├── resources
    ├── setting.png
    ├── wechat.jpeg
    ├── screenshot-20251209-181423.png
    ├── screenshot-20251210-120416.png
    ├── screenshot-20251210-120630.png
    ├── WECHAT.md
    ├── logo.svg
    ├── privacy_policy.txt
    └── privacy_policy_en.txt
├── phone_agent
    ├── model
    │   ├── __init__.py
    │   └── client.py
    ├── actions
    │   ├── __init__.py
    │   └── handler.py
    ├── __init__.py
    ├── adb
    │   ├── __init__.py
    │   ├── input.py
    │   ├── screenshot.py
    │   ├── device.py
    │   └── connection.py
    ├── hdc
    │   ├── __init__.py
    │   ├── screenshot.py
    │   ├── input.py
    │   ├── device.py
    │   └── connection.py
    ├── config
    │   ├── __init__.py
    │   ├── i18n.py
    │   ├── prompts_en.py
    │   ├── prompts.py
    │   ├── prompts_zh.py
    │   ├── timing.py
    │   ├── apps.py
    │   └── apps_harmonyos.py
    ├── device_factory.py
    └── agent.py
├── requirements.txt
├── .pre-commit-config.yaml
├── .gitignore
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── feature-request.yaml
    │   └── bug_report.yaml
    └── PULL_REQUEST_TEMPLATE.md
├── setup.py
├── examples
    ├── demo_thinking.py
    └── basic_usage.py
├── scripts
    ├── check_deployment_cn.py
    └── check_deployment_en.py
├── README_coding_agent.md
└── LICENSE


/resources/setting.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zai-org/Open-AutoGLM/HEAD/resources/setting.png


--------------------------------------------------------------------------------
/resources/wechat.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zai-org/Open-AutoGLM/HEAD/resources/wechat.jpeg


--------------------------------------------------------------------------------
/resources/screenshot-20251209-181423.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zai-org/Open-AutoGLM/HEAD/resources/screenshot-20251209-181423.png


--------------------------------------------------------------------------------
/resources/screenshot-20251210-120416.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zai-org/Open-AutoGLM/HEAD/resources/screenshot-20251210-120416.png


--------------------------------------------------------------------------------
/resources/screenshot-20251210-120630.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zai-org/Open-AutoGLM/HEAD/resources/screenshot-20251210-120630.png


--------------------------------------------------------------------------------
/phone_agent/model/__init__.py:
--------------------------------------------------------------------------------
1 | """Model client module for AI inference."""
2 | 
3 | from phone_agent.model.client import ModelClient, ModelConfig
4 | 
5 | __all__ = ["ModelClient", "ModelConfig"]
6 | 


--------------------------------------------------------------------------------
/phone_agent/actions/__init__.py:
--------------------------------------------------------------------------------
1 | """Action handling module for Phone Agent."""
2 | 
3 | from phone_agent.actions.handler import ActionHandler, ActionResult
4 | 
5 | __all__ = ["ActionHandler", "ActionResult"]
6 | 


--------------------------------------------------------------------------------
/resources/WECHAT.md:
--------------------------------------------------------------------------------
1 | <div align="center">
2 | <img src=wechat.jpeg width="60%"/>
3 | 
4 | <p> 扫码加入「Open-AutoGLM 交流群」 </p>
5 | <p> Scan the QR code to follow the official account and join the "Open-AutoGLM Discussion Group" </p>
6 | </div>
7 | 


--------------------------------------------------------------------------------
/phone_agent/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Phone Agent - An AI-powered phone automation framework.
 3 | 
 4 | This package provides tools for automating Android phone interactions
 5 | using AI models for visual understanding and decision making.
 6 | """
 7 | 
 8 | from phone_agent.agent import PhoneAgent
 9 | 
10 | __version__ = "0.1.0"
11 | __all__ = ["PhoneAgent"]
12 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | Pillow>=12.0.0
 2 | openai>=2.9.0
 3 | 
 4 | # For Model Deployment
 5 | 
 6 | ## After installing sglang or vLLM, please run pip install -U transformers again to upgrade to 5.0.0rc0.
 7 | ## Any dependency conflicts related to Transformers can be ignored.
 8 | 
 9 | # sglang>=0.5.6.post1
10 | # vllm>=0.12.0
11 | # transformers>=5.0.0rc0
12 | 
13 | # Optional: for development
14 | # pytest>=7.0.0
15 | # pre-commit>=4.5.0
16 | # black>=23.0.0
17 | # mypy>=1.0.0
18 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | default_install_hook_types:
 2 |   - pre-commit
 3 |   - commit-msg
 4 | exclude: '^phone_agent/config/apps\.py$'
 5 | exclude: '^README_en\.md$'
 6 | default_stages:
 7 |   - pre-commit # Run locally
 8 | repos:
 9 | - repo: https://github.com/astral-sh/ruff-pre-commit
10 |   rev: v0.11.7
11 |   hooks:
12 |   - id: ruff
13 |     args: [--output-format, github, --fix, --select, I]
14 |   - id: ruff-format
15 | - repo: https://github.com/crate-ci/typos
16 |   rev: v1.32.0
17 |   hooks:
18 |   - id: typos
19 | - repo: https://github.com/jackdewinter/pymarkdown
20 |   rev: v0.9.29
21 |   hooks:
22 |   - id: pymarkdown
23 |     args: [fix]
24 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Python
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | *.so
 6 | .Python
 7 | build/
 8 | develop-eggs/
 9 | dist/
10 | downloads/
11 | eggs/
12 | .eggs/
13 | lib/
14 | lib64/
15 | parts/
16 | sdist/
17 | var/
18 | wheels/
19 | *.egg-info/
20 | .installed.cfg
21 | *.egg
22 | 
23 | # Virtual environments
24 | venv/
25 | ENV/
26 | env/
27 | .venv/
28 | 
29 | # IDE
30 | .idea/
31 | .vscode/
32 | *.swp
33 | *.swo
34 | *~
35 | 
36 | # Testing
37 | .pytest_cache/
38 | .coverage
39 | htmlcov/
40 | .tox/
41 | .nox/
42 | 
43 | # Type checking
44 | .mypy_cache/
45 | 
46 | # Jupyter
47 | .ipynb_checkpoints/
48 | 
49 | # OS
50 | .DS_Store
51 | Thumbs.db
52 | 
53 | # Project specific
54 | *.log
55 | /tmp/
56 | screenshots/
57 | 
58 | # Keep old files during transition
59 | call_model.py
60 | app_package_name.py
61 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature-request.yaml:
--------------------------------------------------------------------------------
 1 | name: "\U0001F680 Feature request"
 2 | description: Submit a request for a new Open-AutoGLM / 提交一个新的 Open-AutoGLM 的功能建议
 3 | labels: [ "feature" ]
 4 | body:
 5 |   - type: textarea
 6 |     id: feature-request
 7 |     validations:
 8 |       required: true
 9 |     attributes:
10 |       label: Feature request  / 功能建议
11 |       description: |
12 |         A brief description of the functional proposal. Links to corresponding papers and code are desirable.
13 |         对功能建议的简述。最好提供对应的论文和代码链接
14 | 
15 |   - type: textarea
16 |     id: motivation
17 |     validations:
18 |       required: true
19 |     attributes:
20 |       label: Motivation / 动机
21 |       description: |
22 |         Your motivation for making the suggestion. If that motivation is related to another GitHub issue, link to it here.
23 |         您提出建议的动机。如果该动机与另一个 GitHub 问题有关，请在此处提供对应的链接。
24 | 
25 |   - type: textarea
26 |     id: contribution
27 |     validations:
28 |       required: true
29 |     attributes:
30 |       label: Your contribution / 您的贡献
31 |       description: |
32 | 
33 |         Your PR link or any other link you can help with.
34 |         您的PR链接或者其他您能提供帮助的链接。
35 | 


--------------------------------------------------------------------------------
/phone_agent/adb/__init__.py:
--------------------------------------------------------------------------------
 1 | """ADB utilities for Android device interaction."""
 2 | 
 3 | from phone_agent.adb.connection import (
 4 |     ADBConnection,
 5 |     ConnectionType,
 6 |     DeviceInfo,
 7 |     list_devices,
 8 |     quick_connect,
 9 | )
10 | from phone_agent.adb.device import (
11 |     back,
12 |     double_tap,
13 |     get_current_app,
14 |     home,
15 |     launch_app,
16 |     long_press,
17 |     swipe,
18 |     tap,
19 | )
20 | from phone_agent.adb.input import (
21 |     clear_text,
22 |     detect_and_set_adb_keyboard,
23 |     restore_keyboard,
24 |     type_text,
25 | )
26 | from phone_agent.adb.screenshot import get_screenshot
27 | 
28 | __all__ = [
29 |     # Screenshot
30 |     "get_screenshot",
31 |     # Input
32 |     "type_text",
33 |     "clear_text",
34 |     "detect_and_set_adb_keyboard",
35 |     "restore_keyboard",
36 |     # Device control
37 |     "get_current_app",
38 |     "tap",
39 |     "swipe",
40 |     "back",
41 |     "home",
42 |     "double_tap",
43 |     "long_press",
44 |     "launch_app",
45 |     # Connection management
46 |     "ADBConnection",
47 |     "DeviceInfo",
48 |     "ConnectionType",
49 |     "quick_connect",
50 |     "list_devices",
51 | ]
52 | 


--------------------------------------------------------------------------------
/phone_agent/hdc/__init__.py:
--------------------------------------------------------------------------------
 1 | """HDC utilities for HarmonyOS device interaction."""
 2 | 
 3 | from phone_agent.hdc.connection import (
 4 |     HDCConnection,
 5 |     ConnectionType,
 6 |     DeviceInfo,
 7 |     list_devices,
 8 |     quick_connect,
 9 |     set_hdc_verbose,
10 | )
11 | from phone_agent.hdc.device import (
12 |     back,
13 |     double_tap,
14 |     get_current_app,
15 |     home,
16 |     launch_app,
17 |     long_press,
18 |     swipe,
19 |     tap,
20 | )
21 | from phone_agent.hdc.input import (
22 |     clear_text,
23 |     detect_and_set_adb_keyboard,
24 |     restore_keyboard,
25 |     type_text,
26 | )
27 | from phone_agent.hdc.screenshot import get_screenshot
28 | 
29 | __all__ = [
30 |     # Screenshot
31 |     "get_screenshot",
32 |     # Input
33 |     "type_text",
34 |     "clear_text",
35 |     "detect_and_set_adb_keyboard",
36 |     "restore_keyboard",
37 |     # Device control
38 |     "get_current_app",
39 |     "tap",
40 |     "swipe",
41 |     "back",
42 |     "home",
43 |     "double_tap",
44 |     "long_press",
45 |     "launch_app",
46 |     # Connection management
47 |     "HDCConnection",
48 |     "DeviceInfo",
49 |     "ConnectionType",
50 |     "quick_connect",
51 |     "list_devices",
52 |     "set_hdc_verbose",
53 | ]
54 | 


--------------------------------------------------------------------------------
/phone_agent/config/__init__.py:
--------------------------------------------------------------------------------
 1 | """Configuration module for Phone Agent."""
 2 | 
 3 | from phone_agent.config.apps import APP_PACKAGES
 4 | from phone_agent.config.i18n import get_message, get_messages
 5 | from phone_agent.config.prompts_en import SYSTEM_PROMPT as SYSTEM_PROMPT_EN
 6 | from phone_agent.config.prompts_zh import SYSTEM_PROMPT as SYSTEM_PROMPT_ZH
 7 | from phone_agent.config.timing import (
 8 |     TIMING_CONFIG,
 9 |     ActionTimingConfig,
10 |     ConnectionTimingConfig,
11 |     DeviceTimingConfig,
12 |     TimingConfig,
13 |     get_timing_config,
14 |     update_timing_config,
15 | )
16 | 
17 | 
18 | def get_system_prompt(lang: str = "cn") -> str:
19 |     """
20 |     Get system prompt by language.
21 | 
22 |     Args:
23 |         lang: Language code, 'cn' for Chinese, 'en' for English.
24 | 
25 |     Returns:
26 |         System prompt string.
27 |     """
28 |     if lang == "en":
29 |         return SYSTEM_PROMPT_EN
30 |     return SYSTEM_PROMPT_ZH
31 | 
32 | 
33 | # Default to Chinese for backward compatibility
34 | SYSTEM_PROMPT = SYSTEM_PROMPT_ZH
35 | 
36 | __all__ = [
37 |     "APP_PACKAGES",
38 |     "SYSTEM_PROMPT",
39 |     "SYSTEM_PROMPT_ZH",
40 |     "SYSTEM_PROMPT_EN",
41 |     "get_system_prompt",
42 |     "get_messages",
43 |     "get_message",
44 |     "TIMING_CONFIG",
45 |     "TimingConfig",
46 |     "ActionTimingConfig",
47 |     "DeviceTimingConfig",
48 |     "ConnectionTimingConfig",
49 |     "get_timing_config",
50 |     "update_timing_config",
51 | ]
52 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | # Contribution Guide
 2 | 
 3 | We welcome your contributions to this repository. To ensure elegant code style and better code quality, we have prepared
 4 | the following contribution guidelines.
 5 | 
 6 | ## What We Accept
 7 | 
 8 | + This PR fixes a typo or improves the documentation (if this is the case, you may skip the other checks).
 9 | + This PR fixes a specific issue — please reference the issue number in the PR description. Make sure your code strictly
10 |   follows the coding standards below.
11 | + This PR introduces a new feature — please clearly explain the necessity and implementation of the feature. Make sure
12 |   your code strictly follows the coding standards below.
13 | 
14 | ## Code Style Guide
15 | 
16 | Good code style is an art. We have prepared a `pre-commit` hook to enforce consistent code
17 | formatting across the project. You can clean up your code following the steps below:
18 | 
19 | ```shell
20 | pre-commit run --all-files
21 | ```
22 | 
23 | If your code complies with the standards, you should not see any errors.
24 | 
25 | ## Naming Conventions
26 | 
27 | + Please use **English** for naming; do not use Pinyin or other languages. All comments should also be in English.
28 | + Follow **PEP8** naming conventions strictly, and use underscores to separate words. Avoid meaningless names such as
29 |   `a`, `b`, `c`.
30 | 
31 | ## For glmv-reward Contributors
32 | 
33 | Before PR, Please run:
34 | 
35 | ```bash
36 | cd glmv-reward/
37 | uv sync
38 | uv run poe lint
39 | uv run poe typecheck
40 | ```
41 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """Setup script for Phone Agent."""
 3 | 
 4 | from setuptools import find_packages, setup
 5 | 
 6 | with open("README.md", "r", encoding="utf-8") as f:
 7 |     long_description = f.read()
 8 | 
 9 | setup(
10 |     name="phone-agent",
11 |     version="0.1.0",
12 |     author="Zhipu AI",
13 |     author_email="",
14 |     description="AI-powered phone automation framework",
15 |     long_description=long_description,
16 |     long_description_content_type="text/markdown",
17 |     url="https://github.com/yourusername/phone-agent",
18 |     packages=find_packages(),
19 |     classifiers=[
20 |         "Development Status :: 3 - Alpha",
21 |         "Intended Audience :: Developers",
22 |         "License :: OSI Approved :: Apache Software License",
23 |         "Operating System :: OS Independent",
24 |         "Programming Language :: Python :: 3",
25 |         "Programming Language :: Python :: 3.10",
26 |         "Programming Language :: Python :: 3.11",
27 |         "Programming Language :: Python :: 3.12",
28 |         "Topic :: Software Development :: Libraries :: Python Modules",
29 |         "Topic :: Scientific/Engineering :: Artificial Intelligence",
30 |     ],
31 |     python_requires=">=3.10",
32 |     install_requires=[
33 |         "Pillow>=12.0.0",
34 |         "openai>=2.9.0",
35 |     ],
36 |     extras_require={
37 |         "dev": [
38 |             "pytest>=7.0.0",
39 |             "black>=23.0.0",
40 |             "mypy>=1.0.0",
41 |             "ruff>=0.1.0",
42 |         ],
43 |     },
44 |     entry_points={
45 |         "console_scripts": [
46 |             "phone-agent=main:main",
47 |         ],
48 |     },
49 | )
50 | 


--------------------------------------------------------------------------------
/examples/demo_thinking.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | Thinking Output Demo / 演示 thinking 输出的示例
 4 | 
 5 | This script demonstrates how the Agent outputs both thinking process and actions in verbose mode.
 6 | 这个脚本展示了在 verbose 模式下，Agent 会同时输出思考过程和执行动作。
 7 | """
 8 | 
 9 | from phone_agent import PhoneAgent
10 | from phone_agent.agent import AgentConfig
11 | from phone_agent.config import get_messages
12 | from phone_agent.model import ModelConfig
13 | 
14 | 
15 | def main(lang: str = "cn"):
16 |     msgs = get_messages(lang)
17 | 
18 |     print("=" * 60)
19 |     print("Phone Agent - Thinking Demo")
20 |     print("=" * 60)
21 | 
22 |     # Configure model
23 |     model_config = ModelConfig(
24 |         base_url="http://localhost:8000/v1",
25 |         model_name="autoglm-phone-9b",
26 |         temperature=0.1,
27 |     )
28 | 
29 |     # Configure Agent (verbose=True enables detailed output)
30 |     agent_config = AgentConfig(
31 |         max_steps=10,
32 |         verbose=True,
33 |         lang=lang,
34 |     )
35 | 
36 |     # Create Agent
37 |     agent = PhoneAgent(
38 |         model_config=model_config,
39 |         agent_config=agent_config,
40 |     )
41 | 
42 |     # Execute task
43 |     print(f"\n📱 {msgs['starting_task']}...\n")
44 |     result = agent.run("打开小红书搜索美食攻略")
45 | 
46 |     print("\n" + "=" * 60)
47 |     print(f"📊 {msgs['final_result']}: {result}")
48 |     print("=" * 60)
49 | 
50 | 
51 | if __name__ == "__main__":
52 |     import argparse
53 | 
54 |     parser = argparse.ArgumentParser(description="Phone Agent Thinking Demo")
55 |     parser.add_argument(
56 |         "--lang",
57 |         type=str,
58 |         default="cn",
59 |         choices=["cn", "en"],
60 |         help="Language for UI messages (cn=Chinese, en=English)",
61 |     )
62 |     args = parser.parse_args()
63 | 
64 |     main(lang=args.lang)
65 | 


--------------------------------------------------------------------------------
/phone_agent/config/i18n.py:
--------------------------------------------------------------------------------
 1 | """Internationalization (i18n) module for Phone Agent UI messages."""
 2 | 
 3 | # Chinese messages
 4 | MESSAGES_ZH = {
 5 |     "thinking": "思考过程",
 6 |     "action": "执行动作",
 7 |     "task_completed": "任务完成",
 8 |     "done": "完成",
 9 |     "starting_task": "开始执行任务",
10 |     "final_result": "最终结果",
11 |     "task_result": "任务结果",
12 |     "confirmation_required": "需要确认",
13 |     "continue_prompt": "是否继续？(y/n)",
14 |     "manual_operation_required": "需要人工操作",
15 |     "manual_operation_hint": "请手动完成操作...",
16 |     "press_enter_when_done": "完成后按回车继续",
17 |     "connection_failed": "连接失败",
18 |     "connection_successful": "连接成功",
19 |     "step": "步骤",
20 |     "task": "任务",
21 |     "result": "结果",
22 |     "performance_metrics": "性能指标",
23 |     "time_to_first_token": "首 Token 延迟 (TTFT)",
24 |     "time_to_thinking_end": "思考完成延迟",
25 |     "total_inference_time": "总推理时间",
26 | }
27 | 
28 | # English messages
29 | MESSAGES_EN = {
30 |     "thinking": "Thinking",
31 |     "action": "Action",
32 |     "task_completed": "Task Completed",
33 |     "done": "Done",
34 |     "starting_task": "Starting task",
35 |     "final_result": "Final Result",
36 |     "task_result": "Task Result",
37 |     "confirmation_required": "Confirmation Required",
38 |     "continue_prompt": "Continue? (y/n)",
39 |     "manual_operation_required": "Manual Operation Required",
40 |     "manual_operation_hint": "Please complete the operation manually...",
41 |     "press_enter_when_done": "Press Enter when done",
42 |     "connection_failed": "Connection Failed",
43 |     "connection_successful": "Connection Successful",
44 |     "step": "Step",
45 |     "task": "Task",
46 |     "result": "Result",
47 |     "performance_metrics": "Performance Metrics",
48 |     "time_to_first_token": "Time to First Token (TTFT)",
49 |     "time_to_thinking_end": "Time to Thinking End",
50 |     "total_inference_time": "Total Inference Time",
51 | }
52 | 
53 | 
54 | def get_messages(lang: str = "cn") -> dict:
55 |     """
56 |     Get UI messages dictionary by language.
57 | 
58 |     Args:
59 |         lang: Language code, 'cn' for Chinese, 'en' for English.
60 | 
61 |     Returns:
62 |         Dictionary of UI messages.
63 |     """
64 |     if lang == "en":
65 |         return MESSAGES_EN
66 |     return MESSAGES_ZH
67 | 
68 | 
69 | def get_message(key: str, lang: str = "cn") -> str:
70 |     """
71 |     Get a single UI message by key and language.
72 | 
73 |     Args:
74 |         key: Message key.
75 |         lang: Language code, 'cn' for Chinese, 'en' for English.
76 | 
77 |     Returns:
78 |         Message string.
79 |     """
80 |     messages = get_messages(lang)
81 |     return messages.get(key, key)
82 | 


--------------------------------------------------------------------------------
/phone_agent/config/prompts_en.py:
--------------------------------------------------------------------------------
 1 | """System prompts for the AI agent."""
 2 | 
 3 | from datetime import datetime
 4 | 
 5 | today = datetime.today()
 6 | formatted_date = today.strftime("%Y-%m-%d, %A")
 7 | 
 8 | SYSTEM_PROMPT = (
 9 |     "The current date: "
10 |     + formatted_date
11 |     + """
12 | # Setup
13 | You are a professional Android operation agent assistant that can fulfill the user's high-level instructions. Given a screenshot of the Android interface at each step, you first analyze the situation, then plan the best course of action using Python-style pseudo-code.
14 | 
15 | # More details about the code
16 | Your response format must be structured as follows:
17 | 
18 | Think first: Use <think>...</think> to analyze the current screen, identify key elements, and determine the most efficient action.
19 | Provide the action: Use <answer>...</answer> to return a single line of pseudo-code representing the operation.
20 | 
21 | Your output should STRICTLY follow the format:
22 | <think>
23 | [Your thought]
24 | </think>
25 | <answer>
26 | [Your operation code]
27 | </answer>
28 | 
29 | - **Tap**
30 |   Perform a tap action on a specified screen area. The element is a list of 2 integers, representing the coordinates of the tap point.
31 |   **Example**:
32 |   <answer>
33 |   do(action="Tap", element=[x,y])
34 |   </answer>
35 | - **Type**
36 |   Enter text into the currently focused input field.
37 |   **Example**:
38 |   <answer>
39 |   do(action="Type", text="Hello World")
40 |   </answer>
41 | - **Swipe**
42 |   Perform a swipe action with start point and end point.
43 |   **Examples**:
44 |   <answer>
45 |   do(action="Swipe", start=[x1,y1], end=[x2,y2])
46 |   </answer>
47 | - **Long Press**
48 |   Perform a long press action on a specified screen area.
49 |   You can add the element to the action to specify the long press area. The element is a list of 2 integers, representing the coordinates of the long press point.
50 |   **Example**:
51 |   <answer>
52 |   do(action="Long Press", element=[x,y])
53 |   </answer>
54 | - **Launch**
55 |   Launch an app. Try to use launch action when you need to launch an app. Check the instruction to choose the right app before you use this action.
56 |   **Example**:
57 |   <answer>
58 |   do(action="Launch", app="Settings")
59 |   </answer>
60 | - **Back**
61 |   Press the Back button to navigate to the previous screen.
62 |   **Example**:
63 |   <answer>
64 |   do(action="Back")
65 |   </answer>
66 | - **Finish**
67 |   Terminate the program and optionally print a message.
68 |   **Example**:
69 |   <answer>
70 |   finish(message="Task completed.")
71 |   </answer>
72 | 
73 | 
74 | REMEMBER:
75 | - Think before you act: Always analyze the current UI and the best course of action before executing any step, and output in <think> part.
76 | - Only ONE LINE of action in <answer> part per response: Each step must contain exactly one line of executable code.
77 | - Generate execution code strictly according to format requirements.
78 | """
79 | )
80 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.yaml:
--------------------------------------------------------------------------------
 1 | name: "\U0001F41B Bug Report"
 2 | description: Submit a bug report to help us improve Open-AutoGLM / 提交一个 Bug 问题报告来帮助我们改进 Open-AutoGLM
 3 | body:
 4 |   - type: textarea
 5 |     id: system-info
 6 |     attributes:
 7 |       label: System Info / 系統信息
 8 |       description: Your operating environment / 您的运行环境信息
 9 |       placeholder: Includes Cuda version, Transformers version, Python version, operating system, hardware information (if you suspect a hardware problem)... / 包括Cuda版本，Transformers版本，Python版本，操作系统，硬件信息(如果您怀疑是硬件方面的问题)...
10 |     validations:
11 |       required: true
12 | 
13 |   - type: textarea
14 |     id: who-can-help
15 |     attributes:
16 |       label: Who can help? / 谁可以帮助到您？
17 |       description: |
18 |         Your issue will be replied to more quickly if you can figure out the right person to tag with @
19 |         All issues are read by one of the maintainers, so if you don't know who to tag, just leave this blank and our maintainer will ping the right person.
20 | 
21 |         Please tag fewer than 3 people.
22 | 
23 |         如果您能找到合适的标签 @，您的问题会更快得到回复。
24 |         所有问题都会由我们的维护者阅读，如果您不知道该标记谁，只需留空，我们的维护人员会找到合适的开发组成员来解决问题。
25 | 
26 |         标记的人数应该不超过 3 个人。
27 | 
28 |         If it's not a bug in these three subsections, you may not specify the helper. Our maintainer will find the right person in the development group to solve the problem.
29 | 
30 |         如果不是这三个子版块的bug，您可以不指明帮助者，我们的维护人员会找到合适的开发组成员来解决问题。
31 | 
32 |       placeholder: "@Username ..."
33 | 
34 |   - type: checkboxes
35 |     id: information-scripts-examples
36 |     attributes:
37 |       label: Information / 问题信息
38 |       description: 'The problem arises when using: / 问题出现在'
39 |       options:
40 |         - label: "The official example scripts / 官方的示例脚本"
41 |         - label: "My own modified scripts / 我自己修改的脚本和任务"
42 | 
43 |   - type: textarea
44 |     id: reproduction
45 |     validations:
46 |       required: true
47 |     attributes:
48 |       label: Reproduction / 复现过程
49 |       description: |
50 |         Please provide a code example that reproduces the problem you encountered, preferably with a minimal reproduction unit.
51 |         If you have code snippets, error messages, stack traces, please provide them here as well.
52 |         Please format your code correctly using code tags. See https://help.github.com/en/github/writing-on-github/creating-and-highlighting-code-blocks#syntax-highlighting
53 |         Do not use screenshots, as they are difficult to read and (more importantly) do not allow others to copy and paste your code.
54 | 
55 |         请提供能重现您遇到的问题的代码示例,最好是最小复现单元。
56 |         如果您有代码片段、错误信息、堆栈跟踪，也请在此提供。
57 |         请使用代码标签正确格式化您的代码。请参见 https://help.github.com/en/github/writing-on-github/creating-and-highlighting-code-blocks#syntax-highlighting
58 |         请勿使用截图，因为截图难以阅读，而且（更重要的是）不允许他人复制粘贴您的代码。
59 |       placeholder: |
60 |         Steps to reproduce the behavior/复现Bug的步骤:
61 | 
62 |           1.
63 |           2.
64 |           3.
65 | 
66 |   - type: textarea
67 |     id: expected-behavior
68 |     validations:
69 |       required: true
70 |     attributes:
71 |       label: Expected behavior / 期待表现
72 |       description: "A clear and concise description of what you would expect to happen. /简单描述您期望发生的事情。"
73 | 


--------------------------------------------------------------------------------
/phone_agent/adb/input.py:
--------------------------------------------------------------------------------
  1 | """Input utilities for Android device text input."""
  2 | 
  3 | import base64
  4 | import subprocess
  5 | from typing import Optional
  6 | 
  7 | 
  8 | def type_text(text: str, device_id: str | None = None) -> None:
  9 |     """
 10 |     Type text into the currently focused input field using ADB Keyboard.
 11 | 
 12 |     Args:
 13 |         text: The text to type.
 14 |         device_id: Optional ADB device ID for multi-device setups.
 15 | 
 16 |     Note:
 17 |         Requires ADB Keyboard to be installed on the device.
 18 |         See: https://github.com/nicnocquee/AdbKeyboard
 19 |     """
 20 |     adb_prefix = _get_adb_prefix(device_id)
 21 |     encoded_text = base64.b64encode(text.encode("utf-8")).decode("utf-8")
 22 | 
 23 |     subprocess.run(
 24 |         adb_prefix
 25 |         + [
 26 |             "shell",
 27 |             "am",
 28 |             "broadcast",
 29 |             "-a",
 30 |             "ADB_INPUT_B64",
 31 |             "--es",
 32 |             "msg",
 33 |             encoded_text,
 34 |         ],
 35 |         capture_output=True,
 36 |         text=True,
 37 |     )
 38 | 
 39 | 
 40 | def clear_text(device_id: str | None = None) -> None:
 41 |     """
 42 |     Clear text in the currently focused input field.
 43 | 
 44 |     Args:
 45 |         device_id: Optional ADB device ID for multi-device setups.
 46 |     """
 47 |     adb_prefix = _get_adb_prefix(device_id)
 48 | 
 49 |     subprocess.run(
 50 |         adb_prefix + ["shell", "am", "broadcast", "-a", "ADB_CLEAR_TEXT"],
 51 |         capture_output=True,
 52 |         text=True,
 53 |     )
 54 | 
 55 | 
 56 | def detect_and_set_adb_keyboard(device_id: str | None = None) -> str:
 57 |     """
 58 |     Detect current keyboard and switch to ADB Keyboard if needed.
 59 | 
 60 |     Args:
 61 |         device_id: Optional ADB device ID for multi-device setups.
 62 | 
 63 |     Returns:
 64 |         The original keyboard IME identifier for later restoration.
 65 |     """
 66 |     adb_prefix = _get_adb_prefix(device_id)
 67 | 
 68 |     # Get current IME
 69 |     result = subprocess.run(
 70 |         adb_prefix + ["shell", "settings", "get", "secure", "default_input_method"],
 71 |         capture_output=True,
 72 |         text=True,
 73 |     )
 74 |     current_ime = (result.stdout + result.stderr).strip()
 75 | 
 76 |     # Switch to ADB Keyboard if not already set
 77 |     if "com.android.adbkeyboard/.AdbIME" not in current_ime:
 78 |         subprocess.run(
 79 |             adb_prefix + ["shell", "ime", "set", "com.android.adbkeyboard/.AdbIME"],
 80 |             capture_output=True,
 81 |             text=True,
 82 |         )
 83 | 
 84 |     # Warm up the keyboard
 85 |     type_text("", device_id)
 86 | 
 87 |     return current_ime
 88 | 
 89 | 
 90 | def restore_keyboard(ime: str, device_id: str | None = None) -> None:
 91 |     """
 92 |     Restore the original keyboard IME.
 93 | 
 94 |     Args:
 95 |         ime: The IME identifier to restore.
 96 |         device_id: Optional ADB device ID for multi-device setups.
 97 |     """
 98 |     adb_prefix = _get_adb_prefix(device_id)
 99 | 
100 |     subprocess.run(
101 |         adb_prefix + ["shell", "ime", "set", ime], capture_output=True, text=True
102 |     )
103 | 
104 | 
105 | def _get_adb_prefix(device_id: str | None) -> list:
106 |     """Get ADB command prefix with optional device specifier."""
107 |     if device_id:
108 |         return ["adb", "-s", device_id]
109 |     return ["adb"]
110 | 


--------------------------------------------------------------------------------
/phone_agent/adb/screenshot.py:
--------------------------------------------------------------------------------
  1 | """Screenshot utilities for capturing Android device screen."""
  2 | 
  3 | import base64
  4 | import os
  5 | import subprocess
  6 | import tempfile
  7 | import uuid
  8 | from dataclasses import dataclass
  9 | from io import BytesIO
 10 | from typing import Tuple
 11 | 
 12 | from PIL import Image
 13 | 
 14 | 
 15 | @dataclass
 16 | class Screenshot:
 17 |     """Represents a captured screenshot."""
 18 | 
 19 |     base64_data: str
 20 |     width: int
 21 |     height: int
 22 |     is_sensitive: bool = False
 23 | 
 24 | 
 25 | def get_screenshot(device_id: str | None = None, timeout: int = 10) -> Screenshot:
 26 |     """
 27 |     Capture a screenshot from the connected Android device.
 28 | 
 29 |     Args:
 30 |         device_id: Optional ADB device ID for multi-device setups.
 31 |         timeout: Timeout in seconds for screenshot operations.
 32 | 
 33 |     Returns:
 34 |         Screenshot object containing base64 data and dimensions.
 35 | 
 36 |     Note:
 37 |         If the screenshot fails (e.g., on sensitive screens like payment pages),
 38 |         a black fallback image is returned with is_sensitive=True.
 39 |     """
 40 |     temp_path = os.path.join(tempfile.gettempdir(), f"screenshot_{uuid.uuid4()}.png")
 41 |     adb_prefix = _get_adb_prefix(device_id)
 42 | 
 43 |     try:
 44 |         # Execute screenshot command
 45 |         result = subprocess.run(
 46 |             adb_prefix + ["shell", "screencap", "-p", "/sdcard/tmp.png"],
 47 |             capture_output=True,
 48 |             text=True,
 49 |             timeout=timeout,
 50 |         )
 51 | 
 52 |         # Check for screenshot failure (sensitive screen)
 53 |         output = result.stdout + result.stderr
 54 |         if "Status: -1" in output or "Failed" in output:
 55 |             return _create_fallback_screenshot(is_sensitive=True)
 56 | 
 57 |         # Pull screenshot to local temp path
 58 |         subprocess.run(
 59 |             adb_prefix + ["pull", "/sdcard/tmp.png", temp_path],
 60 |             capture_output=True,
 61 |             text=True,
 62 |             timeout=5,
 63 |         )
 64 | 
 65 |         if not os.path.exists(temp_path):
 66 |             return _create_fallback_screenshot(is_sensitive=False)
 67 | 
 68 |         # Read and encode image
 69 |         img = Image.open(temp_path)
 70 |         width, height = img.size
 71 | 
 72 |         buffered = BytesIO()
 73 |         img.save(buffered, format="PNG")
 74 |         base64_data = base64.b64encode(buffered.getvalue()).decode("utf-8")
 75 | 
 76 |         # Cleanup
 77 |         os.remove(temp_path)
 78 | 
 79 |         return Screenshot(
 80 |             base64_data=base64_data, width=width, height=height, is_sensitive=False
 81 |         )
 82 | 
 83 |     except Exception as e:
 84 |         print(f"Screenshot error: {e}")
 85 |         return _create_fallback_screenshot(is_sensitive=False)
 86 | 
 87 | 
 88 | def _get_adb_prefix(device_id: str | None) -> list:
 89 |     """Get ADB command prefix with optional device specifier."""
 90 |     if device_id:
 91 |         return ["adb", "-s", device_id]
 92 |     return ["adb"]
 93 | 
 94 | 
 95 | def _create_fallback_screenshot(is_sensitive: bool) -> Screenshot:
 96 |     """Create a black fallback image when screenshot fails."""
 97 |     default_width, default_height = 1080, 2400
 98 | 
 99 |     black_img = Image.new("RGB", (default_width, default_height), color="black")
100 |     buffered = BytesIO()
101 |     black_img.save(buffered, format="PNG")
102 |     base64_data = base64.b64encode(buffered.getvalue()).decode("utf-8")
103 | 
104 |     return Screenshot(
105 |         base64_data=base64_data,
106 |         width=default_width,
107 |         height=default_height,
108 |         is_sensitive=is_sensitive,
109 |     )
110 | 


--------------------------------------------------------------------------------
/scripts/check_deployment_cn.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import os
  4 | 
  5 | from openai import OpenAI
  6 | 
  7 | if __name__ == "__main__":
  8 |     parser = argparse.ArgumentParser(
  9 |         description="检查模型部署是否成功的工具",
 10 |         formatter_class=argparse.RawDescriptionHelpFormatter,
 11 |         epilog="""
 12 | 使用示例:
 13 |   python scripts/check_deployment_cn.py --base-url http://localhost:8000/v1 --apikey your-key --model autoglm-phone-9b
 14 |   python scripts/check_deployment_cn.py --base-url http://localhost:8000/v1 --apikey your-key --model autoglm-phone-9b --messages-file custom.json
 15 |         """,
 16 |     )
 17 | 
 18 |     parser.add_argument(
 19 |         "--base-url",
 20 |         type=str,
 21 |         required=True,
 22 |         help="API 服务的 base URL，例如: http://localhost:8000/v1",
 23 |     )
 24 | 
 25 |     parser.add_argument(
 26 |         "--apikey", type=str, default="EMPTY", help="API 密钥 (默认: EMPTY)"
 27 |     )
 28 | 
 29 |     parser.add_argument(
 30 |         "--model",
 31 |         type=str,
 32 |         required=True,
 33 |         help="要测试的模型名称，例如: autoglm-phone-9b",
 34 |     )
 35 | 
 36 |     parser.add_argument(
 37 |         "--messages-file",
 38 |         type=str,
 39 |         default="scripts/sample_messages.json",
 40 |         help="包含测试消息的 JSON 文件路径 (默认: scripts/sample_messages.json)",
 41 |     )
 42 | 
 43 |     parser.add_argument(
 44 |         "--max-tokens", type=int, default=3000, help="最大生成 token 数 (默认: 3000)"
 45 |     )
 46 | 
 47 |     parser.add_argument(
 48 |         "--temperature", type=float, default=0.0, help="采样温度 (默认: 0.0)"
 49 |     )
 50 | 
 51 |     parser.add_argument(
 52 |         "--top_p", type=float, default=0.85, help="nucleus sampling 参数 (默认: 0.85)"
 53 |     )
 54 | 
 55 |     parser.add_argument(
 56 |         "--frequency_penalty", type=float, default=0.2, help="频率惩罚参数 (默认: 0.2)"
 57 |     )
 58 | 
 59 |     args = parser.parse_args()
 60 | 
 61 |     # 读取测试消息
 62 |     if not os.path.exists(args.messages_file):
 63 |         print(f"错误: 消息文件 {args.messages_file} 不存在")
 64 |         exit(1)
 65 | 
 66 |     with open(args.messages_file) as f:
 67 |         messages = json.load(f)
 68 | 
 69 |     base_url = args.base_url
 70 |     api_key = args.apikey
 71 |     model = args.model
 72 | 
 73 |     print(f"开始测试模型推理...")
 74 |     print(f"Base URL: {base_url}")
 75 |     print(f"Model: {model}")
 76 |     print(f"Messages file: {args.messages_file}")
 77 |     print("=" * 80)
 78 | 
 79 |     try:
 80 |         client = OpenAI(
 81 |             base_url=base_url,
 82 |             api_key=api_key,
 83 |         )
 84 | 
 85 |         response = client.chat.completions.create(
 86 |             messages=messages,
 87 |             model=model,
 88 |             max_tokens=args.max_tokens,
 89 |             temperature=args.temperature,
 90 |             top_p=args.top_p,
 91 |             frequency_penalty=args.frequency_penalty,
 92 |             stream=False,
 93 |         )
 94 | 
 95 |         print("\n模型推理结果:")
 96 |         print("=" * 80)
 97 |         print(response.choices[0].message.content)
 98 |         print("=" * 80)
 99 | 
100 |         if response.usage:
101 |             print(f"\n统计信息:")
102 |             print(f"  - Prompt tokens: {response.usage.prompt_tokens}")
103 |             print(f"  - Completion tokens: {response.usage.completion_tokens}")
104 |             print(f"  - Total tokens: {response.usage.total_tokens}")
105 | 
106 |         print(f"\n请根据上述推理结果判断模型部署是否符合预期。")
107 | 
108 |     except Exception as e:
109 |         print(f"\n调用 API 时发生错误:")
110 |         print(f"错误类型: {type(e).__name__}")
111 |         print(f"错误信息: {str(e)}")
112 |         print(
113 |             "\n提示: 请检查 base_url、api_key 和 model 参数是否正确，以及服务是否正在运行。"
114 |         )
115 |         exit(1)
116 | 


--------------------------------------------------------------------------------
/phone_agent/config/prompts.py:
--------------------------------------------------------------------------------
 1 | """System prompts for the AI agent."""
 2 | 
 3 | from datetime import datetime
 4 | 
 5 | today = datetime.today()
 6 | formatted_date = today.strftime("%Y年%m月%d日")
 7 | 
 8 | SYSTEM_PROMPT = (
 9 |     "今天的日期是: "
10 |     + formatted_date
11 |     + """
12 | 你是一个智能体分析专家，可以根据操作历史和当前状态图执行一系列操作来完成任务。
13 | 你必须严格按照要求输出以下格式：
14 | <think>{think}</think>
15 | <answer>{action}</answer>
16 | 
17 | 其中：
18 | - {think} 是对你为什么选择这个操作的简短推理说明。
19 | - {action} 是本次执行的具体操作指令，必须严格遵循下方定义的指令格式。
20 | 
21 | 操作指令及其作用如下：
22 | - do(action="Launch", app="xxx")  
23 |     Launch是启动目标app的操作，这比通过主屏幕导航更快。此操作完成后，您将自动收到结果状态的截图。
24 | - do(action="Tap", element=[x,y])  
25 |     Tap是点击操作，点击屏幕上的特定点。可用此操作点击按钮、选择项目、从主屏幕打开应用程序，或与任何可点击的用户界面元素进行交互。坐标系统从左上角 (0,0) 开始到右下角（999,999)结束。此操作完成后，您将自动收到结果状态的截图。
26 | - do(action="Tap", element=[x,y], message="重要操作")  
27 |     基本功能同Tap，点击涉及财产、支付、隐私等敏感按钮时触发。
28 | - do(action="Type", text="xxx")  
29 |     Type是输入操作，在当前聚焦的输入框中输入文本。使用此操作前，请确保输入框已被聚焦（先点击它）。输入的文本将像使用键盘输入一样输入。重要提示：手机可能正在使用 ADB 键盘，该键盘不会像普通键盘那样占用屏幕空间。要确认键盘已激活，请查看屏幕底部是否显示 'ADB Keyboard {ON}' 类似的文本，或者检查输入框是否处于激活/高亮状态。不要仅仅依赖视觉上的键盘显示。自动清除文本：当你使用输入操作时，输入框中现有的任何文本（包括占位符文本和实际输入）都会在输入新文本前自动清除。你无需在输入前手动清除文本——直接使用输入操作输入所需文本即可。操作完成后，你将自动收到结果状态的截图。
30 | - do(action="Type_Name", text="xxx")  
31 |     Type_Name是输入人名的操作，基本功能同Type。
32 | - do(action="Interact")  
33 |     Interact是当有多个满足条件的选项时而触发的交互操作，询问用户如何选择。
34 | - do(action="Swipe", start=[x1,y1], end=[x2,y2])  
35 |     Swipe是滑动操作，通过从起始坐标拖动到结束坐标来执行滑动手势。可用于滚动内容、在屏幕之间导航、下拉通知栏以及项目栏或进行基于手势的导航。坐标系统从左上角 (0,0) 开始到右下角（999,999)结束。滑动持续时间会自动调整以实现自然的移动。此操作完成后，您将自动收到结果状态的截图。
36 | - do(action="Note", message="True")  
37 |     记录当前页面内容以便后续总结。
38 | - do(action="Call_API", instruction="xxx")  
39 |     总结或评论当前页面或已记录的内容。
40 | - do(action="Long Press", element=[x,y])  
41 |     Long Pres是长按操作，在屏幕上的特定点长按指定时间。可用于触发上下文菜单、选择文本或激活长按交互。坐标系统从左上角 (0,0) 开始到右下角（999,999)结束。此操作完成后，您将自动收到结果状态的屏幕截图。
42 | - do(action="Double Tap", element=[x,y])  
43 |     Double Tap在屏幕上的特定点快速连续点按两次。使用此操作可以激活双击交互，如缩放、选择文本或打开项目。坐标系统从左上角 (0,0) 开始到右下角（999,999)结束。此操作完成后，您将自动收到结果状态的截图。
44 | - do(action="Take_over", message="xxx")  
45 |     Take_over是接管操作，表示在登录和验证阶段需要用户协助。
46 | - do(action="Back")  
47 |     导航返回到上一个屏幕或关闭当前对话框。相当于按下 Android 的返回按钮。使用此操作可以从更深的屏幕返回、关闭弹出窗口或退出当前上下文。此操作完成后，您将自动收到结果状态的截图。
48 | - do(action="Home") 
49 |     Home是回到系统桌面的操作，相当于按下 Android 主屏幕按钮。使用此操作可退出当前应用并返回启动器，或从已知状态启动新任务。此操作完成后，您将自动收到结果状态的截图。
50 | - do(action="Wait", duration="x seconds")  
51 |     等待页面加载，x为需要等待多少秒。
52 | - finish(message="xxx")  
53 |     finish是结束任务的操作，表示准确完整完成任务，message是终止信息。 
54 | 
55 | 必须遵循的规则：
56 | 1. 在执行任何操作前，先检查当前app是否是目标app，如果不是，先执行 Launch。
57 | 2. 如果进入到了无关页面，先执行 Back。如果执行Back后页面没有变化，请点击页面左上角的返回键进行返回，或者右上角的X号关闭。
58 | 3. 如果页面未加载出内容，最多连续 Wait 三次，否则执行 Back重新进入。
59 | 4. 如果页面显示网络问题，需要重新加载，请点击重新加载。
60 | 5. 如果当前页面找不到目标联系人、商品、店铺等信息，可以尝试 Swipe 滑动查找。
61 | 6. 遇到价格区间、时间区间等筛选条件，如果没有完全符合的，可以放宽要求。
62 | 7. 在做小红书总结类任务时一定要筛选图文笔记。
63 | 8. 购物车全选后再点击全选可以把状态设为全不选，在做购物车任务时，如果购物车里已经有商品被选中时，你需要点击全选后再点击取消全选，再去找需要购买或者删除的商品。
64 | 9. 在做外卖任务时，如果相应店铺购物车里已经有其他商品你需要先把购物车清空再去购买用户指定的外卖。
65 | 10. 在做点外卖任务时，如果用户需要点多个外卖，请尽量在同一店铺进行购买，如果无法找到可以下单，并说明某个商品未找到。
66 | 11. 请严格遵循用户意图执行任务，用户的特殊要求可以执行多次搜索，滑动查找。比如（i）用户要求点一杯咖啡，要咸的，你可以直接搜索咸咖啡，或者搜索咖啡后滑动查找咸的咖啡，比如海盐咖啡。（ii）用户要找到XX群，发一条消息，你可以先搜索XX群，找不到结果后，将"群"字去掉，搜索XX重试。（iii）用户要找到宠物友好的餐厅，你可以搜索餐厅，找到筛选，找到设施，选择可带宠物，或者直接搜索可带宠物，必要时可以使用AI搜索。
67 | 12. 在选择日期时，如果原滑动方向与预期日期越来越远，请向反方向滑动查找。
68 | 13. 执行任务过程中如果有多个可选择的项目栏，请逐个查找每个项目栏，直到完成任务，一定不要在同一项目栏多次查找，从而陷入死循环。
69 | 14. 在执行下一步操作前请一定要检查上一步的操作是否生效，如果点击没生效，可能因为app反应较慢，请先稍微等待一下，如果还是不生效请调整一下点击位置重试，如果仍然不生效请跳过这一步继续任务，并在finish message说明点击不生效。
70 | 15. 在执行任务中如果遇到滑动不生效的情况，请调整一下起始点位置，增大滑动距离重试，如果还是不生效，有可能是已经滑到底了，请继续向反方向滑动，直到顶部或底部，如果仍然没有符合要求的结果，请跳过这一步继续任务，并在finish message说明但没找到要求的项目。
71 | 16. 在做游戏任务时如果在战斗页面如果有自动战斗一定要开启自动战斗，如果多轮历史状态相似要检查自动战斗是否开启。
72 | 17. 如果没有合适的搜索结果，可能是因为搜索页面不对，请返回到搜索页面的上一级尝试重新搜索，如果尝试三次返回上一级搜索后仍然没有符合要求的结果，执行 finish(message="原因")。
73 | 18. 在结束任务前请一定要仔细检查任务是否完整准确的完成，如果出现错选、漏选、多选的情况，请返回之前的步骤进行纠正。
74 | """
75 | )
76 | 


--------------------------------------------------------------------------------
/resources/logo.svg:
--------------------------------------------------------------------------------
 1 | 
 2 | <svg width="1024" height="1024" viewBox="0 0 1024 1024" fill="none" xmlns="http://www.w3.org/2000/svg">
 3 | <rect width="1024" height="1024" fill="#FFF0DA"/>
 4 | <path d="M881.677 400.562C887.211 401.088 892.243 403.993 895.465 408.523C899.163 413.721 898.933 422.878 898.472 441.191L895.841 545.774C895.231 570.027 894.926 582.154 890.07 587.442C885.859 592.027 879.682 594.277 873.509 593.476C866.389 592.552 858.354 583.464 842.284 565.288L840.972 563.803L824.65 592.072C814.046 610.439 790.561 616.732 772.194 606.128C753.828 595.524 747.536 572.039 758.14 553.673L774.463 525.399L772.506 525.002C748.73 520.173 736.842 517.758 732.481 512.054C728.701 507.108 727.562 500.634 729.427 494.695C731.578 487.845 741.928 481.518 762.627 468.863L851.883 414.293C867.512 404.738 875.327 399.96 881.677 400.562Z" fill="#792200"/>
 5 | <path d="M377.549 258.56C426.829 258.56 466.487 272.875 496.525 301.504C526.562 330.134 548.152 367.211 561.293 412.736L609.869 325.44C613.624 318.4 619.021 312.768 626.061 308.544C633.57 303.851 641.549 301.504 649.997 301.504C662.669 301.504 673.464 305.963 682.381 314.88C691.298 323.798 695.757 334.357 695.757 346.56C695.757 354.539 692.941 364.16 687.309 375.424C639.906 460.843 604.706 522.326 581.709 559.873C592.504 629.803 612.92 664.768 642.957 664.768C648.589 664.768 653.517 663.829 657.741 661.952C661.965 660.074 667.597 657.259 674.637 653.504C679.33 650.688 684.493 648.342 690.125 646.464C695.757 644.118 701.154 642.944 706.317 642.944C718.519 642.944 729.08 647.637 737.997 657.024C747.384 666.411 752.077 676.737 752.077 688C752.077 700.203 746.445 711.702 735.181 722.497C724.386 732.822 710.072 741.035 692.237 747.136C674.872 753.238 657.037 756.289 638.733 756.289C581.474 756.288 539.938 723.669 514.125 658.432C454.989 729.771 389.048 765.44 316.301 765.44C279.224 765.44 246.371 756.523 217.741 738.688C189.112 720.853 166.818 695.979 150.861 664.064C134.903 632.149 126.925 595.306 126.925 553.536C126.925 505.194 137.25 458.261 157.901 412.736C179.021 367.211 208.589 330.134 246.605 301.504C285.09 272.875 328.738 258.56 377.549 258.56ZM376.141 350.08C347.042 350.08 320.29 359.701 295.885 378.944C271.949 398.187 252.941 423.531 238.861 454.976C225.25 486.421 218.445 519.275 218.445 553.536C218.445 591.552 227.128 621.12 244.493 642.24C261.859 663.36 285.794 673.92 316.301 673.92C343.522 673.92 368.397 665.941 390.925 649.984C413.453 634.027 434.808 612.438 454.989 585.216C464.845 571.606 475.405 556.118 486.669 538.752L483.149 497.216C479.863 452.161 469.773 416.491 452.877 390.208C435.981 363.457 410.402 350.08 376.141 350.08Z" fill="#FF5C1C"/>
 6 | <path d="M582.1 559.232C581.969 559.446 581.839 559.66 581.709 559.873C592.504 629.803 612.92 664.768 642.957 664.768C647.505 664.768 651.592 664.154 655.222 662.93L611.743 753.637C568.23 744.668 535.691 712.934 514.125 658.432C513.738 658.899 513.349 659.364 512.961 659.828L512.875 659.632C539.5 625.342 554.855 603.541 582.044 559.157L582.1 559.232Z" fill="url(#paint0_linear_2455_8751)"/>
 7 | <path d="M486.002 539.817L485.989 539.796C486.177 539.508 486.365 539.219 486.553 538.929C486.37 539.224 486.187 539.521 486.002 539.817ZM470.039 281.417C479.571 287.067 488.4 293.761 496.525 301.504C526.557 330.129 548.144 367.199 561.286 412.714C534.864 460.487 517.929 488.515 486.668 538.745L483.149 497.216C479.863 452.161 469.773 416.491 452.877 390.208C443.849 375.915 432.342 365.44 418.357 358.784L470.039 281.417Z" fill="url(#paint1_linear_2455_8751)"/>
 8 | <defs>
 9 | <linearGradient id="paint0_linear_2455_8751" x1="546.263" y1="596.665" x2="620.57" y2="716.515" gradientUnits="userSpaceOnUse">
10 | <stop stop-color="#E64404"/>
11 | <stop offset="1" stop-color="#FF5C1C"/>
12 | </linearGradient>
13 | <linearGradient id="paint1_linear_2455_8751" x1="549.179" y1="473.234" x2="429.329" y2="398.926" gradientUnits="userSpaceOnUse">
14 | <stop stop-color="#F34500"/>
15 | <stop offset="1" stop-color="#FF5C1C"/>
16 | </linearGradient>
17 | </defs>
18 | </svg>
19 | 


--------------------------------------------------------------------------------
/phone_agent/config/prompts_zh.py:
--------------------------------------------------------------------------------
 1 | """System prompts for the AI agent."""
 2 | 
 3 | from datetime import datetime
 4 | 
 5 | today = datetime.today()
 6 | weekday_names = ["星期一", "星期二", "星期三", "星期四", "星期五", "星期六", "星期日"]
 7 | weekday = weekday_names[today.weekday()]
 8 | formatted_date = today.strftime("%Y年%m月%d日") + " " + weekday
 9 | 
10 | SYSTEM_PROMPT = (
11 |     "今天的日期是: "
12 |     + formatted_date
13 |     + """
14 | 你是一个智能体分析专家，可以根据操作历史和当前状态图执行一系列操作来完成任务。
15 | 你必须严格按照要求输出以下格式：
16 | <think>{think}</think>
17 | <answer>{action}</answer>
18 | 
19 | 其中：
20 | - {think} 是对你为什么选择这个操作的简短推理说明。
21 | - {action} 是本次执行的具体操作指令，必须严格遵循下方定义的指令格式。
22 | 
23 | 操作指令及其作用如下：
24 | - do(action="Launch", app="xxx")  
25 |     Launch是启动目标app的操作，这比通过主屏幕导航更快。此操作完成后，您将自动收到结果状态的截图。
26 | - do(action="Tap", element=[x,y])  
27 |     Tap是点击操作，点击屏幕上的特定点。可用此操作点击按钮、选择项目、从主屏幕打开应用程序，或与任何可点击的用户界面元素进行交互。坐标系统从左上角 (0,0) 开始到右下角（999,999)结束。此操作完成后，您将自动收到结果状态的截图。
28 | - do(action="Tap", element=[x,y], message="重要操作")  
29 |     基本功能同Tap，点击涉及财产、支付、隐私等敏感按钮时触发。
30 | - do(action="Type", text="xxx")  
31 |     Type是输入操作，在当前聚焦的输入框中输入文本。使用此操作前，请确保输入框已被聚焦（先点击它）。输入的文本将像使用键盘输入一样输入。重要提示：手机可能正在使用 ADB 键盘，该键盘不会像普通键盘那样占用屏幕空间。要确认键盘已激活，请查看屏幕底部是否显示 'ADB Keyboard {ON}' 类似的文本，或者检查输入框是否处于激活/高亮状态。不要仅仅依赖视觉上的键盘显示。自动清除文本：当你使用输入操作时，输入框中现有的任何文本（包括占位符文本和实际输入）都会在输入新文本前自动清除。你无需在输入前手动清除文本——直接使用输入操作输入所需文本即可。操作完成后，你将自动收到结果状态的截图。
32 | - do(action="Type_Name", text="xxx")  
33 |     Type_Name是输入人名的操作，基本功能同Type。
34 | - do(action="Interact")  
35 |     Interact是当有多个满足条件的选项时而触发的交互操作，询问用户如何选择。
36 | - do(action="Swipe", start=[x1,y1], end=[x2,y2])  
37 |     Swipe是滑动操作，通过从起始坐标拖动到结束坐标来执行滑动手势。可用于滚动内容、在屏幕之间导航、下拉通知栏以及项目栏或进行基于手势的导航。坐标系统从左上角 (0,0) 开始到右下角（999,999)结束。滑动持续时间会自动调整以实现自然的移动。此操作完成后，您将自动收到结果状态的截图。
38 | - do(action="Note", message="True")  
39 |     记录当前页面内容以便后续总结。
40 | - do(action="Call_API", instruction="xxx")  
41 |     总结或评论当前页面或已记录的内容。
42 | - do(action="Long Press", element=[x,y])  
43 |     Long Pres是长按操作，在屏幕上的特定点长按指定时间。可用于触发上下文菜单、选择文本或激活长按交互。坐标系统从左上角 (0,0) 开始到右下角（999,999)结束。此操作完成后，您将自动收到结果状态的屏幕截图。
44 | - do(action="Double Tap", element=[x,y])  
45 |     Double Tap在屏幕上的特定点快速连续点按两次。使用此操作可以激活双击交互，如缩放、选择文本或打开项目。坐标系统从左上角 (0,0) 开始到右下角（999,999)结束。此操作完成后，您将自动收到结果状态的截图。
46 | - do(action="Take_over", message="xxx")  
47 |     Take_over是接管操作，表示在登录和验证阶段需要用户协助。
48 | - do(action="Back")  
49 |     导航返回到上一个屏幕或关闭当前对话框。相当于按下 Android 的返回按钮。使用此操作可以从更深的屏幕返回、关闭弹出窗口或退出当前上下文。此操作完成后，您将自动收到结果状态的截图。
50 | - do(action="Home") 
51 |     Home是回到系统桌面的操作，相当于按下 Android 主屏幕按钮。使用此操作可退出当前应用并返回启动器，或从已知状态启动新任务。此操作完成后，您将自动收到结果状态的截图。
52 | - do(action="Wait", duration="x seconds")  
53 |     等待页面加载，x为需要等待多少秒。
54 | - finish(message="xxx")  
55 |     finish是结束任务的操作，表示准确完整完成任务，message是终止信息。 
56 | 
57 | 必须遵循的规则：
58 | 1. 在执行任何操作前，先检查当前app是否是目标app，如果不是，先执行 Launch。
59 | 2. 如果进入到了无关页面，先执行 Back。如果执行Back后页面没有变化，请点击页面左上角的返回键进行返回，或者右上角的X号关闭。
60 | 3. 如果页面未加载出内容，最多连续 Wait 三次，否则执行 Back重新进入。
61 | 4. 如果页面显示网络问题，需要重新加载，请点击重新加载。
62 | 5. 如果当前页面找不到目标联系人、商品、店铺等信息，可以尝试 Swipe 滑动查找。
63 | 6. 遇到价格区间、时间区间等筛选条件，如果没有完全符合的，可以放宽要求。
64 | 7. 在做小红书总结类任务时一定要筛选图文笔记。
65 | 8. 购物车全选后再点击全选可以把状态设为全不选，在做购物车任务时，如果购物车里已经有商品被选中时，你需要点击全选后再点击取消全选，再去找需要购买或者删除的商品。
66 | 9. 在做外卖任务时，如果相应店铺购物车里已经有其他商品你需要先把购物车清空再去购买用户指定的外卖。
67 | 10. 在做点外卖任务时，如果用户需要点多个外卖，请尽量在同一店铺进行购买，如果无法找到可以下单，并说明某个商品未找到。
68 | 11. 请严格遵循用户意图执行任务，用户的特殊要求可以执行多次搜索，滑动查找。比如（i）用户要求点一杯咖啡，要咸的，你可以直接搜索咸咖啡，或者搜索咖啡后滑动查找咸的咖啡，比如海盐咖啡。（ii）用户要找到XX群，发一条消息，你可以先搜索XX群，找不到结果后，将"群"字去掉，搜索XX重试。（iii）用户要找到宠物友好的餐厅，你可以搜索餐厅，找到筛选，找到设施，选择可带宠物，或者直接搜索可带宠物，必要时可以使用AI搜索。
69 | 12. 在选择日期时，如果原滑动方向与预期日期越来越远，请向反方向滑动查找。
70 | 13. 执行任务过程中如果有多个可选择的项目栏，请逐个查找每个项目栏，直到完成任务，一定不要在同一项目栏多次查找，从而陷入死循环。
71 | 14. 在执行下一步操作前请一定要检查上一步的操作是否生效，如果点击没生效，可能因为app反应较慢，请先稍微等待一下，如果还是不生效请调整一下点击位置重试，如果仍然不生效请跳过这一步继续任务，并在finish message说明点击不生效。
72 | 15. 在执行任务中如果遇到滑动不生效的情况，请调整一下起始点位置，增大滑动距离重试，如果还是不生效，有可能是已经滑到底了，请继续向反方向滑动，直到顶部或底部，如果仍然没有符合要求的结果，请跳过这一步继续任务，并在finish message说明但没找到要求的项目。
73 | 16. 在做游戏任务时如果在战斗页面如果有自动战斗一定要开启自动战斗，如果多轮历史状态相似要检查自动战斗是否开启。
74 | 17. 如果没有合适的搜索结果，可能是因为搜索页面不对，请返回到搜索页面的上一级尝试重新搜索，如果尝试三次返回上一级搜索后仍然没有符合要求的结果，执行 finish(message="原因")。
75 | 18. 在结束任务前请一定要仔细检查任务是否完整准确的完成，如果出现错选、漏选、多选的情况，请返回之前的步骤进行纠正。
76 | """
77 | )
78 | 


--------------------------------------------------------------------------------
/scripts/check_deployment_en.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import os
  4 | 
  5 | from openai import OpenAI
  6 | 
  7 | if __name__ == "__main__":
  8 |     parser = argparse.ArgumentParser(
  9 |         description="Tool for checking if model deployment is successful",
 10 |         formatter_class=argparse.RawDescriptionHelpFormatter,
 11 |         epilog="""
 12 | Usage examples:
 13 |   python scripts/check_deployment_en.py --base-url http://localhost:8000/v1 --apikey your-key --model autoglm-phone-9b
 14 |   python scripts/check_deployment_en.py --base-url http://localhost:8000/v1 --apikey your-key --model autoglm-phone-9b --messages-file custom.json
 15 |         """,
 16 |     )
 17 | 
 18 |     parser.add_argument(
 19 |         "--base-url",
 20 |         type=str,
 21 |         required=True,
 22 |         help="Base URL of the API service, e.g.: http://localhost:8000/v1",
 23 |     )
 24 | 
 25 |     parser.add_argument(
 26 |         "--apikey", type=str, default="EMPTY", help="API key (default: EMPTY)"
 27 |     )
 28 | 
 29 |     parser.add_argument(
 30 |         "--model",
 31 |         type=str,
 32 |         required=True,
 33 |         help="Name of the model to test, e.g.: autoglm-phone-9b",
 34 |     )
 35 | 
 36 |     parser.add_argument(
 37 |         "--messages-file",
 38 |         type=str,
 39 |         default="scripts/sample_messages_en.json",
 40 |         help="Path to JSON file containing test messages (default: scripts/sample_messages_en.json)",
 41 |     )
 42 | 
 43 |     parser.add_argument(
 44 |         "--max-tokens",
 45 |         type=int,
 46 |         default=3000,
 47 |         help="Maximum generation tokens (default: 3000)",
 48 |     )
 49 | 
 50 |     parser.add_argument(
 51 |         "--temperature",
 52 |         type=float,
 53 |         default=0.0,
 54 |         help="Sampling temperature (default: 0.0)",
 55 |     )
 56 | 
 57 |     parser.add_argument(
 58 |         "--top_p",
 59 |         type=float,
 60 |         default=0.85,
 61 |         help="Nucleus sampling parameter (default: 0.85)",
 62 |     )
 63 | 
 64 |     parser.add_argument(
 65 |         "--frequency_penalty",
 66 |         type=float,
 67 |         default=0.2,
 68 |         help="Frequency penalty parameter (default: 0.2)",
 69 |     )
 70 | 
 71 |     args = parser.parse_args()
 72 | 
 73 |     # Read test messages
 74 |     if not os.path.exists(args.messages_file):
 75 |         print(f"Error: Message file {args.messages_file} does not exist")
 76 |         exit(1)
 77 | 
 78 |     with open(args.messages_file) as f:
 79 |         messages = json.load(f)
 80 | 
 81 |     base_url = args.base_url
 82 |     api_key = args.apikey
 83 |     model = args.model
 84 | 
 85 |     print(f"Starting model inference test...")
 86 |     print(f"Base URL: {base_url}")
 87 |     print(f"Model: {model}")
 88 |     print(f"Messages file: {args.messages_file}")
 89 |     print("=" * 80)
 90 | 
 91 |     try:
 92 |         client = OpenAI(
 93 |             base_url=base_url,
 94 |             api_key=api_key,
 95 |         )
 96 | 
 97 |         response = client.chat.completions.create(
 98 |             messages=messages,
 99 |             model=model,
100 |             max_tokens=args.max_tokens,
101 |             temperature=args.temperature,
102 |             top_p=args.top_p,
103 |             frequency_penalty=args.frequency_penalty,
104 |             stream=False,
105 |         )
106 | 
107 |         print("\nModel inference result:")
108 |         print("=" * 80)
109 |         print(response.choices[0].message.content)
110 |         print("=" * 80)
111 | 
112 |         if response.usage:
113 |             print(f"\nStatistics:")
114 |             print(f"  - Prompt tokens: {response.usage.prompt_tokens}")
115 |             print(f"  - Completion tokens: {response.usage.completion_tokens}")
116 |             print(f"  - Total tokens: {response.usage.total_tokens}")
117 | 
118 |         print(
119 |             f"\nPlease evaluate the above inference result to determine if the model deployment meets expectations."
120 |         )
121 | 
122 |     except Exception as e:
123 |         print(f"\nError occurred while calling API:")
124 |         print(f"Error type: {type(e).__name__}")
125 |         print(f"Error message: {str(e)}")
126 |         print(
127 |             "\nTip: Please check if base_url, api_key and model parameters are correct, and if the service is running."
128 |         )
129 |         exit(1)
130 | 


--------------------------------------------------------------------------------
/phone_agent/hdc/screenshot.py:
--------------------------------------------------------------------------------
  1 | """Screenshot utilities for capturing HarmonyOS device screen."""
  2 | 
  3 | import base64
  4 | import os
  5 | import subprocess
  6 | import tempfile
  7 | import uuid
  8 | from dataclasses import dataclass
  9 | from io import BytesIO
 10 | from typing import Tuple
 11 | 
 12 | from PIL import Image
 13 | from phone_agent.hdc.connection import _run_hdc_command
 14 | 
 15 | 
 16 | @dataclass
 17 | class Screenshot:
 18 |     """Represents a captured screenshot."""
 19 | 
 20 |     base64_data: str
 21 |     width: int
 22 |     height: int
 23 |     is_sensitive: bool = False
 24 | 
 25 | 
 26 | def get_screenshot(device_id: str | None = None, timeout: int = 10) -> Screenshot:
 27 |     """
 28 |     Capture a screenshot from the connected HarmonyOS device.
 29 | 
 30 |     Args:
 31 |         device_id: Optional HDC device ID for multi-device setups.
 32 |         timeout: Timeout in seconds for screenshot operations.
 33 | 
 34 |     Returns:
 35 |         Screenshot object containing base64 data and dimensions.
 36 | 
 37 |     Note:
 38 |         If the screenshot fails (e.g., on sensitive screens like payment pages),
 39 |         a black fallback image is returned with is_sensitive=True.
 40 |     """
 41 |     temp_path = os.path.join(tempfile.gettempdir(), f"screenshot_{uuid.uuid4()}.png")
 42 |     hdc_prefix = _get_hdc_prefix(device_id)
 43 | 
 44 |     try:
 45 |         # Execute screenshot command
 46 |         # HarmonyOS HDC only supports JPEG format
 47 |         remote_path = "/data/local/tmp/tmp_screenshot.jpeg"
 48 | 
 49 |         # Try method 1: hdc shell screenshot (newer HarmonyOS versions)
 50 |         result = _run_hdc_command(
 51 |             hdc_prefix + ["shell", "screenshot", remote_path],
 52 |             capture_output=True,
 53 |             text=True,
 54 |             timeout=timeout,
 55 |         )
 56 | 
 57 |         # Check for screenshot failure (sensitive screen)
 58 |         output = result.stdout + result.stderr
 59 |         if "fail" in output.lower() or "error" in output.lower() or "not found" in output.lower():
 60 |             # Try method 2: snapshot_display (older versions or different devices)
 61 |             result = _run_hdc_command(
 62 |                 hdc_prefix + ["shell", "snapshot_display", "-f", remote_path],
 63 |                 capture_output=True,
 64 |                 text=True,
 65 |                 timeout=timeout,
 66 |             )
 67 |             output = result.stdout + result.stderr
 68 |             if "fail" in output.lower() or "error" in output.lower():
 69 |                 return _create_fallback_screenshot(is_sensitive=True)
 70 | 
 71 |         # Pull screenshot to local temp path
 72 |         # Note: remote file is JPEG, but PIL can open it regardless of local extension
 73 |         _run_hdc_command(
 74 |             hdc_prefix + ["file", "recv", remote_path, temp_path],
 75 |             capture_output=True,
 76 |             text=True,
 77 |             timeout=5,
 78 |         )
 79 | 
 80 |         if not os.path.exists(temp_path):
 81 |             return _create_fallback_screenshot(is_sensitive=False)
 82 | 
 83 |         # Read JPEG image and convert to PNG for model inference
 84 |         # PIL automatically detects the image format from file content
 85 |         img = Image.open(temp_path)
 86 |         width, height = img.size
 87 | 
 88 |         buffered = BytesIO()
 89 |         img.save(buffered, format="PNG")
 90 |         base64_data = base64.b64encode(buffered.getvalue()).decode("utf-8")
 91 | 
 92 |         # Cleanup
 93 |         os.remove(temp_path)
 94 | 
 95 |         return Screenshot(
 96 |             base64_data=base64_data, width=width, height=height, is_sensitive=False
 97 |         )
 98 | 
 99 |     except Exception as e:
100 |         print(f"Screenshot error: {e}")
101 |         return _create_fallback_screenshot(is_sensitive=False)
102 | 
103 | 
104 | def _get_hdc_prefix(device_id: str | None) -> list:
105 |     """Get HDC command prefix with optional device specifier."""
106 |     if device_id:
107 |         return ["hdc", "-t", device_id]
108 |     return ["hdc"]
109 | 
110 | 
111 | def _create_fallback_screenshot(is_sensitive: bool) -> Screenshot:
112 |     """Create a black fallback image when screenshot fails."""
113 |     default_width, default_height = 1080, 2400
114 | 
115 |     black_img = Image.new("RGB", (default_width, default_height), color="black")
116 |     buffered = BytesIO()
117 |     black_img.save(buffered, format="PNG")
118 |     base64_data = base64.b64encode(buffered.getvalue()).decode("utf-8")
119 | 
120 |     return Screenshot(
121 |         base64_data=base64_data,
122 |         width=default_width,
123 |         height=default_height,
124 |         is_sensitive=is_sensitive,
125 |     )
126 | 


--------------------------------------------------------------------------------
/resources/privacy_policy.txt:
--------------------------------------------------------------------------------
  1 | 第一部分：模型/技术的安全性说明
  2 | 
  3 | 1. AutoGLM 技术机制与部署灵活性
  4 | AutoGLM 的核心功能是自动化操作执行。其工作原理如下：
  5 | - 指令驱动： 基于用户或开发者发出的操作指令。
  6 | - 屏幕理解： 获取当前操作环境的屏幕内容，将图像发送给大模型（可部署在本地或云端）进行分析理解。
  7 | - 操作模拟： 模拟人类操作方式（如点击、滑动、输入信息等）在目标环境中完成任务。
  8 | - 示例： 当指令要求订购高铁票时，AutoGLM 会打开相关应用，识别界面内容，按指令选择车次、完成下单等步骤，如同人工操作，用户或开发者可随时终止任务。
  9 | 
 10 | 关键灵活性：
 11 | - 模型部署： 开发者可自由选择将 AutoGLM 模型部署在本地设备或云端服务器上。
 12 | - 操作执行环境： 自动化操作可以在本地设备上执行，也可以在云设备上执行，具体由开发者根据应用场景和需求决定。
 13 | - 数据流向： 数据流向取决于部署选择：
 14 |   - 本地部署（模型+执行）： 屏幕捕获、模型分析、操作执行均在本地设备完成，数据不离开设备，隐私性最高。
 15 |   - 云端部署（模型+执行）： 屏幕内容需从操作环境（本机或云设备）传输到云端模型，模型分析后指令返回操作环境执行。开发者需确保传输和云端处理的安全性。
 16 |   - 混合部署（如本地执行+云端模型）： 屏幕内容在本地捕获，传输到云端模型分析，分析结果返回本地执行。开发者需关注数据传输安全。
 17 | 
 18 | 2. 系统权限调用说明（针对操作执行环境）
 19 | 为保证自动化操作正常执行，运行 AutoGLM 操作的环境可能需要获取以下权限：
 20 | - ADB (Android Debug Bridge) 权限： 用于获取信息并模拟点击、滑动、输入等用户交互操作。
 21 | - 存储权限： 用于临时存储必要的数据、模型文件（若本地部署）或日志。
 22 | - 网络权限： 用于访问在线服务（如调用云端模型、访问目标应用服务）。
 23 | - 其他特定权限： 根据具体任务可能需要（如麦克风用于语音指令）。
 24 | 
 25 | 开发者责任：
 26 | - 最小权限原则： 仅请求完成特定任务所必需的权限。
 27 | - 透明告知： 在应用或服务中清晰、明确地向最终用户说明每个权限的用途和必要性。
 28 | - 用户授权： 必须获得最终用户的明确授权后，才能在操作环境中启用相关权限和功能。
 29 | - 环境适配： 确保权限请求和获取机制适配所选择的操作执行环境（本地或云）。
 30 | 
 31 | 3. 数据处理与隐私保护原则
 32 | AutoGLM 开源项目本身不收集用户数据。数据处理和隐私保护的责任主体是基于 AutoGLM 开发具体应用或服务的开发者，其责任取决于部署方式：
 33 | - 本地部署（模型+执行）：
 34 |   - 开发者需在应用层面实现本地数据的安全存储和处理，所有数据处理（屏幕捕获、模型分析、操作执行）均在最终用户的本地设备上完成。
 35 |   - 开发者应确保其应用不主动将敏感数据（如屏幕内容、操作记录）上传到开发者服务器或第三方，除非用户明确知情同意且为必要功能。
 36 | - 云端部署（模型或执行或两者）：
 37 |   - 涉及数据（屏幕内容、操作指令、模型分析结果）在操作环境与云端之间传输。
 38 |   - 开发者必须：
 39 |     - 实施强加密保护所有传输和存储的数据。
 40 |     - 明确告知最终用户哪些数据会被发送到云端、发送目的、存储位置及保留期限，获得最终用户对数据传输和云端处理的明确同意。
 41 |     - 遵守适用的数据保护法规，提供清晰的隐私政策，说明数据处理实践。
 42 |     - 确保云端环境（模型服务器、操作环境服务器）的安全配置和访问控制。
 43 | - 通用原则（所有部署方式）：
 44 |   - 数据最小化： 仅收集和处理完成自动化任务所绝对必需的最少信息。
 45 |   - 目的限制： 数据仅用于实现用户指令的特定自动化操作目的。
 46 |   - 安全保障： 开发者有责任采取合理的技术和管理措施，保护其处理的所有用户数据（无论在本地还是云端）的安全性和保密性，防止未经授权的访问、使用、泄露或丢失。
 47 |   - 用户控制： 提供机制让最终用户能够查看、管理（如删除）与其相关的数据（在技术可行且符合部署方式的前提下）。
 48 | 
 49 | 
 50 | 
 51 | 第二部分：开发者/用户应该遵循的使用规范
 52 | 
 53 | 开发者/用户在使用AutoGLM开源项目过程中，应始终遵循《中华人民共和国网络安全法》《互联网信息服务算法推荐管理规定》《互联网信息服务深度合成管理规定》《生成式人工智能服务管理暂行办法》《网络安全技术 生成式人工智能服务安全基本要求》等使用地所适用的法律法规及标准，并根据《人工智能生成合成内容标识办法》《网络安全技术人工智能生成合成内容标识方法(GB45438-2025)》的要求和应用场景，对人工智能生成合成内容进行标识，包括但不限于显式标识、隐式标识(元数据标识和数字水印)等。
 54 | 
 55 | 1. 重要操作确认机制
 56 | 
 57 | 开发者必须在其基于 AutoGLM 开发的应用或服务中，为涉及以下6+1项高风险操作设计并实现明确的、强制性的用户确认步骤：
 58 | - 信息交互与内容传播：包括但不限于发送消息、邮件、发表评论、点赞、分享等。
 59 | - 文件处置与权限管理：包括但不限于创建、编辑、删除、移动文件或文件夹、开启或关闭任意权限等。
 60 | - 交易订单与权益处置：包括但不限于清空购物车、提交订单、修改/添加收货地址、使用优惠券/积分等。
 61 | - 资金流转与支付结算：包括但不限于转账、支付、收款、充值、提现、绑定/解绑支付方式等。
 62 | - 账户身份与安全配置：包括但不限于修改密码、设置/修改安全选项、删除账号或关联账号、删除好友/联系人、删除对话/记录等。
 63 | - 医疗健康与法律合规：包括但不限于诊疗记录/健康数据的访问、授权或处置、药品采购、生理或心理测试、电子协议的签署等。
 64 | - 其他高风险操作：其他任何可能对用户数据安全、财产安全、账号安全或声誉造成重大影响的操作。
 65 | 
 66 | 要求：
 67 | - 确认步骤必须在操作执行前触发，清晰展示即将执行的操作详情。
 68 | - 提供便捷的取消/终止机制，允许用户在确认前或操作过程中随时中止任务。
 69 | - 开发者责任： 未能实现有效确认机制导致用户损失的，开发者需承担相应责任。用户责任： 用户在确认后未及时终止错误操作导致的损失，由用户自行承担。
 70 | 
 71 | 2. 开发者与用户的义务
 72 | 
 73 | 开发者义务：
 74 | - 透明告知： 清晰、准确地向最终用户说明其应用/服务的功能、工作原理（特别是自动化部分）、数据收集和处理方式（包括是否涉及云端）、潜在风险以及用户如何控制。
 75 | - 提供监控与控制： 设计用户界面，允许最终用户：
 76 |   - 实时查看或了解自动化操作的当前状态和步骤。
 77 |   - 方便、快速地暂停、终止任何正在进行的自动化任务。
 78 |   - 管理自动化操作的权限和设置。
 79 | - 安全开发： 遵循安全编码实践，确保应用/服务本身的安全性，防止被恶意利用。
 80 | - 合规性： 确保其开发的应用/服务符合所有适用的法律法规、行业标准和第三方平台（如被操作的应用）的服务条款。
 81 | - 风险提示： 在适当位置（如功能入口、首次使用时、确认步骤中）向用户明确提示使用自动化功能可能存在的风险（如误操作、隐私风险、第三方平台政策风险）。
 82 | - 避免关键依赖： 谨慎评估，不建议将 AutoGLM 用于处理极端关键、高风险或一旦出错后果极其严重的操作（如医疗设备控制、关键基础设施操作、大额金融交易无人工复核）。
 83 | 
 84 | 用户义务：
 85 | - 理解风险： 在使用基于 AutoGLM 的自动化功能前，仔细阅读开发者提供的说明、隐私政策和风险提示，充分理解其工作原理和潜在风险。
 86 | - 谨慎授权： 仅在完全信任应用/服务开发者并理解授权内容后，才授予必要的权限。
 87 | - 主动监控： 在自动化任务执行期间，保持适当的关注，特别是在执行重要操作时。利用开发者提供的监控功能了解操作进展。
 88 | - 及时干预： 如发现操作错误、异常或不符合预期，应立即使用提供的终止功能停止任务。
 89 | - 承担责任： 对其发出的指令、确认的操作以及因未能及时监控和制止错误操作而导致的任何损失，自行承担责任。
 90 | 
 91 | 3. 开发者与用户行为规范
 92 | 
 93 | 严禁利用 AutoGLM 开源项目或基于其开发的应用/服务从事以下行为：
 94 | （1）批量自动化与恶意竞争行为
 95 | - 进行任何形式的虚假数据操作：刷单、刷票、刷赞、刷评论、刷流量、刷粉丝、刷播放量、刷下载量等。
 96 | - 批量操控账号：批量注册、批量登录、批量操作第三方平台账号（群控、多开、云控）。
 97 | - 扰乱市场秩序：恶意抢购、囤积居奇、抢占限量资源、批量领取/滥用优惠券/补贴、恶意占用服务资源（薅羊毛）。
 98 | - 操纵平台规则：刷榜、刷排名、操纵搜索结果、人为干预推荐算法、虚假提升/降低内容曝光度。
 99 | - 制造虚假活跃度：批量发布、转发、点赞、收藏、关注、取关等社交媒体操作。
100 | - 破坏游戏公平：游戏代练、工作室操作、批量刷装备/金币/经验/道具。
101 | - 破坏公正性：批量投票、刷票、操纵网络评选、调查结果。
102 | （2）虚假信息与欺诈行为
103 | - 制造误导信息：发布/传播虚假商品/服务评价、虚假用户反馈、虚假证言、虚假体验。
104 | - 伪造商业数据：制造虚假交易记录、虚假销量、虚假用户活跃度、虚假好评率。
105 | - 身份欺诈：冒充他人身份、虚构个人信息、盗用他人账号/头像/昵称、伪造身份证明。
106 | - 虚假营销：发布虚假广告、进行虚假宣传、夸大产品功效、隐瞒产品缺陷/风险。
107 | - 参与诈骗活动：网络诈骗、虚假投资、传销、非法集资、虚假中奖、钓鱼等。
108 | - 传播不实信息：制造或恶意传播虚假新闻、谣言、未经证实的信息。
109 | （3）破坏第三方服务与系统安全
110 | - 非授权访问：利用 AutoGLM 进行数据爬取（违反 robots.txt 或平台政策）、信息窃取、API 接口滥用、服务器渗透测试（未授权）。
111 | - 技术破坏：对第三方应用进行逆向工程、破解、修改、注入恶意代码、干扰其正常运行。
112 | - 资源滥用：恶意占用第三方服务器资源、发送垃圾请求、制造异常流量、进行 DDoS 攻击。
113 | - 违反平台规则：故意违反被操作第三方应用的用户协议、服务条款、社区规则。
114 | - 恶意竞争：恶意差评、恶意举报、恶意投诉、商业诋毁。
115 | - 传播有害内容：传播计算机病毒、木马、恶意软件、勒索软件、垃圾邮件、非法内容。
116 | - 侵犯数据权益：未经授权进行大规模商业数据采集、用户信息收集、隐私窥探。
117 | （4）侵犯他人合法权益
118 | - 账号盗用：盗用他人账号、密码、身份凭证进行操作。
119 | - 网络骚扰与霸凌：恶意骚扰、威胁、辱骂、诽谤、人肉搜索他人。
120 | - 侵犯隐私与秘密：未经授权收集、使用、传播他人个人信息、隐私数据、商业秘密。
121 | - 恶意抢注：抢注他人商标、域名、用户名、社交媒体账号等。
122 | - 骚扰行为：恶意刷屏、垃圾信息轰炸、强制关注/订阅。
123 | - 损害商业利益：商业间谍活动、不正当竞争、恶意挖角、窃取商业机密。
124 | （5）滥用资源与破坏项目生态
125 | - 滥用注册资源：恶意注册大量账号、虚假注册。
126 | - 浪费计算/设备资源：恶意占用本地设备或云设备资源、长时间闲置占用、运行与自动化任务无关的高耗能程序（如挖矿）。
127 | - 破坏稳定性：恶意测试系统性能、进行压力测试（未授权）、频繁重启服务、利用技术漏洞/缺陷牟利或损害项目/平台利益。
128 | - 违反开源协议：违反 AutoGLM 项目的开源许可证条款。
129 | 
130 | 违反后果：
131 | 
132 | 如开发者/用户在使用中未遵循相应的法律法规、政策、行业标准（包括但不限于技术规范、安全标准）及开源项目的约定（包括但不限于开源协议、使用须知），由此产生的全部法律责任、经济损失及一切不良后果，均由开发者 / 用户自行独立承担。


--------------------------------------------------------------------------------
/phone_agent/device_factory.py:
--------------------------------------------------------------------------------
  1 | """Device factory for selecting ADB or HDC based on device type."""
  2 | 
  3 | from enum import Enum
  4 | from typing import Any
  5 | 
  6 | 
  7 | class DeviceType(Enum):
  8 |     """Type of device connection tool."""
  9 | 
 10 |     ADB = "adb"
 11 |     HDC = "hdc"
 12 | 
 13 | 
 14 | class DeviceFactory:
 15 |     """
 16 |     Factory class for getting device-specific implementations.
 17 | 
 18 |     This allows the system to work with both Android (ADB) and HarmonyOS (HDC) devices.
 19 |     """
 20 | 
 21 |     def __init__(self, device_type: DeviceType = DeviceType.ADB):
 22 |         """
 23 |         Initialize the device factory.
 24 | 
 25 |         Args:
 26 |             device_type: The type of device to use (ADB or HDC).
 27 |         """
 28 |         self.device_type = device_type
 29 |         self._module = None
 30 | 
 31 |     @property
 32 |     def module(self):
 33 |         """Get the appropriate device module (adb or hdc)."""
 34 |         if self._module is None:
 35 |             if self.device_type == DeviceType.ADB:
 36 |                 from phone_agent import adb
 37 |                 self._module = adb
 38 |             elif self.device_type == DeviceType.HDC:
 39 |                 from phone_agent import hdc
 40 |                 self._module = hdc
 41 |             else:
 42 |                 raise ValueError(f"Unknown device type: {self.device_type}")
 43 |         return self._module
 44 | 
 45 |     def get_screenshot(self, device_id: str | None = None, timeout: int = 10):
 46 |         """Get screenshot from device."""
 47 |         return self.module.get_screenshot(device_id, timeout)
 48 | 
 49 |     def get_current_app(self, device_id: str | None = None) -> str:
 50 |         """Get current app name."""
 51 |         return self.module.get_current_app(device_id)
 52 | 
 53 |     def tap(self, x: int, y: int, device_id: str | None = None, delay: float | None = None):
 54 |         """Tap at coordinates."""
 55 |         return self.module.tap(x, y, device_id, delay)
 56 | 
 57 |     def double_tap(self, x: int, y: int, device_id: str | None = None, delay: float | None = None):
 58 |         """Double tap at coordinates."""
 59 |         return self.module.double_tap(x, y, device_id, delay)
 60 | 
 61 |     def long_press(self, x: int, y: int, duration_ms: int = 3000, device_id: str | None = None, delay: float | None = None):
 62 |         """Long press at coordinates."""
 63 |         return self.module.long_press(x, y, duration_ms, device_id, delay)
 64 | 
 65 |     def swipe(self, start_x: int, start_y: int, end_x: int, end_y: int, duration_ms: int | None = None, device_id: str | None = None, delay: float | None = None):
 66 |         """Swipe from start to end."""
 67 |         return self.module.swipe(start_x, start_y, end_x, end_y, duration_ms, device_id, delay)
 68 | 
 69 |     def back(self, device_id: str | None = None, delay: float | None = None):
 70 |         """Press back button."""
 71 |         return self.module.back(device_id, delay)
 72 | 
 73 |     def home(self, device_id: str | None = None, delay: float | None = None):
 74 |         """Press home button."""
 75 |         return self.module.home(device_id, delay)
 76 | 
 77 |     def launch_app(self, app_name: str, device_id: str | None = None, delay: float | None = None) -> bool:
 78 |         """Launch an app."""
 79 |         return self.module.launch_app(app_name, device_id, delay)
 80 | 
 81 |     def type_text(self, text: str, device_id: str | None = None):
 82 |         """Type text."""
 83 |         return self.module.type_text(text, device_id)
 84 | 
 85 |     def clear_text(self, device_id: str | None = None):
 86 |         """Clear text."""
 87 |         return self.module.clear_text(device_id)
 88 | 
 89 |     def detect_and_set_adb_keyboard(self, device_id: str | None = None) -> str:
 90 |         """Detect and set keyboard."""
 91 |         return self.module.detect_and_set_adb_keyboard(device_id)
 92 | 
 93 |     def restore_keyboard(self, ime: str, device_id: str | None = None):
 94 |         """Restore keyboard."""
 95 |         return self.module.restore_keyboard(ime, device_id)
 96 | 
 97 |     def list_devices(self):
 98 |         """List connected devices."""
 99 |         return self.module.list_devices()
100 | 
101 |     def get_connection_class(self):
102 |         """Get the connection class (ADBConnection or HDCConnection)."""
103 |         if self.device_type == DeviceType.ADB:
104 |             from phone_agent.adb import ADBConnection
105 |             return ADBConnection
106 |         elif self.device_type == DeviceType.HDC:
107 |             from phone_agent.hdc import HDCConnection
108 |             return HDCConnection
109 |         else:
110 |             raise ValueError(f"Unknown device type: {self.device_type}")
111 | 
112 | 
113 | # Global device factory instance
114 | _device_factory: DeviceFactory | None = None
115 | 
116 | 
117 | def set_device_type(device_type: DeviceType):
118 |     """
119 |     Set the global device type.
120 | 
121 |     Args:
122 |         device_type: The device type to use (ADB or HDC).
123 |     """
124 |     global _device_factory
125 |     _device_factory = DeviceFactory(device_type)
126 | 
127 | 
128 | def get_device_factory() -> DeviceFactory:
129 |     """
130 |     Get the global device factory instance.
131 | 
132 |     Returns:
133 |         The device factory instance.
134 |     """
135 |     global _device_factory
136 |     if _device_factory is None:
137 |         _device_factory = DeviceFactory(DeviceType.ADB)  # Default to ADB
138 |     return _device_factory
139 | 


--------------------------------------------------------------------------------
/phone_agent/hdc/input.py:
--------------------------------------------------------------------------------
  1 | """Input utilities for HarmonyOS device text input."""
  2 | 
  3 | import base64
  4 | import subprocess
  5 | from typing import Optional
  6 | 
  7 | from phone_agent.hdc.connection import _run_hdc_command
  8 | 
  9 | 
 10 | def type_text(text: str, device_id: str | None = None) -> None:
 11 |     """
 12 |     Type text into the currently focused input field.
 13 | 
 14 |     Args:
 15 |         text: The text to type. Supports multi-line text with newline characters.
 16 |         device_id: Optional HDC device ID for multi-device setups.
 17 | 
 18 |     Note:
 19 |         HarmonyOS uses: hdc shell uitest uiInput text "文本内容"
 20 |         This command works without coordinates when input field is focused.
 21 |         For multi-line text, the function splits by newlines and sends ENTER keyEvents.
 22 |         ENTER key code in HarmonyOS: 2054
 23 |         Recommendation: Click on the input field first to focus it, then use this function.
 24 |     """
 25 |     hdc_prefix = _get_hdc_prefix(device_id)
 26 | 
 27 |     # Handle multi-line text by splitting on newlines
 28 |     if '\n' in text:
 29 |         lines = text.split('\n')
 30 |         for i, line in enumerate(lines):
 31 |             if line:  # Only process non-empty lines
 32 |                 # Escape special characters for shell
 33 |                 escaped_line = line.replace('"', '\\"').replace("$", "\\$")
 34 | 
 35 |                 _run_hdc_command(
 36 |                     hdc_prefix + ["shell", "uitest", "uiInput", "text", escaped_line],
 37 |                     capture_output=True,
 38 |                     text=True,
 39 |                 )
 40 | 
 41 |             # Send ENTER key event after each line except the last one
 42 |             if i < len(lines) - 1:
 43 |                 try:
 44 |                     _run_hdc_command(
 45 |                         hdc_prefix + ["shell", "uitest", "uiInput", "keyEvent", "2054"],
 46 |                         capture_output=True,
 47 |                         text=True,
 48 |                     )
 49 |                 except Exception as e:
 50 |                     print(f"[HDC] ENTER keyEvent failed: {e}")
 51 |     else:
 52 |         # Single line text - original logic
 53 |         # Escape special characters for shell (keep quotes for proper text handling)
 54 |         # The text will be wrapped in quotes in the command
 55 |         escaped_text = text.replace('"', '\\"').replace("$", "\\$")
 56 | 
 57 |         # HarmonyOS uitest uiInput text command
 58 |         # Format: hdc shell uitest uiInput text "文本内容"
 59 |         _run_hdc_command(
 60 |             hdc_prefix + ["shell", "uitest", "uiInput", "text", escaped_text],
 61 |             capture_output=True,
 62 |             text=True,
 63 |         )
 64 | 
 65 | 
 66 | def clear_text(device_id: str | None = None) -> None:
 67 |     """
 68 |     Clear text in the currently focused input field.
 69 | 
 70 |     Args:
 71 |         device_id: Optional HDC device ID for multi-device setups.
 72 | 
 73 |     Note:
 74 |         This method uses repeated delete key events to clear text.
 75 |         For HarmonyOS, you might also use select all + delete for better efficiency.
 76 |     """
 77 |     hdc_prefix = _get_hdc_prefix(device_id)
 78 |     # Ctrl+A to select all (key code 2072 for Ctrl, 2017 for A)
 79 |     # Then delete
 80 |     _run_hdc_command(
 81 |         hdc_prefix + ["shell", "uitest", "uiInput", "keyEvent", "2072", "2017"],
 82 |         capture_output=True,
 83 |         text=True,
 84 |     )
 85 |     _run_hdc_command(
 86 |         hdc_prefix + ["shell", "uitest", "uiInput", "keyEvent", "2055"],  # Delete key
 87 |         capture_output=True,
 88 |         text=True,
 89 |     )
 90 | 
 91 | 
 92 | def detect_and_set_adb_keyboard(device_id: str | None = None) -> str:
 93 |     """
 94 |     Detect current keyboard and switch to ADB Keyboard if available.
 95 | 
 96 |     Args:
 97 |         device_id: Optional HDC device ID for multi-device setups.
 98 | 
 99 |     Returns:
100 |         The original keyboard IME identifier for later restoration.
101 | 
102 |     Note:
103 |         This is a placeholder. HarmonyOS may not support ADB Keyboard.
104 |         If there's a similar tool for HarmonyOS, integrate it here.
105 |     """
106 |     hdc_prefix = _get_hdc_prefix(device_id)
107 | 
108 |     # Get current IME (if HarmonyOS supports this)
109 |     try:
110 |         result = _run_hdc_command(
111 |             hdc_prefix + ["shell", "settings", "get", "secure", "default_input_method"],
112 |             capture_output=True,
113 |             text=True,
114 |         )
115 |         current_ime = (result.stdout + result.stderr).strip()
116 | 
117 |         # If ADB Keyboard equivalent exists for HarmonyOS, switch to it
118 |         # For now, we'll just return the current IME
119 |         return current_ime
120 |     except Exception:
121 |         return ""
122 | 
123 | 
124 | def restore_keyboard(ime: str, device_id: str | None = None) -> None:
125 |     """
126 |     Restore the original keyboard IME.
127 | 
128 |     Args:
129 |         ime: The IME identifier to restore.
130 |         device_id: Optional HDC device ID for multi-device setups.
131 |     """
132 |     if not ime:
133 |         return
134 | 
135 |     hdc_prefix = _get_hdc_prefix(device_id)
136 | 
137 |     try:
138 |         _run_hdc_command(
139 |             hdc_prefix + ["shell", "ime", "set", ime], capture_output=True, text=True
140 |         )
141 |     except Exception:
142 |         pass
143 | 
144 | 
145 | def _get_hdc_prefix(device_id: str | None) -> list:
146 |     """Get HDC command prefix with optional device specifier."""
147 |     if device_id:
148 |         return ["hdc", "-t", device_id]
149 |     return ["hdc"]
150 | 


--------------------------------------------------------------------------------
/examples/basic_usage.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | Phone Agent Usage Examples / Phone Agent 使用示例
  4 | 
  5 | Demonstrates how to use Phone Agent for phone automation tasks via Python API.
  6 | 演示如何通过 Python API 使用 Phone Agent 进行手机自动化任务。
  7 | """
  8 | 
  9 | from phone_agent import PhoneAgent
 10 | from phone_agent.agent import AgentConfig
 11 | from phone_agent.config import get_messages
 12 | from phone_agent.model import ModelConfig
 13 | 
 14 | 
 15 | def example_basic_task(lang: str = "cn"):
 16 |     """Basic task example / 基础任务示例"""
 17 |     msgs = get_messages(lang)
 18 | 
 19 |     # Configure model endpoint
 20 |     model_config = ModelConfig(
 21 |         base_url="http://localhost:8000/v1",
 22 |         model_name="autoglm-phone-9b",
 23 |         temperature=0.1,
 24 |     )
 25 | 
 26 |     # Configure Agent behavior
 27 |     agent_config = AgentConfig(
 28 |         max_steps=50,
 29 |         verbose=True,
 30 |         lang=lang,
 31 |     )
 32 | 
 33 |     # Create Agent
 34 |     agent = PhoneAgent(
 35 |         model_config=model_config,
 36 |         agent_config=agent_config,
 37 |     )
 38 | 
 39 |     # Execute task
 40 |     result = agent.run("打开小红书搜索美食攻略")
 41 |     print(f"{msgs['task_result']}: {result}")
 42 | 
 43 | 
 44 | def example_with_callbacks(lang: str = "cn"):
 45 |     """Task example with callbacks / 带回调的任务示例"""
 46 |     msgs = get_messages(lang)
 47 | 
 48 |     def my_confirmation(message: str) -> bool:
 49 |         """Sensitive operation confirmation callback / 敏感操作确认回调"""
 50 |         print(f"\n[{msgs['confirmation_required']}] {message}")
 51 |         response = input(f"{msgs['continue_prompt']}: ")
 52 |         return response.lower() in ("yes", "y", "是")
 53 | 
 54 |     def my_takeover(message: str) -> None:
 55 |         """Manual takeover callback / 人工接管回调"""
 56 |         print(f"\n[{msgs['manual_operation_required']}] {message}")
 57 |         print(msgs["manual_operation_hint"])
 58 |         input(f"{msgs['press_enter_when_done']}: ")
 59 | 
 60 |     # Create Agent with custom callbacks
 61 |     agent_config = AgentConfig(lang=lang)
 62 |     agent = PhoneAgent(
 63 |         agent_config=agent_config,
 64 |         confirmation_callback=my_confirmation,
 65 |         takeover_callback=my_takeover,
 66 |     )
 67 | 
 68 |     # Execute task that may require confirmation
 69 |     result = agent.run("打开淘宝搜索无线耳机并加入购物车")
 70 |     print(f"{msgs['task_result']}: {result}")
 71 | 
 72 | 
 73 | def example_step_by_step(lang: str = "cn"):
 74 |     """Step-by-step execution example (for debugging) / 单步执行示例（用于调试）"""
 75 |     msgs = get_messages(lang)
 76 | 
 77 |     agent_config = AgentConfig(lang=lang)
 78 |     agent = PhoneAgent(agent_config=agent_config)
 79 | 
 80 |     # Initialize task
 81 |     result = agent.step("打开美团搜索附近的火锅店")
 82 |     print(f"{msgs['step']} 1: {result.action}")
 83 | 
 84 |     # Continue if not finished
 85 |     while not result.finished and agent.step_count < 10:
 86 |         result = agent.step()
 87 |         print(f"{msgs['step']} {agent.step_count}: {result.action}")
 88 |         print(f"  {msgs['thinking']}: {result.thinking[:100]}...")
 89 | 
 90 |     print(f"\n{msgs['final_result']}: {result.message}")
 91 | 
 92 | 
 93 | def example_multiple_tasks(lang: str = "cn"):
 94 |     """Batch task example / 批量任务示例"""
 95 |     msgs = get_messages(lang)
 96 | 
 97 |     agent_config = AgentConfig(lang=lang)
 98 |     agent = PhoneAgent(agent_config=agent_config)
 99 | 
100 |     tasks = [
101 |         "打开高德地图查看实时路况",
102 |         "打开大众点评搜索附近的咖啡店",
103 |         "打开bilibili搜索Python教程",
104 |     ]
105 | 
106 |     for task in tasks:
107 |         print(f"\n{'=' * 50}")
108 |         print(f"{msgs['task']}: {task}")
109 |         print("=" * 50)
110 | 
111 |         result = agent.run(task)
112 |         print(f"{msgs['result']}: {result}")
113 | 
114 |         # Reset Agent state
115 |         agent.reset()
116 | 
117 | 
118 | def example_remote_device(lang: str = "cn"):
119 |     """Remote device example / 远程设备示例"""
120 |     from phone_agent.adb import ADBConnection
121 | 
122 |     msgs = get_messages(lang)
123 | 
124 |     # Create connection manager
125 |     conn = ADBConnection()
126 | 
127 |     # Connect to remote device
128 |     success, message = conn.connect("192.168.1.100:5555")
129 |     if not success:
130 |         print(f"{msgs['connection_failed']}: {message}")
131 |         return
132 | 
133 |     print(f"{msgs['connection_successful']}: {message}")
134 | 
135 |     # Create Agent with device specified
136 |     agent_config = AgentConfig(
137 |         device_id="192.168.1.100:5555",
138 |         verbose=True,
139 |         lang=lang,
140 |     )
141 | 
142 |     agent = PhoneAgent(agent_config=agent_config)
143 | 
144 |     # Execute task
145 |     result = agent.run("打开微信查看消息")
146 |     print(f"{msgs['task_result']}: {result}")
147 | 
148 |     # Disconnect
149 |     conn.disconnect("192.168.1.100:5555")
150 | 
151 | 
152 | if __name__ == "__main__":
153 |     import argparse
154 | 
155 |     parser = argparse.ArgumentParser(description="Phone Agent Usage Examples")
156 |     parser.add_argument(
157 |         "--lang",
158 |         type=str,
159 |         default="cn",
160 |         choices=["cn", "en"],
161 |         help="Language for UI messages (cn=Chinese, en=English)",
162 |     )
163 |     args = parser.parse_args()
164 | 
165 |     msgs = get_messages(args.lang)
166 | 
167 |     print("Phone Agent Usage Examples")
168 |     print("=" * 50)
169 | 
170 |     # Run basic example
171 |     print(f"\n1. Basic Task Example")
172 |     print("-" * 30)
173 |     example_basic_task(args.lang)
174 | 
175 |     # Uncomment to run other examples
176 |     # print(f"\n2. Task Example with Callbacks")
177 |     # print("-" * 30)
178 |     # example_with_callbacks(args.lang)
179 | 
180 |     # print(f"\n3. Step-by-step Example")
181 |     # print("-" * 30)
182 |     # example_step_by_step(args.lang)
183 | 
184 |     # print(f"\n4. Batch Task Example")
185 |     # print("-" * 30)
186 |     # example_multiple_tasks(args.lang)
187 | 
188 |     # print(f"\n5. Remote Device Example")
189 |     # print("-" * 30)
190 |     # example_remote_device(args.lang)
191 | 


--------------------------------------------------------------------------------
/phone_agent/config/timing.py:
--------------------------------------------------------------------------------
  1 | """Timing configuration for Phone Agent.
  2 | 
  3 | This module defines all configurable waiting times used throughout the application.
  4 | Users can customize these values by modifying this file or by setting environment variables.
  5 | """
  6 | 
  7 | import os
  8 | from dataclasses import dataclass
  9 | 
 10 | 
 11 | @dataclass
 12 | class ActionTimingConfig:
 13 |     """Configuration for action handler timing delays."""
 14 | 
 15 |     # Text input related delays (in seconds)
 16 |     keyboard_switch_delay: float = 1.0  # Delay after switching to ADB keyboard
 17 |     text_clear_delay: float = 1.0  # Delay after clearing text
 18 |     text_input_delay: float = 1.0  # Delay after typing text
 19 |     keyboard_restore_delay: float = 1.0  # Delay after restoring original keyboard
 20 | 
 21 |     def __post_init__(self):
 22 |         """Load values from environment variables if present."""
 23 |         self.keyboard_switch_delay = float(
 24 |             os.getenv("PHONE_AGENT_KEYBOARD_SWITCH_DELAY", self.keyboard_switch_delay)
 25 |         )
 26 |         self.text_clear_delay = float(
 27 |             os.getenv("PHONE_AGENT_TEXT_CLEAR_DELAY", self.text_clear_delay)
 28 |         )
 29 |         self.text_input_delay = float(
 30 |             os.getenv("PHONE_AGENT_TEXT_INPUT_DELAY", self.text_input_delay)
 31 |         )
 32 |         self.keyboard_restore_delay = float(
 33 |             os.getenv("PHONE_AGENT_KEYBOARD_RESTORE_DELAY", self.keyboard_restore_delay)
 34 |         )
 35 | 
 36 | 
 37 | @dataclass
 38 | class DeviceTimingConfig:
 39 |     """Configuration for device operation timing delays."""
 40 | 
 41 |     # Default delays for various device operations (in seconds)
 42 |     default_tap_delay: float = 1.0  # Default delay after tap
 43 |     default_double_tap_delay: float = 1.0  # Default delay after double tap
 44 |     double_tap_interval: float = 0.1  # Interval between two taps in double tap
 45 |     default_long_press_delay: float = 1.0  # Default delay after long press
 46 |     default_swipe_delay: float = 1.0  # Default delay after swipe
 47 |     default_back_delay: float = 1.0  # Default delay after back button
 48 |     default_home_delay: float = 1.0  # Default delay after home button
 49 |     default_launch_delay: float = 1.0  # Default delay after launching app
 50 | 
 51 |     def __post_init__(self):
 52 |         """Load values from environment variables if present."""
 53 |         self.default_tap_delay = float(
 54 |             os.getenv("PHONE_AGENT_TAP_DELAY", self.default_tap_delay)
 55 |         )
 56 |         self.default_double_tap_delay = float(
 57 |             os.getenv("PHONE_AGENT_DOUBLE_TAP_DELAY", self.default_double_tap_delay)
 58 |         )
 59 |         self.double_tap_interval = float(
 60 |             os.getenv("PHONE_AGENT_DOUBLE_TAP_INTERVAL", self.double_tap_interval)
 61 |         )
 62 |         self.default_long_press_delay = float(
 63 |             os.getenv("PHONE_AGENT_LONG_PRESS_DELAY", self.default_long_press_delay)
 64 |         )
 65 |         self.default_swipe_delay = float(
 66 |             os.getenv("PHONE_AGENT_SWIPE_DELAY", self.default_swipe_delay)
 67 |         )
 68 |         self.default_back_delay = float(
 69 |             os.getenv("PHONE_AGENT_BACK_DELAY", self.default_back_delay)
 70 |         )
 71 |         self.default_home_delay = float(
 72 |             os.getenv("PHONE_AGENT_HOME_DELAY", self.default_home_delay)
 73 |         )
 74 |         self.default_launch_delay = float(
 75 |             os.getenv("PHONE_AGENT_LAUNCH_DELAY", self.default_launch_delay)
 76 |         )
 77 | 
 78 | 
 79 | @dataclass
 80 | class ConnectionTimingConfig:
 81 |     """Configuration for ADB connection timing delays."""
 82 | 
 83 |     # ADB server and connection delays (in seconds)
 84 |     adb_restart_delay: float = 2.0  # Wait time after enabling TCP/IP mode
 85 |     server_restart_delay: float = (
 86 |         1.0  # Wait time between killing and starting ADB server
 87 |     )
 88 | 
 89 |     def __post_init__(self):
 90 |         """Load values from environment variables if present."""
 91 |         self.adb_restart_delay = float(
 92 |             os.getenv("PHONE_AGENT_ADB_RESTART_DELAY", self.adb_restart_delay)
 93 |         )
 94 |         self.server_restart_delay = float(
 95 |             os.getenv("PHONE_AGENT_SERVER_RESTART_DELAY", self.server_restart_delay)
 96 |         )
 97 | 
 98 | 
 99 | @dataclass
100 | class TimingConfig:
101 |     """Master timing configuration combining all timing settings."""
102 | 
103 |     action: ActionTimingConfig
104 |     device: DeviceTimingConfig
105 |     connection: ConnectionTimingConfig
106 | 
107 |     def __init__(self):
108 |         """Initialize all timing configurations."""
109 |         self.action = ActionTimingConfig()
110 |         self.device = DeviceTimingConfig()
111 |         self.connection = ConnectionTimingConfig()
112 | 
113 | 
114 | # Global timing configuration instance
115 | # Users can modify these values at runtime or through environment variables
116 | TIMING_CONFIG = TimingConfig()
117 | 
118 | 
119 | def get_timing_config() -> TimingConfig:
120 |     """
121 |     Get the global timing configuration.
122 | 
123 |     Returns:
124 |         The global TimingConfig instance.
125 |     """
126 |     return TIMING_CONFIG
127 | 
128 | 
129 | def update_timing_config(
130 |     action: ActionTimingConfig | None = None,
131 |     device: DeviceTimingConfig | None = None,
132 |     connection: ConnectionTimingConfig | None = None,
133 | ) -> None:
134 |     """
135 |     Update the global timing configuration.
136 | 
137 |     Args:
138 |         action: New action timing configuration.
139 |         device: New device timing configuration.
140 |         connection: New connection timing configuration.
141 | 
142 |     Example:
143 |         >>> from phone_agent.config.timing import update_timing_config, ActionTimingConfig
144 |         >>> custom_action = ActionTimingConfig(
145 |         ...     keyboard_switch_delay=0.5,
146 |         ...     text_input_delay=0.5
147 |         ... )
148 |         >>> update_timing_config(action=custom_action)
149 |     """
150 |     global TIMING_CONFIG
151 |     if action is not None:
152 |         TIMING_CONFIG.action = action
153 |     if device is not None:
154 |         TIMING_CONFIG.device = device
155 |     if connection is not None:
156 |         TIMING_CONFIG.connection = connection
157 | 
158 | 
159 | __all__ = [
160 |     "ActionTimingConfig",
161 |     "DeviceTimingConfig",
162 |     "ConnectionTimingConfig",
163 |     "TimingConfig",
164 |     "TIMING_CONFIG",
165 |     "get_timing_config",
166 |     "update_timing_config",
167 | ]
168 | 


--------------------------------------------------------------------------------
/phone_agent/adb/device.py:
--------------------------------------------------------------------------------
  1 | """Device control utilities for Android automation."""
  2 | 
  3 | import os
  4 | import subprocess
  5 | import time
  6 | from typing import List, Optional, Tuple
  7 | 
  8 | from phone_agent.config.apps import APP_PACKAGES
  9 | from phone_agent.config.timing import TIMING_CONFIG
 10 | 
 11 | 
 12 | def get_current_app(device_id: str | None = None) -> str:
 13 |     """
 14 |     Get the currently focused app name.
 15 | 
 16 |     Args:
 17 |         device_id: Optional ADB device ID for multi-device setups.
 18 | 
 19 |     Returns:
 20 |         The app name if recognized, otherwise "System Home".
 21 |     """
 22 |     adb_prefix = _get_adb_prefix(device_id)
 23 | 
 24 |     result = subprocess.run(
 25 |         adb_prefix + ["shell", "dumpsys", "window"], capture_output=True, text=True, encoding="utf-8"
 26 |     )
 27 |     output = result.stdout
 28 |     if not output:
 29 |         raise ValueError("No output from dumpsys window")
 30 | 
 31 |     # Parse window focus info
 32 |     for line in output.split("\n"):
 33 |         if "mCurrentFocus" in line or "mFocusedApp" in line:
 34 |             for app_name, package in APP_PACKAGES.items():
 35 |                 if package in line:
 36 |                     return app_name
 37 | 
 38 |     return "System Home"
 39 | 
 40 | 
 41 | def tap(
 42 |     x: int, y: int, device_id: str | None = None, delay: float | None = None
 43 | ) -> None:
 44 |     """
 45 |     Tap at the specified coordinates.
 46 | 
 47 |     Args:
 48 |         x: X coordinate.
 49 |         y: Y coordinate.
 50 |         device_id: Optional ADB device ID.
 51 |         delay: Delay in seconds after tap. If None, uses configured default.
 52 |     """
 53 |     if delay is None:
 54 |         delay = TIMING_CONFIG.device.default_tap_delay
 55 | 
 56 |     adb_prefix = _get_adb_prefix(device_id)
 57 | 
 58 |     subprocess.run(
 59 |         adb_prefix + ["shell", "input", "tap", str(x), str(y)], capture_output=True
 60 |     )
 61 |     time.sleep(delay)
 62 | 
 63 | 
 64 | def double_tap(
 65 |     x: int, y: int, device_id: str | None = None, delay: float | None = None
 66 | ) -> None:
 67 |     """
 68 |     Double tap at the specified coordinates.
 69 | 
 70 |     Args:
 71 |         x: X coordinate.
 72 |         y: Y coordinate.
 73 |         device_id: Optional ADB device ID.
 74 |         delay: Delay in seconds after double tap. If None, uses configured default.
 75 |     """
 76 |     if delay is None:
 77 |         delay = TIMING_CONFIG.device.default_double_tap_delay
 78 | 
 79 |     adb_prefix = _get_adb_prefix(device_id)
 80 | 
 81 |     subprocess.run(
 82 |         adb_prefix + ["shell", "input", "tap", str(x), str(y)], capture_output=True
 83 |     )
 84 |     time.sleep(TIMING_CONFIG.device.double_tap_interval)
 85 |     subprocess.run(
 86 |         adb_prefix + ["shell", "input", "tap", str(x), str(y)], capture_output=True
 87 |     )
 88 |     time.sleep(delay)
 89 | 
 90 | 
 91 | def long_press(
 92 |     x: int,
 93 |     y: int,
 94 |     duration_ms: int = 3000,
 95 |     device_id: str | None = None,
 96 |     delay: float | None = None,
 97 | ) -> None:
 98 |     """
 99 |     Long press at the specified coordinates.
100 | 
101 |     Args:
102 |         x: X coordinate.
103 |         y: Y coordinate.
104 |         duration_ms: Duration of press in milliseconds.
105 |         device_id: Optional ADB device ID.
106 |         delay: Delay in seconds after long press. If None, uses configured default.
107 |     """
108 |     if delay is None:
109 |         delay = TIMING_CONFIG.device.default_long_press_delay
110 | 
111 |     adb_prefix = _get_adb_prefix(device_id)
112 | 
113 |     subprocess.run(
114 |         adb_prefix
115 |         + ["shell", "input", "swipe", str(x), str(y), str(x), str(y), str(duration_ms)],
116 |         capture_output=True,
117 |     )
118 |     time.sleep(delay)
119 | 
120 | 
121 | def swipe(
122 |     start_x: int,
123 |     start_y: int,
124 |     end_x: int,
125 |     end_y: int,
126 |     duration_ms: int | None = None,
127 |     device_id: str | None = None,
128 |     delay: float | None = None,
129 | ) -> None:
130 |     """
131 |     Swipe from start to end coordinates.
132 | 
133 |     Args:
134 |         start_x: Starting X coordinate.
135 |         start_y: Starting Y coordinate.
136 |         end_x: Ending X coordinate.
137 |         end_y: Ending Y coordinate.
138 |         duration_ms: Duration of swipe in milliseconds (auto-calculated if None).
139 |         device_id: Optional ADB device ID.
140 |         delay: Delay in seconds after swipe. If None, uses configured default.
141 |     """
142 |     if delay is None:
143 |         delay = TIMING_CONFIG.device.default_swipe_delay
144 | 
145 |     adb_prefix = _get_adb_prefix(device_id)
146 | 
147 |     if duration_ms is None:
148 |         # Calculate duration based on distance
149 |         dist_sq = (start_x - end_x) ** 2 + (start_y - end_y) ** 2
150 |         duration_ms = int(dist_sq / 1000)
151 |         duration_ms = max(1000, min(duration_ms, 2000))  # Clamp between 1000-2000ms
152 | 
153 |     subprocess.run(
154 |         adb_prefix
155 |         + [
156 |             "shell",
157 |             "input",
158 |             "swipe",
159 |             str(start_x),
160 |             str(start_y),
161 |             str(end_x),
162 |             str(end_y),
163 |             str(duration_ms),
164 |         ],
165 |         capture_output=True,
166 |     )
167 |     time.sleep(delay)
168 | 
169 | 
170 | def back(device_id: str | None = None, delay: float | None = None) -> None:
171 |     """
172 |     Press the back button.
173 | 
174 |     Args:
175 |         device_id: Optional ADB device ID.
176 |         delay: Delay in seconds after pressing back. If None, uses configured default.
177 |     """
178 |     if delay is None:
179 |         delay = TIMING_CONFIG.device.default_back_delay
180 | 
181 |     adb_prefix = _get_adb_prefix(device_id)
182 | 
183 |     subprocess.run(
184 |         adb_prefix + ["shell", "input", "keyevent", "4"], capture_output=True
185 |     )
186 |     time.sleep(delay)
187 | 
188 | 
189 | def home(device_id: str | None = None, delay: float | None = None) -> None:
190 |     """
191 |     Press the home button.
192 | 
193 |     Args:
194 |         device_id: Optional ADB device ID.
195 |         delay: Delay in seconds after pressing home. If None, uses configured default.
196 |     """
197 |     if delay is None:
198 |         delay = TIMING_CONFIG.device.default_home_delay
199 | 
200 |     adb_prefix = _get_adb_prefix(device_id)
201 | 
202 |     subprocess.run(
203 |         adb_prefix + ["shell", "input", "keyevent", "KEYCODE_HOME"], capture_output=True
204 |     )
205 |     time.sleep(delay)
206 | 
207 | 
208 | def launch_app(
209 |     app_name: str, device_id: str | None = None, delay: float | None = None
210 | ) -> bool:
211 |     """
212 |     Launch an app by name.
213 | 
214 |     Args:
215 |         app_name: The app name (must be in APP_PACKAGES).
216 |         device_id: Optional ADB device ID.
217 |         delay: Delay in seconds after launching. If None, uses configured default.
218 | 
219 |     Returns:
220 |         True if app was launched, False if app not found.
221 |     """
222 |     if delay is None:
223 |         delay = TIMING_CONFIG.device.default_launch_delay
224 | 
225 |     if app_name not in APP_PACKAGES:
226 |         return False
227 | 
228 |     adb_prefix = _get_adb_prefix(device_id)
229 |     package = APP_PACKAGES[app_name]
230 | 
231 |     subprocess.run(
232 |         adb_prefix
233 |         + [
234 |             "shell",
235 |             "monkey",
236 |             "-p",
237 |             package,
238 |             "-c",
239 |             "android.intent.category.LAUNCHER",
240 |             "1",
241 |         ],
242 |         capture_output=True,
243 |     )
244 |     time.sleep(delay)
245 |     return True
246 | 
247 | 
248 | def _get_adb_prefix(device_id: str | None) -> list:
249 |     """Get ADB command prefix with optional device specifier."""
250 |     if device_id:
251 |         return ["adb", "-s", device_id]
252 |     return ["adb"]
253 | 


--------------------------------------------------------------------------------
/phone_agent/hdc/device.py:
--------------------------------------------------------------------------------
  1 | """Device control utilities for HarmonyOS automation."""
  2 | 
  3 | import os
  4 | import subprocess
  5 | import time
  6 | from typing import List, Optional, Tuple
  7 | 
  8 | from phone_agent.config.apps_harmonyos import APP_ABILITIES, APP_PACKAGES
  9 | from phone_agent.config.timing import TIMING_CONFIG
 10 | from phone_agent.hdc.connection import _run_hdc_command
 11 | 
 12 | 
 13 | def get_current_app(device_id: str | None = None) -> str:
 14 |     """
 15 |     Get the currently focused app name.
 16 | 
 17 |     Args:
 18 |         device_id: Optional HDC device ID for multi-device setups.
 19 | 
 20 |     Returns:
 21 |         The app name if recognized, otherwise "System Home".
 22 |     """
 23 |     hdc_prefix = _get_hdc_prefix(device_id)
 24 | 
 25 |     result = _run_hdc_command(
 26 |         hdc_prefix + ["shell", "hidumper", "-s", "WindowManagerService", "-a", "-a"],
 27 |         capture_output=True,
 28 |         text=True,
 29 |         encoding="utf-8"
 30 |     )
 31 |     output = result.stdout
 32 |     if not output:
 33 |         raise ValueError("No output from hidumper")
 34 | 
 35 |     # Parse window focus info
 36 |     for line in output.split("\n"):
 37 |         if "focused" in line.lower() or "current" in line.lower():
 38 |             for app_name, package in APP_PACKAGES.items():
 39 |                 if package in line:
 40 |                     return app_name
 41 | 
 42 |     return "System Home"
 43 | 
 44 | 
 45 | def tap(
 46 |     x: int, y: int, device_id: str | None = None, delay: float | None = None
 47 | ) -> None:
 48 |     """
 49 |     Tap at the specified coordinates.
 50 | 
 51 |     Args:
 52 |         x: X coordinate.
 53 |         y: Y coordinate.
 54 |         device_id: Optional HDC device ID.
 55 |         delay: Delay in seconds after tap. If None, uses configured default.
 56 |     """
 57 |     if delay is None:
 58 |         delay = TIMING_CONFIG.device.default_tap_delay
 59 | 
 60 |     hdc_prefix = _get_hdc_prefix(device_id)
 61 | 
 62 |     # HarmonyOS uses uitest uiInput click
 63 |     _run_hdc_command(
 64 |         hdc_prefix + ["shell", "uitest", "uiInput", "click", str(x), str(y)],
 65 |         capture_output=True
 66 |     )
 67 |     time.sleep(delay)
 68 | 
 69 | 
 70 | def double_tap(
 71 |     x: int, y: int, device_id: str | None = None, delay: float | None = None
 72 | ) -> None:
 73 |     """
 74 |     Double tap at the specified coordinates.
 75 | 
 76 |     Args:
 77 |         x: X coordinate.
 78 |         y: Y coordinate.
 79 |         device_id: Optional HDC device ID.
 80 |         delay: Delay in seconds after double tap. If None, uses configured default.
 81 |     """
 82 |     if delay is None:
 83 |         delay = TIMING_CONFIG.device.default_double_tap_delay
 84 | 
 85 |     hdc_prefix = _get_hdc_prefix(device_id)
 86 | 
 87 |     # HarmonyOS uses uitest uiInput doubleClick
 88 |     _run_hdc_command(
 89 |         hdc_prefix + ["shell", "uitest", "uiInput", "doubleClick", str(x), str(y)],
 90 |         capture_output=True
 91 |     )
 92 |     time.sleep(delay)
 93 | 
 94 | 
 95 | def long_press(
 96 |     x: int,
 97 |     y: int,
 98 |     duration_ms: int = 3000,
 99 |     device_id: str | None = None,
100 |     delay: float | None = None,
101 | ) -> None:
102 |     """
103 |     Long press at the specified coordinates.
104 | 
105 |     Args:
106 |         x: X coordinate.
107 |         y: Y coordinate.
108 |         duration_ms: Duration of press in milliseconds (note: HarmonyOS longClick may not support duration).
109 |         device_id: Optional HDC device ID.
110 |         delay: Delay in seconds after long press. If None, uses configured default.
111 |     """
112 |     if delay is None:
113 |         delay = TIMING_CONFIG.device.default_long_press_delay
114 | 
115 |     hdc_prefix = _get_hdc_prefix(device_id)
116 | 
117 |     # HarmonyOS uses uitest uiInput longClick
118 |     # Note: longClick may have a fixed duration, duration_ms parameter might not be supported
119 |     _run_hdc_command(
120 |         hdc_prefix + ["shell", "uitest", "uiInput", "longClick", str(x), str(y)],
121 |         capture_output=True,
122 |     )
123 |     time.sleep(delay)
124 | 
125 | 
126 | def swipe(
127 |     start_x: int,
128 |     start_y: int,
129 |     end_x: int,
130 |     end_y: int,
131 |     duration_ms: int | None = None,
132 |     device_id: str | None = None,
133 |     delay: float | None = None,
134 | ) -> None:
135 |     """
136 |     Swipe from start to end coordinates.
137 | 
138 |     Args:
139 |         start_x: Starting X coordinate.
140 |         start_y: Starting Y coordinate.
141 |         end_x: Ending X coordinate.
142 |         end_y: Ending Y coordinate.
143 |         duration_ms: Duration of swipe in milliseconds (auto-calculated if None).
144 |         device_id: Optional HDC device ID.
145 |         delay: Delay in seconds after swipe. If None, uses configured default.
146 |     """
147 |     if delay is None:
148 |         delay = TIMING_CONFIG.device.default_swipe_delay
149 | 
150 |     hdc_prefix = _get_hdc_prefix(device_id)
151 | 
152 |     if duration_ms is None:
153 |         # Calculate duration based on distance
154 |         dist_sq = (start_x - end_x) ** 2 + (start_y - end_y) ** 2
155 |         duration_ms = int(dist_sq / 1000)
156 |         duration_ms = max(500, min(duration_ms, 1000))  # Clamp between 500-1000ms
157 | 
158 |     # HarmonyOS uses uitest uiInput swipe
159 |     # Format: swipe startX startY endX endY duration
160 |     _run_hdc_command(
161 |         hdc_prefix
162 |         + [
163 |             "shell",
164 |             "uitest",
165 |             "uiInput",
166 |             "swipe",
167 |             str(start_x),
168 |             str(start_y),
169 |             str(end_x),
170 |             str(end_y),
171 |             str(duration_ms),
172 |         ],
173 |         capture_output=True,
174 |     )
175 |     time.sleep(delay)
176 | 
177 | 
178 | def back(device_id: str | None = None, delay: float | None = None) -> None:
179 |     """
180 |     Press the back button.
181 | 
182 |     Args:
183 |         device_id: Optional HDC device ID.
184 |         delay: Delay in seconds after pressing back. If None, uses configured default.
185 |     """
186 |     if delay is None:
187 |         delay = TIMING_CONFIG.device.default_back_delay
188 | 
189 |     hdc_prefix = _get_hdc_prefix(device_id)
190 | 
191 |     # HarmonyOS uses uitest uiInput keyEvent Back
192 |     _run_hdc_command(
193 |         hdc_prefix + ["shell", "uitest", "uiInput", "keyEvent", "Back"],
194 |         capture_output=True
195 |     )
196 |     time.sleep(delay)
197 | 
198 | 
199 | def home(device_id: str | None = None, delay: float | None = None) -> None:
200 |     """
201 |     Press the home button.
202 | 
203 |     Args:
204 |         device_id: Optional HDC device ID.
205 |         delay: Delay in seconds after pressing home. If None, uses configured default.
206 |     """
207 |     if delay is None:
208 |         delay = TIMING_CONFIG.device.default_home_delay
209 | 
210 |     hdc_prefix = _get_hdc_prefix(device_id)
211 | 
212 |     # HarmonyOS uses uitest uiInput keyEvent Home
213 |     _run_hdc_command(
214 |         hdc_prefix + ["shell", "uitest", "uiInput", "keyEvent", "Home"],
215 |         capture_output=True
216 |     )
217 |     time.sleep(delay)
218 | 
219 | 
220 | def launch_app(
221 |     app_name: str, device_id: str | None = None, delay: float | None = None
222 | ) -> bool:
223 |     """
224 |     Launch an app by name.
225 | 
226 |     Args:
227 |         app_name: The app name (must be in APP_PACKAGES).
228 |         device_id: Optional HDC device ID.
229 |         delay: Delay in seconds after launching. If None, uses configured default.
230 | 
231 |     Returns:
232 |         True if app was launched, False if app not found.
233 |     """
234 |     if delay is None:
235 |         delay = TIMING_CONFIG.device.default_launch_delay
236 | 
237 |     if app_name not in APP_PACKAGES:
238 |         print(f"[HDC] App '{app_name}' not found in HarmonyOS app list")
239 |         print(f"[HDC] Available apps: {', '.join(sorted(APP_PACKAGES.keys())[:10])}...")
240 |         return False
241 | 
242 |     hdc_prefix = _get_hdc_prefix(device_id)
243 |     bundle = APP_PACKAGES[app_name]
244 | 
245 |     # Get the ability name for this bundle
246 |     # Default to "EntryAbility" if not specified in APP_ABILITIES
247 |     ability = APP_ABILITIES.get(bundle, "EntryAbility")
248 | 
249 |     # HarmonyOS uses 'aa start' command to launch apps
250 |     # Format: aa start -b {bundle} -a {ability}
251 |     _run_hdc_command(
252 |         hdc_prefix
253 |         + [
254 |             "shell",
255 |             "aa",
256 |             "start",
257 |             "-b",
258 |             bundle,
259 |             "-a",
260 |             ability,
261 |         ],
262 |         capture_output=True,
263 |     )
264 |     time.sleep(delay)
265 |     return True
266 | 
267 | 
268 | def _get_hdc_prefix(device_id: str | None) -> list:
269 |     """Get HDC command prefix with optional device specifier."""
270 |     if device_id:
271 |         return ["hdc", "-t", device_id]
272 |     return ["hdc"]
273 | 


--------------------------------------------------------------------------------
/phone_agent/agent.py:
--------------------------------------------------------------------------------
  1 | """Main PhoneAgent class for orchestrating phone automation."""
  2 | 
  3 | import json
  4 | import traceback
  5 | from dataclasses import dataclass
  6 | from typing import Any, Callable
  7 | 
  8 | from phone_agent.actions import ActionHandler
  9 | from phone_agent.actions.handler import do, finish, parse_action
 10 | from phone_agent.config import get_messages, get_system_prompt
 11 | from phone_agent.device_factory import get_device_factory
 12 | from phone_agent.model import ModelClient, ModelConfig
 13 | from phone_agent.model.client import MessageBuilder
 14 | 
 15 | 
 16 | @dataclass
 17 | class AgentConfig:
 18 |     """Configuration for the PhoneAgent."""
 19 | 
 20 |     max_steps: int = 100
 21 |     device_id: str | None = None
 22 |     lang: str = "cn"
 23 |     system_prompt: str | None = None
 24 |     verbose: bool = True
 25 | 
 26 |     def __post_init__(self):
 27 |         if self.system_prompt is None:
 28 |             self.system_prompt = get_system_prompt(self.lang)
 29 | 
 30 | 
 31 | @dataclass
 32 | class StepResult:
 33 |     """Result of a single agent step."""
 34 | 
 35 |     success: bool
 36 |     finished: bool
 37 |     action: dict[str, Any] | None
 38 |     thinking: str
 39 |     message: str | None = None
 40 | 
 41 | 
 42 | class PhoneAgent:
 43 |     """
 44 |     AI-powered agent for automating Android phone interactions.
 45 | 
 46 |     The agent uses a vision-language model to understand screen content
 47 |     and decide on actions to complete user tasks.
 48 | 
 49 |     Args:
 50 |         model_config: Configuration for the AI model.
 51 |         agent_config: Configuration for the agent behavior.
 52 |         confirmation_callback: Optional callback for sensitive action confirmation.
 53 |         takeover_callback: Optional callback for takeover requests.
 54 | 
 55 |     Example:
 56 |         >>> from phone_agent import PhoneAgent
 57 |         >>> from phone_agent.model import ModelConfig
 58 |         >>>
 59 |         >>> model_config = ModelConfig(base_url="http://localhost:8000/v1")
 60 |         >>> agent = PhoneAgent(model_config)
 61 |         >>> agent.run("Open WeChat and send a message to John")
 62 |     """
 63 | 
 64 |     def __init__(
 65 |         self,
 66 |         model_config: ModelConfig | None = None,
 67 |         agent_config: AgentConfig | None = None,
 68 |         confirmation_callback: Callable[[str], bool] | None = None,
 69 |         takeover_callback: Callable[[str], None] | None = None,
 70 |     ):
 71 |         self.model_config = model_config or ModelConfig()
 72 |         self.agent_config = agent_config or AgentConfig()
 73 | 
 74 |         self.model_client = ModelClient(self.model_config)
 75 |         self.action_handler = ActionHandler(
 76 |             device_id=self.agent_config.device_id,
 77 |             confirmation_callback=confirmation_callback,
 78 |             takeover_callback=takeover_callback,
 79 |         )
 80 | 
 81 |         self._context: list[dict[str, Any]] = []
 82 |         self._step_count = 0
 83 | 
 84 |     def run(self, task: str) -> str:
 85 |         """
 86 |         Run the agent to complete a task.
 87 | 
 88 |         Args:
 89 |             task: Natural language description of the task.
 90 | 
 91 |         Returns:
 92 |             Final message from the agent.
 93 |         """
 94 |         self._context = []
 95 |         self._step_count = 0
 96 | 
 97 |         # First step with user prompt
 98 |         result = self._execute_step(task, is_first=True)
 99 | 
100 |         if result.finished:
101 |             return result.message or "Task completed"
102 | 
103 |         # Continue until finished or max steps reached
104 |         while self._step_count < self.agent_config.max_steps:
105 |             result = self._execute_step(is_first=False)
106 | 
107 |             if result.finished:
108 |                 return result.message or "Task completed"
109 | 
110 |         return "Max steps reached"
111 | 
112 |     def step(self, task: str | None = None) -> StepResult:
113 |         """
114 |         Execute a single step of the agent.
115 | 
116 |         Useful for manual control or debugging.
117 | 
118 |         Args:
119 |             task: Task description (only needed for first step).
120 | 
121 |         Returns:
122 |             StepResult with step details.
123 |         """
124 |         is_first = len(self._context) == 0
125 | 
126 |         if is_first and not task:
127 |             raise ValueError("Task is required for the first step")
128 | 
129 |         return self._execute_step(task, is_first)
130 | 
131 |     def reset(self) -> None:
132 |         """Reset the agent state for a new task."""
133 |         self._context = []
134 |         self._step_count = 0
135 | 
136 |     def _execute_step(
137 |         self, user_prompt: str | None = None, is_first: bool = False
138 |     ) -> StepResult:
139 |         """Execute a single step of the agent loop."""
140 |         self._step_count += 1
141 | 
142 |         # Capture current screen state
143 |         device_factory = get_device_factory()
144 |         screenshot = device_factory.get_screenshot(self.agent_config.device_id)
145 |         current_app = device_factory.get_current_app(self.agent_config.device_id)
146 | 
147 |         # Build messages
148 |         if is_first:
149 |             self._context.append(
150 |                 MessageBuilder.create_system_message(self.agent_config.system_prompt)
151 |             )
152 | 
153 |             screen_info = MessageBuilder.build_screen_info(current_app)
154 |             text_content = f"{user_prompt}\n\n{screen_info}"
155 | 
156 |             self._context.append(
157 |                 MessageBuilder.create_user_message(
158 |                     text=text_content, image_base64=screenshot.base64_data
159 |                 )
160 |             )
161 |         else:
162 |             screen_info = MessageBuilder.build_screen_info(current_app)
163 |             text_content = f"** Screen Info **\n\n{screen_info}"
164 | 
165 |             self._context.append(
166 |                 MessageBuilder.create_user_message(
167 |                     text=text_content, image_base64=screenshot.base64_data
168 |                 )
169 |             )
170 | 
171 |         # Get model response
172 |         try:
173 |             msgs = get_messages(self.agent_config.lang)
174 |             print("\n" + "=" * 50)
175 |             print(f"💭 {msgs['thinking']}:")
176 |             print("-" * 50)
177 |             response = self.model_client.request(self._context)
178 |         except Exception as e:
179 |             if self.agent_config.verbose:
180 |                 traceback.print_exc()
181 |             return StepResult(
182 |                 success=False,
183 |                 finished=True,
184 |                 action=None,
185 |                 thinking="",
186 |                 message=f"Model error: {e}",
187 |             )
188 | 
189 |         # Parse action from response
190 |         try:
191 |             action = parse_action(response.action)
192 |         except ValueError:
193 |             if self.agent_config.verbose:
194 |                 traceback.print_exc()
195 |             action = finish(message=response.action)
196 | 
197 |         if self.agent_config.verbose:
198 |             # Print thinking process
199 |             print("-" * 50)
200 |             print(f"🎯 {msgs['action']}:")
201 |             print(json.dumps(action, ensure_ascii=False, indent=2))
202 |             print("=" * 50 + "\n")
203 | 
204 |         # Remove image from context to save space
205 |         self._context[-1] = MessageBuilder.remove_images_from_message(self._context[-1])
206 | 
207 |         # Execute action
208 |         try:
209 |             result = self.action_handler.execute(
210 |                 action, screenshot.width, screenshot.height
211 |             )
212 |         except Exception as e:
213 |             if self.agent_config.verbose:
214 |                 traceback.print_exc()
215 |             result = self.action_handler.execute(
216 |                 finish(message=str(e)), screenshot.width, screenshot.height
217 |             )
218 | 
219 |         # Add assistant response to context
220 |         self._context.append(
221 |             MessageBuilder.create_assistant_message(
222 |                 f"<think>{response.thinking}</think><answer>{response.action}</answer>"
223 |             )
224 |         )
225 | 
226 |         # Check if finished
227 |         finished = action.get("_metadata") == "finish" or result.should_finish
228 | 
229 |         if finished and self.agent_config.verbose:
230 |             msgs = get_messages(self.agent_config.lang)
231 |             print("\n" + "🎉 " + "=" * 48)
232 |             print(
233 |                 f"✅ {msgs['task_completed']}: {result.message or action.get('message', msgs['done'])}"
234 |             )
235 |             print("=" * 50 + "\n")
236 | 
237 |         return StepResult(
238 |             success=result.success,
239 |             finished=finished,
240 |             action=action,
241 |             thinking=response.thinking,
242 |             message=result.message or action.get("message"),
243 |         )
244 | 
245 |     @property
246 |     def context(self) -> list[dict[str, Any]]:
247 |         """Get the current conversation context."""
248 |         return self._context.copy()
249 | 
250 |     @property
251 |     def step_count(self) -> int:
252 |         """Get the current step count."""
253 |         return self._step_count
254 | 


--------------------------------------------------------------------------------
/phone_agent/config/apps.py:
--------------------------------------------------------------------------------
  1 | """App name to package name mapping for supported applications."""
  2 | 
  3 | APP_PACKAGES: dict[str, str] = {
  4 |     # Social & Messaging
  5 |     "微信": "com.tencent.mm",
  6 |     "QQ": "com.tencent.mobileqq",
  7 |     "微博": "com.sina.weibo",
  8 |     # E-commerce
  9 |     "淘宝": "com.taobao.taobao",
 10 |     "京东": "com.jingdong.app.mall",
 11 |     "拼多多": "com.xunmeng.pinduoduo",
 12 |     "淘宝闪购": "com.taobao.taobao",
 13 |     "京东秒送": "com.jingdong.app.mall",
 14 |     # Lifestyle & Social
 15 |     "小红书": "com.xingin.xhs",
 16 |     "豆瓣": "com.douban.frodo",
 17 |     "知乎": "com.zhihu.android",
 18 |     # Maps & Navigation
 19 |     "高德地图": "com.autonavi.minimap",
 20 |     "百度地图": "com.baidu.BaiduMap",
 21 |     # Food & Services
 22 |     "美团": "com.sankuai.meituan",
 23 |     "大众点评": "com.dianping.v1",
 24 |     "饿了么": "me.ele",
 25 |     "肯德基": "com.yek.android.kfc.activitys",
 26 |     # Travel
 27 |     "携程": "ctrip.android.view",
 28 |     "铁路12306": "com.MobileTicket",
 29 |     "12306": "com.MobileTicket",
 30 |     "去哪儿": "com.Qunar",
 31 |     "去哪儿旅行": "com.Qunar",
 32 |     "滴滴出行": "com.sdu.didi.psnger",
 33 |     # Video & Entertainment
 34 |     "bilibili": "tv.danmaku.bili",
 35 |     "抖音": "com.ss.android.ugc.aweme",
 36 |     "快手": "com.smile.gifmaker",
 37 |     "腾讯视频": "com.tencent.qqlive",
 38 |     "爱奇艺": "com.qiyi.video",
 39 |     "优酷视频": "com.youku.phone",
 40 |     "芒果TV": "com.hunantv.imgo.activity",
 41 |     "红果短剧": "com.phoenix.read",
 42 |     # Music & Audio
 43 |     "网易云音乐": "com.netease.cloudmusic",
 44 |     "QQ音乐": "com.tencent.qqmusic",
 45 |     "汽水音乐": "com.luna.music",
 46 |     "喜马拉雅": "com.ximalaya.ting.android",
 47 |     # Reading
 48 |     "番茄小说": "com.dragon.read",
 49 |     "番茄免费小说": "com.dragon.read",
 50 |     "七猫免费小说": "com.kmxs.reader",
 51 |     # Productivity
 52 |     "飞书": "com.ss.android.lark",
 53 |     "QQ邮箱": "com.tencent.androidqqmail",
 54 |     # AI & Tools
 55 |     "豆包": "com.larus.nova",
 56 |     # Health & Fitness
 57 |     "keep": "com.gotokeep.keep",
 58 |     "美柚": "com.lingan.seeyou",
 59 |     # News & Information
 60 |     "腾讯新闻": "com.tencent.news",
 61 |     "今日头条": "com.ss.android.article.news",
 62 |     # Real Estate
 63 |     "贝壳找房": "com.lianjia.beike",
 64 |     "安居客": "com.anjuke.android.app",
 65 |     # Finance
 66 |     "同花顺": "com.hexin.plat.android",
 67 |     # Games
 68 |     "星穹铁道": "com.miHoYo.hkrpg",
 69 |     "崩坏：星穹铁道": "com.miHoYo.hkrpg",
 70 |     "恋与深空": "com.papegames.lysk.cn",
 71 |     "AndroidSystemSettings": "com.android.settings",
 72 |     "Android System Settings": "com.android.settings",
 73 |     "Android  System Settings": "com.android.settings",
 74 |     "Android-System-Settings": "com.android.settings",
 75 |     "Settings": "com.android.settings",
 76 |     "AudioRecorder": "com.android.soundrecorder",
 77 |     "audiorecorder": "com.android.soundrecorder",
 78 |     "Bluecoins": "com.rammigsoftware.bluecoins",
 79 |     "bluecoins": "com.rammigsoftware.bluecoins",
 80 |     "Broccoli": "com.flauschcode.broccoli",
 81 |     "broccoli": "com.flauschcode.broccoli",
 82 |     "Booking.com": "com.booking",
 83 |     "Booking": "com.booking",
 84 |     "booking.com": "com.booking",
 85 |     "booking": "com.booking",
 86 |     "BOOKING.COM": "com.booking",
 87 |     "Chrome": "com.android.chrome",
 88 |     "chrome": "com.android.chrome",
 89 |     "Google Chrome": "com.android.chrome",
 90 |     "Clock": "com.android.deskclock",
 91 |     "clock": "com.android.deskclock",
 92 |     "Contacts": "com.android.contacts",
 93 |     "contacts": "com.android.contacts",
 94 |     "Duolingo": "com.duolingo",
 95 |     "duolingo": "com.duolingo",
 96 |     "Expedia": "com.expedia.bookings",
 97 |     "expedia": "com.expedia.bookings",
 98 |     "Files": "com.android.fileexplorer",
 99 |     "files": "com.android.fileexplorer",
100 |     "File Manager": "com.android.fileexplorer",
101 |     "file manager": "com.android.fileexplorer",
102 |     "gmail": "com.google.android.gm",
103 |     "Gmail": "com.google.android.gm",
104 |     "GoogleMail": "com.google.android.gm",
105 |     "Google Mail": "com.google.android.gm",
106 |     "GoogleFiles": "com.google.android.apps.nbu.files",
107 |     "googlefiles": "com.google.android.apps.nbu.files",
108 |     "FilesbyGoogle": "com.google.android.apps.nbu.files",
109 |     "GoogleCalendar": "com.google.android.calendar",
110 |     "Google-Calendar": "com.google.android.calendar",
111 |     "Google Calendar": "com.google.android.calendar",
112 |     "google-calendar": "com.google.android.calendar",
113 |     "google calendar": "com.google.android.calendar",
114 |     "GoogleChat": "com.google.android.apps.dynamite",
115 |     "Google Chat": "com.google.android.apps.dynamite",
116 |     "Google-Chat": "com.google.android.apps.dynamite",
117 |     "GoogleClock": "com.google.android.deskclock",
118 |     "Google Clock": "com.google.android.deskclock",
119 |     "Google-Clock": "com.google.android.deskclock",
120 |     "GoogleContacts": "com.google.android.contacts",
121 |     "Google-Contacts": "com.google.android.contacts",
122 |     "Google Contacts": "com.google.android.contacts",
123 |     "google-contacts": "com.google.android.contacts",
124 |     "google contacts": "com.google.android.contacts",
125 |     "GoogleDocs": "com.google.android.apps.docs.editors.docs",
126 |     "Google Docs": "com.google.android.apps.docs.editors.docs",
127 |     "googledocs": "com.google.android.apps.docs.editors.docs",
128 |     "google docs": "com.google.android.apps.docs.editors.docs",
129 |     "Google Drive": "com.google.android.apps.docs",
130 |     "Google-Drive": "com.google.android.apps.docs",
131 |     "google drive": "com.google.android.apps.docs",
132 |     "google-drive": "com.google.android.apps.docs",
133 |     "GoogleDrive": "com.google.android.apps.docs",
134 |     "Googledrive": "com.google.android.apps.docs",
135 |     "googledrive": "com.google.android.apps.docs",
136 |     "GoogleFit": "com.google.android.apps.fitness",
137 |     "googlefit": "com.google.android.apps.fitness",
138 |     "GoogleKeep": "com.google.android.keep",
139 |     "googlekeep": "com.google.android.keep",
140 |     "GoogleMaps": "com.google.android.apps.maps",
141 |     "Google Maps": "com.google.android.apps.maps",
142 |     "googlemaps": "com.google.android.apps.maps",
143 |     "google maps": "com.google.android.apps.maps",
144 |     "Google Play Books": "com.google.android.apps.books",
145 |     "Google-Play-Books": "com.google.android.apps.books",
146 |     "google play books": "com.google.android.apps.books",
147 |     "google-play-books": "com.google.android.apps.books",
148 |     "GooglePlayBooks": "com.google.android.apps.books",
149 |     "googleplaybooks": "com.google.android.apps.books",
150 |     "GooglePlayStore": "com.android.vending",
151 |     "Google Play Store": "com.android.vending",
152 |     "Google-Play-Store": "com.android.vending",
153 |     "GoogleSlides": "com.google.android.apps.docs.editors.slides",
154 |     "Google Slides": "com.google.android.apps.docs.editors.slides",
155 |     "Google-Slides": "com.google.android.apps.docs.editors.slides",
156 |     "GoogleTasks": "com.google.android.apps.tasks",
157 |     "Google Tasks": "com.google.android.apps.tasks",
158 |     "Google-Tasks": "com.google.android.apps.tasks",
159 |     "Joplin": "net.cozic.joplin",
160 |     "joplin": "net.cozic.joplin",
161 |     "McDonald": "com.mcdonalds.app",
162 |     "mcdonald": "com.mcdonalds.app",
163 |     "Osmand": "net.osmand",
164 |     "osmand": "net.osmand",
165 |     "PiMusicPlayer": "com.Project100Pi.themusicplayer",
166 |     "pimusicplayer": "com.Project100Pi.themusicplayer",
167 |     "Quora": "com.quora.android",
168 |     "quora": "com.quora.android",
169 |     "Reddit": "com.reddit.frontpage",
170 |     "reddit": "com.reddit.frontpage",
171 |     "RetroMusic": "code.name.monkey.retromusic",
172 |     "retromusic": "code.name.monkey.retromusic",
173 |     "SimpleCalendarPro": "com.scientificcalculatorplus.simplecalculator.basiccalculator.mathcalc",
174 |     "SimpleSMSMessenger": "com.simplemobiletools.smsmessenger",
175 |     "Telegram": "org.telegram.messenger",
176 |     "temu": "com.einnovation.temu",
177 |     "Temu": "com.einnovation.temu",
178 |     "Tiktok": "com.zhiliaoapp.musically",
179 |     "tiktok": "com.zhiliaoapp.musically",
180 |     "Twitter": "com.twitter.android",
181 |     "twitter": "com.twitter.android",
182 |     "X": "com.twitter.android",
183 |     "VLC": "org.videolan.vlc",
184 |     "WeChat": "com.tencent.mm",
185 |     "wechat": "com.tencent.mm",
186 |     "Whatsapp": "com.whatsapp",
187 |     "WhatsApp": "com.whatsapp",
188 | }
189 | 
190 | 
191 | def get_package_name(app_name: str) -> str | None:
192 |     """
193 |     Get the package name for an app.
194 | 
195 |     Args:
196 |         app_name: The display name of the app.
197 | 
198 |     Returns:
199 |         The Android package name, or None if not found.
200 |     """
201 |     return APP_PACKAGES.get(app_name)
202 | 
203 | 
204 | def get_app_name(package_name: str) -> str | None:
205 |     """
206 |     Get the app name from a package name.
207 | 
208 |     Args:
209 |         package_name: The Android package name.
210 | 
211 |     Returns:
212 |         The display name of the app, or None if not found.
213 |     """
214 |     for name, package in APP_PACKAGES.items():
215 |         if package == package_name:
216 |             return name
217 |     return None
218 | 
219 | 
220 | def list_supported_apps() -> list[str]:
221 |     """
222 |     Get a list of all supported app names.
223 | 
224 |     Returns:
225 |         List of app names.
226 |     """
227 |     return list(APP_PACKAGES.keys())
228 | 


--------------------------------------------------------------------------------
/README_coding_agent.md:
--------------------------------------------------------------------------------
  1 | # Open-AutoGLM Quick Start for Coding Agent
  2 | 
  3 | <div align="center">
  4 | <img src=resources/logo.svg width="20%"/>
  5 | </div>
  6 | 
  7 | > **本文专为 AI 助手（如 Claude Code）阅读，用于自动化部署 Open-AutoGLM。**
  8 | >
  9 | > **This document is designed for AI assistants (such as Claude Code) to automate the deployment of Open-AutoGLM.**
 10 | >
 11 | > 如果你是人类读者，可以跳过本文，按照 README.md 文档操作即可。
 12 | >
 13 | > If you are a human reader, you can skip this document and follow the README.md instructions instead.
 14 | 
 15 | ---
 16 | 
 17 | ## Table of Contents / 目录
 18 | 
 19 | - [English](#english)
 20 | - [中文](#中文)
 21 | 
 22 | ---
 23 | 
 24 | # English
 25 | 
 26 | ## Prerequisites
 27 | 
 28 | ### 1. Python Environment
 29 | 
 30 | Python 3.10 or higher is required.
 31 | 
 32 | ### 2. ADB (Android Debug Bridge)
 33 | 
 34 | 1. Download the official ADB [installation package](https://developer.android.com/tools/releases/platform-tools)
 35 | 2. Extract and configure environment variables:
 36 | 
 37 | **macOS:**
 38 | 
 39 | ```bash
 40 | # Assuming extracted to ~/Downloads/platform-tools
 41 | export PATH=${PATH}:~/Downloads/platform-tools
 42 | ```
 43 | 
 44 | **Windows:** Add the extracted folder path to your system PATH. Refer to [this tutorial](https://blog.csdn.net/x2584179909/article/details/108319973) if needed.
 45 | 
 46 | ### 3. Android Device Setup
 47 | 
 48 | Requirements:
 49 | - Android 7.0+ device or emulator
 50 | - Developer Mode enabled
 51 | - USB Debugging enabled
 52 | 
 53 | **Enable Developer Mode:**
 54 | 1. Go to `Settings > About Phone > Build Number`
 55 | 2. Tap rapidly about 10 times until "Developer mode enabled" appears
 56 | 
 57 | **Enable USB Debugging:**
 58 | 1. Go to `Settings > Developer Options > USB Debugging`
 59 | 2. Enable the toggle
 60 | 3. Some devices may require a restart
 61 | 
 62 | **Important permissions to check:**
 63 | 
 64 | ![Permissions](resources/screenshot-20251210-120416.png)
 65 | 
 66 | ### 4. Install ADB Keyboard
 67 | 
 68 | Download and install [ADB Keyboard APK](https://github.com/senzhk/ADBKeyBoard/blob/master/ADBKeyboard.apk) on your device.
 69 | 
 70 | After installation, enable it in `Settings > Input Method` or `Settings > Keyboard List`.
 71 | 
 72 | ---
 73 | 
 74 | ## Installation
 75 | 
 76 | ```bash
 77 | # Install dependencies
 78 | pip install -r requirements.txt
 79 | 
 80 | # Install package
 81 | pip install -e .
 82 | ```
 83 | 
 84 | ---
 85 | 
 86 | ## ADB Configuration
 87 | 
 88 | **Ensure your USB cable supports data transfer (not charging only).**
 89 | 
 90 | ### Verify Connection
 91 | 
 92 | ```bash
 93 | # Check connected devices
 94 | adb devices
 95 | 
 96 | # Expected output:
 97 | # List of devices attached
 98 | # emulator-5554   device
 99 | ```
100 | 
101 | ### Remote Debugging (WiFi)
102 | 
103 | Ensure your phone and computer are on the same WiFi network.
104 | 
105 | ![Enable Wireless Debugging](resources/screenshot-20251210-120630.png)
106 | 
107 | ```bash
108 | # Connect via WiFi (replace with your phone's IP and port)
109 | adb connect 192.168.1.100:5555
110 | 
111 | # Verify connection
112 | adb devices
113 | ```
114 | 
115 | ### Device Management
116 | 
117 | ```bash
118 | # List all devices
119 | adb devices
120 | 
121 | # Connect remote device
122 | adb connect <ip>:<port>
123 | 
124 | # Disconnect device
125 | adb disconnect <ip>:<port>
126 | ```
127 | 
128 | ---
129 | 
130 | ## Usage
131 | 
132 | ### Command Line
133 | 
134 | ```bash
135 | # Interactive mode
136 | python main.py --base-url <MODEL_API_URL> --model <MODEL_NAME>
137 | 
138 | # Execute specific task
139 | python main.py --base-url <MODEL_API_URL> "Open Chrome browser"
140 | 
141 | # Use API key authentication
142 | python main.py --apikey sk-xxxxx
143 | 
144 | # English system prompt
145 | python main.py --lang en --base-url <MODEL_API_URL> "Open Chrome browser"
146 | 
147 | # List supported apps
148 | python main.py --list-apps
149 | 
150 | # Specify device
151 | python main.py --device-id 192.168.1.100:5555 --base-url <MODEL_API_URL> "Open TikTok"
152 | ```
153 | 
154 | ### Python API
155 | 
156 | ```python
157 | from phone_agent import PhoneAgent
158 | from phone_agent.model import ModelConfig
159 | 
160 | # Configure model
161 | model_config = ModelConfig(
162 |     base_url="<MODEL_API_URL>",
163 |     model_name="<MODEL_NAME>",
164 | )
165 | 
166 | # Create Agent
167 | agent = PhoneAgent(model_config=model_config)
168 | 
169 | # Execute task
170 | result = agent.run("Open eBay and search for wireless earbuds")
171 | print(result)
172 | ```
173 | 
174 | ---
175 | 
176 | ## Environment Variables
177 | 
178 | | Variable                  | Description               | Default                      |
179 | |---------------------------|---------------------------|------------------------------|
180 | | `PHONE_AGENT_BASE_URL`    | Model API URL             | `http://localhost:8000/v1`   |
181 | | `PHONE_AGENT_MODEL`       | Model name                | `autoglm-phone-9b`           |
182 | | `PHONE_AGENT_API_KEY`     | API key                   | `EMPTY`                      |
183 | | `PHONE_AGENT_MAX_STEPS`   | Max steps per task        | `100`                        |
184 | | `PHONE_AGENT_DEVICE_ID`   | ADB device ID             | (auto-detect)                |
185 | | `PHONE_AGENT_LANG`        | Language (`cn`/`en`)      | `cn`                         |
186 | 
187 | ---
188 | 
189 | ## Troubleshooting
190 | 
191 | ### Device Not Found
192 | 
193 | ```bash
194 | adb kill-server
195 | adb start-server
196 | adb devices
197 | ```
198 | 
199 | Check:
200 | 1. USB debugging enabled
201 | 2. USB cable supports data transfer
202 | 3. Authorization popup approved on phone
203 | 4. Try different USB port/cable
204 | 
205 | ### Can Open Apps but Cannot Tap
206 | 
207 | Enable both in `Settings > Developer Options`:
208 | - **USB Debugging**
209 | - **USB Debugging (Security Settings)**
210 | 
211 | ### Text Input Not Working
212 | 
213 | 1. Ensure ADB Keyboard is installed
214 | 2. Enable in `Settings > System > Language & Input > Virtual Keyboard`
215 | 
216 | ### Windows Encoding Issues
217 | 
218 | Add environment variable before running:
219 | 
220 | ```bash
221 | PYTHONIOENCODING=utf-8 python main.py ...
222 | ```
223 | 
224 | ---
225 | 
226 | # 中文
227 | 
228 | ## 环境要求
229 | 
230 | ### 1. Python 环境
231 | 
232 | 需要 Python 3.10 及以上版本。
233 | 
234 | ### 2. ADB (Android Debug Bridge)
235 | 
236 | 1. 下载官方 ADB [安装包](https://developer.android.com/tools/releases/platform-tools?hl=zh-cn)
237 | 2. 解压并配置环境变量：
238 | 
239 | **macOS:**
240 | 
241 | ```bash
242 | # 假设解压到 ~/Downloads/platform-tools
243 | export PATH=${PATH}:~/Downloads/platform-tools
244 | ```
245 | 
246 | **Windows:** 将解压后的文件夹路径添加到系统 PATH。可参考[此教程](https://blog.csdn.net/x2584179909/article/details/108319973)。
247 | 
248 | ### 3. 安卓设备配置
249 | 
250 | 要求：
251 | - Android 7.0+ 设备或模拟器
252 | - 开发者模式已启用
253 | - USB 调试已启用
254 | 
255 | **启用开发者模式：**
256 | 1. 进入 `设置 > 关于手机 > 版本号`
257 | 2. 连续快速点击约 10 次，直到提示"开发者模式已启用"
258 | 
259 | **启用 USB 调试：**
260 | 1. 进入 `设置 > 开发者选项 > USB 调试`
261 | 2. 开启开关
262 | 3. 部分设备可能需要重启
263 | 
264 | **请务必检查以下权限：**
265 | 
266 | ![权限](resources/screenshot-20251209-181423.png)
267 | 
268 | ### 4. 安装 ADB Keyboard
269 | 
270 | 在设备上下载并安装 [ADB Keyboard APK](https://github.com/senzhk/ADBKeyBoard/blob/master/ADBKeyboard.apk)。
271 | 
272 | 安装后，在 `设置 > 输入法` 或 `设置 > 键盘列表` 中启用。
273 | 
274 | ---
275 | 
276 | ## 安装
277 | 
278 | ```bash
279 | # 安装依赖
280 | pip install -r requirements.txt
281 | 
282 | # 安装包
283 | pip install -e .
284 | ```
285 | 
286 | ---
287 | 
288 | ## ADB 配置
289 | 
290 | **请确保 USB 数据线支持数据传输（而非仅充电）。**
291 | 
292 | ### 验证连接
293 | 
294 | ```bash
295 | # 检查已连接设备
296 | adb devices
297 | 
298 | # 预期输出：
299 | # List of devices attached
300 | # emulator-5554   device
301 | ```
302 | 
303 | ### 远程调试（WiFi）
304 | 
305 | 确保手机和电脑在同一 WiFi 网络中。
306 | 
307 | ![开启无线调试](resources/setting.png)
308 | 
309 | ```bash
310 | # 通过 WiFi 连接（替换为手机显示的 IP 和端口）
311 | adb connect 192.168.1.100:5555
312 | 
313 | # 验证连接
314 | adb devices
315 | ```
316 | 
317 | ### 设备管理
318 | 
319 | ```bash
320 | # 列出所有设备
321 | adb devices
322 | 
323 | # 连接远程设备
324 | adb connect <ip>:<port>
325 | 
326 | # 断开设备
327 | adb disconnect <ip>:<port>
328 | ```
329 | 
330 | ---
331 | 
332 | ## 使用方法
333 | 
334 | ### 命令行
335 | 
336 | ```bash
337 | # 交互模式
338 | python main.py --base-url <模型API地址> --model <模型名称>
339 | 
340 | # 执行指定任务
341 | python main.py --base-url <模型API地址> "打开美团搜索附近的火锅店"
342 | 
343 | # 使用 API Key 认证
344 | python main.py --apikey sk-xxxxx
345 | 
346 | # 使用英文系统提示词
347 | python main.py --lang en --base-url <模型API地址> "Open Chrome browser"
348 | 
349 | # 列出支持的应用
350 | python main.py --list-apps
351 | 
352 | # 指定设备
353 | python main.py --device-id 192.168.1.100:5555 --base-url <模型API地址> "打开抖音刷视频"
354 | ```
355 | 
356 | ### Python API
357 | 
358 | ```python
359 | from phone_agent import PhoneAgent
360 | from phone_agent.model import ModelConfig
361 | 
362 | # 配置模型
363 | model_config = ModelConfig(
364 |     base_url="<模型API地址>",
365 |     model_name="<模型名称>",
366 | )
367 | 
368 | # 创建 Agent
369 | agent = PhoneAgent(model_config=model_config)
370 | 
371 | # 执行任务
372 | result = agent.run("打开淘宝搜索无线耳机")
373 | print(result)
374 | ```
375 | 
376 | ---
377 | 
378 | ## 环境变量
379 | 
380 | | 变量                        | 描述               | 默认值                        |
381 | |---------------------------|------------------|----------------------------|
382 | | `PHONE_AGENT_BASE_URL`    | 模型 API 地址        | `http://localhost:8000/v1` |
383 | | `PHONE_AGENT_MODEL`       | 模型名称             | `autoglm-phone-9b`         |
384 | | `PHONE_AGENT_API_KEY`     | API Key          | `EMPTY`                    |
385 | | `PHONE_AGENT_MAX_STEPS`   | 每个任务最大步数         | `100`                      |
386 | | `PHONE_AGENT_DEVICE_ID`   | ADB 设备 ID        | (自动检测)                     |
387 | | `PHONE_AGENT_LANG`        | 语言 (`cn`/`en`)   | `cn`                       |
388 | 
389 | ---
390 | 
391 | ## 常见问题
392 | 
393 | ### 设备未找到
394 | 
395 | ```bash
396 | adb kill-server
397 | adb start-server
398 | adb devices
399 | ```
400 | 
401 | 检查：
402 | 1. USB 调试是否已开启
403 | 2. 数据线是否支持数据传输
404 | 3. 手机上的授权弹窗是否已点击「允许」
405 | 4. 尝试更换 USB 接口或数据线
406 | 
407 | ### 能打开应用但无法点击
408 | 
409 | 在 `设置 > 开发者选项` 中同时启用：
410 | - **USB 调试**
411 | - **USB 调试（安全设置）**
412 | 
413 | ### 文本输入不工作
414 | 
415 | 1. 确保已安装 ADB Keyboard
416 | 2. 在 `设置 > 系统 > 语言和输入法 > 虚拟键盘` 中启用
417 | 
418 | ### Windows 编码异常
419 | 
420 | 运行代码前添加环境变量：
421 | 
422 | ```bash
423 | PYTHONIOENCODING=utf-8 python main.py ...
424 | ```
425 | 
426 | ---
427 | 
428 | ## License
429 | 
430 | This project is for research and learning purposes only. See [Terms of Use](resources/privacy_policy.txt) / [使用条款](resources/privacy_policy.txt).
431 | 


--------------------------------------------------------------------------------
/phone_agent/config/apps_harmonyos.py:
--------------------------------------------------------------------------------
  1 | """HarmonyOS application package name mappings.
  2 | 
  3 | Maps user-friendly app names to HarmonyOS bundle names.
  4 | These bundle names are used with the 'hdc shell aa start -b <bundle>' command.
  5 | """
  6 | 
  7 | # Custom ability names for apps that don't use the default "EntryAbility"
  8 | # Maps bundle_name -> ability_name
  9 | # Generated by: python test/find_abilities.py
 10 | APP_ABILITIES: dict[str, str] = {
 11 |     # Third-party apps
 12 |     "cn.wps.mobileoffice.hap": "DocumentAbility",
 13 |     "com.ccb.mobilebank.hm": "CcbMainAbility",
 14 |     "com.dewu.hos": "HomeAbility",
 15 |     "com.larus.nova.hm": "MainAbility",
 16 |     "com.luna.hm.music": "MainAbility",
 17 |     "com.meitu.meitupic": "MainAbility",
 18 |     "com.ss.hm.article.news": "MainAbility",
 19 |     "com.ss.hm.ugc.aweme": "MainAbility",
 20 |     "com.taobao.taobao4hmos": "Taobao_mainAbility",
 21 |     "com.tencent.videohm": "AppAbility",
 22 |     "com.ximalaya.ting.xmharmony": "MainBundleAbility",
 23 |     "com.zhihu.hmos": "PhoneAbility",
 24 | 
 25 |     # Huawei system apps
 26 |     "com.huawei.hmos.browser": "MainAbility",
 27 |     "com.huawei.hmos.calculator": "com.huawei.hmos.calculator.CalculatorAbility",
 28 |     "com.huawei.hmos.calendar": "MainAbility",
 29 |     "com.huawei.hmos.camera": "com.huawei.hmos.camera.MainAbility",
 30 |     "com.huawei.hmos.clock": "com.huawei.hmos.clock.phone",
 31 |     "com.huawei.hmos.clouddrive": "MainAbility",
 32 |     "com.huawei.hmos.email": "ApplicationAbility",
 33 |     "com.huawei.hmos.filemanager": "MainAbility",
 34 |     "com.huawei.hmos.health": "Activity_card_entryAbility",
 35 |     "com.huawei.hmos.notepad": "MainAbility",
 36 |     "com.huawei.hmos.photos": "MainAbility",
 37 |     "com.huawei.hmos.screenrecorder": "com.huawei.hmos.screenrecorder.ServiceExtAbility",
 38 |     "com.huawei.hmos.screenshot": "com.huawei.hmos.screenshot.ServiceExtAbility",
 39 |     "com.huawei.hmos.settings": "com.huawei.hmos.settings.MainAbility",
 40 |     "com.huawei.hmos.soundrecorder": "MainAbility",
 41 |     "com.huawei.hmos.vassistant": "AiCaptionServiceExtAbility",
 42 |     "com.huawei.hmos.wallet": "MainAbility",
 43 | 
 44 |     # Huawei services
 45 |     "com.huawei.hmsapp.appgallery": "MainAbility",
 46 |     "com.huawei.hmsapp.books": "MainAbility",
 47 |     "com.huawei.hmsapp.himovie": "MainAbility",
 48 |     "com.huawei.hmsapp.hisearch": "MainAbility",
 49 |     "com.huawei.hmsapp.music": "MainAbility",
 50 |     "com.huawei.hmsapp.thememanager": "MainAbility",
 51 |     "com.huawei.hmsapp.totemweather": "com.huawei.hmsapp.totemweather.MainAbility",
 52 | 
 53 |     # OHOS system apps
 54 |     "com.ohos.callui": "com.ohos.callui.ServiceAbility",
 55 |     "com.ohos.contacts": "com.ohos.contacts.MainAbility",
 56 |     "com.ohos.mms": "com.ohos.mms.MainAbility",
 57 | }
 58 | 
 59 | APP_PACKAGES: dict[str, str] = {
 60 |     # Social & Messaging
 61 |     "微信": "com.tencent.wechat",
 62 |     "QQ": "com.tencent.mqq",
 63 |     "微博": "com.sina.weibo.stage",
 64 |     # E-commerce
 65 |     "淘宝": "com.taobao.taobao4hmos",
 66 |     "京东": "com.jd.hm.mall",
 67 |     "拼多多": "com.xunmeng.pinduoduo.hos",
 68 |     "淘宝闪购": "com.taobao.taobao4hmos",
 69 |     "京东秒送": "com.jd.hm.mall",
 70 |     # Lifestyle & Social
 71 |     "小红书": "com.xingin.xhs_hos",
 72 |     "知乎": "com.zhihu.hmos",
 73 |     # "豆瓣": "com.douban.frodo",  # 未在 hdc 列表中找到
 74 |     # Maps & Navigation
 75 |     "高德地图": "com.amap.hmapp",
 76 |     "百度地图": "com.baidu.hmmap",
 77 |     # Food & Services
 78 |     "美团": "com.sankuai.hmeituan",
 79 |     "美团外卖": "com.meituan.takeaway",
 80 |     "大众点评": "com.sankuai.dianping",
 81 |     # "肯德基": "com.yek.android.kfc.activitys",  # 未在 hdc 列表中找到
 82 |     # Travel
 83 |     # "携程": "ctrip.android.view",  # 未在 hdc 列表中找到
 84 |     "铁路12306": "com.chinarailway.ticketingHM",
 85 |     "12306": "com.chinarailway.ticketingHM",
 86 |     # "去哪儿": "com.Qunar",  # 未在 hdc 列表中找到
 87 |     # "去哪儿旅行": "com.Qunar",  # 未在 hdc 列表中找到
 88 |     "滴滴出行": "com.sdu.didi.hmos.psnger",
 89 |     # Video & Entertainment
 90 |     "bilibili": "yylx.danmaku.bili",
 91 |     "抖音": "com.ss.hm.ugc.aweme",
 92 |     "快手": "com.kuaishou.hmapp",
 93 |     "腾讯视频": "com.tencent.videohm",
 94 |     "爱奇艺": "com.qiyi.video.hmy",
 95 |     "芒果TV": "com.mgtv.phone",
 96 |     # "优酷视频": "com.youku.phone",  # 未在 hdc 列表中找到
 97 |     # "红果短剧": "com.phoenix.read",  # 未在 hdc 列表中找到
 98 |     # Music & Audio
 99 |     # "网易云音乐": "com.netease.cloudmusic",  # 未在 hdc 列表中找到
100 |     "QQ音乐": "com.tencent.hm.qqmusic",
101 |     "汽水音乐": "com.luna.hm.music",
102 |     "喜马拉雅": "com.ximalaya.ting.xmharmony",
103 |     # Reading
104 |     # "番茄小说": "com.dragon.read",  # 未在 hdc 列表中找到
105 |     # "番茄免费小说": "com.dragon.read",  # 未在 hdc 列表中找到
106 |     # "七猫免费小说": "com.kmxs.reader",  # 未在 hdc 列表中找到
107 |     # Productivity
108 |     "飞书": "com.ss.feishu",
109 |     # "QQ邮箱": "com.tencent.androidqqmail",  # 未在 hdc 列表中找到
110 |     # AI & Tools
111 |     "豆包": "com.larus.nova.hm",
112 |     # Health & Fitness
113 |     # "keep": "com.gotokeep.keep",  # 未在 hdc 列表中找到
114 |     # "美柚": "com.lingan.seeyou",  # 未在 hdc 列表中找到
115 |     # News & Information
116 |     # "腾讯新闻": "com.tencent.news",  # 未在 hdc 列表中找到
117 |     "今日头条": "com.ss.hm.article.news",
118 |     # Real Estate
119 |     # "贝壳找房": "com.lianjia.beike",  # 未在 hdc 列表中找到
120 |     # "安居客": "com.anjuke.android.app",  # 未在 hdc 列表中找到
121 |     # Finance
122 |     # "同花顺": "com.hexin.plat.android",  # 未在 hdc 列表中找到
123 |     # Games
124 |     # "星穹铁道": "com.miHoYo.hkrpg",  # 未在 hdc 列表中找到
125 |     # "崩坏：星穹铁道": "com.miHoYo.hkrpg",  # 未在 hdc 列表中找到
126 |     # "恋与深空": "com.papegames.lysk.cn",  # 未在 hdc 列表中找到
127 |     
128 |     # HarmonyOS 第三方应用
129 |     "百度": "com.baidu.baiduapp",
130 |     "阿里巴巴": "com.alibaba.wireless_hmos",
131 |     "WPS": "cn.wps.mobileoffice.hap",
132 |     "企业微信": "com.tencent.wework.hmos",
133 |     "同程": "com.tongcheng.hmos",
134 |     "同程旅行": "com.tongcheng.hmos",
135 |     "唯品会": "com.vip.hosapp",
136 |     "支付宝": "com.alipay.mobile.client",
137 |     "UC浏览器": "com.uc.mobile",
138 |     "闲鱼": "com.taobao.idlefish4ohos",
139 |     "转转": "com.zhuanzhuan.hmoszz",
140 |     "迅雷": "com.xunlei.thunder",
141 |     "搜狗输入法": "com.sogou.input",
142 |     "扫描全能王": "com.intsig.camscanner.hap",
143 |     "美图秀秀": "com.meitu.meitupic",
144 |     "58同城": "com.wuba.life",
145 |     "得物": "com.dewu.hos",
146 |     "海底捞": "com.haidilao.haros",
147 |     "中国移动": "com.droi.tong",
148 |     "中国联通": "com.sinovatech.unicom.ha",
149 |     "国家税务总局": "cn.gov.chinatax.gt4.hm",
150 |     "建设银行": "com.ccb.mobilebank.hm",
151 |     "快手极速版": "com.kuaishou.hmnebula",
152 |     
153 |     # HarmonyOS 系统应用 - 工具类
154 |     "浏览器": "com.huawei.hmos.browser",
155 |     "计算器": "com.huawei.hmos.calculator",
156 |     "日历": "com.huawei.hmos.calendar",
157 |     "相机": "com.huawei.hmos.camera",
158 |     "时钟": "com.huawei.hmos.clock",
159 |     "云盘": "com.huawei.hmos.clouddrive",
160 |     "云空间": "com.huawei.hmos.clouddrive",
161 |     "邮件": "com.huawei.hmos.email",
162 |     "文件管理器": "com.huawei.hmos.filemanager",
163 |     "文件": "com.huawei.hmos.files",
164 |     "查找设备": "com.huawei.hmos.finddevice",
165 |     "查找手机": "com.huawei.hmos.finddevice",
166 |     "录音机": "com.huawei.hmos.soundrecorder",
167 |     "录音": "com.huawei.hmos.soundrecorder",
168 |     "录屏": "com.huawei.hmos.screenrecorder",
169 |     "截屏": "com.huawei.hmos.screenshot",
170 |     "笔记": "com.huawei.hmos.notepad",
171 |     "备忘录": "com.huawei.hmos.notepad",
172 |     
173 |     # HarmonyOS 系统应用 - 媒体类
174 |     "相册": "com.huawei.hmos.photos",
175 |     "图库": "com.huawei.hmos.photos",
176 |     # "视频": "com.huawei.hmos.mediaplayer",  # 未在 hdc 列表中找到，但有 com.huawei.hmsapp.himovie
177 |     
178 |     # HarmonyOS 系统应用 - 通讯类
179 |     "联系人": "com.ohos.contacts",
180 |     "通讯录": "com.ohos.contacts",
181 |     "短信": "com.ohos.mms",
182 |     "信息": "com.ohos.mms",
183 |     "电话": "com.ohos.callui",
184 |     "拨号": "com.ohos.callui",
185 |     
186 |     # HarmonyOS 系统应用 - 设置类
187 |     "设置": "com.huawei.hmos.settings",
188 |     "系统设置": "com.huawei.hmos.settings",
189 |     "AndroidSystemSettings": "com.huawei.hmos.settings",
190 |     "Android System Settings": "com.huawei.hmos.settings",
191 |     "Android  System Settings": "com.huawei.hmos.settings",
192 |     "Android-System-Settings": "com.huawei.hmos.settings",
193 |     "Settings": "com.huawei.hmos.settings",
194 |     
195 |     # HarmonyOS 系统应用 - 生活服务
196 |     "健康": "com.huawei.hmos.health",
197 |     "运动健康": "com.huawei.hmos.health",
198 |     "地图": "com.huawei.hmos.maps.app",
199 |     "华为地图": "com.huawei.hmos.maps.app",
200 |     "钱包": "com.huawei.hmos.wallet",
201 |     "华为钱包": "com.huawei.hmos.wallet",
202 |     "智慧生活": "com.huawei.hmos.ailife",
203 |     "智能助手": "com.huawei.hmos.vassistant",
204 |     "小艺": "com.huawei.hmos.vassistant",
205 |     
206 |     # HarmonyOS 服务
207 |     "应用市场": "com.huawei.hmsapp.appgallery",
208 |     "华为应用市场": "com.huawei.hmsapp.appgallery",
209 |     "音乐": "com.huawei.hmsapp.music",
210 |     "华为音乐": "com.huawei.hmsapp.music",
211 |     "主题": "com.huawei.hmsapp.thememanager",
212 |     "主题管理": "com.huawei.hmsapp.thememanager",
213 |     "天气": "com.huawei.hmsapp.totemweather",
214 |     "华为天气": "com.huawei.hmsapp.totemweather",
215 |     "视频": "com.huawei.hmsapp.himovie",
216 |     "华为视频": "com.huawei.hmsapp.himovie",
217 |     "阅读": "com.huawei.hmsapp.books",
218 |     "华为阅读": "com.huawei.hmsapp.books",
219 |     "游戏中心": "com.huawei.hmsapp.gamecenter",
220 |     "华为游戏中心": "com.huawei.hmsapp.gamecenter",
221 |     "搜索": "com.huawei.hmsapp.hisearch",
222 |     "华为搜索": "com.huawei.hmsapp.hisearch",
223 |     "指南针": "com.huawei.hmsapp.compass",
224 |     "会员中心": "com.huawei.hmos.myhuawei",
225 |     "我的华为": "com.huawei.hmos.myhuawei",
226 |     "华为会员": "com.huawei.hmos.myhuawei",
227 | }
228 | 
229 | 
230 | def get_package_name(app_name: str) -> str | None:
231 |     """
232 |     Get the package name for an app.
233 | 
234 |     Args:
235 |         app_name: The display name of the app.
236 | 
237 |     Returns:
238 |         The HarmonyOS bundle name, or None if not found.
239 |     """
240 |     return APP_PACKAGES.get(app_name)
241 | 
242 | 
243 | def get_app_name(package_name: str) -> str | None:
244 |     """
245 |     Get the app name from a package name.
246 | 
247 |     Args:
248 |         package_name: The HarmonyOS bundle name.
249 | 
250 |     Returns:
251 |         The display name of the app, or None if not found.
252 |     """
253 |     for name, package in APP_PACKAGES.items():
254 |         if package == package_name:
255 |             return name
256 |     return None
257 | 
258 | 
259 | def list_supported_apps() -> list[str]:
260 |     """
261 |     Get a list of all supported app names.
262 | 
263 |     Returns:
264 |         List of app names.
265 |     """
266 |     return list(APP_PACKAGES.keys())
267 | 


--------------------------------------------------------------------------------
/phone_agent/model/client.py:
--------------------------------------------------------------------------------
  1 | """Model client for AI inference using OpenAI-compatible API."""
  2 | 
  3 | import json
  4 | import time
  5 | from dataclasses import dataclass, field
  6 | from typing import Any
  7 | 
  8 | from openai import OpenAI
  9 | 
 10 | from phone_agent.config.i18n import get_message
 11 | 
 12 | 
 13 | @dataclass
 14 | class ModelConfig:
 15 |     """Configuration for the AI model."""
 16 | 
 17 |     base_url: str = "http://localhost:8000/v1"
 18 |     api_key: str = "EMPTY"
 19 |     model_name: str = "autoglm-phone-9b"
 20 |     max_tokens: int = 3000
 21 |     temperature: float = 0.0
 22 |     top_p: float = 0.85
 23 |     frequency_penalty: float = 0.2
 24 |     extra_body: dict[str, Any] = field(default_factory=dict)
 25 |     lang: str = "cn"  # Language for UI messages: 'cn' or 'en'
 26 | 
 27 | 
 28 | @dataclass
 29 | class ModelResponse:
 30 |     """Response from the AI model."""
 31 | 
 32 |     thinking: str
 33 |     action: str
 34 |     raw_content: str
 35 |     # Performance metrics
 36 |     time_to_first_token: float | None = None  # Time to first token (seconds)
 37 |     time_to_thinking_end: float | None = None  # Time to thinking end (seconds)
 38 |     total_time: float | None = None  # Total inference time (seconds)
 39 | 
 40 | 
 41 | class ModelClient:
 42 |     """
 43 |     Client for interacting with OpenAI-compatible vision-language models.
 44 | 
 45 |     Args:
 46 |         config: Model configuration.
 47 |     """
 48 | 
 49 |     def __init__(self, config: ModelConfig | None = None):
 50 |         self.config = config or ModelConfig()
 51 |         self.client = OpenAI(base_url=self.config.base_url, api_key=self.config.api_key)
 52 | 
 53 |     def request(self, messages: list[dict[str, Any]]) -> ModelResponse:
 54 |         """
 55 |         Send a request to the model.
 56 | 
 57 |         Args:
 58 |             messages: List of message dictionaries in OpenAI format.
 59 | 
 60 |         Returns:
 61 |             ModelResponse containing thinking and action.
 62 | 
 63 |         Raises:
 64 |             ValueError: If the response cannot be parsed.
 65 |         """
 66 |         # Start timing
 67 |         start_time = time.time()
 68 |         time_to_first_token = None
 69 |         time_to_thinking_end = None
 70 | 
 71 |         stream = self.client.chat.completions.create(
 72 |             messages=messages,
 73 |             model=self.config.model_name,
 74 |             max_tokens=self.config.max_tokens,
 75 |             temperature=self.config.temperature,
 76 |             top_p=self.config.top_p,
 77 |             frequency_penalty=self.config.frequency_penalty,
 78 |             extra_body=self.config.extra_body,
 79 |             stream=True,
 80 |         )
 81 | 
 82 |         raw_content = ""
 83 |         buffer = ""  # Buffer to hold content that might be part of a marker
 84 |         action_markers = ["finish(message=", "do(action="]
 85 |         in_action_phase = False  # Track if we've entered the action phase
 86 |         first_token_received = False
 87 | 
 88 |         for chunk in stream:
 89 |             if len(chunk.choices) == 0:
 90 |                 continue
 91 |             if chunk.choices[0].delta.content is not None:
 92 |                 content = chunk.choices[0].delta.content
 93 |                 raw_content += content
 94 | 
 95 |                 # Record time to first token
 96 |                 if not first_token_received:
 97 |                     time_to_first_token = time.time() - start_time
 98 |                     first_token_received = True
 99 | 
100 |                 if in_action_phase:
101 |                     # Already in action phase, just accumulate content without printing
102 |                     continue
103 | 
104 |                 buffer += content
105 | 
106 |                 # Check if any marker is fully present in buffer
107 |                 marker_found = False
108 |                 for marker in action_markers:
109 |                     if marker in buffer:
110 |                         # Marker found, print everything before it
111 |                         thinking_part = buffer.split(marker, 1)[0]
112 |                         print(thinking_part, end="", flush=True)
113 |                         print()  # Print newline after thinking is complete
114 |                         in_action_phase = True
115 |                         marker_found = True
116 | 
117 |                         # Record time to thinking end
118 |                         if time_to_thinking_end is None:
119 |                             time_to_thinking_end = time.time() - start_time
120 | 
121 |                         break
122 | 
123 |                 if marker_found:
124 |                     continue  # Continue to collect remaining content
125 | 
126 |                 # Check if buffer ends with a prefix of any marker
127 |                 # If so, don't print yet (wait for more content)
128 |                 is_potential_marker = False
129 |                 for marker in action_markers:
130 |                     for i in range(1, len(marker)):
131 |                         if buffer.endswith(marker[:i]):
132 |                             is_potential_marker = True
133 |                             break
134 |                     if is_potential_marker:
135 |                         break
136 | 
137 |                 if not is_potential_marker:
138 |                     # Safe to print the buffer
139 |                     print(buffer, end="", flush=True)
140 |                     buffer = ""
141 | 
142 |         # Calculate total time
143 |         total_time = time.time() - start_time
144 | 
145 |         # Parse thinking and action from response
146 |         thinking, action = self._parse_response(raw_content)
147 | 
148 |         # Print performance metrics
149 |         lang = self.config.lang
150 |         print()
151 |         print("=" * 50)
152 |         print(f"⏱️  {get_message('performance_metrics', lang)}:")
153 |         print("-" * 50)
154 |         if time_to_first_token is not None:
155 |             print(
156 |                 f"{get_message('time_to_first_token', lang)}: {time_to_first_token:.3f}s"
157 |             )
158 |         if time_to_thinking_end is not None:
159 |             print(
160 |                 f"{get_message('time_to_thinking_end', lang)}:        {time_to_thinking_end:.3f}s"
161 |             )
162 |         print(
163 |             f"{get_message('total_inference_time', lang)}:          {total_time:.3f}s"
164 |         )
165 |         print("=" * 50)
166 | 
167 |         return ModelResponse(
168 |             thinking=thinking,
169 |             action=action,
170 |             raw_content=raw_content,
171 |             time_to_first_token=time_to_first_token,
172 |             time_to_thinking_end=time_to_thinking_end,
173 |             total_time=total_time,
174 |         )
175 | 
176 |     def _parse_response(self, content: str) -> tuple[str, str]:
177 |         """
178 |         Parse the model response into thinking and action parts.
179 | 
180 |         Parsing rules:
181 |         1. If content contains 'finish(message=', everything before is thinking,
182 |            everything from 'finish(message=' onwards is action.
183 |         2. If rule 1 doesn't apply but content contains 'do(action=',
184 |            everything before is thinking, everything from 'do(action=' onwards is action.
185 |         3. Fallback: If content contains '<answer>', use legacy parsing with XML tags.
186 |         4. Otherwise, return empty thinking and full content as action.
187 | 
188 |         Args:
189 |             content: Raw response content.
190 | 
191 |         Returns:
192 |             Tuple of (thinking, action).
193 |         """
194 |         # Rule 1: Check for finish(message=
195 |         if "finish(message=" in content:
196 |             parts = content.split("finish(message=", 1)
197 |             thinking = parts[0].strip()
198 |             action = "finish(message=" + parts[1]
199 |             return thinking, action
200 | 
201 |         # Rule 2: Check for do(action=
202 |         if "do(action=" in content:
203 |             parts = content.split("do(action=", 1)
204 |             thinking = parts[0].strip()
205 |             action = "do(action=" + parts[1]
206 |             return thinking, action
207 | 
208 |         # Rule 3: Fallback to legacy XML tag parsing
209 |         if "<answer>" in content:
210 |             parts = content.split("<answer>", 1)
211 |             thinking = parts[0].replace("<think>", "").replace("</think>", "").strip()
212 |             action = parts[1].replace("</answer>", "").strip()
213 |             return thinking, action
214 | 
215 |         # Rule 4: No markers found, return content as action
216 |         return "", content
217 | 
218 | 
219 | class MessageBuilder:
220 |     """Helper class for building conversation messages."""
221 | 
222 |     @staticmethod
223 |     def create_system_message(content: str) -> dict[str, Any]:
224 |         """Create a system message."""
225 |         return {"role": "system", "content": content}
226 | 
227 |     @staticmethod
228 |     def create_user_message(
229 |         text: str, image_base64: str | None = None
230 |     ) -> dict[str, Any]:
231 |         """
232 |         Create a user message with optional image.
233 | 
234 |         Args:
235 |             text: Text content.
236 |             image_base64: Optional base64-encoded image.
237 | 
238 |         Returns:
239 |             Message dictionary.
240 |         """
241 |         content = []
242 | 
243 |         if image_base64:
244 |             content.append(
245 |                 {
246 |                     "type": "image_url",
247 |                     "image_url": {"url": f"data:image/png;base64,{image_base64}"},
248 |                 }
249 |             )
250 | 
251 |         content.append({"type": "text", "text": text})
252 | 
253 |         return {"role": "user", "content": content}
254 | 
255 |     @staticmethod
256 |     def create_assistant_message(content: str) -> dict[str, Any]:
257 |         """Create an assistant message."""
258 |         return {"role": "assistant", "content": content}
259 | 
260 |     @staticmethod
261 |     def remove_images_from_message(message: dict[str, Any]) -> dict[str, Any]:
262 |         """
263 |         Remove image content from a message to save context space.
264 | 
265 |         Args:
266 |             message: Message dictionary.
267 | 
268 |         Returns:
269 |             Message with images removed.
270 |         """
271 |         if isinstance(message.get("content"), list):
272 |             message["content"] = [
273 |                 item for item in message["content"] if item.get("type") == "text"
274 |             ]
275 |         return message
276 | 
277 |     @staticmethod
278 |     def build_screen_info(current_app: str, **extra_info) -> str:
279 |         """
280 |         Build screen info string for the model.
281 | 
282 |         Args:
283 |             current_app: Current app name.
284 |             **extra_info: Additional info to include.
285 | 
286 |         Returns:
287 |             JSON string with screen info.
288 |         """
289 |         info = {"current_app": current_app, **extra_info}
290 |         return json.dumps(info, ensure_ascii=False)
291 | 


--------------------------------------------------------------------------------
/phone_agent/adb/connection.py:
--------------------------------------------------------------------------------
  1 | """ADB connection management for local and remote devices."""
  2 | 
  3 | import subprocess
  4 | import time
  5 | from dataclasses import dataclass
  6 | from enum import Enum
  7 | from typing import Optional
  8 | 
  9 | from phone_agent.config.timing import TIMING_CONFIG
 10 | 
 11 | 
 12 | class ConnectionType(Enum):
 13 |     """Type of ADB connection."""
 14 | 
 15 |     USB = "usb"
 16 |     WIFI = "wifi"
 17 |     REMOTE = "remote"
 18 | 
 19 | 
 20 | @dataclass
 21 | class DeviceInfo:
 22 |     """Information about a connected device."""
 23 | 
 24 |     device_id: str
 25 |     status: str
 26 |     connection_type: ConnectionType
 27 |     model: str | None = None
 28 |     android_version: str | None = None
 29 | 
 30 | 
 31 | class ADBConnection:
 32 |     """
 33 |     Manages ADB connections to Android devices.
 34 | 
 35 |     Supports USB, WiFi, and remote TCP/IP connections.
 36 | 
 37 |     Example:
 38 |         >>> conn = ADBConnection()
 39 |         >>> # Connect to remote device
 40 |         >>> conn.connect("192.168.1.100:5555")
 41 |         >>> # List devices
 42 |         >>> devices = conn.list_devices()
 43 |         >>> # Disconnect
 44 |         >>> conn.disconnect("192.168.1.100:5555")
 45 |     """
 46 | 
 47 |     def __init__(self, adb_path: str = "adb"):
 48 |         """
 49 |         Initialize ADB connection manager.
 50 | 
 51 |         Args:
 52 |             adb_path: Path to ADB executable.
 53 |         """
 54 |         self.adb_path = adb_path
 55 | 
 56 |     def connect(self, address: str, timeout: int = 10) -> tuple[bool, str]:
 57 |         """
 58 |         Connect to a remote device via TCP/IP.
 59 | 
 60 |         Args:
 61 |             address: Device address in format "host:port" (e.g., "192.168.1.100:5555").
 62 |             timeout: Connection timeout in seconds.
 63 | 
 64 |         Returns:
 65 |             Tuple of (success, message).
 66 | 
 67 |         Note:
 68 |             The remote device must have TCP/IP debugging enabled.
 69 |             On the device, run: adb tcpip 5555
 70 |         """
 71 |         # Validate address format
 72 |         if ":" not in address:
 73 |             address = f"{address}:5555"  # Default ADB port
 74 | 
 75 |         try:
 76 |             result = subprocess.run(
 77 |                 [self.adb_path, "connect", address],
 78 |                 capture_output=True,
 79 |                 text=True,
 80 |                 timeout=timeout,
 81 |             )
 82 | 
 83 |             output = result.stdout + result.stderr
 84 | 
 85 |             if "connected" in output.lower():
 86 |                 return True, f"Connected to {address}"
 87 |             elif "already connected" in output.lower():
 88 |                 return True, f"Already connected to {address}"
 89 |             else:
 90 |                 return False, output.strip()
 91 | 
 92 |         except subprocess.TimeoutExpired:
 93 |             return False, f"Connection timeout after {timeout}s"
 94 |         except Exception as e:
 95 |             return False, f"Connection error: {e}"
 96 | 
 97 |     def disconnect(self, address: str | None = None) -> tuple[bool, str]:
 98 |         """
 99 |         Disconnect from a remote device.
100 | 
101 |         Args:
102 |             address: Device address to disconnect. If None, disconnects all.
103 | 
104 |         Returns:
105 |             Tuple of (success, message).
106 |         """
107 |         try:
108 |             cmd = [self.adb_path, "disconnect"]
109 |             if address:
110 |                 cmd.append(address)
111 | 
112 |             result = subprocess.run(cmd, capture_output=True, text=True, encoding="utf-8", timeout=5)
113 | 
114 |             output = result.stdout + result.stderr
115 |             return True, output.strip() or "Disconnected"
116 | 
117 |         except Exception as e:
118 |             return False, f"Disconnect error: {e}"
119 | 
120 |     def list_devices(self) -> list[DeviceInfo]:
121 |         """
122 |         List all connected devices.
123 | 
124 |         Returns:
125 |             List of DeviceInfo objects.
126 |         """
127 |         try:
128 |             result = subprocess.run(
129 |                 [self.adb_path, "devices", "-l"],
130 |                 capture_output=True,
131 |                 text=True,
132 |                 timeout=5,
133 |             )
134 | 
135 |             devices = []
136 |             for line in result.stdout.strip().split("\n")[1:]:  # Skip header
137 |                 if not line.strip():
138 |                     continue
139 | 
140 |                 parts = line.split()
141 |                 if len(parts) >= 2:
142 |                     device_id = parts[0]
143 |                     status = parts[1]
144 | 
145 |                     # Determine connection type
146 |                     if ":" in device_id:
147 |                         conn_type = ConnectionType.REMOTE
148 |                     elif "emulator" in device_id:
149 |                         conn_type = ConnectionType.USB  # Emulator via USB
150 |                     else:
151 |                         conn_type = ConnectionType.USB
152 | 
153 |                     # Parse additional info
154 |                     model = None
155 |                     for part in parts[2:]:
156 |                         if part.startswith("model:"):
157 |                             model = part.split(":", 1)[1]
158 |                             break
159 | 
160 |                     devices.append(
161 |                         DeviceInfo(
162 |                             device_id=device_id,
163 |                             status=status,
164 |                             connection_type=conn_type,
165 |                             model=model,
166 |                         )
167 |                     )
168 | 
169 |             return devices
170 | 
171 |         except Exception as e:
172 |             print(f"Error listing devices: {e}")
173 |             return []
174 | 
175 |     def get_device_info(self, device_id: str | None = None) -> DeviceInfo | None:
176 |         """
177 |         Get detailed information about a device.
178 | 
179 |         Args:
180 |             device_id: Device ID. If None, uses first available device.
181 | 
182 |         Returns:
183 |             DeviceInfo or None if not found.
184 |         """
185 |         devices = self.list_devices()
186 | 
187 |         if not devices:
188 |             return None
189 | 
190 |         if device_id is None:
191 |             return devices[0]
192 | 
193 |         for device in devices:
194 |             if device.device_id == device_id:
195 |                 return device
196 | 
197 |         return None
198 | 
199 |     def is_connected(self, device_id: str | None = None) -> bool:
200 |         """
201 |         Check if a device is connected.
202 | 
203 |         Args:
204 |             device_id: Device ID to check. If None, checks if any device is connected.
205 | 
206 |         Returns:
207 |             True if connected, False otherwise.
208 |         """
209 |         devices = self.list_devices()
210 | 
211 |         if not devices:
212 |             return False
213 | 
214 |         if device_id is None:
215 |             return any(d.status == "device" for d in devices)
216 | 
217 |         return any(d.device_id == device_id and d.status == "device" for d in devices)
218 | 
219 |     def enable_tcpip(
220 |         self, port: int = 5555, device_id: str | None = None
221 |     ) -> tuple[bool, str]:
222 |         """
223 |         Enable TCP/IP debugging on a USB-connected device.
224 | 
225 |         This allows subsequent wireless connections to the device.
226 | 
227 |         Args:
228 |             port: TCP port for ADB (default: 5555).
229 |             device_id: Device ID. If None, uses first available device.
230 | 
231 |         Returns:
232 |             Tuple of (success, message).
233 | 
234 |         Note:
235 |             The device must be connected via USB first.
236 |             After this, you can disconnect USB and connect via WiFi.
237 |         """
238 |         try:
239 |             cmd = [self.adb_path]
240 |             if device_id:
241 |                 cmd.extend(["-s", device_id])
242 |             cmd.extend(["tcpip", str(port)])
243 | 
244 |             result = subprocess.run(cmd, capture_output=True, text=True, encoding="utf-8", timeout=10)
245 | 
246 |             output = result.stdout + result.stderr
247 | 
248 |             if "restarting" in output.lower() or result.returncode == 0:
249 |                 time.sleep(TIMING_CONFIG.connection.adb_restart_delay)
250 |                 return True, f"TCP/IP mode enabled on port {port}"
251 |             else:
252 |                 return False, output.strip()
253 | 
254 |         except Exception as e:
255 |             return False, f"Error enabling TCP/IP: {e}"
256 | 
257 |     def get_device_ip(self, device_id: str | None = None) -> str | None:
258 |         """
259 |         Get the IP address of a connected device.
260 | 
261 |         Args:
262 |             device_id: Device ID. If None, uses first available device.
263 | 
264 |         Returns:
265 |             IP address string or None if not found.
266 |         """
267 |         try:
268 |             cmd = [self.adb_path]
269 |             if device_id:
270 |                 cmd.extend(["-s", device_id])
271 |             cmd.extend(["shell", "ip", "route"])
272 | 
273 |             result = subprocess.run(cmd, capture_output=True, text=True, encoding="utf-8", timeout=5)
274 | 
275 |             # Parse IP from route output
276 |             for line in result.stdout.split("\n"):
277 |                 if "src" in line:
278 |                     parts = line.split()
279 |                     for i, part in enumerate(parts):
280 |                         if part == "src" and i + 1 < len(parts):
281 |                             return parts[i + 1]
282 | 
283 |             # Alternative: try wlan0 interface
284 |             cmd[-1] = "ip addr show wlan0"
285 |             result = subprocess.run(
286 |                 cmd[:-1] + ["shell", "ip", "addr", "show", "wlan0"],
287 |                 capture_output=True,
288 |                 text=True,
289 |                 encoding="utf-8",
290 |                 timeout=5,
291 |             )
292 | 
293 |             for line in result.stdout.split("\n"):
294 |                 if "inet " in line:
295 |                     parts = line.strip().split()
296 |                     if len(parts) >= 2:
297 |                         return parts[1].split("/")[0]
298 | 
299 |             return None
300 | 
301 |         except Exception as e:
302 |             print(f"Error getting device IP: {e}")
303 |             return None
304 | 
305 |     def restart_server(self) -> tuple[bool, str]:
306 |         """
307 |         Restart the ADB server.
308 | 
309 |         Returns:
310 |             Tuple of (success, message).
311 |         """
312 |         try:
313 |             # Kill server
314 |             subprocess.run(
315 |                 [self.adb_path, "kill-server"], capture_output=True, timeout=5
316 |             )
317 | 
318 |             time.sleep(TIMING_CONFIG.connection.server_restart_delay)
319 | 
320 |             # Start server
321 |             subprocess.run(
322 |                 [self.adb_path, "start-server"], capture_output=True, timeout=5
323 |             )
324 | 
325 |             return True, "ADB server restarted"
326 | 
327 |         except Exception as e:
328 |             return False, f"Error restarting server: {e}"
329 | 
330 | 
331 | def quick_connect(address: str) -> tuple[bool, str]:
332 |     """
333 |     Quick helper to connect to a remote device.
334 | 
335 |     Args:
336 |         address: Device address (e.g., "192.168.1.100" or "192.168.1.100:5555").
337 | 
338 |     Returns:
339 |         Tuple of (success, message).
340 |     """
341 |     conn = ADBConnection()
342 |     return conn.connect(address)
343 | 
344 | 
345 | def list_devices() -> list[DeviceInfo]:
346 |     """
347 |     Quick helper to list connected devices.
348 | 
349 |     Returns:
350 |         List of DeviceInfo objects.
351 |     """
352 |     conn = ADBConnection()
353 |     return conn.list_devices()
354 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to the Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright 2025 Zhipu AI
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/phone_agent/hdc/connection.py:
--------------------------------------------------------------------------------
  1 | """HDC connection management for HarmonyOS devices."""
  2 | 
  3 | import os
  4 | import subprocess
  5 | import time
  6 | from dataclasses import dataclass
  7 | from enum import Enum
  8 | from typing import Optional
  9 | 
 10 | from phone_agent.config.timing import TIMING_CONFIG
 11 | 
 12 | 
 13 | # Global flag to control HDC command output
 14 | _HDC_VERBOSE = os.getenv("HDC_VERBOSE", "false").lower() in ("true", "1", "yes")
 15 | 
 16 | 
 17 | def _run_hdc_command(cmd: list, **kwargs) -> subprocess.CompletedProcess:
 18 |     """
 19 |     Run HDC command with optional verbose output.
 20 | 
 21 |     Args:
 22 |         cmd: Command list to execute.
 23 |         **kwargs: Additional arguments for subprocess.run.
 24 | 
 25 |     Returns:
 26 |         CompletedProcess result.
 27 |     """
 28 |     if _HDC_VERBOSE:
 29 |         print(f"[HDC] Running command: {' '.join(cmd)}")
 30 | 
 31 |     result = subprocess.run(cmd, **kwargs)
 32 | 
 33 |     if _HDC_VERBOSE and result.returncode != 0:
 34 |         print(f"[HDC] Command failed with return code {result.returncode}")
 35 |         if hasattr(result, 'stderr') and result.stderr:
 36 |             print(f"[HDC] Error: {result.stderr}")
 37 | 
 38 |     return result
 39 | 
 40 | 
 41 | def set_hdc_verbose(verbose: bool):
 42 |     """Set HDC verbose mode globally."""
 43 |     global _HDC_VERBOSE
 44 |     _HDC_VERBOSE = verbose
 45 | 
 46 | 
 47 | class ConnectionType(Enum):
 48 |     """Type of HDC connection."""
 49 | 
 50 |     USB = "usb"
 51 |     WIFI = "wifi"
 52 |     REMOTE = "remote"
 53 | 
 54 | 
 55 | @dataclass
 56 | class DeviceInfo:
 57 |     """Information about a connected device."""
 58 | 
 59 |     device_id: str
 60 |     status: str
 61 |     connection_type: ConnectionType
 62 |     model: str | None = None
 63 |     harmony_version: str | None = None
 64 | 
 65 | 
 66 | class HDCConnection:
 67 |     """
 68 |     Manages HDC connections to HarmonyOS devices.
 69 | 
 70 |     Supports USB, WiFi, and remote TCP/IP connections.
 71 | 
 72 |     Example:
 73 |         >>> conn = HDCConnection()
 74 |         >>> # Connect to remote device
 75 |         >>> conn.connect("192.168.1.100:5555")
 76 |         >>> # List devices
 77 |         >>> devices = conn.list_devices()
 78 |         >>> # Disconnect
 79 |         >>> conn.disconnect("192.168.1.100:5555")
 80 |     """
 81 | 
 82 |     def __init__(self, hdc_path: str = "hdc"):
 83 |         """
 84 |         Initialize HDC connection manager.
 85 | 
 86 |         Args:
 87 |             hdc_path: Path to HDC executable.
 88 |         """
 89 |         self.hdc_path = hdc_path
 90 | 
 91 |     def connect(self, address: str, timeout: int = 10) -> tuple[bool, str]:
 92 |         """
 93 |         Connect to a remote device via TCP/IP.
 94 | 
 95 |         Args:
 96 |             address: Device address in format "host:port" (e.g., "192.168.1.100:5555").
 97 |             timeout: Connection timeout in seconds.
 98 | 
 99 |         Returns:
100 |             Tuple of (success, message).
101 | 
102 |         Note:
103 |             The remote device must have TCP/IP debugging enabled.
104 |         """
105 |         # Validate address format
106 |         if ":" not in address:
107 |             address = f"{address}:5555"  # Default HDC port
108 | 
109 |         try:
110 |             result = _run_hdc_command(
111 |                 [self.hdc_path, "tconn", address],
112 |                 capture_output=True,
113 |                 text=True,
114 |                 timeout=timeout,
115 |             )
116 | 
117 |             output = result.stdout + result.stderr
118 | 
119 |             if "Connect OK" in output or "connected" in output.lower():
120 |                 return True, f"Connected to {address}"
121 |             elif "already connected" in output.lower():
122 |                 return True, f"Already connected to {address}"
123 |             else:
124 |                 return False, output.strip()
125 | 
126 |         except subprocess.TimeoutExpired:
127 |             return False, f"Connection timeout after {timeout}s"
128 |         except Exception as e:
129 |             return False, f"Connection error: {e}"
130 | 
131 |     def disconnect(self, address: str | None = None) -> tuple[bool, str]:
132 |         """
133 |         Disconnect from a remote device.
134 | 
135 |         Args:
136 |             address: Device address to disconnect. If None, disconnects all.
137 | 
138 |         Returns:
139 |             Tuple of (success, message).
140 |         """
141 |         try:
142 |             if address:
143 |                 cmd = [self.hdc_path, "tdisconn", address]
144 |             else:
145 |                 # HDC doesn't have a "disconnect all" command, so we need to list and disconnect each
146 |                 devices = self.list_devices()
147 |                 for device in devices:
148 |                     if ":" in device.device_id:  # Remote device
149 |                         _run_hdc_command(
150 |                             [self.hdc_path, "tdisconn", device.device_id],
151 |                             capture_output=True,
152 |                             text=True,
153 |                             timeout=5
154 |                         )
155 |                 return True, "Disconnected all remote devices"
156 | 
157 |             result = _run_hdc_command(cmd, capture_output=True, text=True, encoding="utf-8", timeout=5)
158 | 
159 |             output = result.stdout + result.stderr
160 |             return True, output.strip() or "Disconnected"
161 | 
162 |         except Exception as e:
163 |             return False, f"Disconnect error: {e}"
164 | 
165 |     def list_devices(self) -> list[DeviceInfo]:
166 |         """
167 |         List all connected devices.
168 | 
169 |         Returns:
170 |             List of DeviceInfo objects.
171 |         """
172 |         try:
173 |             result = _run_hdc_command(
174 |                 [self.hdc_path, "list", "targets"],
175 |                 capture_output=True,
176 |                 text=True,
177 |                 timeout=5,
178 |             )
179 | 
180 |             devices = []
181 |             for line in result.stdout.strip().split("\n"):
182 |                 if not line.strip():
183 |                     continue
184 | 
185 |                 # HDC output format: device_id (status)
186 |                 # Example: "192.168.1.100:5555" or "FMR0223C13000649"
187 |                 device_id = line.strip()
188 | 
189 |                 # Determine connection type
190 |                 if ":" in device_id:
191 |                     conn_type = ConnectionType.REMOTE
192 |                 else:
193 |                     conn_type = ConnectionType.USB
194 | 
195 |                 # HDC doesn't provide detailed status in list command
196 |                 # We assume "Connected" status for devices that appear
197 |                 devices.append(
198 |                     DeviceInfo(
199 |                         device_id=device_id,
200 |                         status="device",
201 |                         connection_type=conn_type,
202 |                         model=None,
203 |                     )
204 |                 )
205 | 
206 |             return devices
207 | 
208 |         except Exception as e:
209 |             print(f"Error listing devices: {e}")
210 |             return []
211 | 
212 |     def get_device_info(self, device_id: str | None = None) -> DeviceInfo | None:
213 |         """
214 |         Get detailed information about a device.
215 | 
216 |         Args:
217 |             device_id: Device ID. If None, uses first available device.
218 | 
219 |         Returns:
220 |             DeviceInfo or None if not found.
221 |         """
222 |         devices = self.list_devices()
223 | 
224 |         if not devices:
225 |             return None
226 | 
227 |         if device_id is None:
228 |             return devices[0]
229 | 
230 |         for device in devices:
231 |             if device.device_id == device_id:
232 |                 return device
233 | 
234 |         return None
235 | 
236 |     def is_connected(self, device_id: str | None = None) -> bool:
237 |         """
238 |         Check if a device is connected.
239 | 
240 |         Args:
241 |             device_id: Device ID to check. If None, checks if any device is connected.
242 | 
243 |         Returns:
244 |             True if connected, False otherwise.
245 |         """
246 |         devices = self.list_devices()
247 | 
248 |         if not devices:
249 |             return False
250 | 
251 |         if device_id is None:
252 |             return len(devices) > 0
253 | 
254 |         return any(d.device_id == device_id for d in devices)
255 | 
256 |     def enable_tcpip(
257 |         self, port: int = 5555, device_id: str | None = None
258 |     ) -> tuple[bool, str]:
259 |         """
260 |         Enable TCP/IP debugging on a USB-connected device.
261 | 
262 |         This allows subsequent wireless connections to the device.
263 | 
264 |         Args:
265 |             port: TCP port for HDC (default: 5555).
266 |             device_id: Device ID. If None, uses first available device.
267 | 
268 |         Returns:
269 |             Tuple of (success, message).
270 | 
271 |         Note:
272 |             The device must be connected via USB first.
273 |             After this, you can disconnect USB and connect via WiFi.
274 |         """
275 |         try:
276 |             cmd = [self.hdc_path]
277 |             if device_id:
278 |                 cmd.extend(["-t", device_id])
279 |             cmd.extend(["tmode", "port", str(port)])
280 | 
281 |             result = _run_hdc_command(cmd, capture_output=True, text=True, encoding="utf-8", timeout=10)
282 | 
283 |             output = result.stdout + result.stderr
284 | 
285 |             if result.returncode == 0 or "success" in output.lower():
286 |                 time.sleep(TIMING_CONFIG.connection.adb_restart_delay)
287 |                 return True, f"TCP/IP mode enabled on port {port}"
288 |             else:
289 |                 return False, output.strip()
290 | 
291 |         except Exception as e:
292 |             return False, f"Error enabling TCP/IP: {e}"
293 | 
294 |     def get_device_ip(self, device_id: str | None = None) -> str | None:
295 |         """
296 |         Get the IP address of a connected device.
297 | 
298 |         Args:
299 |             device_id: Device ID. If None, uses first available device.
300 | 
301 |         Returns:
302 |             IP address string or None if not found.
303 |         """
304 |         try:
305 |             cmd = [self.hdc_path]
306 |             if device_id:
307 |                 cmd.extend(["-t", device_id])
308 |             cmd.extend(["shell", "ifconfig"])
309 | 
310 |             result = _run_hdc_command(cmd, capture_output=True, text=True, encoding="utf-8", timeout=5)
311 | 
312 |             # Parse IP from ifconfig output
313 |             for line in result.stdout.split("\n"):
314 |                 if "inet addr:" in line or "inet " in line:
315 |                     parts = line.strip().split()
316 |                     for i, part in enumerate(parts):
317 |                         if "addr:" in part:
318 |                             ip = part.split(":")[1]
319 |                             # Filter out localhost
320 |                             if not ip.startswith("127."):
321 |                                 return ip
322 |                         elif part == "inet" and i + 1 < len(parts):
323 |                             ip = parts[i + 1].split("/")[0]
324 |                             if not ip.startswith("127."):
325 |                                 return ip
326 | 
327 |             return None
328 | 
329 |         except Exception as e:
330 |             print(f"Error getting device IP: {e}")
331 |             return None
332 | 
333 |     def restart_server(self) -> tuple[bool, str]:
334 |         """
335 |         Restart the HDC server.
336 | 
337 |         Returns:
338 |             Tuple of (success, message).
339 |         """
340 |         try:
341 |             # Kill server
342 |             _run_hdc_command(
343 |                 [self.hdc_path, "kill"], capture_output=True, timeout=5
344 |             )
345 | 
346 |             time.sleep(TIMING_CONFIG.connection.server_restart_delay)
347 | 
348 |             # Start server (HDC auto-starts when running commands)
349 |             _run_hdc_command(
350 |                 [self.hdc_path, "start", "-r"], capture_output=True, timeout=5
351 |             )
352 | 
353 |             return True, "HDC server restarted"
354 | 
355 |         except Exception as e:
356 |             return False, f"Error restarting server: {e}"
357 | 
358 | 
359 | def quick_connect(address: str) -> tuple[bool, str]:
360 |     """
361 |     Quick helper to connect to a remote device.
362 | 
363 |     Args:
364 |         address: Device address (e.g., "192.168.1.100" or "192.168.1.100:5555").
365 | 
366 |     Returns:
367 |         Tuple of (success, message).
368 |     """
369 |     conn = HDCConnection()
370 |     return conn.connect(address)
371 | 
372 | 
373 | def list_devices() -> list[DeviceInfo]:
374 |     """
375 |     Quick helper to list connected devices.
376 | 
377 |     Returns:
378 |         List of DeviceInfo objects.
379 |     """
380 |     conn = HDCConnection()
381 |     return conn.list_devices()
382 | 


--------------------------------------------------------------------------------
/resources/privacy_policy_en.txt:
--------------------------------------------------------------------------------
  1 | Part I: Safety Description of Model/Technology
  2 | 
  3 | 1. AutoGLM Technical Mechanism and Deployment Flexibility
  4 | The core functionality of AutoGLM is automated operation execution. Its working principle is as follows:
  5 | - Instruction-Driven: Based on operation instructions issued by the user or developer.
  6 | - Screen Understanding: Captures the screen content of the current operating environment and sends the image to a large model (which can be deployed locally or in the cloud) for analysis and understanding.
  7 | - Operation Simulation: Simulates human interaction methods (such as clicking, swiping, inputting information, etc.) to complete tasks in the target environment.
  8 | - Example: When instructed to book a high-speed rail ticket, AutoGLM would open the relevant application, identify the interface content, and follow the instructions to select a train, complete the order, etc., similar to manual operation. The user or developer can terminate the task at any time.
  9 |   
 10 | Key Flexibility:
 11 | - Model Deployment: Developers can freely choose to deploy the AutoGLM model on local devices or on cloud servers.
 12 | - Operation Execution Environment: Automated operations can be executed on local devices or on cloud-based devices, as determined by the developer based on application scenarios and requirements.
 13 | - Data Flow: The data flow depends on the deployment choice:
 14 |   - Local Deployment (Model + Execution): Screen capture, model analysis, and operation execution are all completed on the local device. Data does not leave the device, offering the highest level of privacy.
 15 |   - Cloud Deployment (Model + Execution): Screen content needs to be transmitted from the operating environment (local or cloud device) to the cloud-based model. After analysis, the model returns instructions to the operating environment for execution. Developers must ensure the security of transmission and cloud processing.
 16 |   - Hybrid Deployment (e.g., Local Execution + Cloud Model): Screen content is captured locally, transmitted to the cloud model for analysis, and the analysis results are returned to the local environment for execution. Developers need to pay attention to data transmission security.
 17 |     
 18 | 2. System Permission Usage Description (For the Operation Execution Environment)
 19 | To ensure the normal execution of automated operations, the environment running AutoGLM operations may need to obtain the following permissions:
 20 | - ADB (Android Debug Bridge) Permissions: Used to obtain information and simulate user interaction operations such as clicking, swiping, and inputting.
 21 | - Storage Permissions: Used for temporary storage of necessary data, model files (if deployed locally), or logs.
 22 | - Network Permissions: Used to access online services (e.g., calling cloud models, accessing target application services).
 23 | - Other Specific Permissions: May be required for specific tasks (e.g., microphone for voice commands).
 24 |   
 25 | Developer Responsibilities:
 26 | - Principle of Least Privilege: Only request permissions absolutely necessary to complete a specific task.
 27 | - Transparent Disclosure: Clearly and explicitly inform end-users in the application or service about the purpose and necessity of each permission.
 28 | - User Authorization: Must obtain explicit authorization from the end-user before enabling relevant permissions and functionalities in the operating environment.
 29 | - Environment Adaptation: Ensure that the permission request and acquisition mechanisms are adapted to the chosen operation execution environment (local or cloud).
 30 |   
 31 | 3. Data Processing and Privacy Protection Principles
 32 | The AutoGLM open-source project itself does not collect user data. The responsibility for data processing and privacy protection lies with the developers who build specific applications or services based on AutoGLM. Their responsibilities vary depending on the deployment method:
 33 | - Local Deployment (Model + Execution):
 34 |   - Developers must implement secure local data storage and processing at the application level. All data processing (screen capture, model analysis, operation execution) is completed on the end-user's local device.
 35 |   - Developers should ensure their application does not actively upload sensitive data (such as screen content, operation logs) to the developer's servers or third parties, unless with the user's explicit, informed consent and for a necessary functionality.
 36 | - Cloud Deployment (Model and/or Execution):
 37 |   - Involves data transmission (screen content, operation instructions, model analysis results) between the operating environment and the cloud.
 38 |   - Developers must:
 39 |     - Implement strong encryption to protect all data in transit and at rest.
 40 |     - Clearly inform end-users about what data will be sent to the cloud, the purpose of transmission, storage location, and retention period, and obtain the end-user's explicit consent for data transmission and cloud processing.
 41 |     - Comply with applicable data protection regulations, provide a clear privacy policy explaining data processing practices.
 42 |     - Ensure secure configuration and access control for the cloud environment (model servers, operating environment servers).
 43 | - General Principles (All Deployment Methods):
 44 |   - Data Minimization: Collect and process only the minimum information absolutely necessary to complete the automated task.
 45 |   - Purpose Limitation: Use data solely for the specific purpose of the automated operation to fulfill the user's instruction.
 46 |   - Security Safeguards: Developers are responsible for taking reasonable technical and administrative measures to protect the security and confidentiality of all user data they process (whether locally or in the cloud), preventing unauthorized access, use, disclosure, or loss.
 47 |   - User Control: Provide mechanisms allowing end-users to view and manage (e.g., delete) data related to them (where technically feasible and consistent with the deployment method).
 48 |     
 49 | 
 50 | ---
 51 | 
 52 | Part II: Usage Norms Developers/Users Should Follow
 53 | Developers/users must always comply with applicable laws and regulations when using the AutoGLM open-source project.
 54 | 
 55 | 1. Critical Operation Confirmation Mechanism
 56 | Developers must design and implement explicit, mandatory user confirmation steps within their applications or services built on AutoGLM for the following 6+1 types of high-risk operations:
 57 | - Information Interaction and Content Dissemination: Including but not limited to sending messages, emails, posting comments, liking, sharing, etc.
 58 | - File Handling and Permission Management: Including but not limited to creating, editing, deleting, moving files or folders, enabling or disabling any permissions, etc.
 59 | - Transaction Orders and Disposal of Rights/Interests: Including but not limited to clearing shopping carts, submitting orders, modifying/adding shipping addresses, using coupons/points, etc.
 60 | - Fund Transfers and Payment Settlement: Including but not limited to transfers, payments, receiving funds, recharging, withdrawals, binding/unbinding payment methods, etc.
 61 | - Account Identity and Security Configuration: Including but not limited to changing passwords, setting/modifying security options, deleting accounts or linked accounts, deleting friends/contacts, deleting conversations/records, etc.
 62 | - Healthcare and Legal Compliance: Including but not limited to accessing, authorizing, or disposing of medical records/health data, purchasing medication, physical or psychological testing, signing electronic agreements, etc.
 63 | - Other High-Risk Operations: Any other operation that may significantly impact user data security, property security, account security, or reputation.
 64 |   
 65 | Requirements:
 66 | - The confirmation step must be triggered before operation execution, clearly displaying the details of the upcoming operation.
 67 | - Provide convenient cancel/termination mechanisms, allowing users to abort the task at any time before confirmation or during the operation process.
 68 | - Developer Responsibility: Developers shall bear corresponding responsibility for losses caused to users due to failure to implement an effective confirmation mechanism.
 69 | - User Responsibility: Users shall bear losses resulting from their failure to promptly terminate erroneous operations after confirmation.
 70 |   
 71 | 2. Obligations of Developers and Users
 72 | Developer Obligations:
 73 | - Transparent Disclosure: Clearly and accurately explain to end-users the functionality, working principles (especially the automated parts), data collection and processing methods (including whether the cloud is involved), potential risks, and how users can exercise control.
 74 | - Provide Monitoring and Control: Design a user interface that allows end-users to:
 75 |   - View or understand the current status and steps of automated operations in real-time.
 76 |   - Conveniently and quickly pause or terminate any ongoing automated task.
 77 |   - Manage permissions and settings for automated operations.
 78 | - Secure Development: Follow secure coding practices to ensure the security of the application/service itself and prevent malicious exploitation.
 79 | - Compliance: Ensure that the developed application/service complies with all applicable laws, regulations, industry standards, and third-party platform (e.g., the application being operated on) terms of service.
 80 | - Risk Warning: Clearly warn users in appropriate locations (e.g., feature entry points, first-time use, confirmation steps) about potential risks of using automation functions (such as misoperation, privacy risks, third-party platform policy risks).
 81 | - Avoid Critical Dependencies: Carefully evaluate and refrain from recommending AutoGLM for handling extremely critical, high-risk operations or those with severe consequences upon error (e.g., medical device control, critical infrastructure operations, large financial transactions without human review).
 82 |   
 83 | User Obligations:
 84 | - Understand Risks: Before using AutoGLM-based automation features, carefully read the developer's instructions, privacy policy, and risk warnings to fully understand their working principles and potential risks.
 85 | - Grant Permissions Cautiously: Only grant necessary permissions after fully trusting the application/service developer and understanding the authorization content.
 86 | - Active Monitoring: Maintain appropriate attention during the execution of automated tasks, especially for important operations. Utilize monitoring functions provided by the developer to understand operation progress.
 87 | - Timely Intervention: Immediately use the provided termination function to stop the task if any operation error, abnormality, or deviation from expectation is observed.
 88 | - Assume Responsibility: Bear responsibility for instructions issued, operations confirmed, and any losses resulting from failure to promptly monitor and stop erroneous operations.
 89 |   
 90 | 3. Developer and User Code of Conduct
 91 | It is strictly prohibited to use the AutoGLM open-source project or applications/services developed based on it to engage in the following behaviors:
 92 | 
 93 | (1) Bulk Automation and Malicious Competition
 94 | - Any form of falsified data manipulation: brushing orders, votes, likes, comments, traffic, followers, play counts, downloads, etc.
 95 | - Bulk account manipulation: bulk registration, bulk login, bulk operation of third-party platform accounts (group control, multi-instance, cloud control).
 96 | - Disrupting market order: malicious bulk purchasing, hoarding and profiteering, snatching limited resources, bulk claiming/abusing coupons/subsidies, maliciously occupying service resources ("薅羊毛").
 97 | - Manipulating platform rules: brushing rankings/search results, artificially influencing recommendation algorithms, artificially inflating/deflating content exposure.
 98 | - Creating false engagement: bulk publishing, reposting, liking, collecting, following, unfollowing, etc., on social media.
 99 | - Undermining game fairness: power-leveling services, studio operations, bulk farming of equipment/currency/experience/items.
100 | - Undermining fairness: bulk voting, ballot stuffing, manipulating online polls/survey results.
101 |   
102 | (2) False Information and Fraudulent Behavior
103 | - Creating misleading information: publishing/spreading false product/service reviews, false user feedback, false testimonials, false experiences.
104 | - Fabricating commercial data: creating false transaction records, sales figures, user engagement, positive review rates.
105 | - Identity fraud: impersonating others, fabricating personal information, stealing others' accounts/avatars/nicknames, forging identity documents.
106 | - False marketing: publishing false advertisements, conducting false promotions, exaggerating product efficacy, concealing product defects/risks.
107 | - Participating in fraudulent activities: online scams, false investments, pyramid schemes, illegal fundraising, fake prize wins, phishing, etc.
108 | - Spreading unverified information: creating or maliciously spreading fake news, rumors, unverified information.
109 |   
110 | (3) Harming Third-Party Services and System Security
111 | - Unauthorized access: using AutoGLM for data scraping (violating robots.txt or platform policies), information theft, API abuse, unauthorized penetration testing.
112 | - Technical sabotage: reverse engineering, cracking, modifying, injecting malicious code into third-party applications, disrupting their normal operation.
113 | - Resource abuse: maliciously occupying third-party server resources, sending spam requests, generating abnormal traffic, conducting DDoS attacks.
114 | - Violating platform rules: intentionally violating the user agreements, terms of service, or community rules of the third-party application being operated on.
115 | - Malicious competition: malicious negative reviews, false reporting, false complaints, commercial defamation.
116 | - Spreading harmful content: spreading computer viruses, trojans, malware, ransomware, spam, illegal content.
117 | - Infringing data rights: unauthorized large-scale commercial data collection, user information gathering, privacy snooping.
118 |   
119 | (4) Infringing on Others' Legitimate Rights and Interests
120 | - Account theft: stealing others' accounts, passwords, identity credentials for operations.
121 | - Online harassment and bullying: malicious harassment, threats, insults, defamation, doxxing others.
122 | - Privacy and secret infringement: unauthorized collection, use, or dissemination of others' personal information, private data, trade secrets.
123 | - Cybersquatting: registering others' trademarks, domain names, usernames, social media accounts, etc., in bad faith.
124 | - Harassment: malicious spamming, message bombing, forced following/subscription.
125 | - Harming commercial interests: industrial espionage, unfair competition, malicious poaching, theft of trade secrets.
126 |   
127 | (5) Resource Abuse and Damaging Project Ecosystem
128 | - Abusing registration resources: maliciously registering numerous accounts, fake registration.
129 | - Wasting computing/device resources: maliciously occupying local or cloud device resources, long-term idle occupancy, running high-energy-consumption programs unrelated to automated tasks (e.g., cryptocurrency mining).
130 | - Destabilizing systems: maliciously testing system performance, conducting unauthorized stress tests, frequently restarting services, exploiting technical vulnerabilities/defects for personal gain or to harm the project/platform.
131 | - Violating open-source licenses: violating the terms of the AutoGLM project's open-source license.
132 |   
133 | Consequences of Violation:
134 | If developers/users fail to follow the corresponding laws, regulations, policies, industry standards (including but not limited to technical specifications, security standards), and the project's agreements (including but not limited to open-source licenses, usage notes) during use, all resulting legal liabilities, economic losses, and any adverse consequences shall be solely and independently borne by the developers / users.


--------------------------------------------------------------------------------
/phone_agent/actions/handler.py:
--------------------------------------------------------------------------------
  1 | """Action handler for processing AI model outputs."""
  2 | 
  3 | import ast
  4 | import re
  5 | import subprocess
  6 | import time
  7 | from dataclasses import dataclass
  8 | from typing import Any, Callable
  9 | 
 10 | from phone_agent.config.timing import TIMING_CONFIG
 11 | from phone_agent.device_factory import get_device_factory
 12 | 
 13 | 
 14 | @dataclass
 15 | class ActionResult:
 16 |     """Result of an action execution."""
 17 | 
 18 |     success: bool
 19 |     should_finish: bool
 20 |     message: str | None = None
 21 |     requires_confirmation: bool = False
 22 | 
 23 | 
 24 | class ActionHandler:
 25 |     """
 26 |     Handles execution of actions from AI model output.
 27 | 
 28 |     Args:
 29 |         device_id: Optional ADB device ID for multi-device setups.
 30 |         confirmation_callback: Optional callback for sensitive action confirmation.
 31 |             Should return True to proceed, False to cancel.
 32 |         takeover_callback: Optional callback for takeover requests (login, captcha).
 33 |     """
 34 | 
 35 |     def __init__(
 36 |         self,
 37 |         device_id: str | None = None,
 38 |         confirmation_callback: Callable[[str], bool] | None = None,
 39 |         takeover_callback: Callable[[str], None] | None = None,
 40 |     ):
 41 |         self.device_id = device_id
 42 |         self.confirmation_callback = confirmation_callback or self._default_confirmation
 43 |         self.takeover_callback = takeover_callback or self._default_takeover
 44 | 
 45 |     def execute(
 46 |         self, action: dict[str, Any], screen_width: int, screen_height: int
 47 |     ) -> ActionResult:
 48 |         """
 49 |         Execute an action from the AI model.
 50 | 
 51 |         Args:
 52 |             action: The action dictionary from the model.
 53 |             screen_width: Current screen width in pixels.
 54 |             screen_height: Current screen height in pixels.
 55 | 
 56 |         Returns:
 57 |             ActionResult indicating success and whether to finish.
 58 |         """
 59 |         action_type = action.get("_metadata")
 60 | 
 61 |         if action_type == "finish":
 62 |             return ActionResult(
 63 |                 success=True, should_finish=True, message=action.get("message")
 64 |             )
 65 | 
 66 |         if action_type != "do":
 67 |             return ActionResult(
 68 |                 success=False,
 69 |                 should_finish=True,
 70 |                 message=f"Unknown action type: {action_type}",
 71 |             )
 72 | 
 73 |         action_name = action.get("action")
 74 |         handler_method = self._get_handler(action_name)
 75 | 
 76 |         if handler_method is None:
 77 |             return ActionResult(
 78 |                 success=False,
 79 |                 should_finish=False,
 80 |                 message=f"Unknown action: {action_name}",
 81 |             )
 82 | 
 83 |         try:
 84 |             return handler_method(action, screen_width, screen_height)
 85 |         except Exception as e:
 86 |             return ActionResult(
 87 |                 success=False, should_finish=False, message=f"Action failed: {e}"
 88 |             )
 89 | 
 90 |     def _get_handler(self, action_name: str) -> Callable | None:
 91 |         """Get the handler method for an action."""
 92 |         handlers = {
 93 |             "Launch": self._handle_launch,
 94 |             "Tap": self._handle_tap,
 95 |             "Type": self._handle_type,
 96 |             "Type_Name": self._handle_type,
 97 |             "Swipe": self._handle_swipe,
 98 |             "Back": self._handle_back,
 99 |             "Home": self._handle_home,
100 |             "Double Tap": self._handle_double_tap,
101 |             "Long Press": self._handle_long_press,
102 |             "Wait": self._handle_wait,
103 |             "Take_over": self._handle_takeover,
104 |             "Note": self._handle_note,
105 |             "Call_API": self._handle_call_api,
106 |             "Interact": self._handle_interact,
107 |         }
108 |         return handlers.get(action_name)
109 | 
110 |     def _convert_relative_to_absolute(
111 |         self, element: list[int], screen_width: int, screen_height: int
112 |     ) -> tuple[int, int]:
113 |         """Convert relative coordinates (0-1000) to absolute pixels."""
114 |         x = int(element[0] / 1000 * screen_width)
115 |         y = int(element[1] / 1000 * screen_height)
116 |         return x, y
117 | 
118 |     def _handle_launch(self, action: dict, width: int, height: int) -> ActionResult:
119 |         """Handle app launch action."""
120 |         app_name = action.get("app")
121 |         if not app_name:
122 |             return ActionResult(False, False, "No app name specified")
123 | 
124 |         device_factory = get_device_factory()
125 |         success = device_factory.launch_app(app_name, self.device_id)
126 |         if success:
127 |             return ActionResult(True, False)
128 |         return ActionResult(False, False, f"App not found: {app_name}")
129 | 
130 |     def _handle_tap(self, action: dict, width: int, height: int) -> ActionResult:
131 |         """Handle tap action."""
132 |         element = action.get("element")
133 |         if not element:
134 |             return ActionResult(False, False, "No element coordinates")
135 | 
136 |         x, y = self._convert_relative_to_absolute(element, width, height)
137 | 
138 |         # Check for sensitive operation
139 |         if "message" in action:
140 |             if not self.confirmation_callback(action["message"]):
141 |                 return ActionResult(
142 |                     success=False,
143 |                     should_finish=True,
144 |                     message="User cancelled sensitive operation",
145 |                 )
146 | 
147 |         device_factory = get_device_factory()
148 |         device_factory.tap(x, y, self.device_id)
149 |         return ActionResult(True, False)
150 | 
151 |     def _handle_type(self, action: dict, width: int, height: int) -> ActionResult:
152 |         """Handle text input action."""
153 |         text = action.get("text", "")
154 | 
155 |         device_factory = get_device_factory()
156 | 
157 |         # Switch to ADB keyboard
158 |         original_ime = device_factory.detect_and_set_adb_keyboard(self.device_id)
159 |         time.sleep(TIMING_CONFIG.action.keyboard_switch_delay)
160 | 
161 |         # Clear existing text and type new text
162 |         device_factory.clear_text(self.device_id)
163 |         time.sleep(TIMING_CONFIG.action.text_clear_delay)
164 | 
165 |         # Handle multiline text by splitting on newlines
166 |         device_factory.type_text(text, self.device_id)
167 |         time.sleep(TIMING_CONFIG.action.text_input_delay)
168 | 
169 |         # Restore original keyboard
170 |         device_factory.restore_keyboard(original_ime, self.device_id)
171 |         time.sleep(TIMING_CONFIG.action.keyboard_restore_delay)
172 | 
173 |         return ActionResult(True, False)
174 | 
175 |     def _handle_swipe(self, action: dict, width: int, height: int) -> ActionResult:
176 |         """Handle swipe action."""
177 |         start = action.get("start")
178 |         end = action.get("end")
179 | 
180 |         if not start or not end:
181 |             return ActionResult(False, False, "Missing swipe coordinates")
182 | 
183 |         start_x, start_y = self._convert_relative_to_absolute(start, width, height)
184 |         end_x, end_y = self._convert_relative_to_absolute(end, width, height)
185 | 
186 |         device_factory = get_device_factory()
187 |         device_factory.swipe(start_x, start_y, end_x, end_y, device_id=self.device_id)
188 |         return ActionResult(True, False)
189 | 
190 |     def _handle_back(self, action: dict, width: int, height: int) -> ActionResult:
191 |         """Handle back button action."""
192 |         device_factory = get_device_factory()
193 |         device_factory.back(self.device_id)
194 |         return ActionResult(True, False)
195 | 
196 |     def _handle_home(self, action: dict, width: int, height: int) -> ActionResult:
197 |         """Handle home button action."""
198 |         device_factory = get_device_factory()
199 |         device_factory.home(self.device_id)
200 |         return ActionResult(True, False)
201 | 
202 |     def _handle_double_tap(self, action: dict, width: int, height: int) -> ActionResult:
203 |         """Handle double tap action."""
204 |         element = action.get("element")
205 |         if not element:
206 |             return ActionResult(False, False, "No element coordinates")
207 | 
208 |         x, y = self._convert_relative_to_absolute(element, width, height)
209 |         device_factory = get_device_factory()
210 |         device_factory.double_tap(x, y, self.device_id)
211 |         return ActionResult(True, False)
212 | 
213 |     def _handle_long_press(self, action: dict, width: int, height: int) -> ActionResult:
214 |         """Handle long press action."""
215 |         element = action.get("element")
216 |         if not element:
217 |             return ActionResult(False, False, "No element coordinates")
218 | 
219 |         x, y = self._convert_relative_to_absolute(element, width, height)
220 |         device_factory = get_device_factory()
221 |         device_factory.long_press(x, y, device_id=self.device_id)
222 |         return ActionResult(True, False)
223 | 
224 |     def _handle_wait(self, action: dict, width: int, height: int) -> ActionResult:
225 |         """Handle wait action."""
226 |         duration_str = action.get("duration", "1 seconds")
227 |         try:
228 |             duration = float(duration_str.replace("seconds", "").strip())
229 |         except ValueError:
230 |             duration = 1.0
231 | 
232 |         time.sleep(duration)
233 |         return ActionResult(True, False)
234 | 
235 |     def _handle_takeover(self, action: dict, width: int, height: int) -> ActionResult:
236 |         """Handle takeover request (login, captcha, etc.)."""
237 |         message = action.get("message", "User intervention required")
238 |         self.takeover_callback(message)
239 |         return ActionResult(True, False)
240 | 
241 |     def _handle_note(self, action: dict, width: int, height: int) -> ActionResult:
242 |         """Handle note action (placeholder for content recording)."""
243 |         # This action is typically used for recording page content
244 |         # Implementation depends on specific requirements
245 |         return ActionResult(True, False)
246 | 
247 |     def _handle_call_api(self, action: dict, width: int, height: int) -> ActionResult:
248 |         """Handle API call action (placeholder for summarization)."""
249 |         # This action is typically used for content summarization
250 |         # Implementation depends on specific requirements
251 |         return ActionResult(True, False)
252 | 
253 |     def _handle_interact(self, action: dict, width: int, height: int) -> ActionResult:
254 |         """Handle interaction request (user choice needed)."""
255 |         # This action signals that user input is needed
256 |         return ActionResult(True, False, message="User interaction required")
257 | 
258 |     def _send_keyevent(self, keycode: str) -> None:
259 |         """Send a keyevent to the device."""
260 |         from phone_agent.device_factory import DeviceType, get_device_factory
261 |         from phone_agent.hdc.connection import _run_hdc_command
262 | 
263 |         device_factory = get_device_factory()
264 | 
265 |         # Handle HDC devices with HarmonyOS-specific keyEvent command
266 |         if device_factory.device_type == DeviceType.HDC:
267 |             hdc_prefix = ["hdc", "-t", self.device_id] if self.device_id else ["hdc"]
268 |             
269 |             # Map common keycodes to HarmonyOS keyEvent codes
270 |             # KEYCODE_ENTER (66) -> 2054 (HarmonyOS Enter key code)
271 |             if keycode == "KEYCODE_ENTER" or keycode == "66":
272 |                 _run_hdc_command(
273 |                     hdc_prefix + ["shell", "uitest", "uiInput", "keyEvent", "2054"],
274 |                     capture_output=True,
275 |                     text=True,
276 |                 )
277 |             else:
278 |                 # For other keys, try to use the numeric code directly
279 |                 # If keycode is a string like "KEYCODE_ENTER", convert it
280 |                 try:
281 |                     # Try to extract numeric code from string or use as-is
282 |                     if keycode.startswith("KEYCODE_"):
283 |                         # For now, only handle ENTER, other keys may need mapping
284 |                         if "ENTER" in keycode:
285 |                             _run_hdc_command(
286 |                                 hdc_prefix + ["shell", "uitest", "uiInput", "keyEvent", "2054"],
287 |                                 capture_output=True,
288 |                                 text=True,
289 |                             )
290 |                         else:
291 |                             # Fallback to ADB-style command for unsupported keys
292 |                             subprocess.run(
293 |                                 hdc_prefix + ["shell", "input", "keyevent", keycode],
294 |                                 capture_output=True,
295 |                                 text=True,
296 |                             )
297 |                     else:
298 |                         # Assume it's a numeric code
299 |                         _run_hdc_command(
300 |                             hdc_prefix + ["shell", "uitest", "uiInput", "keyEvent", str(keycode)],
301 |                             capture_output=True,
302 |                             text=True,
303 |                         )
304 |                 except Exception:
305 |                     # Fallback to ADB-style command
306 |                     subprocess.run(
307 |                         hdc_prefix + ["shell", "input", "keyevent", keycode],
308 |                         capture_output=True,
309 |                         text=True,
310 |                     )
311 |         else:
312 |             # ADB devices use standard input keyevent command
313 |             cmd_prefix = ["adb", "-s", self.device_id] if self.device_id else ["adb"]
314 |             subprocess.run(
315 |                 cmd_prefix + ["shell", "input", "keyevent", keycode],
316 |                 capture_output=True,
317 |                 text=True,
318 |             )
319 | 
320 |     @staticmethod
321 |     def _default_confirmation(message: str) -> bool:
322 |         """Default confirmation callback using console input."""
323 |         response = input(f"Sensitive operation: {message}\nConfirm? (Y/N): ")
324 |         return response.upper() == "Y"
325 | 
326 |     @staticmethod
327 |     def _default_takeover(message: str) -> None:
328 |         """Default takeover callback using console input."""
329 |         input(f"{message}\nPress Enter after completing manual operation...")
330 | 
331 | 
332 | def parse_action(response: str) -> dict[str, Any]:
333 |     """
334 |     Parse action from model response.
335 | 
336 |     Args:
337 |         response: Raw response string from the model.
338 | 
339 |     Returns:
340 |         Parsed action dictionary.
341 | 
342 |     Raises:
343 |         ValueError: If the response cannot be parsed.
344 |     """
345 |     print(f"Parsing action: {response}")
346 |     try:
347 |         response = response.strip()
348 |         if response.startswith('do(action="Type"') or response.startswith(
349 |             'do(action="Type_Name"'
350 |         ):
351 |             text = response.split("text=", 1)[1][1:-2]
352 |             action = {"_metadata": "do", "action": "Type", "text": text}
353 |             return action
354 |         elif response.startswith("do"):
355 |             # Use AST parsing instead of eval for safety
356 |             try:
357 |                 # Escape special characters (newlines, tabs, etc.) for valid Python syntax
358 |                 response = response.replace('\n', '\\n')
359 |                 response = response.replace('\r', '\\r')
360 |                 response = response.replace('\t', '\\t')
361 | 
362 |                 tree = ast.parse(response, mode="eval")
363 |                 if not isinstance(tree.body, ast.Call):
364 |                     raise ValueError("Expected a function call")
365 | 
366 |                 call = tree.body
367 |                 # Extract keyword arguments safely
368 |                 action = {"_metadata": "do"}
369 |                 for keyword in call.keywords:
370 |                     key = keyword.arg
371 |                     value = ast.literal_eval(keyword.value)
372 |                     action[key] = value
373 | 
374 |                 return action
375 |             except (SyntaxError, ValueError) as e:
376 |                 raise ValueError(f"Failed to parse do() action: {e}")
377 | 
378 |         elif response.startswith("finish"):
379 |             action = {
380 |                 "_metadata": "finish",
381 |                 "message": response.replace("finish(message=", "")[1:-2],
382 |             }
383 |         else:
384 |             raise ValueError(f"Failed to parse action: {response}")
385 |         return action
386 |     except Exception as e:
387 |         raise ValueError(f"Failed to parse action: {e}")
388 | 
389 | 
390 | def do(**kwargs) -> dict[str, Any]:
391 |     """Helper function for creating 'do' actions."""
392 |     kwargs["_metadata"] = "do"
393 |     return kwargs
394 | 
395 | 
396 | def finish(**kwargs) -> dict[str, Any]:
397 |     """Helper function for creating 'finish' actions."""
398 |     kwargs["_metadata"] = "finish"
399 |     return kwargs
400 | 


--------------------------------------------------------------------------------