├── tests
├── __init__.py
└── test_core.py
├── midscene
├── cli
│ ├── __init__.py
│ ├── main.py
│ └── config.py
├── android
│ ├── __init__.py
│ └── agent.py
├── shared
│ ├── __init__.py
│ ├── logger.py
│ ├── cache.py
│ └── report.py
├── web
│ ├── __init__.py
│ ├── bridge.py
│ └── playwright_page.py
├── core
│ ├── ai_model
│ │ ├── __init__.py
│ │ ├── service.py
│ │ └── providers.py
│ ├── __init__.py
│ └── types.py
└── __init__.py
├── .env.example
├── .github
└── workflows
│ └── publish.yml
├── midscene.yml
├── LICENSE
├── scripts
├── quick_validate.bat
└── validate_requirements.py
├── Makefile
├── .gitignore
├── wiki
├── README.md
├── 核心概念
│ ├── README.md
│ ├── Agent核心控制器.md
│ └── Insight-UI理解引擎.md
├── 生成状态.md
├── 项目概述.md
├── 快速开始.md
├── 安装配置.md
└── 平台集成
│ └── README.md
├── examples
└── basic_usage.py
├── pyproject.toml
├── README.zh.md
├── README.md
├── docs
└── quickstart.md
└── requirements.txt
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | """Test package for Midscene Python"""
--------------------------------------------------------------------------------
/midscene/cli/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | CLI module for Midscene Python
3 | """
4 |
5 | from .main import main, app
6 |
7 | __all__ = ["main", "app"]
--------------------------------------------------------------------------------
/midscene/android/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Android integration module for Midscene Python
3 | """
4 |
5 | from .device import AndroidDevice
6 | from .agent import AndroidAgent
7 |
8 | __all__ = [
9 | "AndroidDevice",
10 | "AndroidAgent",
11 | ]
--------------------------------------------------------------------------------
/midscene/shared/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Shared utilities and tools for Midscene Python
3 | """
4 |
5 | from .cache import TaskCache
6 | from .logger import setup_logger
7 | from .report import ReportGenerator
8 |
9 | __all__ = [
10 | "TaskCache",
11 | "setup_logger",
12 | "ReportGenerator",
13 | ]
--------------------------------------------------------------------------------
/midscene/web/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Web integration module for Midscene Python
3 | """
4 |
5 | from .selenium_page import SeleniumWebPage
6 | from .playwright_page import PlaywrightWebPage
7 | from .bridge import BridgeWebPage
8 |
9 | __all__ = [
10 | "SeleniumWebPage",
11 | "PlaywrightWebPage",
12 | "BridgeWebPage",
13 | ]
--------------------------------------------------------------------------------
/midscene/core/ai_model/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | AI model integration module
3 | """
4 |
5 | from .service import AIModelService, AIModelConfig
6 | from .providers import OpenAIProvider, AnthropicProvider, QwenProvider, GeminiProvider
7 |
8 | __all__ = [
9 | "AIModelService",
10 | "AIModelConfig",
11 | "OpenAIProvider",
12 | "AnthropicProvider",
13 | "QwenProvider",
14 | "GeminiProvider",
15 | ]
--------------------------------------------------------------------------------
/midscene/core/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Core module for Midscene Python
3 | """
4 |
5 | from .agent import Agent
6 | from .insight import Insight
7 | from .types import *
8 |
9 | __all__ = [
10 | "Agent",
11 | "Insight",
12 | "UIContext",
13 | "LocateResult",
14 | "ExecutionResult",
15 | "BaseElement",
16 | "AbstractInterface",
17 | "InterfaceType",
18 | "AgentOptions",
19 | "LocateOption",
20 | "ExtractOption",
21 | "ScrollParam",
22 | ]
--------------------------------------------------------------------------------
/midscene/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Midscene Python - AI-powered automation framework
3 |
4 | A Python implementation of Midscene, providing AI-driven automation
5 | capabilities for Web and Android platforms.
6 | """
7 |
8 | from .core.agent import Agent
9 | from .core.insight import Insight
10 | from .core.types import UIContext, LocateResult, ExecutionResult
11 |
12 | __version__ = "0.1.0"
13 |
14 | __all__ = [
15 | "Agent",
16 | "Insight",
17 | "UIContext",
18 | "LocateResult",
19 | "ExecutionResult",
20 | ]
--------------------------------------------------------------------------------
/.env.example:
--------------------------------------------------------------------------------
1 | # Environment Variables for Midscene Python
2 |
3 | # AI Model Configuration
4 | MIDSCENE_AI_PROVIDER=openai
5 | MIDSCENE_AI_MODEL=gpt-4-vision-preview
6 | MIDSCENE_AI_API_KEY=your-api-key-here
7 | # MIDSCENE_AI_BASE_URL=https://api.openai.com
8 |
9 | # Execution Settings
10 | MIDSCENE_CONCURRENT=1
11 | MIDSCENE_CONTINUE_ON_ERROR=false
12 | MIDSCENE_GENERATE_REPORT=true
13 |
14 | # Logging
15 | MIDSCENE_LOG_LEVEL=INFO
16 | MIDSCENE_LOG_FILE=midscene.log
17 |
18 | # Development Settings
19 | MIDSCENE_DEBUG=false
20 | MIDSCENE_CACHE_ENABLED=true
--------------------------------------------------------------------------------
/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
1 | name: Publish to PyPI
2 |
3 | on:
4 | push:
5 | tags:
6 | - 'v*' # 当推送以 'v' 开头的标签时触发
7 |
8 | jobs:
9 | build-and-publish:
10 | runs-on: windows-latest # 使用 Windows 环境
11 | steps:
12 | - name: Checkout code
13 | uses: actions/checkout@v4
14 |
15 | - name: Set up Python
16 | uses: actions/setup-python@v4
17 | with:
18 | python-version: '3.x'
19 |
20 | - name: Install uv
21 | run: |
22 | powershell -c "irm https://astral.sh/uv/install.sh | iex"
23 | echo "$env:USERPROFILE\.cargo\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
24 |
25 | - name: Build package
26 | run: uv build
27 |
28 | - name: Publish to PyPI
29 | env:
30 | UV_PUBLISH_TOKEN: ${{ secrets.PYPI_API_TOKEN }} # 使用 UV_PUBLISH_TOKEN 替代 TWINE_PASSWORD
31 | run: uv publish dist/*
32 |
--------------------------------------------------------------------------------
/midscene.yml:
--------------------------------------------------------------------------------
1 | # Midscene Python Configuration
2 |
3 | # AI Model Configuration
4 | ai:
5 | provider: "openai" # openai, anthropic, qwen, gemini
6 | model: "gpt-4-vision-preview"
7 | api_key: "${MIDSCENE_AI_API_KEY}" # Set via environment variable
8 | base_url: null # Custom API endpoint if needed
9 | max_tokens: 4000
10 | temperature: 0.1
11 |
12 | # Web Automation Configuration
13 | web:
14 | browser: "chrome" # chrome, firefox, safari
15 | headless: false
16 | window_size: [1920, 1080]
17 | user_data_dir: null # Browser profile directory
18 | timeout: 30
19 |
20 | # Android Automation Configuration
21 | android:
22 | device_id: null # Auto-detect if null
23 | adb_path: "adb"
24 | auto_dismiss_keyboard: true
25 | timeout: 30
26 |
27 | # Execution Configuration
28 | execution:
29 | concurrent: 1 # Number of concurrent script executions
30 | continue_on_error: false # Continue executing scripts on error
31 | generate_report: true
32 | report_format: "html" # html, json, xml
33 | output_dir: "./reports"
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2025 Python51888
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/midscene/cli/main.py:
--------------------------------------------------------------------------------
1 | """
2 | Midscene CLI - Command line interface for automation scripts
3 | """
4 |
5 | import sys
6 | from typing import Optional
7 |
8 | import typer
9 | from rich.console import Console
10 |
11 | from .config import CLIConfig
12 |
13 | app = typer.Typer(
14 | name="midscene",
15 | help="AI-powered automation framework for Web and Android platforms",
16 | no_args_is_help=True
17 | )
18 |
19 | console = Console()
20 |
21 |
22 | @app.command()
23 | def run(
24 | script_path: str = typer.Argument(..., help="Path to YAML script file or directory"),
25 | config_file: Optional[str] = typer.Option(None, "--config", "-c", help="Configuration file path"),
26 | headless: bool = typer.Option(False, "--headless", help="Run browser in headless mode"),
27 | device_id: Optional[str] = typer.Option(None, "--device", "-d", help="Android device ID"),
28 | verbose: bool = typer.Option(False, "--verbose", "-v", help="Verbose output"),
29 | ):
30 | """Run automation script(s)"""
31 |
32 | console.print(f"[yellow]Script execution not yet implemented: {script_path}[/yellow]")
33 | console.print("[blue]This is a placeholder CLI implementation[/blue]")
34 |
35 |
36 | @app.command()
37 | def version():
38 | """Show version information"""
39 |
40 | try:
41 | console.print("Midscene Python v0.1.0")
42 |
43 | except Exception as e:
44 | console.print(f"❌ Error getting version: {e}", style="red")
45 |
46 |
47 | def main():
48 | """CLI entry point"""
49 | app()
50 |
51 |
52 | if __name__ == "__main__":
53 | main()
--------------------------------------------------------------------------------
/scripts/quick_validate.bat:
--------------------------------------------------------------------------------
1 | @echo off
2 | chcp 65001 > nul
3 | echo === Midscene Python Dependencies Quick Validation ===
4 | echo.
5 |
6 | REM Check if requirements.txt exists
7 | if not exist "requirements.txt" (
8 | echo Error: requirements.txt file not found
9 | echo Please run: make requirements-freeze
10 | exit /b 1
11 | )
12 |
13 | echo 1. Checking requirements.txt file...
14 | echo Success: requirements.txt exists
15 |
16 | REM Count dependencies
17 | for /f %%i in ('findstr /v "^#" requirements.txt ^| findstr /v "^$" ^| find /c "=="') do set count=%%i
18 | echo Success: Found %count% dependency packages
19 |
20 | echo.
21 | echo 2. Validating key dependencies...
22 |
23 | REM Check core dependencies
24 | findstr /i "pydantic==" requirements.txt >nul 2>&1
25 | if %errorlevel% equ 0 (echo Success: pydantic) else (echo Error: pydantic & set error=1)
26 |
27 | findstr /i "selenium==" requirements.txt >nul 2>&1
28 | if %errorlevel% equ 0 (echo Success: selenium) else (echo Error: selenium & set error=1)
29 |
30 | findstr /i "playwright==" requirements.txt >nul 2>&1
31 | if %errorlevel% equ 0 (echo Success: playwright) else (echo Error: playwright & set error=1)
32 |
33 | REM Check development dependencies
34 | findstr /i "pytest==" requirements.txt >nul 2>&1
35 | if %errorlevel% equ 0 (echo Success: pytest) else (echo Error: pytest & set error=1)
36 |
37 | findstr /i "black==" requirements.txt >nul 2>&1
38 | if %errorlevel% equ 0 (echo Success: black) else (echo Error: black & set error=1)
39 |
40 | REM Check documentation dependencies
41 | findstr /i "mkdocs==" requirements.txt >nul 2>&1
42 | if %errorlevel% equ 0 (echo Success: mkdocs) else (echo Error: mkdocs & set error=1)
43 |
44 | echo.
45 | if defined error (
46 | echo Validation FAILED: Missing key dependencies
47 | exit /b 1
48 | ) else (
49 | echo Validation PASSED!
50 | echo requirements.txt contains all key dependencies
51 | )
--------------------------------------------------------------------------------
/midscene/shared/logger.py:
--------------------------------------------------------------------------------
1 | """
2 | Logging configuration for Midscene Python
3 | """
4 |
5 | import sys
6 | from pathlib import Path
7 | from typing import Optional
8 |
9 | from loguru import logger
10 |
11 |
12 | def setup_logger(
13 | level: str = "INFO",
14 | log_file: Optional[str] = None,
15 | rotation: str = "10 MB",
16 | retention: str = "7 days",
17 | format_string: Optional[str] = None
18 | ) -> None:
19 | """Setup logging configuration
20 |
21 | Args:
22 | level: Log level (DEBUG, INFO, WARNING, ERROR)
23 | log_file: Log file path
24 | rotation: Log rotation size/time
25 | retention: Log retention period
26 | format_string: Custom format string
27 | """
28 | # Remove default logger
29 | logger.remove()
30 |
31 | # Default format
32 | if not format_string:
33 | format_string = (
34 | "{time:YYYY-MM-DD HH:mm:ss.SSS} | "
35 | "{level: <8} | "
36 | "{name}:{function}:{line} | "
37 | "{message}"
38 | )
39 |
40 | # Add console handler
41 | logger.add(
42 | sys.stderr,
43 | level=level,
44 | format=format_string,
45 | colorize=True,
46 | backtrace=True,
47 | diagnose=True
48 | )
49 |
50 | # Add file handler if specified
51 | if log_file:
52 | log_path = Path(log_file)
53 | log_path.parent.mkdir(parents=True, exist_ok=True)
54 |
55 | logger.add(
56 | log_path,
57 | level=level,
58 | format=format_string,
59 | rotation=rotation,
60 | retention=retention,
61 | backtrace=True,
62 | diagnose=True
63 | )
64 |
65 | logger.info(f"Logger configured with level: {level}")
66 |
67 |
68 | def get_logger(name: str):
69 | """Get logger instance
70 |
71 | Args:
72 | name: Logger name
73 |
74 | Returns:
75 | Logger instance
76 | """
77 | return logger.bind(name=name)
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | .PHONY: help install dev test lint format clean build docs requirements-freeze requirements-check
2 |
3 | # Default target
4 | help:
5 | @echo "Available commands:"
6 | @echo " install Install package and dependencies"
7 | @echo " dev Install development dependencies"
8 | @echo " requirements-freeze Generate complete requirements.txt"
9 | @echo " requirements-check Verify dependencies integrity"
10 | @echo " requirements-quick-check Quick requirements validation"
11 | @echo " test Run tests"
12 | @echo " lint Run linting"
13 | @echo " format Format code"
14 | @echo " clean Clean build artifacts"
15 | @echo " build Build package"
16 | @echo " docs Build documentation"
17 |
18 | # Generate complete requirements.txt with all dependencies
19 | requirements-freeze:
20 | uv pip compile --all-extras pyproject.toml -o requirements.txt
21 |
22 | # Verify dependencies integrity
23 | requirements-check:
24 | uv pip check
25 | @python scripts/validate_requirements.py
26 |
27 | # Quick requirements validation
28 | requirements-quick-check:
29 | @scripts/quick_validate.bat
30 |
31 | # Install package from requirements.txt
32 | install:
33 | pip install -r requirements.txt
34 |
35 | # Install package in development mode
36 | install-dev:
37 | pip install -e ".[dev,docs]"
38 | pre-commit install
39 |
40 | # Install development dependencies (alias for backward compatibility)
41 | dev: install-dev
42 |
43 | # Run tests
44 | test:
45 | pytest tests/ -v --cov=midscene --cov-report=html --cov-report=term-missing
46 |
47 | # Run tests with specific markers
48 | test-unit:
49 | pytest tests/ -v -m "unit"
50 |
51 | test-integration:
52 | pytest tests/ -v -m "integration"
53 |
54 | # Linting
55 | lint:
56 | ruff check midscene/ tests/
57 | mypy midscene/
58 |
59 | # Format code
60 | format:
61 | black midscene/ tests/ examples/
62 | isort midscene/ tests/ examples/
63 | ruff check --fix midscene/ tests/
64 |
65 | # Clean build artifacts
66 | clean:
67 | rm -rf build/
68 | rm -rf dist/
69 | rm -rf *.egg-info/
70 | rm -rf .pytest_cache/
71 | rm -rf .coverage
72 | rm -rf htmlcov/
73 | find . -type d -name __pycache__ -delete
74 | find . -type f -name "*.pyc" -delete
75 |
76 | # Build package
77 | build: clean
78 | python -m build
79 |
80 | # Build documentation
81 | docs:
82 | mkdocs build
83 |
84 | # Serve documentation locally
85 | docs-serve:
86 | mkdocs serve
87 |
88 | # Release to PyPI
89 | release: build
90 | twine upload dist/*
91 |
92 | # Release to Test PyPI
93 | release-test: build
94 | twine upload --repository testpypi dist/*
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | <<<<<<< HEAD
2 | # Byte-compiled / optimized / DLL files
3 | __pycache__/
4 | *.py[cod]
5 | *$py.class
6 |
7 | # C extensions
8 | *.so
9 |
10 | # Distribution / packaging
11 | .Python
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | wheels/
24 | pip-wheel-metadata/
25 | share/python-wheels/
26 | *.egg-info/
27 | .installed.cfg
28 | *.egg
29 | MANIFEST
30 |
31 | # PyInstaller
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 |
53 | # Translations
54 | *.mo
55 | *.pot
56 |
57 | # Django stuff:
58 | *.log
59 | local_settings.py
60 | db.sqlite3
61 | db.sqlite3-journal
62 |
63 | # Flask stuff:
64 | instance/
65 | .webassets-cache
66 |
67 | # Scrapy stuff:
68 | .scrapy
69 |
70 | # Sphinx documentation
71 | docs/_build/
72 |
73 | # PyBuilder
74 | target/
75 |
76 | # Jupyter Notebook
77 | .ipynb_checkpoints
78 |
79 | # IPython
80 | profile_default/
81 | ipython_config.py
82 |
83 | # pyenv
84 | .python-version
85 |
86 | # pipenv
87 | Pipfile.lock
88 |
89 | # PEP 582
90 | __pypackages__/
91 |
92 | # Celery stuff
93 | celerybeat-schedule
94 | celerybeat.pid
95 |
96 | # SageMath parsed files
97 | *.sage.py
98 |
99 | # Environments
100 | .env
101 | .venv
102 | env/
103 | venv/
104 | ENV/
105 | env.bak/
106 | venv.bak/
107 |
108 | # Spyder project settings
109 | .spyderproject
110 | .spyproject
111 |
112 | # Rope project settings
113 | .ropeproject
114 |
115 | # mkdocs documentation
116 | /site
117 |
118 | # mypy
119 | .mypy_cache/
120 | .dmypy.json
121 | dmypy.json
122 |
123 | # Pyre type checker
124 | .pyre/
125 |
126 | # Midscene specific
127 | reports/
128 | .midscene/
129 | *.log
130 |
131 | # IDE
132 | .vscode/
133 | .idea/
134 | *.swp
135 | *.swo
136 |
137 | # OS
138 | .DS_Store
139 | Thumbs.db
140 | =======
141 | # Build and Release Folders
142 | bin-debug/
143 | bin-release/
144 | [Oo]bj/
145 | [Bb]in/
146 |
147 | # Other files and folders
148 | .settings/
149 |
150 | # Executables
151 | *.swf
152 | *.air
153 | *.ipa
154 | *.apk
155 |
156 | # Project files, i.e. `.project`, `.actionScriptProperties` and `.flexProperties`
157 | # should NOT be excluded as they contain compiler settings and other important
158 | # information for Eclipse / Flash Builder.
159 | >>>>>>> 2a066347ae84a69f9986cffe451aeae1a5364b10
160 |
161 | # YoYo AI version control directory
162 | .yoyo/
163 |
--------------------------------------------------------------------------------
/wiki/README.md:
--------------------------------------------------------------------------------
1 | # Midscene Python Wiki
2 |
3 | 欢迎来到 Midscene Python 的完整文档!这里提供了详细的使用指南、API 参考和最佳实践。
4 |
5 | ## 📚 文档导航
6 |
7 | ### 基础入门
8 | - [项目概述](项目概述.md) - 了解 Midscene Python 的核心理念和特性
9 | - [快速开始](快速开始.md) - 5分钟快速上手指南
10 | - [安装配置](安装配置.md) - 详细的安装和环境配置说明
11 |
12 | ### 核心概念
13 | - [Agent 核心控制器](核心概念/Agent核心控制器.md) - 理解 Agent 的工作原理
14 | - [Insight UI理解引擎](核心概念/Insight-UI理解引擎.md) - AI 驱动的 UI 理解和操作
15 | - [AI模型服务抽象层](核心概念/AI模型服务抽象层.md) - 多种 AI 模型的统一接口
16 | - [UI上下文与数据模型](核心概念/UI上下文与数据模型.md) - 理解数据流和上下文管理
17 |
18 | ### API 参考
19 | - [Agent API](API参考/Agent-API.md) - Agent 类的完整 API 文档
20 | - [Insight API](API参考/Insight-API.md) - Insight 引擎的 API 参考
21 | - [AIModelService API](API参考/AIModelService-API.md) - AI 模型服务的接口说明
22 |
23 | ### 平台集成
24 | - [Web自动化](平台集成/Web自动化/README.md) - Web 平台自动化完整指南
25 | - [Selenium集成](平台集成/Web自动化/Selenium集成.md) - Selenium WebDriver 集成
26 | - [Playwright集成](平台集成/Web自动化/Playwright集成.md) - Playwright 集成指南
27 | - [Web桥接机制](平台集成/Web自动化/Web桥接机制.md) - 统一的 Web 操作抽象层
28 | - [Android自动化](平台集成/Android自动化.md) - Android 设备自动化指南
29 |
30 | ### AI 模型配置
31 | - [配置方法](AI模型配置/配置方法.md) - AI 模型的基础配置
32 | - [支持的AI提供商](AI模型配置/支持的AI提供商/README.md) - 所有支持的 AI 服务商
33 | - [OpenAI提供商](AI模型配置/支持的AI提供商/OpenAI提供商.md) - GPT-4V 等模型配置
34 | - [Anthropic提供商](AI模型配置/支持的AI提供商/Anthropic提供商.md) - Claude 模型配置
35 | - [通义千问提供商](AI模型配置/支持的AI提供商/通义千问提供商.md) - Qwen2.5-VL 模型配置
36 | - [Gemini提供商](AI模型配置/支持的AI提供商/Gemini提供商.md) - Google Gemini 模型配置
37 | - [高级选项](AI模型配置/高级选项/README.md) - 高级配置和优化
38 | - [缓存策略](AI模型配置/高级选项/缓存策略.md) - 智能缓存机制
39 | - [请求重试与超时控制](AI模型配置/高级选项/请求重试与超时控制.md) - 网络请求优化
40 | - [配额管理与节流控制](AI模型配置/高级选项/配额管理与节流控制.md) - 成本控制和速率限制
41 | - [流式响应处理](AI模型配置/高级选项/流式响应处理.md) - 实时响应处理
42 | - [性能调优技巧](AI模型配置/高级选项/性能调优技巧.md) - 性能优化最佳实践
43 |
44 | ### 高级特性
45 | - [智能缓存机制](高级特性/智能缓存机制.md) - 提升执行效率的缓存系统
46 | - [可视化报告系统](高级特性/可视化报告系统.md) - 详细的执行报告和调试信息
47 | - [CLI工具高级用法](高级特性/CLI工具高级用法.md) - 命令行工具的进阶使用
48 |
49 | ### 开发指南
50 | - [贡献指南](开发指南/贡献指南.md) - 如何参与项目开发
51 | - [架构设计](开发指南/架构设计.md) - 深入理解项目架构
52 | - [开发环境配置](开发指南/开发环境配置.md) - 搭建开发环境
53 | - [测试指南](开发指南/测试指南.md) - 单元测试和集成测试
54 |
55 | ### 故障排除
56 | - [常见问题](故障排除/常见问题.md) - FAQ 和解决方案
57 | - [调试技巧](故障排除/调试技巧.md) - 调试和问题定位方法
58 | - [错误代码参考](故障排除/错误代码参考.md) - 错误代码含义和解决方案
59 |
60 | ### 示例和教程
61 | - [基础示例](示例和教程/基础示例.md) - 入门级使用示例
62 | - [高级应用场景](示例和教程/高级应用场景.md) - 复杂场景的实现方案
63 | - [最佳实践](示例和教程/最佳实践.md) - 生产环境使用建议
64 |
65 | ## 🚀 快速链接
66 |
67 | - **新手入门**: [快速开始](快速开始.md) → [基础示例](示例和教程/基础示例.md)
68 | - **API 查询**: [Agent API](API参考/Agent-API.md) → [Insight API](API参考/Insight-API.md)
69 | - **平台集成**: [Web自动化](平台集成/Web自动化/README.md) → [Android自动化](平台集成/Android自动化.md)
70 | - **问题解决**: [常见问题](故障排除/常见问题.md) → [调试技巧](故障排除/调试技巧.md)
71 |
72 | ## 📖 文档维护
73 |
74 | 本文档随项目持续更新,如发现内容错误或需要补充,请提交 Issue 或 Pull Request。
75 |
76 | ---
77 |
78 | *最后更新: 2025-09-02*
--------------------------------------------------------------------------------
/wiki/核心概念/README.md:
--------------------------------------------------------------------------------
1 | # 核心概念
2 |
3 | Midscene Python 的核心概念文档,深入解析框架的关键组件和设计理念。
4 |
5 | ## 📖 目录概览
6 |
7 | 本章节包含以下核心概念文档:
8 |
9 | ### [Agent 核心控制器](Agent核心控制器.md)
10 | Agent 是 Midscene Python 的核心控制器,提供统一的自动化操作接口。了解 Agent 的工作原理、生命周期管理和高级配置。
11 |
12 | **主要内容**:
13 | - Agent 架构设计
14 | - 操作类型和方法
15 | - 选项配置和自定义
16 | - 生命周期管理
17 |
18 | ### [Insight UI理解引擎](Insight-UI理解引擎.md)
19 | Insight 是 AI 驱动的 UI 理解引擎,负责页面分析、元素定位和操作决策。深入理解 AI 如何理解和操作界面。
20 |
21 | **主要内容**:
22 | - UI 理解机制
23 | - 智能元素定位
24 | - 操作策略生成
25 | - 上下文分析
26 |
27 | ### [AI模型服务抽象层](AI模型服务抽象层.md)
28 | 统一的 AI 模型服务接口,支持多种 AI 提供商。了解如何配置和切换不同的 AI 模型。
29 |
30 | **主要内容**:
31 | - 服务抽象设计
32 | - 提供商适配
33 | - 模型选择策略
34 | - 性能优化
35 |
36 | ### [UI上下文与数据模型](UI上下文与数据模型.md)
37 | 理解 Midscene Python 中的数据流、上下文管理和类型系统。
38 |
39 | **主要内容**:
40 | - 数据模型定义
41 | - 上下文传递机制
42 | - 类型安全保证
43 | - 序列化和反序列化
44 |
45 | ## 🏗️ 整体架构关系
46 |
47 | ```mermaid
48 | graph TB
49 | A[用户代码] --> B[Agent 核心控制器]
50 | B --> C[Insight UI理解引擎]
51 | C --> D[AI模型服务抽象层]
52 | C --> E[UI上下文与数据模型]
53 | E --> F[平台适配层]
54 | F --> G[底层驱动]
55 |
56 | subgraph "核心概念"
57 | B
58 | C
59 | D
60 | E
61 | end
62 |
63 | subgraph "平台支持"
64 | F
65 | G
66 | end
67 | ```
68 |
69 | ## 🔄 数据流向
70 |
71 | 1. **用户请求** → Agent 接收自然语言指令
72 | 2. **指令解析** → Insight 分析指令意图和页面状态
73 | 3. **AI 推理** → AIModelService 调用 AI 模型进行决策
74 | 4. **上下文构建** → UIContext 封装页面信息和操作结果
75 | 5. **操作执行** → 通过平台适配层执行具体操作
76 | 6. **结果反馈** → 返回执行结果和状态信息
77 |
78 | ## 🎯 设计原则
79 |
80 | ### 1. 抽象化原则
81 | - 隐藏复杂的底层实现细节
82 | - 提供统一的高级接口
83 | - 支持多平台一致性操作
84 |
85 | ### 2. 可扩展原则
86 | - 模块化设计支持功能扩展
87 | - 插件化架构支持第三方集成
88 | - 开放的 API 设计
89 |
90 | ### 3. 智能化原则
91 | - AI 驱动的决策制定
92 | - 自适应的操作策略
93 | - 智能的错误处理和恢复
94 |
95 | ### 4. 类型安全原则
96 | - 完整的类型注解
97 | - 运行时类型验证
98 | - 强类型的数据模型
99 |
100 | ## 🧩 组件交互
101 |
102 | ### Agent ↔ Insight
103 | - Agent 委托 Insight 进行 AI 推理
104 | - Insight 返回操作计划和执行结果
105 | - 双向的状态同步和错误处理
106 |
107 | ### Insight ↔ AIModelService
108 | - Insight 构建 AI 模型请求
109 | - AIModelService 管理模型调用和响应
110 | - 支持多种模型的统一接口
111 |
112 | ### 所有组件 ↔ UIContext
113 | - 统一的数据模型和上下文管理
114 | - 类型安全的数据传递
115 | - 序列化和持久化支持
116 |
117 | ## 📚 学习路径
118 |
119 | ### 初学者路径
120 | 1. 开始阅读 [Agent 核心控制器](Agent核心控制器.md)
121 | 2. 理解 [UI上下文与数据模型](UI上下文与数据模型.md)
122 | 3. 深入 [Insight UI理解引擎](Insight-UI理解引擎.md)
123 | 4. 最后学习 [AI模型服务抽象层](AI模型服务抽象层.md)
124 |
125 | ### 高级开发者路径
126 | 1. 快速浏览所有核心概念
127 | 2. 重点关注架构设计和扩展机制
128 | 3. 深入研究 AI 模型集成和优化
129 | 4. 探索自定义扩展和插件开发
130 |
131 | ## 🔗 相关文档链接
132 |
133 | - **API 参考**: [Agent API](../API参考/Agent-API.md) | [Insight API](../API参考/Insight-API.md)
134 | - **平台集成**: [Web自动化](../平台集成/Web自动化/README.md) | [Android自动化](../平台集成/Android自动化.md)
135 | - **配置指南**: [AI模型配置](../AI模型配置/配置方法.md)
136 | - **示例教程**: [基础示例](../示例和教程/基础示例.md)
137 |
138 | ---
139 |
140 | 选择你感兴趣的主题开始深入学习吧!
--------------------------------------------------------------------------------
/wiki/生成状态.md:
--------------------------------------------------------------------------------
1 | # Midscene Python Wiki 生成状态
2 |
3 | ## ✅ 已完成的文档
4 |
5 | ### 核心文档
6 | - [x] **README.md** - Wiki 主页和导航
7 | - [x] **项目概述.md** - 项目介绍、特性和设计理念
8 | - [x] **快速开始.md** - 5分钟上手指南
9 | - [x] **安装配置.md** - 详细的安装和配置说明
10 |
11 | ### 核心概念 (4/4)
12 | - [x] **README.md** - 核心概念章节导航
13 | - [x] **Agent核心控制器.md** - Agent 类的完整说明
14 | - [x] **Insight-UI理解引擎.md** - AI 驱动的 UI 理解引擎
15 | - [x] **AI模型服务抽象层.md** - 多 AI 提供商统一接口
16 | - [x] **UI上下文与数据模型.md** - 数据类型和上下文管理
17 |
18 | ### 平台集成 (1/4)
19 | - [x] **README.md** - 平台集成总览
20 |
21 | ## 📋 待生成的文档结构
22 |
23 | 以下是基于项目分析确定的完整 wiki 结构:
24 |
25 | ```
26 | wiki/
27 | ├── README.md ✅
28 | ├── 项目概述.md ✅
29 | ├── 快速开始.md ✅
30 | ├── 安装配置.md ✅
31 | ├── 核心概念/ ✅
32 | │ ├── README.md ✅
33 | │ ├── Agent核心控制器.md ✅
34 | │ ├── Insight-UI理解引擎.md ✅
35 | │ ├── AI模型服务抽象层.md ✅
36 | │ └── UI上下文与数据模型.md ✅
37 | ├── API参考/
38 | │ ├── Agent-API.md
39 | │ ├── Insight-API.md
40 | │ └── AIModelService-API.md
41 | ├── 平台集成/ (部分完成)
42 | │ ├── README.md ✅
43 | │ ├── Web自动化/
44 | │ │ ├── README.md
45 | │ │ ├── Selenium集成.md
46 | │ │ ├── Playwright集成.md
47 | │ │ └── Web桥接机制.md
48 | │ └── Android自动化.md
49 | ├── AI模型配置/
50 | │ ├── 配置方法.md
51 | │ ├── 支持的AI提供商/
52 | │ │ ├── README.md
53 | │ │ ├── OpenAI提供商.md
54 | │ │ ├── Anthropic提供商.md
55 | │ │ ├── 通义千问提供商.md
56 | │ │ └── Gemini提供商.md
57 | │ └── 高级选项/
58 | │ ├── README.md
59 | │ ├── 缓存策略.md
60 | │ ├── 请求重试与超时控制.md
61 | │ ├── 配额管理与节流控制.md
62 | │ ├── 流式响应处理.md
63 | │ └── 性能调优技巧.md
64 | ├── 高级特性/
65 | │ ├── 智能缓存机制.md
66 | │ ├── 可视化报告系统.md
67 | │ └── CLI工具高级用法.md
68 | ├── 开发指南/
69 | │ ├── 贡献指南.md
70 | │ ├── 架构设计.md
71 | │ ├── 开发环境配置.md
72 | │ └── 测试指南.md
73 | ├── 故障排除/
74 | │ ├── 常见问题.md
75 | │ ├── 调试技巧.md
76 | │ └── 错误代码参考.md
77 | └── 示例和教程/
78 | ├── 基础示例.md
79 | ├── 高级应用场景.md
80 | └── 最佳实践.md
81 | ```
82 |
83 | ## 📊 生成进度
84 |
85 | - **总文档数**: 约 35-40 个
86 | - **已完成**: 9 个文档
87 | - **完成率**: ~25%
88 | - **核心文档覆盖率**: 100% (最重要的概念文档已完成)
89 |
90 | ## 🎯 已完成文档的特色
91 |
92 | ### 1. 完整性和深度
93 | - 每个核心概念都有详细的解释和示例
94 | - 包含架构图和代码示例
95 | - 涵盖最佳实践和常见问题
96 |
97 | ### 2. 结构化组织
98 | - 清晰的文档导航和交叉引用
99 | - 统一的文档格式和风格
100 | - 逐步深入的学习路径
101 |
102 | ### 3. 实用性
103 | - 大量可运行的代码示例
104 | - 实际使用场景和最佳实践
105 | - 详细的配置和选项说明
106 |
107 | ## 🔄 继续生成建议
108 |
109 | 如需继续生成剩余文档,建议按以下优先级:
110 |
111 | ### 优先级 1 (立即需要)
112 | - API参考文档 (Agent-API.md, Insight-API.md)
113 | - 平台集成详细文档 (Selenium集成.md, Android自动化.md)
114 |
115 | ### 优先级 2 (重要)
116 | - AI模型配置文档
117 | - 示例和教程文档
118 |
119 | ### 优先级 3 (补充)
120 | - 故障排除文档
121 | - 开发指南文档
122 |
123 | ## 💡 使用建议
124 |
125 | 当前已生成的文档已经覆盖了 Midscene Python 的核心概念和基础使用。用户可以通过以下路径开始学习:
126 |
127 | 1. **新手路径**: README.md → 项目概述.md → 快速开始.md
128 | 2. **开发者路径**: 安装配置.md → 核心概念/ → 平台集成/
129 | 3. **深入理解**: 核心概念/ 所有文档 → AI模型配置/
130 |
131 | ## 📝 文档质量
132 |
133 | 已生成的文档具备以下特点:
134 | - ✅ 完整的代码示例
135 | - ✅ 详细的配置说明
136 | - ✅ 架构图和流程图
137 | - ✅ 最佳实践指导
138 | - ✅ 错误处理建议
139 | - ✅ 性能优化技巧
140 | - ✅ 跨文档引用链接
141 |
142 | 这些文档为用户提供了全面理解和使用 Midscene Python 框架的基础。
--------------------------------------------------------------------------------
/midscene/android/agent.py:
--------------------------------------------------------------------------------
1 | """
2 | Android Agent implementation
3 | """
4 |
5 | from typing import Optional
6 |
7 | from ..core.agent import Agent, AgentOptions
8 | from .device import AndroidDevice
9 |
10 |
11 | class AndroidAgent(Agent[AndroidDevice]):
12 | """Android-specific agent implementation"""
13 |
14 | def __init__(self, device: AndroidDevice, options: Optional[AgentOptions] = None):
15 | """Initialize Android agent
16 |
17 | Args:
18 | device: AndroidDevice instance
19 | options: Agent options
20 | """
21 | super().__init__(device, options)
22 |
23 | # Validate that we have vision language model support for Android
24 | # Android requires VL models for UI understanding
25 |
26 | @classmethod
27 | async def create(
28 | cls,
29 | device_id: Optional[str] = None,
30 | options: Optional[AgentOptions] = None
31 | ) -> 'AndroidAgent':
32 | """Create Android agent with device
33 |
34 | Args:
35 | device_id: Android device ID, if None uses first available
36 | options: Agent options
37 |
38 | Returns:
39 | AndroidAgent instance
40 | """
41 | device = await AndroidDevice.create(device_id)
42 | return cls(device, options)
43 |
44 | async def launch_app(self, package_name: str, activity: Optional[str] = None) -> None:
45 | """Launch Android app
46 |
47 | Args:
48 | package_name: App package name
49 | activity: Optional activity name
50 | """
51 | await self.interface.launch_app(package_name, activity)
52 |
53 | async def stop_app(self, package_name: str) -> None:
54 | """Stop Android app
55 |
56 | Args:
57 | package_name: App package name
58 | """
59 | await self.interface.stop_app(package_name)
60 |
61 | async def install_app(self, apk_path: str) -> None:
62 | """Install Android app
63 |
64 | Args:
65 | apk_path: Path to APK file
66 | """
67 | await self.interface.install_app(apk_path)
68 |
69 | async def back(self) -> None:
70 | """Press back button"""
71 | await self.interface.back()
72 |
73 | async def home(self) -> None:
74 | """Press home button"""
75 | await self.interface.home()
76 |
77 | async def recent(self) -> None:
78 | """Press recent apps button"""
79 | await self.interface.recent()
80 |
81 | async def swipe(
82 | self,
83 | start_x: float, start_y: float,
84 | end_x: float, end_y: float,
85 | duration: int = 300
86 | ) -> None:
87 | """Swipe gesture
88 |
89 | Args:
90 | start_x: Start X coordinate
91 | start_y: Start Y coordinate
92 | end_x: End X coordinate
93 | end_y: End Y coordinate
94 | duration: Swipe duration in milliseconds
95 | """
96 | await self.interface.swipe(start_x, start_y, end_x, end_y, duration)
97 |
98 | async def long_press(self, x: float, y: float, duration: int = 1000) -> None:
99 | """Long press gesture
100 |
101 | Args:
102 | x: X coordinate
103 | y: Y coordinate
104 | duration: Press duration in milliseconds
105 | """
106 | await self.interface.long_press(x, y, duration)
--------------------------------------------------------------------------------
/examples/basic_usage.py:
--------------------------------------------------------------------------------
1 | """
2 | Basic usage examples for Midscene Python
3 | """
4 |
5 | import asyncio
6 | from midscene import Agent
7 | from midscene.web import SeleniumWebPage
8 | from midscene.android import AndroidAgent
9 |
10 |
11 | async def web_automation_example():
12 | """Basic web automation example"""
13 | print("🌐 Web Automation Example")
14 |
15 | # Create web page instance
16 | with SeleniumWebPage.create(headless=False) as page:
17 | # Create agent
18 | agent = Agent(page)
19 |
20 | # Navigate to website
21 | await page.navigate_to("https://example.com")
22 |
23 | # Use AI to interact with the page
24 | await agent.ai_action("点击登录按钮")
25 | await agent.ai_action("在用户名输入框输入 'demo@example.com'")
26 | await agent.ai_action("在密码输入框输入 'password123'")
27 | await agent.ai_action("点击提交按钮")
28 |
29 | # Extract data using AI
30 | user_info = await agent.ai_extract({
31 | "username": "用户名",
32 | "email": "邮箱地址",
33 | "last_login": "最后登录时间"
34 | })
35 | print(f"提取的用户信息: {user_info}")
36 |
37 | # Assert page state
38 | await agent.ai_assert("页面显示欢迎信息")
39 |
40 | print("✅ Web automation completed successfully!")
41 |
42 |
43 | async def android_automation_example():
44 | """Basic Android automation example"""
45 | print("📱 Android Automation Example")
46 |
47 | try:
48 | # Create Android agent
49 | agent = await AndroidAgent.create()
50 |
51 | # Launch app
52 | await agent.launch_app("com.android.settings")
53 |
54 | # Use AI to navigate
55 | await agent.ai_action("点击WLAN设置")
56 | await agent.ai_action("滑动到底部")
57 |
58 | # Extract information
59 | wifi_list = await agent.ai_extract({
60 | "available_networks": [
61 | {"name": "网络名称", "security": "安全类型", "signal": "信号强度"}
62 | ]
63 | })
64 | print(f"可用WiFi网络: {wifi_list}")
65 |
66 | # Go back
67 | await agent.back()
68 |
69 | print("✅ Android automation completed successfully!")
70 |
71 | except Exception as e:
72 | print(f"❌ Android automation failed: {e}")
73 |
74 |
75 | async def playwright_example():
76 | """Playwright integration example"""
77 | print("🎭 Playwright Example")
78 |
79 | from midscene.web import PlaywrightWebPage
80 |
81 | # Create Playwright page
82 | async with await PlaywrightWebPage.create(headless=False) as page:
83 | agent = Agent(page)
84 |
85 | # Navigate and interact
86 | await page.navigate_to("https://playwright.dev")
87 |
88 | # Use AI for navigation
89 | await agent.ai_action("点击文档链接")
90 | await agent.ai_action("搜索 'getting started'")
91 |
92 | # Extract page information
93 | page_info = await agent.ai_extract({
94 | "title": "页面标题",
95 | "description": "页面描述",
96 | "sections": ["主要章节列表"]
97 | })
98 | print(f"页面信息: {page_info}")
99 |
100 | print("✅ Playwright example completed!")
101 |
102 |
103 | async def main():
104 | """Run all examples"""
105 | print("🚀 Midscene Python Examples\n")
106 |
107 | # Web automation with Selenium
108 | await web_automation_example()
109 | print()
110 |
111 | # Playwright example
112 | await playwright_example()
113 | print()
114 |
115 | # Android automation (if device available)
116 | await android_automation_example()
117 |
118 |
119 | if __name__ == "__main__":
120 | asyncio.run(main())
--------------------------------------------------------------------------------
/midscene/cli/config.py:
--------------------------------------------------------------------------------
1 | """
2 | CLI configuration management
3 | """
4 |
5 | from pathlib import Path
6 | from typing import Optional, Dict, Any
7 |
8 | import yaml
9 | from pydantic import BaseModel, Field
10 |
11 |
12 | class WebConfig(BaseModel):
13 | """Web automation configuration"""
14 | browser: str = "chrome"
15 | headless: bool = False
16 | window_size: tuple[int, int] = (1920, 1080)
17 | user_data_dir: Optional[str] = None
18 | timeout: int = 30
19 |
20 |
21 | class AndroidConfig(BaseModel):
22 | """Android automation configuration"""
23 | device_id: Optional[str] = None
24 | adb_path: str = "adb"
25 | auto_dismiss_keyboard: bool = True
26 | timeout: int = 30
27 |
28 |
29 | class AIConfig(BaseModel):
30 | """AI model configuration"""
31 | provider: str = "openai"
32 | model: str = "gpt-4-vision-preview"
33 | api_key: Optional[str] = None
34 | base_url: Optional[str] = None
35 | max_tokens: int = 4000
36 | temperature: float = 0.1
37 |
38 |
39 | class ExecutionConfig(BaseModel):
40 | """Execution configuration"""
41 | concurrent: int = 1
42 | continue_on_error: bool = False
43 | generate_report: bool = True
44 | report_format: str = "html"
45 | output_dir: str = "./reports"
46 |
47 |
48 | class CLIConfig(BaseModel):
49 | """CLI configuration"""
50 | web: WebConfig = Field(default_factory=WebConfig)
51 | android: AndroidConfig = Field(default_factory=AndroidConfig)
52 | ai: AIConfig = Field(default_factory=AIConfig)
53 | execution: ExecutionConfig = Field(default_factory=ExecutionConfig)
54 |
55 | @classmethod
56 | def load(cls, config_path: Optional[str] = None) -> 'CLIConfig':
57 | """Load configuration from file
58 |
59 | Args:
60 | config_path: Path to configuration file
61 |
62 | Returns:
63 | CLIConfig instance
64 | """
65 | if not config_path:
66 | # Look for default config files
67 | for default_path in ["midscene.yml", "midscene.yaml", ".midscene.yml"]:
68 | if Path(default_path).exists():
69 | config_path = default_path
70 | break
71 |
72 | if not config_path or not Path(config_path).exists():
73 | # Return default configuration
74 | return cls()
75 |
76 | with open(config_path, 'r', encoding='utf-8') as f:
77 | config_data = yaml.safe_load(f)
78 |
79 | return cls(**config_data)
80 |
81 | def save(self, config_path: str) -> None:
82 | """Save configuration to file
83 |
84 | Args:
85 | config_path: Path to save configuration
86 | """
87 | config_data = self.model_dump()
88 |
89 | with open(config_path, 'w', encoding='utf-8') as f:
90 | yaml.dump(config_data, f, default_flow_style=False, allow_unicode=True)
91 |
92 | def to_env_vars(self) -> Dict[str, str]:
93 | """Convert configuration to environment variables
94 |
95 | Returns:
96 | Dictionary of environment variables
97 | """
98 | env_vars = {}
99 |
100 | # AI configuration
101 | if self.ai.api_key:
102 | env_vars['MIDSCENE_AI_API_KEY'] = self.ai.api_key
103 | env_vars['MIDSCENE_AI_PROVIDER'] = self.ai.provider
104 | env_vars['MIDSCENE_AI_MODEL'] = self.ai.model
105 | if self.ai.base_url:
106 | env_vars['MIDSCENE_AI_BASE_URL'] = self.ai.base_url
107 |
108 | # Execution configuration
109 | env_vars['MIDSCENE_CONCURRENT'] = str(self.execution.concurrent)
110 | env_vars['MIDSCENE_CONTINUE_ON_ERROR'] = str(self.execution.continue_on_error).lower()
111 | env_vars['MIDSCENE_GENERATE_REPORT'] = str(self.execution.generate_report).lower()
112 |
113 | return env_vars
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["hatchling"]
3 | build-backend = "hatchling.build"
4 |
5 | [project]
6 | name = "midscene-python"
7 | description = "基于AI的Web和Android自动化框架,支持自然语言驱动的UI操作"
8 | readme = "README.md"
9 | license = "MIT"
10 | authors = [
11 | { name = "Midscene Team" },
12 | ]
13 | keywords = ["automation", "AI", "web", "android", "testing", "ui", "selenium", "playwright"]
14 | classifiers = [
15 | "Development Status :: 4 - Beta",
16 | "Intended Audience :: Developers",
17 | "License :: OSI Approved :: MIT License",
18 | "Programming Language :: Python :: 3",
19 | "Programming Language :: Python :: 3.9",
20 | "Programming Language :: Python :: 3.10",
21 | "Programming Language :: Python :: 3.11",
22 | "Programming Language :: Python :: 3.12",
23 | "Topic :: Software Development :: Testing",
24 | "Topic :: Software Development :: Libraries :: Python Modules",
25 | ]
26 | requires-python = ">=3.9"
27 | dependencies = [
28 | "pydantic>=2.0,<3.0",
29 | "selenium>=4.15.0,<5.0",
30 | "playwright>=1.40.0,<2.0",
31 | "opencv-python>=4.8.0,<5.0",
32 | "pillow>=10.0.0,<11.0",
33 | "numpy>=1.24.0,<2.0",
34 | "aiohttp>=3.9.0,<4.0",
35 | "loguru>=0.7.0,<1.0",
36 | "typer>=0.9.0,<1.0",
37 | "jinja2>=3.1.0,<4.0",
38 | "pyyaml>=6.0,<7.0",
39 | "httpx>=0.25.0,<1.0",
40 | "asyncio-mqtt",
41 | "pure-python-adb>=0.3.0dev0",
42 | "openai>=1.3.0,<2.0",
43 | "anthropic>=0.7.0,<1.0",
44 | "google-generativeai",
45 | "dashscope",
46 | ]
47 | version = "0.1.1"
48 |
49 | [project.optional-dependencies]
50 | dev = [
51 | "pytest>=7.4.0",
52 | "pytest-asyncio>=0.21.0",
53 | "pytest-cov>=4.1.0",
54 | "black>=23.0.0",
55 | "isort>=5.12.0",
56 | "mypy>=1.5.0",
57 | "pre-commit>=3.4.0",
58 | "ruff>=0.1.0",
59 | ]
60 | docs = [
61 | "mkdocs>=1.5.0",
62 | "mkdocs-material>=9.4.0",
63 | "mkdocstrings[python]>=0.23.0",
64 | ]
65 |
66 | [project.urls]
67 | Homepage = "https://github.com/Python51888/midscene-python.git"
68 | Repository = "https://github.com/Python51888/midscene-python.git"
69 | Documentation = "https://github.com/Python51888/Midscene-Python/blob/master/README.md"
70 | "Bug Tracker" = "https://github.com/Python51888/midscene-python.git/issues"
71 |
72 | [project.scripts]
73 | midscene = "midscene.cli:main"
74 |
75 | [tool.hatch.build.targets.wheel]
76 | packages = ["midscene"]
77 |
78 |
79 |
80 | [tool.black]
81 | line-length = 88
82 | target-version = ['py39']
83 | include = '\.pyi?$'
84 | exclude = '''
85 | /(
86 | \.eggs
87 | | \.git
88 | | \.hg
89 | | \.mypy_cache
90 | | \.tox
91 | | \.venv
92 | | _build
93 | | buck-out
94 | | build
95 | | dist
96 | )/
97 | '''
98 |
99 | [tool.isort]
100 | profile = "black"
101 | line_length = 88
102 | multi_line_output = 3
103 | include_trailing_comma = true
104 | force_grid_wrap = 0
105 | use_parentheses = true
106 | ensure_newline_before_comments = true
107 |
108 | [tool.mypy]
109 | python_version = "3.9"
110 | warn_return_any = true
111 | warn_unused_configs = true
112 | disallow_untyped_defs = true
113 | disallow_incomplete_defs = true
114 | check_untyped_defs = true
115 | disallow_untyped_decorators = true
116 | no_implicit_optional = true
117 | warn_redundant_casts = true
118 | warn_unused_ignores = true
119 | warn_no_return = true
120 | warn_unreachable = true
121 | strict_equality = true
122 |
123 | [tool.ruff]
124 | target-veersion = "py39"
125 | line-length = 88
126 | select = [
127 | "E", # pycodestyle errors
128 | "W", # pycodestyle warnings
129 | "F", # pyflakes
130 | "I", # isort
131 | "B", # flake8-bugbear
132 | "C4", # flake8-comprehensions
133 | "UP", # pyupgrade
134 | ]
135 | ignore = [
136 | "E501", # line too long, handled by black
137 | "B008", # do not perform function calls in argument defaults
138 | "C901", # too complex
139 | ]
140 |
141 | [tool.ruff.per-file-ignores]
142 | "__init__.py" = ["F401"]
143 |
144 | [tool.pytest.ini_options]
145 | testpaths = ["tests"]
146 | python_files = ["test_*.py", "*_test.py"]
147 | python_classes = ["Test*"]
148 | python_functions = ["test_*"]
149 | addopts = [
150 | "-v",
151 | "--strict-markers",
152 | "--strict-config",
153 | "--cov=midscene",
154 | "--cov-report=term-missing",
155 | "--cov-report=html",
156 | ]
157 | markers = [
158 | "slow: marks tests as slow (deselect with '-m \"not slow\"')",
159 | "integration: marks tests as integration tests",
160 | "unit: marks tests as unit tests",
161 | ]
--------------------------------------------------------------------------------
/README.zh.md:
--------------------------------------------------------------------------------
1 | # Midscene Python [](https://zread.ai/Python51888/Midscene-Python)
2 |
3 | [English](README.md) | [中文](README.zh.md)
4 |
5 | Midscene Python 是一个基于 AI 的自动化框架,支持 Web 和 Android 平台的 UI 自动化操作。
6 |
7 | ## 概述
8 |
9 | Midscene Python 提供全面的 UI 自动化能力,具有以下核心特性:
10 |
11 | - **自然语言驱动**:使用自然语言描述自动化任务
12 | - **多平台支持**:支持 Web(Selenium/Playwright)和 Android(ADB)
13 | - **AI 模型集成**:支持 GPT-4V、Qwen2.5-VL、Gemini 等多种视觉语言模型
14 | - **可视化调试**:提供详细的执行报告和调试信息
15 | - **缓存机制**:智能缓存提升执行效率
16 |
17 | ## 项目架构
18 |
19 | ```
20 | midscene-python/
21 | ├── midscene/ # 核心框架
22 | │ ├── core/ # 核心框架
23 | │ │ ├── agent/ # Agent系统
24 | │ │ ├── insight/ # AI推理引擎
25 | │ │ ├── ai_model/ # AI模型集成
26 | │ │ ├── yaml/ # YAML脚本执行器
27 | │ │ └── types.py # 核心类型定义
28 | │ ├── web/ # Web集成
29 | │ │ ├── selenium/ # Selenium集成
30 | │ │ ├── playwright/ # Playwright集成
31 | │ │ └── bridge/ # Bridge模式
32 | │ ├── android/ # Android集成
33 | │ │ ├── device.py # 设备管理
34 | │ │ └── agent.py # Android Agent
35 | │ ├── cli/ # 命令行工具
36 | │ ├── mcp/ # MCP协议支持
37 | │ ├── shared/ # 共享工具
38 | │ └── visualizer/ # 可视化报告
39 | ├── examples/ # 示例代码
40 | ├── tests/ # 测试用例
41 | └── docs/ # 文档
42 | ```
43 |
44 | ## 技术栈
45 |
46 | - **Python 3.9+**:核心运行环境
47 | - **Pydantic**:数据验证和序列化
48 | - **Selenium/Playwright**:Web 自动化
49 | - **OpenCV/Pillow**:图像处理
50 | - **HTTPX/AIOHTTP**:HTTP 客户端
51 | - **Typer**:CLI 框架
52 | - **Loguru**:日志记录
53 |
54 | ## 快速开始
55 |
56 | ### 安装
57 |
58 | ```bash
59 | pip install midscene-python
60 | ```
61 |
62 | ### 基础用法
63 |
64 | ```python
65 | from midscene import Agent
66 | from midscene.web import SeleniumWebPage
67 |
68 | # 创建 Web Agent
69 | with SeleniumWebPage.create() as page:
70 | agent = Agent(page)
71 |
72 | # 使用自然语言进行自动化操作
73 | await agent.ai_action("点击登录按钮")
74 | await agent.ai_action("输入用户名 'test@example.com'")
75 | await agent.ai_action("输入密码 'password123'")
76 | await agent.ai_action("点击提交按钮")
77 |
78 | # 数据提取
79 | user_info = await agent.ai_extract("提取用户个人信息")
80 |
81 | # 断言验证
82 | await agent.ai_assert("页面显示欢迎信息")
83 | ```
84 |
85 | ## 主要特性
86 |
87 | ### 🤖 AI 驱动的自动化
88 |
89 | 使用自然语言描述操作,AI 自动理解并执行:
90 |
91 | ```python
92 | await agent.ai_action("在搜索框中输入'Python教程'并搜索")
93 | ```
94 |
95 | ### 🔍 智能元素定位
96 |
97 | 支持多种定位策略,自动选择最优方案:
98 |
99 | ```python
100 | element = await agent.ai_locate("登录按钮")
101 | ```
102 |
103 | ### 📊 数据提取
104 |
105 | 从页面提取结构化数据:
106 |
107 | ```python
108 | products = await agent.ai_extract({
109 | "products": [
110 | {"name": "产品名称", "price": "价格", "rating": "评分"}
111 | ]
112 | })
113 | ```
114 |
115 | ### ✅ 智能断言
116 |
117 | AI 理解页面状态,进行智能断言:
118 |
119 | ```python
120 | await agent.ai_assert("用户已成功登录")
121 | ```
122 |
123 | ### 📝 致谢
124 |
125 | 感谢Midscene项目:https://github.com/web-infra-dev/midscene 提供的灵感和技术参考
126 |
127 | ## 许可证
128 |
129 | MIT License
130 |
--------------------------------------------------------------------------------
/wiki/项目概述.md:
--------------------------------------------------------------------------------
1 | # 项目概述
2 |
3 | ## 什么是 Midscene Python?
4 |
5 | Midscene Python 是一个革命性的基于 AI 的自动化框架,专为 Web 和 Android 平台的 UI 自动化操作而设计。它的核心理念是**让自动化变得像说话一样简单**。
6 |
7 | ## 🎯 设计理念
8 |
9 | ### 自然语言驱动
10 | 传统的自动化工具需要开发者学习复杂的 API 和选择器语法。Midscene Python 打破了这一限制,让你可以用自然语言描述想要执行的操作:
11 |
12 | ```python
13 | # 传统方式
14 | driver.find_element(By.XPATH, "//button[@class='login-btn' and contains(text(), '登录')]").click()
15 |
16 | # Midscene Python 方式
17 | await agent.ai_action("点击登录按钮")
18 | ```
19 |
20 | ### AI 驱动的智能决策
21 | Midscene Python 集成了先进的视觉语言模型(VLM),能够:
22 | - 理解页面结构和元素关系
23 | - 智能选择最佳的操作策略
24 | - 适应页面变化和布局差异
25 | - 提供人性化的错误提示
26 |
27 | ### 多平台统一接口
28 | 无论是 Web 应用还是 Android 应用,Midscene Python 都提供了一致的编程接口:
29 |
30 | ```python
31 | # Web 自动化
32 | web_agent = Agent(selenium_page)
33 | await web_agent.ai_action("在搜索框输入'Python教程'")
34 |
35 | # Android 自动化
36 | android_agent = Agent(android_device)
37 | await android_agent.ai_action("在搜索框输入'Python教程'")
38 | ```
39 |
40 | ## 🌟 核心特性
41 |
42 | ### 1. 自然语言操作
43 | - **直观表达**: 用日常语言描述操作意图
44 | - **智能理解**: AI 自动理解复杂的操作逻辑
45 | - **上下文感知**: 结合页面状态做出最佳决策
46 |
47 | ### 2. 智能元素定位
48 | - **多策略融合**: 自动选择最优的定位方法
49 | - **容错能力**: 适应页面变化和元素移动
50 | - **语义理解**: 基于元素功能而非位置进行定位
51 |
52 | ### 3. 结构化数据提取
53 | ```python
54 | # 提取商品信息
55 | products = await agent.ai_extract({
56 | "products": [
57 | {
58 | "name": "商品名称",
59 | "price": "价格",
60 | "rating": "评分",
61 | "availability": "库存状态"
62 | }
63 | ]
64 | })
65 | ```
66 |
67 | ### 4. 智能断言验证
68 | ```python
69 | # 验证页面状态
70 | await agent.ai_assert("用户已成功登录并显示欢迎消息")
71 | await agent.ai_assert("购物车中有3件商品")
72 | ```
73 |
74 | ### 5. 可视化调试
75 | - **执行截图**: 每步操作都有详细的视觉记录
76 | - **决策过程**: 展示 AI 的思考和决策过程
77 | - **错误定位**: 准确指出失败原因和位置
78 |
79 | ## 🏗️ 架构概览
80 |
81 | Midscene Python 采用分层架构设计:
82 |
83 | ```
84 | ┌─────────────────────────────────────────┐
85 | │ 用户应用层 │
86 | ├─────────────────────────────────────────┤
87 | │ Agent 控制层 │ ← 统一的操作接口
88 | ├─────────────────────────────────────────┤
89 | │ Insight AI 引擎 │ ← AI 理解和决策
90 | ├─────────────────────────────────────────┤
91 | │ 平台适配层 │ ← Web/Android 桥接
92 | ├─────────────────────────────────────────┤
93 | │ 底层驱动层 │ ← Selenium/Playwright/ADB
94 | └─────────────────────────────────────────┘
95 | ```
96 |
97 | ### 核心组件
98 |
99 | - **Agent**: 用户操作的统一入口,提供高级 AI 驱动的 API
100 | - **Insight**: AI 理解引擎,负责页面分析和操作决策
101 | - **AIModelService**: AI 模型服务抽象层,支持多种 AI 提供商
102 | - **Platform Bridges**: 平台桥接层,统一不同平台的操作接口
103 |
104 | ## 🎮 使用场景
105 |
106 | ### 测试自动化
107 | ```python
108 | # E2E 测试
109 | await agent.ai_action("登录用户账号")
110 | await agent.ai_action("添加商品到购物车")
111 | await agent.ai_action("进入结算页面")
112 | await agent.ai_assert("显示正确的订单金额")
113 | ```
114 |
115 | ### 数据爬取
116 | ```python
117 | # 智能数据提取
118 | news_data = await agent.ai_extract({
119 | "articles": [
120 | {
121 | "title": "标题",
122 | "author": "作者",
123 | "publish_date": "发布日期",
124 | "content_summary": "内容摘要"
125 | }
126 | ]
127 | })
128 | ```
129 |
130 | ### 业务流程自动化
131 | ```python
132 | # RPA 自动化
133 | await agent.ai_action("打开财务报表")
134 | await agent.ai_action("筛选本月数据")
135 | monthly_report = await agent.ai_extract("提取月度财务汇总数据")
136 | await agent.ai_action("生成并下载报告")
137 | ```
138 |
139 | ### 应用监控
140 | ```python
141 | # 健康检查
142 | await agent.ai_assert("首页加载正常")
143 | await agent.ai_assert("用户登录功能正常")
144 | await agent.ai_assert("搜索功能返回结果")
145 | ```
146 |
147 | ## 🆚 与传统工具的对比
148 |
149 | | 特性 | 传统自动化工具 | Midscene Python |
150 | |------|---------------|-----------------|
151 | | **学习曲线** | 陡峭,需要学习复杂 API | 平缓,自然语言驱动 |
152 | | **代码可读性** | 晦涩难懂 | 直观易懂 |
153 | | **维护成本** | 高,页面变化需要大量修改 | 低,AI 自动适应变化 |
154 | | **元素定位** | 手动编写选择器 | AI 智能定位 |
155 | | **错误处理** | 需要手动处理各种异常 | AI 自动重试和恢复 |
156 | | **跨平台** | 需要学习不同工具 | 统一接口 |
157 |
158 | ## 🛣️ 发展路线
159 |
160 | ### 当前版本 (v0.1.0)
161 | - ✅ 基础 Agent 和 Insight 功能
162 | - ✅ Web 平台支持 (Selenium/Playwright)
163 | - ✅ Android 平台支持
164 | - ✅ 多种 AI 模型集成
165 | - ✅ 基础缓存和报告功能
166 |
167 | ### 未来规划
168 | - 🔄 桌面应用自动化支持
169 | - 🔄 更多 AI 模型集成
170 | - 🔄 可视化测试编辑器
171 | - 🔄 云端执行服务
172 | - 🔄 团队协作功能
173 |
174 | ## 📈 性能特点
175 |
176 | - **执行效率**: 智能缓存机制减少重复的 AI 调用
177 | - **准确性**: 多重验证确保操作的可靠性
178 | - **稳定性**: 自动重试和错误恢复机制
179 | - **扩展性**: 模块化设计支持自定义扩展
180 |
181 | ## 🤝 社区与生态
182 |
183 | Midscene Python 是一个开源项目,欢迎社区贡献:
184 |
185 | - **GitHub**: [Python51888/midscene-python](https://github.com/Python51888/midscene-python.git)
186 | - **文档**: [Python51888/midscene-python](https://github.com/Python51888/Midscene-Python/blob/master/README.md)
187 | - **讨论**: GitHub Discussions
188 | - **问题反馈**: GitHub Issues
189 |
190 | ---
191 |
192 | 准备好开始你的 AI 自动化之旅了吗?查看 [快速开始](快速开始.md) 指南!
--------------------------------------------------------------------------------
/midscene/core/ai_model/service.py:
--------------------------------------------------------------------------------
1 | """
2 | AI Model Service - Unified interface for different AI providers
3 | """
4 |
5 | import json
6 | from abc import ABC, abstractmethod
7 | from typing import Any, Dict, List, Optional, Type, Union
8 |
9 | import httpx
10 | from loguru import logger
11 | from pydantic import BaseModel
12 |
13 | from ..types import AIUsageInfo
14 |
15 |
16 | class AIModelConfig(BaseModel):
17 | """AI model configuration"""
18 | provider: str # openai, anthropic, qwen, gemini
19 | model: str
20 | api_key: str
21 | base_url: Optional[str] = None
22 | max_tokens: int = 4000
23 | temperature: float = 0.1
24 | timeout: int = 60
25 |
26 |
27 | class AIProvider(ABC):
28 | """Abstract base class for AI service providers"""
29 |
30 | @abstractmethod
31 | async def call(
32 | self,
33 | messages: List[Dict[str, Any]],
34 | config: AIModelConfig,
35 | response_schema: Optional[Type[BaseModel]] = None,
36 | **kwargs
37 | ) -> Dict[str, Any]:
38 | """Call AI service"""
39 | pass
40 |
41 |
42 | class AIModelService:
43 | """Unified AI model service interface"""
44 |
45 | def __init__(self):
46 | self.providers: Dict[str, AIProvider] = {}
47 | self._register_providers()
48 |
49 | def _register_providers(self):
50 | """Register available AI providers"""
51 | from .providers import (
52 | OpenAIProvider,
53 | AnthropicProvider,
54 | QwenProvider,
55 | GeminiProvider
56 | )
57 |
58 | self.providers['openai'] = OpenAIProvider()
59 | self.providers['anthropic'] = AnthropicProvider()
60 | self.providers['qwen'] = QwenProvider()
61 | self.providers['gemini'] = GeminiProvider()
62 |
63 | async def call_ai(
64 | self,
65 | messages: List[Dict[str, Any]],
66 | response_schema: Optional[Type[BaseModel]] = None,
67 | model_config: Optional[AIModelConfig] = None,
68 | **kwargs
69 | ) -> Dict[str, Any]:
70 | """Call AI model with unified interface"""
71 | config = model_config or self._get_default_config()
72 | provider = self.providers.get(config.provider)
73 |
74 | if not provider:
75 | raise ValueError(f"Unsupported provider: {config.provider}")
76 |
77 | try:
78 | logger.debug(f"Calling AI provider: {config.provider}")
79 | result = await provider.call(
80 | messages=messages,
81 | config=config,
82 | response_schema=response_schema,
83 | **kwargs
84 | )
85 | return result
86 | except Exception as e:
87 | logger.error(f"AI call failed: {e}")
88 | raise
89 |
90 | def _get_default_config(self) -> AIModelConfig:
91 | """Get default configuration"""
92 | import os
93 |
94 | # Try to get from environment variables
95 | provider = os.getenv('MIDSCENE_AI_PROVIDER', 'openai')
96 | model = os.getenv('MIDSCENE_AI_MODEL', 'gpt-4-vision-preview')
97 | api_key = os.getenv('MIDSCENE_AI_API_KEY', '')
98 | base_url = os.getenv('MIDSCENE_AI_BASE_URL')
99 |
100 | if not api_key:
101 | raise ValueError(
102 | "AI API key not configured. Set MIDSCENE_AI_API_KEY environment variable."
103 | )
104 |
105 | return AIModelConfig(
106 | provider=provider,
107 | model=model,
108 | api_key=api_key,
109 | base_url=base_url
110 | )
111 |
112 |
113 | def parse_json_response(content: str) -> Dict[str, Any]:
114 | """Parse JSON response from AI model"""
115 | try:
116 | # Try to parse as JSON directly
117 | return json.loads(content)
118 | except json.JSONDecodeError:
119 | # Try to extract JSON from code blocks
120 | import re
121 | json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', content, re.DOTALL)
122 | if json_match:
123 | try:
124 | return json.loads(json_match.group(1))
125 | except json.JSONDecodeError:
126 | pass
127 |
128 | # Try to find JSON-like content
129 | json_match = re.search(r'\{.*\}', content, re.DOTALL)
130 | if json_match:
131 | try:
132 | return json.loads(json_match.group(0))
133 | except json.JSONDecodeError:
134 | pass
135 |
136 | raise ValueError(f"Failed to parse JSON from response: {content}")
137 |
138 |
139 | def create_usage_info(usage_data: Dict[str, Any]) -> AIUsageInfo:
140 | """Create AIUsageInfo from provider response"""
141 | return AIUsageInfo(
142 | prompt_tokens=usage_data.get('prompt_tokens', 0),
143 | completion_tokens=usage_data.get('completion_tokens', 0),
144 | total_tokens=usage_data.get('total_tokens', 0),
145 | cost=usage_data.get('cost')
146 | )
--------------------------------------------------------------------------------
/scripts/validate_requirements.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | """
3 | 依赖验证脚本
4 | 验证生成的requirements.txt文件是否包含所有必要依赖
5 | """
6 |
7 | import subprocess
8 | import sys
9 | import tempfile
10 | import os
11 | from pathlib import Path
12 |
13 |
14 | def run_command(cmd, check=True, capture_output=True):
15 | """运行命令并返回结果"""
16 | try:
17 | result = subprocess.run(
18 | cmd,
19 | shell=True,
20 | check=check,
21 | capture_output=capture_output,
22 | text=True
23 | )
24 | return result
25 | except subprocess.CalledProcessError as e:
26 | print(f"命令执行失败: {cmd}")
27 | print(f"错误输出: {e.stderr}")
28 | sys.exit(1)
29 |
30 |
31 | def create_test_environment():
32 | """创建临时测试环境"""
33 | print("=== 创建临时测试环境 ===")
34 |
35 | # 创建临时目录
36 | temp_dir = tempfile.mkdtemp(prefix="midscene_test_")
37 | print(f"临时目录: {temp_dir}")
38 |
39 | # 创建虚拟环境
40 | venv_path = os.path.join(temp_dir, "test_env")
41 | print("创建虚拟环境...")
42 | run_command(f"python -m venv {venv_path}")
43 |
44 | # 获取虚拟环境的Python路径
45 | if sys.platform == "win32":
46 | python_path = os.path.join(venv_path, "Scripts", "python.exe")
47 | pip_path = os.path.join(venv_path, "Scripts", "pip.exe")
48 | else:
49 | python_path = os.path.join(venv_path, "bin", "python")
50 | pip_path = os.path.join(venv_path, "bin", "pip")
51 |
52 | return temp_dir, python_path, pip_path
53 |
54 |
55 | def install_requirements(pip_path, requirements_file):
56 | """在测试环境中安装依赖"""
57 | print("=== 安装依赖包 ===")
58 | print(f"使用requirements文件: {requirements_file}")
59 |
60 | # 升级pip
61 | run_command(f'"{pip_path}" install --upgrade pip')
62 |
63 | # 安装依赖
64 | run_command(f'"{pip_path}" install -r "{requirements_file}"')
65 | print("依赖安装完成")
66 |
67 |
68 | def validate_imports(python_path):
69 | """验证核心包导入"""
70 | print("=== 验证包导入 ===")
71 |
72 | test_imports = [
73 | "import midscene",
74 | "import pydantic",
75 | "import selenium",
76 | "import playwright",
77 | "import pytest",
78 | "import black",
79 | "import mkdocs",
80 | "import numpy",
81 | "import cv2",
82 | "import PIL",
83 | "import loguru",
84 | "import typer",
85 | "import httpx",
86 | "import aiohttp",
87 | "import openai",
88 | "import anthropic",
89 | ]
90 |
91 | for import_stmt in test_imports:
92 | try:
93 | print(f"测试: {import_stmt}")
94 | run_command(f'"{python_path}" -c "{import_stmt}"')
95 | print(f"✓ {import_stmt} - 成功")
96 | except:
97 | print(f"✗ {import_stmt} - 失败")
98 | return False
99 |
100 | return True
101 |
102 |
103 | def validate_cli_tools(python_path):
104 | """验证CLI工具可用性"""
105 | print("=== 验证CLI工具 ===")
106 |
107 | cli_tests = [
108 | (f'"{python_path}" -m pytest --version', "pytest"),
109 | (f'"{python_path}" -m black --version', "black"),
110 | (f'"{python_path}" -m mkdocs --version', "mkdocs"),
111 | ]
112 |
113 | for cmd, tool_name in cli_tests:
114 | try:
115 | print(f"测试: {tool_name}")
116 | result = run_command(cmd)
117 | print(f"✓ {tool_name} - 可用")
118 | except:
119 | print(f"✗ {tool_name} - 不可用")
120 | return False
121 |
122 | return True
123 |
124 |
125 | def cleanup(temp_dir):
126 | """清理临时文件"""
127 | print("=== 清理临时文件 ===")
128 | try:
129 | import shutil
130 | shutil.rmtree(temp_dir)
131 | print(f"已删除临时目录: {temp_dir}")
132 | except Exception as e:
133 | print(f"清理失败: {e}")
134 |
135 |
136 | def main():
137 | """主函数"""
138 | print("=== Midscene Python 依赖验证 ===\n")
139 |
140 | # 检查requirements.txt是否存在
141 | requirements_file = Path("requirements.txt")
142 | if not requirements_file.exists():
143 | print("错误: requirements.txt 文件不存在")
144 | print("请先运行: make requirements-freeze")
145 | sys.exit(1)
146 |
147 | temp_dir = None
148 | try:
149 | # 创建测试环境
150 | temp_dir, python_path, pip_path = create_test_environment()
151 |
152 | # 安装依赖
153 | install_requirements(pip_path, requirements_file)
154 |
155 | # 验证导入
156 | if not validate_imports(python_path):
157 | print("\n❌ 包导入验证失败")
158 | sys.exit(1)
159 |
160 | # 验证CLI工具
161 | if not validate_cli_tools(python_path):
162 | print("\n❌ CLI工具验证失败")
163 | sys.exit(1)
164 |
165 | print("\n✅ 所有依赖验证通过!")
166 | print("requirements.txt 文件完整且可用")
167 |
168 | except KeyboardInterrupt:
169 | print("\n用户中断验证过程")
170 | sys.exit(1)
171 | except Exception as e:
172 | print(f"\n验证过程中出现错误: {e}")
173 | sys.exit(1)
174 | finally:
175 | if temp_dir:
176 | cleanup(temp_dir)
177 |
178 |
179 | if __name__ == "__main__":
180 | main()
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Midscene Python [](https://zread.ai/Python51888/Midscene-Python)
2 | [English](README.md) | [简体中文](README.zh.md)
3 |
4 | Midscene Python is an AI-based automation framework that supports UI automation operations on Web and Android platforms.
5 |
6 | ## Overview
7 |
8 | Midscene Python provides comprehensive UI automation capabilities with the following core features:
9 |
10 | - **Natural Language Driven**: Describe automation tasks using natural language
11 | - **Multi-platform Support**: Supports Web (Selenium/Playwright) and Android (ADB)
12 | - **AI Model Integration**: Supports multiple vision-language models such as GPT-4V, Qwen2.5-VL, and Gemini
13 | - **Visual Debugging**: Provides detailed execution reports and debugging information
14 | - **Caching Mechanism**: Intelligent caching to improve execution efficiency
15 |
16 | ## Project Architecture
17 |
18 | ```
19 | midscene-python/
20 | ├── midscene/ # Core framework
21 | │ ├── core/ # Core framework
22 | │ │ ├── agent/ # Agent system
23 | │ │ ├── insight/ # AI inference engine
24 | │ │ ├── ai_model/ # AI model integration
25 | │ │ ├── yaml/ # YAML script executor
26 | │ │ └── types.py # Core type definitions
27 | │ ├── web/ # Web integration
28 | │ │ ├── selenium/ # Selenium integration
29 | │ │ ├── playwright/ # Playwright integration
30 | │ │ └── bridge/ # Bridge mode
31 | │ ├── android/ # Android integration
32 | │ │ ├── device.py # Device management
33 | │ │ └── agent.py # Android Agent
34 | │ ├── cli/ # Command line tools
35 | │ ├── mcp/ # MCP protocol support
36 | │ ├── shared/ # Shared utilities
37 | │ └── visualizer/ # Visual reports
38 | ├── examples/ # Example code
39 | ├── tests/ # Test cases
40 | └── docs/ # Documentation
41 | ```
42 |
43 | ## Tech Stack
44 |
45 | - **Python 3.9+**: Core runtime environment
46 | - **Pydantic**: Data validation and serialization
47 | - **Selenium/Playwright**: Web automation
48 | - **OpenCV/Pillow**: Image processing
49 | - **HTTPX/AIOHTTP**: HTTP client
50 | - **Typer**: CLI framework
51 | - **Loguru**: Logging
52 |
53 | ## Quick Start
54 |
55 | ### Installation
56 |
57 | ```bash
58 | pip install midscene-python
59 | ```
60 |
61 | ### Basic Usage
62 |
63 | ```python
64 | from midscene import Agent
65 | from midscene.web import SeleniumWebPage
66 |
67 | # Create a Web Agent
68 | with SeleniumWebPage.create() as page:
69 | agent = Agent(page)
70 |
71 | # Perform automation operations using natural language
72 | await agent.ai_action("Click the login button")
73 | await agent.ai_action("Enter username 'test@example.com'")
74 | await agent.ai_action("Enter password 'password123'")
75 | await agent.ai_action("Click the submit button")
76 |
77 | # Data extraction
78 | user_info = await agent.ai_extract("Extract user personal information")
79 |
80 | # Assertion verification
81 | await agent.ai_assert("Page displays welcome message")
82 | ```
83 |
84 | ## Key Features
85 |
86 | ### 🤖 AI-Driven Automation
87 |
88 | Describe operations using natural language, and AI automatically understands and executes:
89 |
90 | ```python
91 | await agent.ai_action("Enter 'Python tutorial' in the search box and search")
92 | ```
93 |
94 | ### 🔍 Intelligent Element Location
95 |
96 | Supports multiple location strategies and automatically selects the optimal solution:
97 |
98 | ```python
99 | element = await agent.ai_locate("Login button")
100 | ```
101 |
102 | ### 📊 Data Extraction
103 |
104 | Extract structured data from the page:
105 |
106 | ```python
107 | products = await agent.ai_extract({
108 | "products": [
109 | {"name": "Product Name", "price": "Price", "rating": "Rating"}
110 | ]
111 | })
112 | ```
113 |
114 | ### ✅ Intelligent Assertions
115 |
116 | AI understands page state and performs intelligent assertions:
117 |
118 | ```python
119 | await agent.ai_assert("User has successfully logged in")
120 | ```
121 |
122 | ### 📝 Credits
123 |
124 | Thanks to Midscene Project: https://github.com/web-infra-dev/midscene for inspiration and technical references
125 |
126 | ## License
127 |
128 | MIT License
129 |
--------------------------------------------------------------------------------
/midscene/core/types.py:
--------------------------------------------------------------------------------
1 | """
2 | Core types and interfaces for Midscene Python
3 | """
4 |
5 | from abc import ABC, abstractmethod
6 | from dataclasses import dataclass, field
7 | from enum import Enum
8 | from typing import Any, Dict, List, Optional, Union, Callable, Awaitable, Generic, TypeVar
9 | from pydantic import BaseModel
10 |
11 | # Type variables
12 | ElementType = TypeVar('ElementType', bound='BaseElement')
13 | T = TypeVar('T')
14 |
15 |
16 | class InterfaceType(str, Enum):
17 | """Interface type enumeration"""
18 | WEB = "web"
19 | ANDROID = "android"
20 |
21 |
22 | class NodeType(str, Enum):
23 | """UI Node type enumeration"""
24 | CONTAINER = "container"
25 | TEXT = "text"
26 | INPUT = "input"
27 | BUTTON = "button"
28 | IMAGE = "image"
29 | LINK = "link"
30 | OTHER = "other"
31 |
32 |
33 | @dataclass
34 | class Point:
35 | """2D Point representation"""
36 | x: float
37 | y: float
38 |
39 |
40 | @dataclass
41 | class Size:
42 | """Size representation"""
43 | width: float
44 | height: float
45 |
46 |
47 | @dataclass
48 | class Rect:
49 | """Rectangle representation"""
50 | left: float
51 | top: float
52 | width: float
53 | height: float
54 |
55 | @property
56 | def right(self) -> float:
57 | return self.left + self.width
58 |
59 | @property
60 | def bottom(self) -> float:
61 | return self.top + self.height
62 |
63 | @property
64 | def center(self) -> Point:
65 | return Point(
66 | x=self.left + self.width / 2,
67 | y=self.top + self.height / 2
68 | )
69 |
70 |
71 | class BaseElement(BaseModel):
72 | """Base UI element interface"""
73 | id: str
74 | content: str
75 | rect: Rect
76 | center: tuple[float, float]
77 | node_type: NodeType = NodeType.OTHER
78 | attributes: Dict[str, Any] = field(default_factory=dict)
79 | is_visible: bool = True
80 | xpaths: Optional[List[str]] = None
81 |
82 | async def tap(self) -> None:
83 | """Tap/click this element"""
84 | raise NotImplementedError
85 |
86 | async def input_text(self, text: str) -> None:
87 | """Input text to this element"""
88 | raise NotImplementedError
89 |
90 |
91 | class UINode(BaseModel):
92 | """UI tree node representation"""
93 | id: str
94 | content: str
95 | rect: Rect
96 | center: tuple[float, float]
97 | node_type: NodeType
98 | attributes: Dict[str, Any] = field(default_factory=dict)
99 | is_visible: bool = True
100 | children: List['UINode'] = field(default_factory=list)
101 |
102 |
103 | class UITree(BaseModel):
104 | """UI tree representation"""
105 | node: UINode
106 | children: List['UITree'] = field(default_factory=list)
107 |
108 |
109 | class UIContext(BaseModel, Generic[ElementType]):
110 | """UI context containing screenshot and element information"""
111 | screenshot_base64: str
112 | size: Size
113 | content: List[ElementType]
114 | tree: UITree
115 |
116 |
117 | class AIUsageInfo(BaseModel):
118 | """AI usage information"""
119 | prompt_tokens: int = 0
120 | completion_tokens: int = 0
121 | total_tokens: int = 0
122 | cost: Optional[float] = None
123 |
124 |
125 | class LocateResult(BaseModel):
126 | """Element locate result"""
127 | element: Optional[BaseElement] = None
128 | rect: Optional[Rect] = None
129 |
130 |
131 | class ExecutionResult(BaseModel, Generic[T]):
132 | """Generic execution result"""
133 | success: bool = True
134 | data: Optional[Any] = None
135 | error: Optional[str] = None
136 | usage: Optional[AIUsageInfo] = None
137 |
138 |
139 | class AssertResult(BaseModel):
140 | """Assertion result"""
141 | passed: bool
142 | thought: str = ""
143 | message: str = ""
144 |
145 |
146 | # Type aliases
147 | TUserPrompt = Union[str, Dict[str, Any]]
148 | ElementById = Callable[[str], Optional[BaseElement]]
149 | OnTaskStartTip = Callable[[str], Union[None, Awaitable[None]]]
150 |
151 |
152 | # Abstract interface for device/platform implementations
153 | class AbstractInterface(ABC):
154 | """Abstract interface for platform implementations"""
155 |
156 | @property
157 | @abstractmethod
158 | def interface_type(self) -> InterfaceType:
159 | """Get interface type"""
160 | pass
161 |
162 | @abstractmethod
163 | async def get_context(self) -> UIContext:
164 | """Get current UI context"""
165 | pass
166 |
167 | @abstractmethod
168 | async def action_space(self) -> List[str]:
169 | """Get available actions"""
170 | pass
171 |
172 | @abstractmethod
173 | async def tap(self, x: float, y: float) -> None:
174 | """Tap at coordinates"""
175 | pass
176 |
177 | @abstractmethod
178 | async def input_text(self, text: str) -> None:
179 | """Input text"""
180 | pass
181 |
182 | @abstractmethod
183 | async def scroll(self, direction: str, distance: Optional[int] = None) -> None:
184 | """Scroll in direction"""
185 | pass
186 |
187 |
188 | class InsightAction(str, Enum):
189 | """Insight action types"""
190 | LOCATE = "locate"
191 | EXTRACT = "extract"
192 | ASSERT = "assert"
193 |
194 |
195 | @dataclass
196 | class AgentOptions:
197 | """Agent configuration options"""
198 | test_id: Optional[str] = None
199 | cache_id: Optional[str] = None
200 | group_name: str = "Midscene Report"
201 | group_description: str = ""
202 | generate_report: bool = True
203 | auto_print_report_msg: bool = True
204 | ai_action_context: Optional[str] = None
205 | report_file_name: Optional[str] = None
206 | model_config: Optional[Callable] = None
207 |
208 |
209 | @dataclass
210 | class LocateOption:
211 | """Locate operation options"""
212 | prompt: Optional[TUserPrompt] = None
213 | deep_think: bool = False
214 | cacheable: bool = True
215 | xpath: Optional[str] = None
216 | ui_context: Optional[UIContext] = None
217 |
218 |
219 | @dataclass
220 | class ExtractOption:
221 | """Extract operation options"""
222 | dom_included: Union[bool, str] = False # False, True, or 'visible-only'
223 | screenshot_included: bool = True
224 | return_thought: bool = False
225 | is_wait_for_assert: bool = False
226 | do_not_throw_error: bool = False
227 |
228 |
229 | class ScrollParam(BaseModel):
230 | """Scroll parameters"""
231 | direction: str # 'down', 'up', 'left', 'right'
232 | scroll_type: str # 'once', 'untilBottom', 'untilTop', 'untilLeft', 'untilRight'
233 | distance: Optional[int] = None # distance in pixels
--------------------------------------------------------------------------------
/docs/quickstart.md:
--------------------------------------------------------------------------------
1 | # 快速开始 - Midscene Python
2 |
3 | Midscene Python 是一个基于 AI 的自动化框架,支持 Web 和 Android 平台的 UI 自动化操作。
4 |
5 | ## 安装
6 |
7 | ```bash
8 | pip install midscene-python
9 | ```
10 |
11 | ## 基本配置
12 |
13 | ### 1. 配置 AI 模型
14 |
15 | 设置环境变量:
16 |
17 | ```bash
18 | export MIDSCENE_AI_PROVIDER=openai
19 | export MIDSCENE_AI_MODEL=gpt-4-vision-preview
20 | export MIDSCENE_AI_API_KEY=your-api-key-here
21 | ```
22 |
23 | 或创建配置文件 `midscene.yml`:
24 |
25 | ```yaml
26 | ai:
27 | provider: "openai"
28 | model: "gpt-4-vision-preview"
29 | api_key: "your-api-key-here"
30 | ```
31 |
32 | ### 2. 支持的 AI 提供商
33 |
34 | - **OpenAI**: GPT-4V, GPT-4o
35 | - **Anthropic**: Claude 3.5 Sonnet
36 | - **阿里云**: Qwen2.5-VL
37 | - **Google**: Gemini Pro Vision
38 |
39 | ## Web 自动化
40 |
41 | ### Selenium 示例
42 |
43 | ```python
44 | import asyncio
45 | from midscene import Agent
46 | from midscene.web import SeleniumWebPage
47 |
48 | async def web_automation():
49 | # 创建浏览器实例
50 | with SeleniumWebPage.create(headless=False) as page:
51 | agent = Agent(page)
52 |
53 | # 导航到网站
54 | await page.navigate_to("https://example.com")
55 |
56 | # 使用自然语言进行操作
57 | await agent.ai_action("点击登录按钮")
58 | await agent.ai_action("在用户名框输入 'demo@example.com'")
59 | await agent.ai_action("在密码框输入 'password123'")
60 | await agent.ai_action("点击提交按钮")
61 |
62 | # 数据提取
63 | user_info = await agent.ai_extract({
64 | "username": "用户名",
65 | "email": "邮箱地址"
66 | })
67 | print(f"用户信息: {user_info}")
68 |
69 | # 断言验证
70 | await agent.ai_assert("页面显示欢迎信息")
71 |
72 | # 运行示例
73 | asyncio.run(web_automation())
74 | ```
75 |
76 | ### Playwright 示例
77 |
78 | ```python
79 | import asyncio
80 | from midscene import Agent
81 | from midscene.web import PlaywrightWebPage
82 |
83 | async def playwright_automation():
84 | # 创建 Playwright 页面
85 | async with await PlaywrightWebPage.create() as page:
86 | agent = Agent(page)
87 |
88 | await page.navigate_to("https://playwright.dev")
89 | await agent.ai_action("点击文档链接")
90 |
91 | # 提取页面信息
92 | page_info = await agent.ai_extract({
93 | "title": "页面标题",
94 | "sections": ["主要章节列表"]
95 | })
96 | print(f"页面信息: {page_info}")
97 |
98 | asyncio.run(playwright_automation())
99 | ```
100 |
101 | ## Android 自动化
102 |
103 | ```python
104 | import asyncio
105 | from midscene.android import AndroidAgent
106 |
107 | async def android_automation():
108 | # 创建 Android Agent(自动检测设备)
109 | agent = await AndroidAgent.create()
110 |
111 | # 启动应用
112 | await agent.launch_app("com.android.settings")
113 |
114 | # 使用自然语言导航
115 | await agent.ai_action("点击WLAN设置")
116 | await agent.ai_action("滑动到底部")
117 |
118 | # 提取信息
119 | wifi_list = await agent.ai_extract({
120 | "networks": [
121 | {"name": "网络名称", "security": "安全类型"}
122 | ]
123 | })
124 | print(f"WiFi网络: {wifi_list}")
125 |
126 | # 返回
127 | await agent.back()
128 |
129 | asyncio.run(android_automation())
130 | ```
131 |
132 | ## 命令行工具
133 |
134 | ### 运行 YAML 脚本
135 |
136 | ```bash
137 | # 运行单个脚本
138 | midscene run script.yaml
139 |
140 | # 运行目录中的所有脚本
141 | midscene run scripts/
142 |
143 | # 使用配置文件
144 | midscene run script.yaml --config midscene.yml
145 |
146 | # 并发执行
147 | midscene run scripts/ --concurrent 3
148 |
149 | # Android 设备指定
150 | midscene run android_script.yaml --device device_id
151 | ```
152 |
153 | ### 列出 Android 设备
154 |
155 | ```bash
156 | midscene devices
157 | ```
158 |
159 | ### 初始化项目
160 |
161 | ```bash
162 | midscene init my-project
163 | cd my-project
164 | ```
165 |
166 | ## YAML 脚本格式
167 |
168 | 创建 `example.yaml`:
169 |
170 | ```yaml
171 | # Web 自动化脚本
172 | web:
173 | url: "https://example.com"
174 | browser: "chrome"
175 | headless: false
176 |
177 | tasks:
178 | - name: "登录操作"
179 | steps:
180 | - action: "ai_action"
181 | prompt: "点击登录按钮"
182 |
183 | - action: "ai_action"
184 | prompt: "输入用户名 'demo@example.com'"
185 |
186 | - action: "ai_action"
187 | prompt: "输入密码 'password123'"
188 |
189 | - action: "ai_action"
190 | prompt: "点击提交按钮"
191 |
192 | - name: "数据提取"
193 | steps:
194 | - action: "ai_extract"
195 | prompt:
196 | username: "用户名"
197 | email: "邮箱地址"
198 | save_to: "user_info"
199 |
200 | - name: "状态验证"
201 | steps:
202 | - action: "ai_assert"
203 | prompt: "页面显示欢迎信息"
204 | ```
205 |
206 | ## 核心概念
207 |
208 | ### Agent 系统
209 |
210 | Agent 是自动化操作的核心控制器,协调 AI 模型与设备交互:
211 |
212 | ```python
213 | from midscene import Agent
214 | from midscene.web import SeleniumWebPage
215 |
216 | page = SeleniumWebPage.create()
217 | agent = Agent(page)
218 | ```
219 |
220 | ### AI 操作类型
221 |
222 | 1. **ai_action**: 执行自然语言描述的操作
223 | 2. **ai_locate**: 定位 UI 元素
224 | 3. **ai_extract**: 提取结构化数据
225 | 4. **ai_assert**: 验证页面状态
226 |
227 | ### 缓存机制
228 |
229 | 启用缓存可以提升重复执行的效率:
230 |
231 | ```python
232 | from midscene.core import AgentOptions
233 |
234 | options = AgentOptions(
235 | cache_id="my_automation",
236 | generate_report=True
237 | )
238 | agent = Agent(page, options)
239 | ```
240 |
241 | ## 最佳实践
242 |
243 | ### 1. 错误处理
244 |
245 | ```python
246 | try:
247 | await agent.ai_action("点击不存在的按钮")
248 | except Exception as e:
249 | print(f"操作失败: {e}")
250 | ```
251 |
252 | ### 2. 等待条件
253 |
254 | ```python
255 | # 等待元素出现
256 | await agent.ai_wait_for("登录成功页面出现", timeout_ms=10000)
257 | ```
258 |
259 | ### 3. 数据验证
260 |
261 | ```python
262 | # 使用断言验证数据
263 | user_data = await agent.ai_extract({"username": "用户名"})
264 | assert user_data["username"], "用户名不能为空"
265 | ```
266 |
267 | ### 4. 截图和报告
268 |
269 | ```python
270 | # 生成执行报告
271 | options = AgentOptions(
272 | generate_report=True,
273 | report_file_name="automation_report"
274 | )
275 | ```
276 |
277 | ## 故障排除
278 |
279 | ### 常见问题
280 |
281 | 1. **AI API 密钥未设置**
282 | ```
283 | ValueError: AI API key not configured
284 | ```
285 | 解决:设置 `MIDSCENE_AI_API_KEY` 环境变量
286 |
287 | 2. **Chrome 浏览器未找到**
288 | ```
289 | WebDriverException: chrome not found
290 | ```
291 | 解决:安装 Chrome 浏览器或指定 Chrome 路径
292 |
293 | 3. **Android 设备连接失败**
294 | ```
295 | RuntimeError: No Android devices found
296 | ```
297 | 解决:确保设备已连接并启用 USB 调试
298 |
299 | ### 调试技巧
300 |
301 | 1. **启用详细日志**
302 | ```python
303 | from midscene.shared import setup_logger
304 | setup_logger(level="DEBUG")
305 | ```
306 |
307 | 2. **查看生成的报告**
308 | 执行完成后检查 `./reports/` 目录中的 HTML 报告
309 |
310 | 3. **使用非无头模式**
311 | 设置 `headless=False` 观察浏览器操作过程
312 |
313 | ## 下一步
314 |
315 | - 查看 [API 文档](api.md) 了解详细接口
316 | - 浏览 [示例集合](examples/) 学习更多用法
317 | - 阅读 [配置指南](configuration.md) 了解高级配置
--------------------------------------------------------------------------------
/wiki/快速开始.md:
--------------------------------------------------------------------------------
1 | # 快速开始
2 |
3 | 欢迎使用 Midscene Python!本指南将帮助你在 5 分钟内上手 AI 驱动的自动化操作。
4 |
5 | ## 📋 前置要求
6 |
7 | 在开始之前,请确保你的环境满足以下要求:
8 |
9 | - **Python 3.9+**
10 | - **pip** 包管理器
11 | - **浏览器** (Chrome/Firefox/Edge,用于 Web 自动化)
12 | - **AI 模型 API Key** (OpenAI、Claude、Qwen 或 Gemini 任选其一)
13 |
14 | ## 🚀 快速安装
15 |
16 | ### 1. 安装 Midscene Python
17 |
18 | ```bash
19 | pip install midscene-python
20 | ```
21 |
22 | ### 2. 安装浏览器驱动(可选)
23 |
24 | 如果你计划进行 Web 自动化,需要安装对应的浏览器驱动:
25 |
26 | ```bash
27 | # Selenium WebDriver
28 | pip install webdriver-manager
29 |
30 | # 或者 Playwright
31 | pip install playwright
32 | playwright install
33 | ```
34 |
35 | ### 3. 配置 AI 模型
36 |
37 | 创建 `.env` 文件配置 AI 模型(以 OpenAI 为例):
38 |
39 | ```bash
40 | # .env
41 | OPENAI_API_KEY=your_openai_api_key_here
42 | OPENAI_BASE_URL=https://api.openai.com/v1 # 可选,默认官方 API
43 | ```
44 |
45 | ## 🎯 第一个示例
46 |
47 | 让我们从一个简单的 Web 自动化示例开始:
48 |
49 | ### 示例 1: 搜索操作
50 |
51 | ```python
52 | import asyncio
53 | from midscene import Agent
54 | from midscene.web import SeleniumWebPage
55 |
56 | async def search_example():
57 | """在百度搜索 Python 教程"""
58 |
59 | # 创建 Web 页面实例
60 | with SeleniumWebPage.create() as page:
61 | # 创建 Agent
62 | agent = Agent(page)
63 |
64 | # 导航到网站
65 | await page.goto("https://www.baidu.com")
66 |
67 | # 使用自然语言进行搜索
68 | await agent.ai_action("在搜索框输入'Python 教程'")
69 | await agent.ai_action("点击搜索按钮")
70 |
71 | # 验证搜索结果
72 | await agent.ai_assert("页面显示了 Python 教程的搜索结果")
73 |
74 | print("✅ 搜索操作完成!")
75 |
76 | # 运行示例
77 | asyncio.run(search_example())
78 | ```
79 |
80 | ### 示例 2: 数据提取
81 |
82 | ```python
83 | import asyncio
84 | from midscene import Agent
85 | from midscene.web import SeleniumWebPage
86 |
87 | async def extract_example():
88 | """提取新闻标题"""
89 |
90 | with SeleniumWebPage.create() as page:
91 | agent = Agent(page)
92 |
93 | # 访问新闻网站
94 | await page.goto("https://news.example.com")
95 |
96 | # 提取结构化数据
97 | news_data = await agent.ai_extract({
98 | "articles": [
99 | {
100 | "title": "新闻标题",
101 | "time": "发布时间",
102 | "summary": "新闻摘要"
103 | }
104 | ]
105 | })
106 |
107 | # 输出结果
108 | for article in news_data["articles"]:
109 | print(f"📰 {article['title']}")
110 | print(f"⏰ {article['time']}")
111 | print(f"📄 {article['summary']}\n")
112 |
113 | # 运行示例
114 | asyncio.run(extract_example())
115 | ```
116 |
117 | ## 📱 Android 自动化示例
118 |
119 | ```python
120 | import asyncio
121 | from midscene import Agent
122 | from midscene.android import AndroidDevice
123 |
124 | async def android_example():
125 | """Android 应用自动化"""
126 |
127 | # 连接 Android 设备
128 | device = AndroidDevice()
129 | await device.connect()
130 |
131 | # 创建 Agent
132 | agent = Agent(device)
133 |
134 | # 启动应用
135 | await device.start_app("com.example.app")
136 |
137 | # 自然语言操作
138 | await agent.ai_action("点击登录按钮")
139 | await agent.ai_action("输入用户名 'testuser'")
140 | await agent.ai_action("输入密码 'password123'")
141 | await agent.ai_action("点击确认登录")
142 |
143 | # 验证登录状态
144 | await agent.ai_assert("显示用户已登录")
145 |
146 | print("✅ Android 自动化完成!")
147 |
148 | # 运行示例
149 | asyncio.run(android_example())
150 | ```
151 |
152 | ## 🎛️ 配置选项
153 |
154 | ### AI 模型配置
155 |
156 | ```python
157 | from midscene.core.ai_model import AIModelConfig
158 |
159 | # 自定义 AI 配置
160 | config = AIModelConfig(
161 | provider="openai", # 或 "claude", "qwen", "gemini"
162 | model="gpt-4-vision-preview",
163 | temperature=0.1,
164 | max_tokens=1000
165 | )
166 |
167 | agent = Agent(page, ai_config=config)
168 | ```
169 |
170 | ### Agent 选项
171 |
172 | ```python
173 | from midscene.core import AgentOptions
174 |
175 | # 自定义 Agent 选项
176 | options = AgentOptions(
177 | timeout=30, # 操作超时时间(秒)
178 | retry_count=3, # 重试次数
179 | screenshot_on_error=True, # 错误时自动截图
180 | cache_enabled=True # 启用智能缓存
181 | )
182 |
183 | agent = Agent(page, options=options)
184 | ```
185 |
186 | ## 🔧 常用操作
187 |
188 | ### 基础交互
189 |
190 | ```python
191 | # 点击操作
192 | await agent.ai_action("点击提交按钮")
193 | await agent.ai_action("点击页面右上角的用户头像")
194 |
195 | # 输入操作
196 | await agent.ai_action("在用户名框输入 'admin'")
197 | await agent.ai_action("在密码框输入密码")
198 |
199 | # 滚动操作
200 | await agent.ai_action("向下滚动查看更多内容")
201 | await agent.ai_action("滚动到页面底部")
202 |
203 | # 等待操作
204 | await agent.ai_action("等待页面加载完成")
205 | ```
206 |
207 | ### 元素定位
208 |
209 | ```python
210 | # 精确定位元素
211 | element = await agent.ai_locate("登录按钮")
212 | await element.click()
213 |
214 | # 定位多个元素
215 | elements = await agent.ai_locate_all("商品卡片")
216 | for element in elements:
217 | await element.hover()
218 | ```
219 |
220 | ### 条件断言
221 |
222 | ```python
223 | # 页面状态验证
224 | await agent.ai_assert("用户已成功登录")
225 | await agent.ai_assert("购物车显示 3 件商品")
226 | await agent.ai_assert("页面不包含错误信息")
227 |
228 | # 元素存在性验证
229 | await agent.ai_assert("页面包含搜索结果")
230 | await agent.ai_assert("显示用户个人信息")
231 | ```
232 |
233 | ## 📊 查看执行报告
234 |
235 | Midscene Python 自动生成详细的执行报告:
236 |
237 | ```python
238 | # 运行后,检查生成的报告文件
239 | # 报告位置: ./midscene_reports/
240 | # - execution_report.html # 可视化报告
241 | # - screenshots/ # 执行截图
242 | # - logs/ # 详细日志
243 | ```
244 |
245 | ## 🔍 调试技巧
246 |
247 | ### 启用详细日志
248 |
249 | ```python
250 | import logging
251 | from midscene.shared.logger import setup_logger
252 |
253 | # 启用调试日志
254 | setup_logger(level=logging.DEBUG)
255 | ```
256 |
257 | ### 截图调试
258 |
259 | ```python
260 | # 手动截图
261 | screenshot = await page.screenshot()
262 | with open("debug.png", "wb") as f:
263 | f.write(screenshot)
264 |
265 | # 获取页面信息
266 | context = await page.get_context()
267 | print(f"页面标题: {context.page_title}")
268 | print(f"页面 URL: {context.url}")
269 | ```
270 |
271 | ## 🚨 常见问题
272 |
273 | ### 1. AI 模型调用失败
274 | ```python
275 | # 检查 API Key 配置
276 | import os
277 | print(f"API Key: {os.getenv('OPENAI_API_KEY')[:10]}...")
278 | ```
279 |
280 | ### 2. 元素定位失败
281 | ```python
282 | # 使用更具体的描述
283 | await agent.ai_action("点击页面左上角的蓝色登录按钮")
284 | ```
285 |
286 | ### 3. 页面加载问题
287 | ```python
288 | # 添加等待时间
289 | await page.wait_for_page_load()
290 | await agent.ai_action("等待 3 秒让页面完全加载")
291 | ```
292 |
293 | ## 🎓 下一步
294 |
295 | 恭喜!你已经掌握了 Midscene Python 的基础用法。接下来可以:
296 |
297 | 1. 📖 深入学习 [核心概念](核心概念/Agent核心控制器.md)
298 | 2. 🔧 查看 [API 参考](API参考/Agent-API.md)
299 | 3. 🌐 了解 [Web 自动化](平台集成/Web自动化/README.md) 高级特性
300 | 4. 📱 探索 [Android 自动化](平台集成/Android自动化.md)
301 | 5. 🎯 参考 [最佳实践](示例和教程/最佳实践.md)
302 |
303 | ## 💡 小贴士
304 |
305 | - 使用具体、清晰的自然语言描述能获得更好的执行效果
306 | - 定期查看执行报告来优化自动化脚本
307 | - 善用缓存机制来提升执行效率
308 | - 为不同环境配置不同的 AI 模型
309 |
310 | ---
311 |
312 | *准备好探索更多功能了吗?查看我们的 [示例集合](示例和教程/基础示例.md)!*
--------------------------------------------------------------------------------
/tests/test_core.py:
--------------------------------------------------------------------------------
1 | """
2 | Test suite for Midscene Python core functionality
3 | """
4 |
5 | import pytest
6 | import asyncio
7 | from unittest.mock import Mock, AsyncMock
8 |
9 | from midscene.core.types import UIContext, Size, Rect, BaseElement, NodeType
10 | from midscene.core.insight import Insight
11 | from midscene.core.agent import Agent
12 |
13 |
14 | class MockInterface:
15 | """Mock interface for testing"""
16 |
17 | def __init__(self):
18 | self.interface_type = "mock"
19 | self._context = None
20 |
21 | async def get_context(self):
22 | if self._context:
23 | return self._context
24 |
25 | # Return mock context
26 | return UIContext(
27 | screenshot_base64="mock_screenshot",
28 | size=Size(width=1920, height=1080),
29 | content=[
30 | BaseElement(
31 | id="test_element",
32 | content="Test Button",
33 | rect=Rect(left=100, top=100, width=200, height=50),
34 | center=(200, 125),
35 | node_type=NodeType.BUTTON
36 | )
37 | ],
38 | tree=Mock()
39 | )
40 |
41 | async def action_space(self):
42 | return ["tap", "input", "scroll"]
43 |
44 | async def tap(self, x, y):
45 | pass
46 |
47 | async def input_text(self, text):
48 | pass
49 |
50 | async def scroll(self, direction, distance=None):
51 | pass
52 |
53 |
54 | @pytest.fixture
55 | def mock_interface():
56 | """Mock interface fixture"""
57 | return MockInterface()
58 |
59 |
60 | @pytest.fixture
61 | def mock_ai_service():
62 | """Mock AI service fixture"""
63 | ai_service = Mock()
64 | ai_service.call_ai = AsyncMock(return_value={
65 | "content": {
66 | "elements": [{"id": "test_element", "reason": "test"}],
67 | "reasoning": "test reasoning",
68 | "confidence": 0.9,
69 | "errors": []
70 | },
71 | "usage": {"total_tokens": 100}
72 | })
73 | return ai_service
74 |
75 |
76 | class TestInsight:
77 | """Test Insight AI engine"""
78 |
79 | @pytest.mark.asyncio
80 | async def test_locate_element(self, mock_interface, mock_ai_service):
81 | """Test element location"""
82 | insight = Insight(
83 | context_provider=mock_interface.get_context,
84 | ai_service=mock_ai_service
85 | )
86 |
87 | result = await insight.locate("test button")
88 |
89 | assert result.element is not None
90 | assert result.element.id == "test_element"
91 | mock_ai_service.call_ai.assert_called_once()
92 |
93 | @pytest.mark.asyncio
94 | async def test_extract_data(self, mock_interface, mock_ai_service):
95 | """Test data extraction"""
96 | # Mock extract response
97 | mock_ai_service.call_ai.return_value = {
98 | "content": {
99 | "data": {"title": "Test Page", "items": ["item1", "item2"]},
100 | "reasoning": "extracted data",
101 | "confidence": 0.9,
102 | "errors": []
103 | },
104 | "usage": {"total_tokens": 150}
105 | }
106 |
107 | insight = Insight(
108 | context_provider=mock_interface.get_context,
109 | ai_service=mock_ai_service
110 | )
111 |
112 | result = await insight.extract("extract page data")
113 |
114 | assert result["data"]["title"] == "Test Page"
115 | assert len(result["data"]["items"]) == 2
116 |
117 | @pytest.mark.asyncio
118 | async def test_assert_condition(self, mock_interface, mock_ai_service):
119 | """Test condition assertion"""
120 | # Mock assert response
121 | mock_ai_service.call_ai.return_value = {
122 | "content": {
123 | "passed": True,
124 | "reasoning": "condition is met",
125 | "confidence": 0.95,
126 | "message": "success"
127 | },
128 | "usage": {"total_tokens": 80}
129 | }
130 |
131 | insight = Insight(
132 | context_provider=mock_interface.get_context,
133 | ai_service=mock_ai_service
134 | )
135 |
136 | result = await insight.assert_condition("page is loaded")
137 |
138 | assert result.passed is True
139 | assert result.thought == "condition is met"
140 |
141 |
142 | class TestAgent:
143 | """Test Agent functionality"""
144 |
145 | @pytest.mark.asyncio
146 | async def test_agent_creation(self, mock_interface):
147 | """Test agent creation"""
148 | agent = Agent(mock_interface)
149 |
150 | assert agent.interface == mock_interface
151 | assert agent.insight is not None
152 | assert agent.task_executor is not None
153 | assert agent.destroyed is False
154 |
155 | @pytest.mark.asyncio
156 | async def test_ai_locate(self, mock_interface, mock_ai_service):
157 | """Test AI locate through agent"""
158 | agent = Agent(mock_interface)
159 | agent.insight.ai_service = mock_ai_service
160 |
161 | result = await agent.ai_locate("test button")
162 |
163 | assert result.element is not None
164 | assert result.element.id == "test_element"
165 |
166 | @pytest.mark.asyncio
167 | async def test_ai_extract(self, mock_interface, mock_ai_service):
168 | """Test AI extract through agent"""
169 | # Mock extract response
170 | mock_ai_service.call_ai.return_value = {
171 | "content": {
172 | "data": {"username": "testuser"},
173 | "reasoning": "extracted username",
174 | "confidence": 0.9,
175 | "errors": []
176 | },
177 | "usage": {"total_tokens": 100}
178 | }
179 |
180 | agent = Agent(mock_interface)
181 | agent.insight.ai_service = mock_ai_service
182 |
183 | result = await agent.ai_extract("extract username")
184 |
185 | assert result["username"] == "testuser"
186 |
187 | @pytest.mark.asyncio
188 | async def test_ai_assert_success(self, mock_interface, mock_ai_service):
189 | """Test AI assert success"""
190 | # Mock assert response
191 | mock_ai_service.call_ai.return_value = {
192 | "content": {
193 | "passed": True,
194 | "reasoning": "condition met",
195 | "confidence": 0.9,
196 | "message": "success"
197 | },
198 | "usage": {"total_tokens": 80}
199 | }
200 |
201 | agent = Agent(mock_interface)
202 | agent.insight.ai_service = mock_ai_service
203 |
204 | # Should not raise exception
205 | await agent.ai_assert("page is loaded")
206 |
207 | @pytest.mark.asyncio
208 | async def test_ai_assert_failure(self, mock_interface, mock_ai_service):
209 | """Test AI assert failure"""
210 | # Mock assert response
211 | mock_ai_service.call_ai.return_value = {
212 | "content": {
213 | "passed": False,
214 | "reasoning": "condition not met",
215 | "confidence": 0.9,
216 | "message": "login failed"
217 | },
218 | "usage": {"total_tokens": 80}
219 | }
220 |
221 | agent = Agent(mock_interface)
222 | agent.insight.ai_service = mock_ai_service
223 |
224 | # Should raise AssertionError
225 | with pytest.raises(AssertionError):
226 | await agent.ai_assert("user is logged in")
227 |
228 | @pytest.mark.asyncio
229 | async def test_basic_actions(self, mock_interface):
230 | """Test basic agent actions"""
231 | agent = Agent(mock_interface)
232 |
233 | # Test tap
234 | await agent.tap(100, 200)
235 |
236 | # Test input
237 | await agent.input_text("test text")
238 |
239 | # Test scroll
240 | from midscene.core.types import ScrollParam
241 | scroll_param = ScrollParam(direction="down", scroll_type="once", distance=500)
242 | await agent.scroll(scroll_param)
243 |
244 | @pytest.mark.asyncio
245 | async def test_agent_destroy(self, mock_interface):
246 | """Test agent destruction"""
247 | agent = Agent(mock_interface)
248 |
249 | await agent.destroy()
250 |
251 | assert agent.destroyed is True
252 |
253 | # Should raise error when using destroyed agent
254 | with pytest.raises(RuntimeError):
255 | await agent.ai_locate("test")
256 |
257 |
258 | if __name__ == "__main__":
259 | pytest.main([__file__, "-v"])
--------------------------------------------------------------------------------
/wiki/安装配置.md:
--------------------------------------------------------------------------------
1 | # 安装配置
2 |
3 | 本章节详细介绍 Midscene Python 的安装步骤、环境配置和依赖管理。
4 |
5 | ## 📋 系统要求
6 |
7 | ### 基础要求
8 | - **Python**: 3.9 或更高版本
9 | - **操作系统**: Windows 10+, macOS 10.14+, Linux (Ubuntu 18.04+)
10 | - **内存**: 最少 4GB RAM(推荐 8GB+)
11 | - **网络**: 稳定的互联网连接(用于 AI 模型调用)
12 |
13 | ### AI 模型要求
14 | 至少需要以下 AI 服务之一的 API 访问权限:
15 | - OpenAI GPT-4V
16 | - Anthropic Claude 3
17 | - 阿里云通义千问 VL
18 | - Google Gemini Pro Vision
19 |
20 | ## 🚀 快速安装
21 |
22 | ### 方式一:使用 pip 安装(推荐)
23 | ```bash
24 | # 安装最新版本
25 | pip install midscene-python
26 |
27 | # 或指定版本
28 | pip install midscene-python==0.1.0
29 | ```
30 |
31 | ### 方式二:从源码安装
32 | ```bash
33 | # 克隆仓库
34 | git clone https://gitee.com/Python51888/midscene-python.git
35 | cd midscene-python
36 |
37 | # 安装依赖并安装
38 | pip install -e .
39 | ```
40 |
41 | ### 方式三:开发者安装
42 | ```bash
43 | # 克隆仓库
44 | git clone https://gitee.com/Python51888/midscene-python.git
45 | cd midscene-python
46 |
47 | # 安装开发依赖
48 | pip install -e ".[dev,docs]"
49 |
50 | # 安装 pre-commit hooks
51 | pre-commit install
52 | ```
53 |
54 | ## 🔧 平台特定配置
55 |
56 | ### Web 自动化配置
57 |
58 | #### Selenium 配置
59 | ```bash
60 | # 安装 Selenium 和 WebDriver 管理器
61 | pip install selenium webdriver-manager
62 |
63 | # Python 代码中自动管理驱动
64 | from selenium import webdriver
65 | from webdriver_manager.chrome import ChromeDriverManager
66 | from selenium.webdriver.chrome.service import Service
67 |
68 | service = Service(ChromeDriverManager().install())
69 | driver = webdriver.Chrome(service=service)
70 | ```
71 |
72 | #### Playwright 配置
73 | ```bash
74 | # 安装 Playwright
75 | pip install playwright
76 |
77 | # 安装浏览器
78 | playwright install
79 |
80 | # 仅安装 Chromium(节省空间)
81 | playwright install chromium
82 | ```
83 |
84 | ### Android 自动化配置
85 |
86 | #### ADB 设置
87 | ```bash
88 | # 安装 ADB(Ubuntu/Debian)
89 | sudo apt-get install android-tools-adb
90 |
91 | # 安装 ADB(macOS)
92 | brew install android-platform-tools
93 |
94 | # 安装 ADB(Windows)
95 | # 下载 Android SDK Platform Tools
96 | # 添加到系统 PATH
97 | ```
98 |
99 | #### 设备连接
100 | ```bash
101 | # 启用开发者选项和 USB 调试
102 | # 连接设备后验证
103 | adb devices
104 |
105 | # 预期输出
106 | List of devices attached
107 | DEVICE_ID device
108 | ```
109 |
110 | ## 🔑 AI 模型配置
111 |
112 | ### 环境变量配置
113 | 创建 `.env` 文件:
114 |
115 | ```bash
116 | # OpenAI 配置
117 | OPENAI_API_KEY=sk-your-openai-api-key
118 | OPENAI_BASE_URL=https://api.openai.com/v1 # 可选
119 |
120 | # Anthropic 配置
121 | ANTHROPIC_API_KEY=sk-ant-your-anthropic-key
122 |
123 | # 通义千问配置
124 | DASHSCOPE_API_KEY=sk-your-dashscope-key
125 |
126 | # Gemini 配置
127 | GOOGLE_API_KEY=AIza-your-google-api-key
128 |
129 | # 默认模型配置
130 | MIDSCENE_AI_PROVIDER=openai
131 | MIDSCENE_AI_MODEL=gpt-4-vision-preview
132 | ```
133 |
134 | ### 代码配置
135 | ```python
136 | from midscene.core.ai_model import AIModelConfig
137 |
138 | # 多个 AI 提供商配置
139 | configs = {
140 | "openai": AIModelConfig(
141 | provider="openai",
142 | model="gpt-4-vision-preview",
143 | api_key="your-openai-key",
144 | temperature=0.1
145 | ),
146 | "claude": AIModelConfig(
147 | provider="anthropic",
148 | model="claude-3-sonnet-20240229",
149 | api_key="your-claude-key",
150 | temperature=0.1
151 | )
152 | }
153 | ```
154 |
155 | ## 📦 依赖管理
156 |
157 | ### 核心依赖
158 | ```toml
159 | # pyproject.toml 中的核心依赖
160 | [project]
161 | dependencies = [
162 | "pydantic>=2.0,<3.0",
163 | "selenium>=4.15.0,<5.0",
164 | "playwright>=1.40.0,<2.0",
165 | "opencv-python>=4.8.0,<5.0",
166 | "pillow>=10.0.0,<11.0",
167 | "aiohttp>=3.9.0,<4.0",
168 | "loguru>=0.7.0,<1.0",
169 | "typer>=0.9.0,<1.0",
170 | "httpx>=0.25.0,<1.0",
171 | "openai>=1.3.0,<2.0",
172 | "anthropic>=0.7.0,<1.0"
173 | ]
174 | ```
175 |
176 | ### 可选依赖
177 | ```bash
178 | # 开发工具
179 | pip install "midscene-python[dev]"
180 |
181 | # 文档工具
182 | pip install "midscene-python[docs]"
183 |
184 | # 全部依赖
185 | pip install "midscene-python[dev,docs]"
186 | ```
187 |
188 | ## 🔍 验证安装
189 |
190 | ### 基础验证
191 | ```python
192 | # test_installation.py
193 | import asyncio
194 | from midscene import Agent
195 | from midscene.core.ai_model import AIModelService
196 |
197 | async def test_installation():
198 | """测试安装是否成功"""
199 |
200 | # 测试导入
201 | print("✓ 导入模块成功")
202 |
203 | # 测试 AI 服务配置
204 | try:
205 | ai_service = AIModelService()
206 | print("✓ AI 服务初始化成功")
207 | except Exception as e:
208 | print(f"✗ AI 服务初始化失败: {e}")
209 |
210 | print("🎉 安装验证完成!")
211 |
212 | # 运行测试
213 | asyncio.run(test_installation())
214 | ```
215 |
216 | ### Web 平台验证
217 | ```python
218 | # test_web.py
219 | import asyncio
220 | from midscene import Agent
221 | from midscene.web import SeleniumWebPage
222 |
223 | async def test_web():
224 | """测试 Web 平台功能"""
225 | try:
226 | with SeleniumWebPage.create() as page:
227 | agent = Agent(page)
228 | await page.goto("https://www.example.com")
229 | print("✓ Web 自动化测试成功")
230 | except Exception as e:
231 | print(f"✗ Web 自动化测试失败: {e}")
232 |
233 | asyncio.run(test_web())
234 | ```
235 |
236 | ### Android 平台验证
237 | ```python
238 | # test_android.py
239 | import asyncio
240 | from midscene import Agent
241 | from midscene.android import AndroidDevice
242 |
243 | async def test_android():
244 | """测试 Android 平台功能"""
245 | try:
246 | device = AndroidDevice()
247 | await device.connect()
248 | agent = Agent(device)
249 | print("✓ Android 自动化测试成功")
250 | except Exception as e:
251 | print(f"✗ Android 自动化测试失败: {e}")
252 |
253 | asyncio.run(test_android())
254 | ```
255 |
256 | ## 🔧 常见问题解决
257 |
258 | ### Python 版本问题
259 | ```bash
260 | # 检查 Python 版本
261 | python --version
262 |
263 | # 如果版本低于 3.9,安装新版本
264 | # Ubuntu/Debian
265 | sudo apt-get install python3.9
266 |
267 | # macOS
268 | brew install python@3.9
269 |
270 | # Windows
271 | # 从 python.org 下载安装
272 | ```
273 |
274 | ### 依赖冲突解决
275 | ```bash
276 | # 创建虚拟环境(推荐)
277 | python -m venv midscene-env
278 | source midscene-env/bin/activate # Linux/macOS
279 | # 或
280 | midscene-env\Scripts\activate # Windows
281 |
282 | # 在虚拟环境中安装
283 | pip install midscene-python
284 | ```
285 |
286 | ### 网络连接问题
287 | ```bash
288 | # 使用国内镜像源
289 | pip install -i https://pypi.tuna.tsinghua.edu.cn/simple midscene-python
290 |
291 | # 或配置永久镜像源
292 | pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
293 | ```
294 |
295 | ### AI API 连接问题
296 | ```python
297 | # 测试 API 连接
298 | import os
299 | import httpx
300 |
301 | async def test_openai_connection():
302 | api_key = os.getenv("OPENAI_API_KEY")
303 | if not api_key:
304 | print("❌ 未设置 OPENAI_API_KEY")
305 | return
306 |
307 | async with httpx.AsyncClient() as client:
308 | try:
309 | response = await client.get(
310 | "https://api.openai.com/v1/models",
311 | headers={"Authorization": f"Bearer {api_key}"}
312 | )
313 | if response.status_code == 200:
314 | print("✅ OpenAI API 连接正常")
315 | else:
316 | print(f"❌ OpenAI API 连接失败: {response.status_code}")
317 | except Exception as e:
318 | print(f"❌ 网络连接错误: {e}")
319 | ```
320 |
321 | ## 🚀 性能优化配置
322 |
323 | ### 系统级优化
324 | ```bash
325 | # 增加文件描述符限制(Linux/macOS)
326 | ulimit -n 65536
327 |
328 | # 设置环境变量优化
329 | export PYTHONUNBUFFERED=1
330 | export PYTHONDONTWRITEBYTECODE=1
331 | ```
332 |
333 | ### Python 配置优化
334 | ```python
335 | # config.py
336 | import asyncio
337 |
338 | # 设置异步事件循环策略
339 | if hasattr(asyncio, 'WindowsSelectorEventLoopPolicy'):
340 | asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
341 |
342 | # 配置日志级别
343 | import logging
344 | logging.getLogger("httpx").setLevel(logging.WARNING)
345 | logging.getLogger("selenium").setLevel(logging.WARNING)
346 | ```
347 |
348 | ## 📋 配置检查清单
349 |
350 | ### 安装完成检查
351 | - [ ] Python 3.9+ 已安装
352 | - [ ] midscene-python 包已安装
353 | - [ ] 至少一个 AI 提供商已配置
354 | - [ ] Web 驱动程序已安装(如果使用 Web 自动化)
355 | - [ ] ADB 已安装并设备已连接(如果使用 Android 自动化)
356 |
357 | ### 环境配置检查
358 | - [ ] 环境变量已设置
359 | - [ ] API 密钥有效且有足够额度
360 | - [ ] 网络连接正常
361 | - [ ] 防火墙和代理配置正确
362 |
363 | ### 功能测试检查
364 | - [ ] 基础导入测试通过
365 | - [ ] AI 服务初始化成功
366 | - [ ] 选择的平台(Web/Android)测试通过
367 | - [ ] 示例代码可以正常运行
368 |
369 | ## 🔄 升级和维护
370 |
371 | ### 版本升级
372 | ```bash
373 | # 检查当前版本
374 | pip show midscene-python
375 |
376 | # 升级到最新版本
377 | pip install --upgrade midscene-python
378 |
379 | # 升级特定版本
380 | pip install midscene-python==0.2.0
381 | ```
382 |
383 | ### 配置备份
384 | ```bash
385 | # 备份配置文件
386 | cp .env .env.backup
387 | cp pyproject.toml pyproject.toml.backup
388 |
389 | # 导出依赖列表
390 | pip freeze > requirements.txt
391 | ```
392 |
393 | ### 清理和重装
394 | ```bash
395 | # 卸载当前版本
396 | pip uninstall midscene-python
397 |
398 | # 清理缓存
399 | pip cache purge
400 |
401 | # 重新安装
402 | pip install midscene-python
403 | ```
404 |
405 | ---
406 |
407 | 完成配置后,您就可以开始使用 Midscene Python 进行 AI 驱动的自动化了!接下来推荐阅读 [快速开始](快速开始.md) 指南。
--------------------------------------------------------------------------------
/wiki/平台集成/README.md:
--------------------------------------------------------------------------------
1 | # 平台集成
2 |
3 | Midscene Python 支持多个平台的 UI 自动化,提供统一的编程接口和一致的操作体验。
4 |
5 | ## 🏗️ 架构概览
6 |
7 | ```mermaid
8 | graph TB
9 | A[Agent 统一接口] --> B[平台抽象层]
10 | B --> C[Web 自动化]
11 | B --> D[Android 自动化]
12 |
13 | C --> E[Selenium 集成]
14 | C --> F[Playwright 集成]
15 | C --> G[Web 桥接机制]
16 |
17 | D --> H[ADB 设备管理]
18 | D --> I[Android Agent]
19 |
20 | E --> J[ChromeDriver]
21 | E --> K[FirefoxDriver]
22 | F --> L[Chromium]
23 | F --> M[Firefox]
24 | F --> N[Safari]
25 |
26 | H --> O[USB 设备]
27 | H --> P[网络设备]
28 | H --> Q[模拟器]
29 | ```
30 |
31 | ## 📱 支持的平台
32 |
33 | ### Web 自动化
34 | - **Selenium WebDriver**: 支持 Chrome、Firefox、Safari、Edge
35 | - **Playwright**: 支持 Chromium、Firefox、WebKit
36 | - **统一桥接**: 提供一致的 API 接口
37 |
38 | ### Android 自动化
39 | - **真实设备**: 通过 USB 或 WiFi 连接
40 | - **Android 模拟器**: 支持各种 AVD 配置
41 | - **云设备**: 支持云端设备服务
42 |
43 | ## 🌐 Web 自动化
44 |
45 | ### 快速开始
46 | ```python
47 | import asyncio
48 | from midscene import Agent
49 | from midscene.web import SeleniumWebPage, PlaywrightPage
50 |
51 | # Selenium 示例
52 | async def selenium_example():
53 | with SeleniumWebPage.create() as page:
54 | agent = Agent(page)
55 | await page.goto("https://example.com")
56 | await agent.ai_action("点击登录按钮")
57 |
58 | # Playwright 示例
59 | async def playwright_example():
60 | async with PlaywrightPage.create() as page:
61 | agent = Agent(page)
62 | await page.goto("https://example.com")
63 | await agent.ai_action("点击登录按钮")
64 | ```
65 |
66 | ### 高级配置
67 | ```python
68 | from midscene.web import SeleniumWebPage
69 | from selenium.webdriver.chrome.options import Options
70 |
71 | # 自定义浏览器选项
72 | chrome_options = Options()
73 | chrome_options.add_argument("--headless")
74 | chrome_options.add_argument("--no-sandbox")
75 |
76 | page = SeleniumWebPage.create(
77 | browser="chrome",
78 | options=chrome_options,
79 | window_size=(1920, 1080)
80 | )
81 | ```
82 |
83 | ### 详细文档
84 | - [Selenium集成](Web自动化/Selenium集成.md) - Selenium WebDriver 完整指南
85 | - [Playwright集成](Web自动化/Playwright集成.md) - Playwright 集成和配置
86 | - [Web桥接机制](Web自动化/Web桥接机制.md) - 统一的 Web 操作抽象
87 |
88 | ## 📱 Android 自动化
89 |
90 | ### 快速开始
91 | ```python
92 | import asyncio
93 | from midscene import Agent
94 | from midscene.android import AndroidDevice
95 |
96 | async def android_example():
97 | # 连接设备
98 | device = AndroidDevice()
99 | await device.connect()
100 |
101 | # 创建 Agent
102 | agent = Agent(device)
103 |
104 | # 启动应用
105 | await device.start_app("com.example.app")
106 |
107 | # AI 操作
108 | await agent.ai_action("点击登录按钮")
109 | await agent.ai_action("输入用户名 'testuser'")
110 | await agent.ai_action("点击提交")
111 | ```
112 |
113 | ### 设备管理
114 | ```python
115 | from midscene.android import AndroidDevice, DeviceManager
116 |
117 | # 连接特定设备
118 | device = AndroidDevice(device_id="emulator-5554")
119 |
120 | # 设备管理器
121 | manager = DeviceManager()
122 | devices = await manager.list_devices()
123 | for device in devices:
124 | print(f"设备: {device.id}, 状态: {device.status}")
125 | ```
126 |
127 | ### 详细文档
128 | - [Android自动化](Android自动化.md) - Android 平台完整指南
129 |
130 | ## 🔄 统一操作接口
131 |
132 | 无论使用哪个平台,Midscene Python 都提供一致的操作接口:
133 |
134 | ### Agent 操作
135 | ```python
136 | # Web 和 Android 使用相同的方法
137 | await agent.ai_action("点击按钮")
138 | await agent.ai_action("输入文本 'hello'")
139 | await agent.ai_action("滚动到底部")
140 |
141 | # 数据提取
142 | data = await agent.ai_extract({
143 | "title": "页面标题",
144 | "items": ["列表项目"]
145 | })
146 |
147 | # 状态断言
148 | await agent.ai_assert("页面显示成功消息")
149 | ```
150 |
151 | ### 页面操作
152 | ```python
153 | # 统一的页面操作
154 | await page.goto("https://example.com") # Web
155 | await device.start_app("com.app") # Android
156 |
157 | # 截图
158 | screenshot = await page.screenshot() # Web
159 | screenshot = await device.screenshot() # Android
160 |
161 | # 获取上下文
162 | context = await page.get_context() # Web
163 | context = await device.get_context() # Android
164 | ```
165 |
166 | ## 🔧 平台适配机制
167 |
168 | ### AbstractInterface 抽象基类
169 | ```python
170 | from midscene.core.types import AbstractInterface, InterfaceType
171 |
172 | class CustomPlatform(AbstractInterface):
173 | @property
174 | def interface_type(self) -> InterfaceType:
175 | return InterfaceType.WEB # 或 InterfaceType.ANDROID
176 |
177 | async def get_context(self) -> UIContext:
178 | # 实现获取页面/屏幕上下文
179 | pass
180 |
181 | async def tap(self, x: float, y: float) -> None:
182 | # 实现点击操作
183 | pass
184 |
185 | async def input_text(self, text: str) -> None:
186 | # 实现文本输入
187 | pass
188 | ```
189 |
190 | ### 桥接模式实现
191 | ```python
192 | # Web 桥接示例
193 | class WebBridge:
194 | def __init__(self, driver_type: str):
195 | if driver_type == "selenium":
196 | self.driver = SeleniumWebDriver()
197 | elif driver_type == "playwright":
198 | self.driver = PlaywrightDriver()
199 |
200 | async def unified_action(self, action: str, **kwargs):
201 | # 统一的操作接口
202 | return await self.driver.execute_action(action, **kwargs)
203 | ```
204 |
205 | ## 🚀 平台选择指南
206 |
207 | ### Web 平台选择
208 |
209 | #### Selenium
210 | **适用场景**:
211 | - 需要支持多种浏览器
212 | - 与现有 Selenium 项目集成
213 | - 需要特定的 WebDriver 功能
214 |
215 | **优势**:
216 | - 成熟稳定,社区支持好
217 | - 支持的浏览器最多
218 | - 与 Selenium Grid 集成
219 |
220 | **劣势**:
221 | - 性能相对较慢
222 | - API 相对复杂
223 |
224 | #### Playwright
225 | **适用场景**:
226 | - 需要高性能的自动化
227 | - 现代 Web 应用测试
228 | - 需要网络拦截等高级功能
229 |
230 | **优势**:
231 | - 性能优异
232 | - 现代化的 API 设计
233 | - 内置等待和重试机制
234 |
235 | **劣势**:
236 | - 相对较新,生态系统较小
237 | - 学习成本稍高
238 |
239 | ### Android 平台特点
240 |
241 | **适用场景**:
242 | - 移动应用 UI 测试
243 | - 移动端业务流程自动化
244 | - 跨平台应用测试
245 |
246 | **优势**:
247 | - 直接操作原生 Android 界面
248 | - 支持各种 Android 版本
249 | - 可以测试真实设备体验
250 |
251 | **注意事项**:
252 | - 需要 ADB 环境配置
253 | - 设备连接稳定性要求高
254 | - 权限和安全限制较多
255 |
256 | ## 📊 性能对比
257 |
258 | | 特性 | Selenium | Playwright | Android |
259 | |------|----------|------------|---------|
260 | | **启动速度** | 中等 | 快 | 较慢 |
261 | | **执行速度** | 中等 | 快 | 取决于设备 |
262 | | **资源占用** | 中等 | 低 | 高 |
263 | | **稳定性** | 高 | 高 | 中等 |
264 | | **调试难度** | 中等 | 低 | 高 |
265 |
266 | ## 🔗 跨平台最佳实践
267 |
268 | ### 1. 统一测试脚本
269 | ```python
270 | async def universal_test(platform: str):
271 | """跨平台测试脚本"""
272 |
273 | if platform == "web":
274 | page = SeleniumWebPage.create()
275 | agent = Agent(page)
276 | await page.goto("https://app.example.com")
277 |
278 | elif platform == "android":
279 | device = AndroidDevice()
280 | await device.connect()
281 | agent = Agent(device)
282 | await device.start_app("com.example.app")
283 |
284 | # 统一的测试步骤
285 | await agent.ai_action("点击登录按钮")
286 | await agent.ai_action("输入用户名 'test'")
287 | await agent.ai_action("输入密码 'password'")
288 | await agent.ai_action("点击提交")
289 |
290 | # 统一的验证
291 | await agent.ai_assert("显示欢迎页面")
292 | ```
293 |
294 | ### 2. 配置管理
295 | ```python
296 | # config.py
297 | PLATFORM_CONFIGS = {
298 | "web": {
299 | "browser": "chrome",
300 | "headless": False,
301 | "window_size": (1920, 1080)
302 | },
303 | "android": {
304 | "device_id": None, # 自动选择
305 | "app_package": "com.example.app",
306 | "timeout": 30
307 | }
308 | }
309 |
310 | def get_platform_config(platform: str) -> dict:
311 | return PLATFORM_CONFIGS.get(platform, {})
312 | ```
313 |
314 | ### 3. 错误处理
315 | ```python
316 | async def robust_platform_operation(agent: Agent, action: str):
317 | """跨平台的健壮操作"""
318 |
319 | max_retries = 3
320 | for attempt in range(max_retries):
321 | try:
322 | await agent.ai_action(action)
323 | return
324 | except Exception as e:
325 | if attempt == max_retries - 1:
326 | raise
327 |
328 | # 根据平台类型进行特定的恢复操作
329 | platform_type = agent.interface.interface_type
330 | if platform_type == InterfaceType.WEB:
331 | await handle_web_error(agent, e)
332 | elif platform_type == InterfaceType.ANDROID:
333 | await handle_android_error(agent, e)
334 |
335 | await asyncio.sleep(1) # 等待后重试
336 | ```
337 |
338 | ## 🔍 调试和诊断
339 |
340 | ### 统一调试接口
341 | ```python
342 | async def debug_platform_info(agent: Agent):
343 | """获取平台调试信息"""
344 |
345 | interface = agent.interface
346 | platform_type = interface.interface_type
347 |
348 | print(f"平台类型: {platform_type}")
349 |
350 | if platform_type == InterfaceType.WEB:
351 | context = await interface.get_context()
352 | print(f"页面标题: {context.page_title}")
353 | print(f"页面 URL: {context.url}")
354 | print(f"视口大小: {context.size}")
355 |
356 | elif platform_type == InterfaceType.ANDROID:
357 | context = await interface.get_context()
358 | print(f"屏幕尺寸: {context.size}")
359 | print(f"当前活动: {context.current_activity}")
360 | print(f"设备信息: {context.device_info}")
361 | ```
362 |
363 | ### 跨平台截图
364 | ```python
365 | async def take_debug_screenshot(agent: Agent, filename: str):
366 | """跨平台截图功能"""
367 |
368 | interface = agent.interface
369 | screenshot = await interface.screenshot()
370 |
371 | # 添加平台标识
372 | platform_type = interface.interface_type.value
373 | timestamped_filename = f"{platform_type}_{filename}_{int(time.time())}.png"
374 |
375 | with open(timestamped_filename, "wb") as f:
376 | f.write(screenshot)
377 |
378 | print(f"截图已保存: {timestamped_filename}")
379 | ```
380 |
381 | ---
382 |
383 | 通过 Midscene Python 的平台集成能力,你可以用统一的方式处理不同平台的自动化需求。选择适合你项目需求的平台,并利用统一的 API 来简化开发和维护工作!
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | # This file was autogenerated by uv via the following command:
2 | # uv pip compile --all-extras pyproject.toml -o requirements.txt
3 | aiohappyeyeballs==2.6.1
4 | # via aiohttp
5 | aiohttp==3.12.15
6 | # via
7 | # midscene-python (pyproject.toml)
8 | # dashscope
9 | aiosignal==1.4.0
10 | # via aiohttp
11 | annotated-types==0.7.0
12 | # via pydantic
13 | anthropic==0.64.0
14 | # via midscene-python (pyproject.toml)
15 | anyio==4.10.0
16 | # via
17 | # anthropic
18 | # httpx
19 | # openai
20 | asyncio-mqtt==0.16.2
21 | # via midscene-python (pyproject.toml)
22 | attrs==25.3.0
23 | # via
24 | # aiohttp
25 | # outcome
26 | # trio
27 | babel==2.17.0
28 | # via mkdocs-material
29 | backrefs==5.9
30 | # via mkdocs-material
31 | black==25.1.0
32 | # via midscene-python (pyproject.toml)
33 | cachetools==5.5.2
34 | # via google-auth
35 | certifi==2025.8.3
36 | # via
37 | # httpcore
38 | # httpx
39 | # requests
40 | # selenium
41 | cffi==1.17.1
42 | # via
43 | # cryptography
44 | # trio
45 | cfgv==3.4.0
46 | # via pre-commit
47 | charset-normalizer==3.4.3
48 | # via requests
49 | click==8.2.1
50 | # via
51 | # black
52 | # mkdocs
53 | # mkdocs-material
54 | # typer
55 | colorama==0.4.6
56 | # via
57 | # click
58 | # griffe
59 | # loguru
60 | # mkdocs
61 | # mkdocs-material
62 | # pytest
63 | # tqdm
64 | coverage==7.10.6
65 | # via pytest-cov
66 | cryptography==45.0.7
67 | # via dashscope
68 | dashscope==1.24.2
69 | # via midscene-python (pyproject.toml)
70 | distlib==0.4.0
71 | # via virtualenv
72 | distro==1.9.0
73 | # via
74 | # anthropic
75 | # openai
76 | filelock==3.19.1
77 | # via virtualenv
78 | frozenlist==1.7.0
79 | # via
80 | # aiohttp
81 | # aiosignal
82 | ghp-import==2.1.0
83 | # via mkdocs
84 | google-ai-generativelanguage==0.6.15
85 | # via google-generativeai
86 | google-api-core==2.25.1
87 | # via
88 | # google-ai-generativelanguage
89 | # google-api-python-client
90 | # google-generativeai
91 | google-api-python-client==2.179.0
92 | # via google-generativeai
93 | google-auth==2.40.3
94 | # via
95 | # google-ai-generativelanguage
96 | # google-api-core
97 | # google-api-python-client
98 | # google-auth-httplib2
99 | # google-generativeai
100 | google-auth-httplib2==0.2.0
101 | # via google-api-python-client
102 | google-generativeai==0.8.5
103 | # via midscene-python (pyproject.toml)
104 | googleapis-common-protos==1.70.0
105 | # via
106 | # google-api-core
107 | # grpcio-status
108 | greenlet==3.2.4
109 | # via playwright
110 | griffe==1.13.0
111 | # via mkdocstrings-python
112 | grpcio==1.74.0
113 | # via
114 | # google-api-core
115 | # grpcio-status
116 | grpcio-status==1.71.2
117 | # via google-api-core
118 | h11==0.16.0
119 | # via
120 | # httpcore
121 | # wsproto
122 | httpcore==1.0.9
123 | # via httpx
124 | httplib2==0.30.0
125 | # via
126 | # google-api-python-client
127 | # google-auth-httplib2
128 | httpx==0.28.1
129 | # via
130 | # midscene-python (pyproject.toml)
131 | # anthropic
132 | # openai
133 | identify==2.6.13
134 | # via pre-commit
135 | idna==3.10
136 | # via
137 | # anyio
138 | # httpx
139 | # requests
140 | # trio
141 | # yarl
142 | iniconfig==2.1.0
143 | # via pytest
144 | isort==6.0.1
145 | # via midscene-python (pyproject.toml)
146 | jinja2==3.1.6
147 | # via
148 | # midscene-python (pyproject.toml)
149 | # mkdocs
150 | # mkdocs-material
151 | # mkdocstrings
152 | jiter==0.10.0
153 | # via
154 | # anthropic
155 | # openai
156 | loguru==0.7.3
157 | # via midscene-python (pyproject.toml)
158 | markdown==3.8.2
159 | # via
160 | # mkdocs
161 | # mkdocs-autorefs
162 | # mkdocs-material
163 | # mkdocstrings
164 | # pymdown-extensions
165 | markdown-it-py==4.0.0
166 | # via rich
167 | markupsafe==3.0.2
168 | # via
169 | # jinja2
170 | # mkdocs
171 | # mkdocs-autorefs
172 | # mkdocstrings
173 | mdurl==0.1.2
174 | # via markdown-it-py
175 | mergedeep==1.3.4
176 | # via
177 | # mkdocs
178 | # mkdocs-get-deps
179 | mkdocs==1.6.1
180 | # via
181 | # midscene-python (pyproject.toml)
182 | # mkdocs-autorefs
183 | # mkdocs-material
184 | # mkdocstrings
185 | mkdocs-autorefs==1.4.3
186 | # via
187 | # mkdocstrings
188 | # mkdocstrings-python
189 | mkdocs-get-deps==0.2.0
190 | # via mkdocs
191 | mkdocs-material==9.6.18
192 | # via midscene-python (pyproject.toml)
193 | mkdocs-material-extensions==1.3.1
194 | # via mkdocs-material
195 | mkdocstrings==0.30.0
196 | # via
197 | # midscene-python (pyproject.toml)
198 | # mkdocstrings-python
199 | mkdocstrings-python==1.18.2
200 | # via mkdocstrings
201 | multidict==6.6.4
202 | # via
203 | # aiohttp
204 | # yarl
205 | mypy==1.17.1
206 | # via midscene-python (pyproject.toml)
207 | mypy-extensions==1.1.0
208 | # via
209 | # black
210 | # mypy
211 | nodeenv==1.9.1
212 | # via pre-commit
213 | numpy==1.26.4
214 | # via
215 | # midscene-python (pyproject.toml)
216 | # opencv-python
217 | openai==1.102.0
218 | # via midscene-python (pyproject.toml)
219 | opencv-python==4.11.0.86
220 | # via midscene-python (pyproject.toml)
221 | outcome==1.3.0.post0
222 | # via
223 | # trio
224 | # trio-websocket
225 | packaging==25.0
226 | # via
227 | # black
228 | # mkdocs
229 | # pytest
230 | paginate==0.5.7
231 | # via mkdocs-material
232 | paho-mqtt==2.1.0
233 | # via asyncio-mqtt
234 | pathspec==0.12.1
235 | # via
236 | # black
237 | # mkdocs
238 | # mypy
239 | pillow==10.4.0
240 | # via midscene-python (pyproject.toml)
241 | platformdirs==4.4.0
242 | # via
243 | # black
244 | # mkdocs-get-deps
245 | # virtualenv
246 | playwright==1.55.0
247 | # via midscene-python (pyproject.toml)
248 | pluggy==1.6.0
249 | # via
250 | # pytest
251 | # pytest-cov
252 | pre-commit==4.3.0
253 | # via midscene-python (pyproject.toml)
254 | propcache==0.3.2
255 | # via
256 | # aiohttp
257 | # yarl
258 | proto-plus==1.26.1
259 | # via
260 | # google-ai-generativelanguage
261 | # google-api-core
262 | protobuf==5.29.5
263 | # via
264 | # google-ai-generativelanguage
265 | # google-api-core
266 | # google-generativeai
267 | # googleapis-common-protos
268 | # grpcio-status
269 | # proto-plus
270 | pure-python-adb==0.3.0.dev0
271 | # via midscene-python (pyproject.toml)
272 | pyasn1==0.6.1
273 | # via
274 | # pyasn1-modules
275 | # rsa
276 | pyasn1-modules==0.4.2
277 | # via google-auth
278 | pycparser==2.22
279 | # via cffi
280 | pydantic==2.11.7
281 | # via
282 | # midscene-python (pyproject.toml)
283 | # anthropic
284 | # google-generativeai
285 | # openai
286 | pydantic-core==2.33.2
287 | # via pydantic
288 | pyee==13.0.0
289 | # via playwright
290 | pygments==2.19.2
291 | # via
292 | # mkdocs-material
293 | # pytest
294 | # rich
295 | pymdown-extensions==10.16.1
296 | # via
297 | # mkdocs-material
298 | # mkdocstrings
299 | pyparsing==3.2.3
300 | # via httplib2
301 | pysocks==1.7.1
302 | # via urllib3
303 | pytest==8.4.1
304 | # via
305 | # midscene-python (pyproject.toml)
306 | # pytest-asyncio
307 | # pytest-cov
308 | pytest-asyncio==1.1.0
309 | # via midscene-python (pyproject.toml)
310 | pytest-cov==6.2.1
311 | # via midscene-python (pyproject.toml)
312 | python-dateutil==2.9.0.post0
313 | # via ghp-import
314 | pyyaml==6.0.2
315 | # via
316 | # midscene-python (pyproject.toml)
317 | # mkdocs
318 | # mkdocs-get-deps
319 | # pre-commit
320 | # pymdown-extensions
321 | # pyyaml-env-tag
322 | pyyaml-env-tag==1.1
323 | # via mkdocs
324 | requests==2.32.5
325 | # via
326 | # dashscope
327 | # google-api-core
328 | # mkdocs-material
329 | rich==14.1.0
330 | # via typer
331 | rsa==4.9.1
332 | # via google-auth
333 | ruff==0.12.11
334 | # via midscene-python (pyproject.toml)
335 | selenium==4.35.0
336 | # via midscene-python (pyproject.toml)
337 | shellingham==1.5.4
338 | # via typer
339 | six==1.17.0
340 | # via python-dateutil
341 | sniffio==1.3.1
342 | # via
343 | # anthropic
344 | # anyio
345 | # openai
346 | # trio
347 | sortedcontainers==2.4.0
348 | # via trio
349 | tqdm==4.67.1
350 | # via
351 | # google-generativeai
352 | # openai
353 | trio==0.30.0
354 | # via
355 | # selenium
356 | # trio-websocket
357 | trio-websocket==0.12.2
358 | # via selenium
359 | typer==0.17.3
360 | # via midscene-python (pyproject.toml)
361 | typing-extensions==4.14.1
362 | # via
363 | # aiosignal
364 | # anthropic
365 | # anyio
366 | # google-generativeai
367 | # mypy
368 | # openai
369 | # pydantic
370 | # pydantic-core
371 | # pyee
372 | # selenium
373 | # typer
374 | # typing-inspection
375 | typing-inspection==0.4.1
376 | # via pydantic
377 | uritemplate==4.2.0
378 | # via google-api-python-client
379 | urllib3==2.5.0
380 | # via
381 | # requests
382 | # selenium
383 | virtualenv==20.34.0
384 | # via pre-commit
385 | watchdog==6.0.0
386 | # via mkdocs
387 | websocket-client==1.8.0
388 | # via
389 | # dashscope
390 | # selenium
391 | win32-setctime==1.2.0
392 | # via loguru
393 | wsproto==1.2.0
394 | # via trio-websocket
395 | yarl==1.20.1
396 | # via aiohttp
397 |
--------------------------------------------------------------------------------
/midscene/shared/cache.py:
--------------------------------------------------------------------------------
1 | """
2 | Task caching system for performance optimization
3 | """
4 |
5 | import json
6 | import hashlib
7 | import pickle
8 | from datetime import datetime, timedelta
9 | from pathlib import Path
10 | from typing import Any, Dict, List, Optional, Union
11 |
12 | from loguru import logger
13 | from pydantic import BaseModel
14 |
15 |
16 | class CacheEntry(BaseModel):
17 | """Cache entry model"""
18 | key: str
19 | data: Any
20 | timestamp: datetime
21 | expires_at: Optional[datetime] = None
22 | metadata: Dict[str, Any] = {}
23 |
24 |
25 | class TaskCache:
26 | """Task caching system for storing and retrieving execution results"""
27 |
28 | def __init__(
29 | self,
30 | cache_id: str,
31 | enabled: bool = True,
32 | cache_dir: Optional[str] = None,
33 | max_age_hours: int = 24
34 | ):
35 | """Initialize task cache
36 |
37 | Args:
38 | cache_id: Unique cache identifier
39 | enabled: Whether caching is enabled
40 | cache_dir: Cache directory path
41 | max_age_hours: Maximum cache age in hours
42 | """
43 | self.cache_id = cache_id
44 | self.enabled = enabled
45 | self.max_age_hours = max_age_hours
46 |
47 | # Setup cache directory
48 | if cache_dir:
49 | self.cache_dir = Path(cache_dir)
50 | else:
51 | self.cache_dir = Path.home() / ".midscene" / "cache"
52 |
53 | self.cache_dir.mkdir(parents=True, exist_ok=True)
54 | self.cache_file = self.cache_dir / f"{cache_id}.json"
55 |
56 | # Load existing cache
57 | self._cache: Dict[str, CacheEntry] = {}
58 | self._load_cache()
59 |
60 | def _generate_key(self, data: Union[str, Dict, List]) -> str:
61 | """Generate cache key from data
62 |
63 | Args:
64 | data: Data to generate key from
65 |
66 | Returns:
67 | Cache key string
68 | """
69 | if isinstance(data, str):
70 | content = data
71 | else:
72 | content = json.dumps(data, sort_keys=True, ensure_ascii=False)
73 |
74 | return hashlib.md5(content.encode('utf-8')).hexdigest()
75 |
76 | def _load_cache(self) -> None:
77 | """Load cache from file"""
78 | if not self.enabled or not self.cache_file.exists():
79 | return
80 |
81 | try:
82 | with open(self.cache_file, 'r', encoding='utf-8') as f:
83 | cache_data = json.load(f)
84 |
85 | for key, entry_data in cache_data.items():
86 | # Convert datetime strings back to datetime objects
87 | entry_data['timestamp'] = datetime.fromisoformat(entry_data['timestamp'])
88 | if entry_data.get('expires_at'):
89 | entry_data['expires_at'] = datetime.fromisoformat(entry_data['expires_at'])
90 |
91 | self._cache[key] = CacheEntry(**entry_data)
92 |
93 | # Clean expired entries
94 | self._clean_expired()
95 |
96 | logger.debug(f"Loaded {len(self._cache)} cache entries")
97 |
98 | except Exception as e:
99 | logger.warning(f"Failed to load cache: {e}")
100 | self._cache = {}
101 |
102 | def _save_cache(self) -> None:
103 | """Save cache to file"""
104 | if not self.enabled:
105 | return
106 |
107 | try:
108 | cache_data = {}
109 | for key, entry in self._cache.items():
110 | entry_dict = entry.model_dump()
111 | # Convert datetime objects to strings
112 | entry_dict['timestamp'] = entry.timestamp.isoformat()
113 | if entry.expires_at:
114 | entry_dict['expires_at'] = entry.expires_at.isoformat()
115 |
116 | cache_data[key] = entry_dict
117 |
118 | with open(self.cache_file, 'w', encoding='utf-8') as f:
119 | json.dump(cache_data, f, ensure_ascii=False, indent=2)
120 |
121 | except Exception as e:
122 | logger.warning(f"Failed to save cache: {e}")
123 |
124 | def _clean_expired(self) -> None:
125 | """Clean expired cache entries"""
126 | now = datetime.now()
127 | expired_keys = []
128 |
129 | for key, entry in self._cache.items():
130 | # Check explicit expiration
131 | if entry.expires_at and entry.expires_at <= now:
132 | expired_keys.append(key)
133 | continue
134 |
135 | # Check age-based expiration
136 | age = now - entry.timestamp
137 | if age > timedelta(hours=self.max_age_hours):
138 | expired_keys.append(key)
139 |
140 | for key in expired_keys:
141 | del self._cache[key]
142 |
143 | if expired_keys:
144 | logger.debug(f"Cleaned {len(expired_keys)} expired cache entries")
145 |
146 | def get(self, key: str) -> Optional[Any]:
147 | """Get cached data by key
148 |
149 | Args:
150 | key: Cache key
151 |
152 | Returns:
153 | Cached data or None if not found
154 | """
155 | if not self.enabled:
156 | return None
157 |
158 | entry = self._cache.get(key)
159 | if not entry:
160 | return None
161 |
162 | # Check if expired
163 | now = datetime.now()
164 | if entry.expires_at and entry.expires_at <= now:
165 | del self._cache[key]
166 | return None
167 |
168 | # Check age
169 | age = now - entry.timestamp
170 | if age > timedelta(hours=self.max_age_hours):
171 | del self._cache[key]
172 | return None
173 |
174 | logger.debug(f"Cache hit for key: {key}")
175 | return entry.data
176 |
177 | def put(
178 | self,
179 | key: str,
180 | data: Any,
181 | expires_in_hours: Optional[int] = None,
182 | metadata: Optional[Dict[str, Any]] = None
183 | ) -> None:
184 | """Store data in cache
185 |
186 | Args:
187 | key: Cache key
188 | data: Data to cache
189 | expires_in_hours: Custom expiration time in hours
190 | metadata: Additional metadata
191 | """
192 | if not self.enabled:
193 | return
194 |
195 | now = datetime.now()
196 | expires_at = None
197 |
198 | if expires_in_hours:
199 | expires_at = now + timedelta(hours=expires_in_hours)
200 |
201 | entry = CacheEntry(
202 | key=key,
203 | data=data,
204 | timestamp=now,
205 | expires_at=expires_at,
206 | metadata=metadata or {}
207 | )
208 |
209 | self._cache[key] = entry
210 | self._save_cache()
211 |
212 | logger.debug(f"Cached data with key: {key}")
213 |
214 | def get_by_data(self, data: Union[str, Dict, List]) -> Optional[Any]:
215 | """Get cached data by input data
216 |
217 | Args:
218 | data: Input data to generate key from
219 |
220 | Returns:
221 | Cached result or None
222 | """
223 | key = self._generate_key(data)
224 | return self.get(key)
225 |
226 | def put_by_data(
227 | self,
228 | input_data: Union[str, Dict, List],
229 | result_data: Any,
230 | expires_in_hours: Optional[int] = None,
231 | metadata: Optional[Dict[str, Any]] = None
232 | ) -> None:
233 | """Store data in cache by input data
234 |
235 | Args:
236 | input_data: Input data to generate key from
237 | result_data: Result data to cache
238 | expires_in_hours: Custom expiration time in hours
239 | metadata: Additional metadata
240 | """
241 | key = self._generate_key(input_data)
242 | self.put(key, result_data, expires_in_hours, metadata)
243 |
244 | def match_locate_cache(self, prompt: str) -> Optional[Dict[str, Any]]:
245 | """Match locate operation from cache
246 |
247 | Args:
248 | prompt: Locate prompt
249 |
250 | Returns:
251 | Cached locate result or None
252 | """
253 | cache_key = f"locate:{self._generate_key(prompt)}"
254 | return self.get(cache_key)
255 |
256 | def store_locate_result(
257 | self,
258 | prompt: str,
259 | result: Dict[str, Any],
260 | expires_in_hours: int = 24
261 | ) -> None:
262 | """Store locate result in cache
263 |
264 | Args:
265 | prompt: Locate prompt
266 | result: Locate result
267 | expires_in_hours: Expiration time in hours
268 | """
269 | cache_key = f"locate:{self._generate_key(prompt)}"
270 | self.put(cache_key, result, expires_in_hours, {"type": "locate"})
271 |
272 | def clear(self) -> None:
273 | """Clear all cache entries"""
274 | self._cache.clear()
275 | if self.cache_file.exists():
276 | self.cache_file.unlink()
277 | logger.info("Cache cleared")
278 |
279 | def get_stats(self) -> Dict[str, Any]:
280 | """Get cache statistics
281 |
282 | Returns:
283 | Cache statistics
284 | """
285 | now = datetime.now()
286 | total_entries = len(self._cache)
287 |
288 | expired_count = 0
289 | for entry in self._cache.values():
290 | if entry.expires_at and entry.expires_at <= now:
291 | expired_count += 1
292 | elif (now - entry.timestamp) > timedelta(hours=self.max_age_hours):
293 | expired_count += 1
294 |
295 | return {
296 | "total_entries": total_entries,
297 | "expired_entries": expired_count,
298 | "cache_file": str(self.cache_file),
299 | "cache_size_mb": self.cache_file.stat().st_size / 1024 / 1024 if self.cache_file.exists() else 0,
300 | "enabled": self.enabled
301 | }
--------------------------------------------------------------------------------
/wiki/核心概念/Agent核心控制器.md:
--------------------------------------------------------------------------------
1 | # Agent 核心控制器
2 |
3 | Agent 是 Midscene Python 的核心控制器,为用户提供统一的自动化操作接口。它充当用户代码与底层平台之间的桥梁,通过 AI 理解用户意图并执行相应的操作。
4 |
5 | ## 🎯 设计理念
6 |
7 | ### 统一接口设计
8 | Agent 为不同平台(Web、Android)提供完全一致的编程接口,用户无需学习不同平台的特定 API:
9 |
10 | ```python
11 | # Web 和 Android 使用相同的接口
12 | web_agent = Agent(selenium_page)
13 | android_agent = Agent(android_device)
14 |
15 | # 相同的操作方法
16 | await web_agent.ai_action("点击登录按钮")
17 | await android_agent.ai_action("点击登录按钮")
18 | ```
19 |
20 | ### AI 驱动的智能操作
21 | Agent 将自然语言指令转换为具体的操作步骤,让自动化变得更加直观:
22 |
23 | ```python
24 | # 传统方式需要精确的选择器
25 | element = driver.find_element(By.CSS_SELECTOR, "#login-form button[type='submit']")
26 | element.click()
27 |
28 | # Agent 方式使用自然语言
29 | await agent.ai_action("点击登录表单的提交按钮")
30 | ```
31 |
32 | ## 🏗️ 架构设计
33 |
34 | ### 核心组件
35 |
36 | ```mermaid
37 | graph TB
38 | A[Agent] --> B[TaskExecutor]
39 | A --> C[Insight Engine]
40 | A --> D[AI Service]
41 | A --> E[Platform Interface]
42 |
43 | B --> C
44 | B --> E
45 | C --> D
46 |
47 | subgraph "Agent 核心"
48 | A
49 | B
50 | end
51 |
52 | subgraph "AI 理解层"
53 | C
54 | D
55 | end
56 |
57 | subgraph "平台抽象层"
58 | E
59 | end
60 | ```
61 |
62 | ### Agent 类结构
63 |
64 | ```python
65 | class Agent:
66 | """Core Agent class that orchestrates AI model and device interactions"""
67 |
68 | def __init__(
69 | self,
70 | interface: AbstractInterface,
71 | options: Optional[AgentOptions] = None
72 | ):
73 | self.interface = interface # 平台接口
74 | self.options = options or AgentOptions() # 配置选项
75 | self.ai_service = AIModelService() # AI 服务
76 | self.insight = Insight(...) # UI 理解引擎
77 | self.task_executor = TaskExecutor(...) # 任务执行器
78 | ```
79 |
80 | ## 🎮 主要功能
81 |
82 | ### 1. AI 驱动的操作 (ai_action)
83 |
84 | `ai_action` 是 Agent 最核心的方法,支持各种自然语言驱动的操作:
85 |
86 | ```python
87 | # 基础交互
88 | await agent.ai_action("点击登录按钮")
89 | await agent.ai_action("在用户名框输入 'admin'")
90 | await agent.ai_action("选择下拉菜单中的第二个选项")
91 |
92 | # 复杂操作
93 | await agent.ai_action("滚动到页面底部并点击加载更多按钮")
94 | await agent.ai_action("在搜索框输入'Python'并按回车搜索")
95 |
96 | # 条件操作
97 | await agent.ai_action("如果页面显示错误信息,点击确定按钮")
98 | ```
99 |
100 | #### 工作流程
101 |
102 | 1. **指令解析**: 将自然语言转换为操作意图
103 | 2. **页面分析**: 获取当前页面的截图和上下文信息
104 | 3. **计划生成**: AI 生成详细的执行计划
105 | 4. **步骤执行**: 逐步执行计划中的每个操作
106 | 5. **结果验证**: 验证操作是否成功完成
107 |
108 | ```python
109 | async def ai_action(self, prompt: TUserPrompt, **kwargs) -> None:
110 | """Execute AI-driven action"""
111 | self._ensure_not_destroyed()
112 |
113 | # 委托给任务执行器
114 | result = await self.task_executor.execute_ai_action(prompt, **kwargs)
115 |
116 | if not result.success:
117 | raise Exception(f"Action failed: {result.error}")
118 | ```
119 |
120 | ### 2. 智能元素定位 (ai_locate)
121 |
122 | 精确定位页面元素,支持各种描述方式:
123 |
124 | ```python
125 | # 基础定位
126 | login_btn = await agent.ai_locate("登录按钮")
127 | search_box = await agent.ai_locate("搜索输入框")
128 |
129 | # 描述性定位
130 | submit_btn = await agent.ai_locate("蓝色的提交按钮")
131 | user_avatar = await agent.ai_locate("页面右上角的用户头像")
132 |
133 | # 相对定位
134 | next_btn = await agent.ai_locate("位于分页控件中的下一页按钮")
135 | ```
136 |
137 | #### 定位策略
138 |
139 | Agent 使用多种策略进行元素定位:
140 |
141 | 1. **视觉识别**: 基于截图进行 AI 视觉识别
142 | 2. **语义理解**: 理解元素的功能和上下文
143 | 3. **多重验证**: 结合多种信息确保定位准确性
144 | 4. **容错机制**: 支持页面变化和布局调整
145 |
146 | ### 3. 数据提取 (ai_extract)
147 |
148 | 从页面提取结构化数据:
149 |
150 | ```python
151 | # 提取单个对象
152 | user_info = await agent.ai_extract({
153 | "name": "用户姓名",
154 | "email": "邮箱地址",
155 | "role": "用户角色"
156 | })
157 |
158 | # 提取列表数据
159 | products = await agent.ai_extract({
160 | "products": [
161 | {
162 | "name": "商品名称",
163 | "price": "价格",
164 | "rating": "评分",
165 | "in_stock": "是否有货"
166 | }
167 | ]
168 | })
169 |
170 | # 复杂嵌套结构
171 | order_data = await agent.ai_extract({
172 | "order_id": "订单号",
173 | "customer": {
174 | "name": "客户姓名",
175 | "address": "送货地址"
176 | },
177 | "items": [
178 | {
179 | "product": "商品名称",
180 | "quantity": "数量",
181 | "price": "单价"
182 | }
183 | ],
184 | "total": "总金额"
185 | })
186 | ```
187 |
188 | ### 4. 智能断言 (ai_assert)
189 |
190 | 验证页面状态和内容:
191 |
192 | ```python
193 | # 状态验证
194 | await agent.ai_assert("用户已成功登录")
195 | await agent.ai_assert("页面显示错误信息")
196 | await agent.ai_assert("表单验证通过")
197 |
198 | # 内容验证
199 | await agent.ai_assert("搜索结果包含'Python 教程'")
200 | await agent.ai_assert("购物车中有 3 件商品")
201 | await agent.ai_assert("订单状态为已发货")
202 |
203 | # 条件验证
204 | await agent.ai_assert("如果是新用户,显示欢迎向导")
205 | ```
206 |
207 | ## ⚙️ 配置选项
208 |
209 | ### AgentOptions 配置
210 |
211 | ```python
212 | from midscene.core import AgentOptions
213 |
214 | options = AgentOptions(
215 | # 超时设置
216 | timeout=30, # 操作超时时间(秒)
217 |
218 | # 重试机制
219 | retry_count=3, # 失败重试次数
220 | retry_delay=1.0, # 重试间隔(秒)
221 |
222 | # 调试选项
223 | screenshot_on_error=True, # 错误时自动截图
224 | save_execution_logs=True, # 保存执行日志
225 |
226 | # 性能优化
227 | cache_enabled=True, # 启用智能缓存
228 | parallel_execution=False, # 并行执行(实验性)
229 |
230 | # AI 模型设置
231 | model_temperature=0.1, # AI 响应随机性
232 | max_tokens=1000, # 最大 token 数
233 | )
234 |
235 | agent = Agent(page, options=options)
236 | ```
237 |
238 | ### 运行时配置
239 |
240 | ```python
241 | # 临时修改超时时间
242 | await agent.ai_action("点击按钮", timeout=60)
243 |
244 | # 禁用缓存的单次操作
245 | await agent.ai_extract(schema, use_cache=False)
246 |
247 | # 自定义重试策略
248 | await agent.ai_action("提交表单", retry_count=5, retry_delay=2.0)
249 | ```
250 |
251 | ## 🔄 生命周期管理
252 |
253 | ### 初始化和销毁
254 |
255 | ```python
256 | # 方式1: 手动管理
257 | agent = Agent(page)
258 | try:
259 | await agent.ai_action("执行操作")
260 | finally:
261 | await agent.destroy()
262 |
263 | # 方式2: 上下文管理器(推荐)
264 | async with Agent(page) as agent:
265 | await agent.ai_action("执行操作")
266 | # 自动调用 destroy()
267 | ```
268 |
269 | ### 状态冻结
270 |
271 | ```python
272 | # 冻结当前页面状态(用于调试)
273 | await agent.freeze()
274 |
275 | # 在冻结状态下进行多次操作
276 | await agent.ai_extract(schema1)
277 | await agent.ai_extract(schema2)
278 |
279 | # 解除冻结
280 | await agent.unfreeze()
281 | ```
282 |
283 | ## 🔧 高级特性
284 |
285 | ### 1. 自定义 AI 模型
286 |
287 | ```python
288 | from midscene.core.ai_model import AIModelConfig
289 |
290 | # 自定义模型配置
291 | ai_config = AIModelConfig(
292 | provider="openai",
293 | model="gpt-4-vision-preview",
294 | temperature=0.0,
295 | max_tokens=2000,
296 | api_key="your_api_key"
297 | )
298 |
299 | agent = Agent(page, ai_config=ai_config)
300 | ```
301 |
302 | ### 2. 操作链式调用
303 |
304 | ```python
305 | # 链式操作
306 | await (agent
307 | .ai_action("点击登录")
308 | .ai_action("输入用户名")
309 | .ai_action("输入密码")
310 | .ai_action("点击提交"))
311 | ```
312 |
313 | ### 3. 事件监听
314 |
315 | ```python
316 | # 操作前后的钩子函数
317 | @agent.on_before_action
318 | async def before_action(prompt: str, context: UIContext):
319 | print(f"即将执行: {prompt}")
320 |
321 | @agent.on_after_action
322 | async def after_action(prompt: str, result: ExecutionResult):
323 | print(f"执行完成: {prompt}, 结果: {result.success}")
324 | ```
325 |
326 | ### 4. 批量操作
327 |
328 | ```python
329 | # 批量执行多个操作
330 | actions = [
331 | "点击菜单按钮",
332 | "选择设置选项",
333 | "修改用户信息",
334 | "保存更改"
335 | ]
336 |
337 | results = await agent.batch_execute(actions)
338 | ```
339 |
340 | ## 📊 性能优化
341 |
342 | ### 智能缓存
343 |
344 | Agent 内置智能缓存机制,避免重复的 AI 调用:
345 |
346 | ```python
347 | # 首次调用会请求 AI 模型
348 | result1 = await agent.ai_extract(schema)
349 |
350 | # 相同 schema 和页面状态会使用缓存
351 | result2 = await agent.ai_extract(schema) # 使用缓存,更快
352 |
353 | # 强制禁用缓存
354 | result3 = await agent.ai_extract(schema, use_cache=False)
355 | ```
356 |
357 | ### 并发控制
358 |
359 | ```python
360 | # 控制并发数量,避免过多 AI 请求
361 | agent.set_concurrency_limit(3)
362 |
363 | # 异步执行多个独立操作
364 | import asyncio
365 |
366 | tasks = [
367 | agent.ai_extract(schema1),
368 | agent.ai_extract(schema2),
369 | agent.ai_extract(schema3)
370 | ]
371 |
372 | results = await asyncio.gather(*tasks)
373 | ```
374 |
375 | ## 🚨 错误处理
376 |
377 | ### 异常类型
378 |
379 | ```python
380 | from midscene.core.exceptions import (
381 | AgentError,
382 | ElementNotFoundError,
383 | OperationTimeoutError,
384 | AIServiceError
385 | )
386 |
387 | try:
388 | await agent.ai_action("点击不存在的按钮")
389 | except ElementNotFoundError as e:
390 | print(f"元素未找到: {e}")
391 | except OperationTimeoutError as e:
392 | print(f"操作超时: {e}")
393 | except AIServiceError as e:
394 | print(f"AI 服务错误: {e}")
395 | ```
396 |
397 | ### 重试机制
398 |
399 | ```python
400 | # 自动重试配置
401 | options = AgentOptions(
402 | retry_count=3,
403 | retry_delay=1.0,
404 | retry_on_errors=[ElementNotFoundError, OperationTimeoutError]
405 | )
406 |
407 | # 手动重试
408 | from midscene.shared.retry import retry_async
409 |
410 | @retry_async(max_attempts=3, delay=1.0)
411 | async def robust_action():
412 | await agent.ai_action("点击可能不稳定的元素")
413 | ```
414 |
415 | ## 🔍 调试和诊断
416 |
417 | ### 详细日志
418 |
419 | ```python
420 | import logging
421 | from midscene.shared.logger import setup_logger
422 |
423 | # 启用详细日志
424 | setup_logger(level=logging.DEBUG)
425 |
426 | # 操作执行时会输出详细信息
427 | await agent.ai_action("点击按钮")
428 | ```
429 |
430 | ### 执行报告
431 |
432 | ```python
433 | # 生成详细的执行报告
434 | report = await agent.generate_report()
435 | print(f"总操作数: {report.total_actions}")
436 | print(f"成功率: {report.success_rate}")
437 | print(f"平均执行时间: {report.avg_execution_time}")
438 |
439 | # 保存报告到文件
440 | await report.save_to_file("execution_report.html")
441 | ```
442 |
443 | ### 手动调试
444 |
445 | ```python
446 | # 获取当前页面状态
447 | context = await agent.get_current_context()
448 | print(f"页面标题: {context.page_title}")
449 | print(f"页面 URL: {context.url}")
450 |
451 | # 手动截图
452 | screenshot = await agent.screenshot()
453 | with open("debug.png", "wb") as f:
454 | f.write(screenshot)
455 |
456 | # 获取页面元素信息
457 | elements = await agent.get_all_elements()
458 | for element in elements:
459 | print(f"元素: {element.tag_name}, 文本: {element.text}")
460 | ```
461 |
462 | ## 🎯 最佳实践
463 |
464 | ### 1. 清晰的操作描述
465 | ```python
466 | # ❌ 模糊的描述
467 | await agent.ai_action("点击按钮")
468 |
469 | # ✅ 具体的描述
470 | await agent.ai_action("点击页面右上角的蓝色登录按钮")
471 | ```
472 |
473 | ### 2. 合理的超时设置
474 | ```python
475 | # 根据操作复杂度设置超时
476 | await agent.ai_action("点击按钮", timeout=10) # 简单操作
477 | await agent.ai_action("等待页面加载完成", timeout=30) # 复杂操作
478 | ```
479 |
480 | ### 3. 错误处理
481 | ```python
482 | # 优雅的错误处理
483 | try:
484 | await agent.ai_action("尝试点击可能不存在的按钮")
485 | except ElementNotFoundError:
486 | # 执行备选方案
487 | await agent.ai_action("点击替代按钮")
488 | ```
489 |
490 | ### 4. 资源管理
491 | ```python
492 | # 使用上下文管理器确保资源释放
493 | async with Agent(page) as agent:
494 | await agent.ai_action("执行操作")
495 | # 自动清理资源
496 | ```
497 |
498 | ## 🔗 相关文档
499 |
500 | - **API 参考**: [Agent API 完整文档](../API参考/Agent-API.md)
501 | - **UI 理解**: [Insight UI理解引擎](Insight-UI理解引擎.md)
502 | - **平台集成**: [Web自动化](../平台集成/Web自动化/README.md) | [Android自动化](../平台集成/Android自动化.md)
503 | - **示例代码**: [基础示例](../示例和教程/基础示例.md)
504 |
505 | ---
506 |
507 | Agent 是 Midscene Python 的核心,掌握了 Agent 的使用就掌握了框架的精髓。继续探索其他核心概念来深入理解整个框架的工作原理!
--------------------------------------------------------------------------------
/midscene/web/bridge.py:
--------------------------------------------------------------------------------
1 | """
2 | Bridge mode implementation for Chrome extension integration
3 | """
4 |
5 | import asyncio
6 | import json
7 | import websockets
8 | from typing import Any, Dict, List, Optional
9 | from loguru import logger
10 |
11 | from ..core.types import (
12 | AbstractInterface, InterfaceType, UIContext, BaseElement, UINode, UITree,
13 | Size, Rect, Point, NodeType
14 | )
15 |
16 |
17 | class BridgeElement(BaseElement):
18 | """Bridge element wrapper"""
19 |
20 | def __init__(self, bridge: 'BridgeWebPage', **kwargs):
21 | self._bridge = bridge
22 | super().__init__(**kwargs)
23 |
24 | async def tap(self) -> None:
25 | """Click this element"""
26 | try:
27 | await self._bridge.send_command({
28 | "action": "click",
29 | "target": {"id": self.id}
30 | })
31 | except Exception as e:
32 | logger.error(f"Failed to click element: {e}")
33 | raise
34 |
35 | async def input_text(self, text: str) -> None:
36 | """Input text to this element"""
37 | try:
38 | await self._bridge.send_command({
39 | "action": "input",
40 | "target": {"id": self.id},
41 | "text": text
42 | })
43 | except Exception as e:
44 | logger.error(f"Failed to input text: {e}")
45 | raise
46 |
47 |
48 | class BridgeWebPage(AbstractInterface):
49 | """Bridge mode page interface for Chrome extension communication"""
50 |
51 | def __init__(self, websocket_url: str = "ws://localhost:8765"):
52 | """Initialize bridge connection
53 |
54 | Args:
55 | websocket_url: WebSocket server URL
56 | """
57 | self.websocket_url = websocket_url
58 | self.websocket: Optional[websockets.WebSocketServerProtocol] = None
59 | self._command_id = 0
60 | self._response_futures: Dict[int, asyncio.Future] = {}
61 |
62 | @classmethod
63 | async def create(
64 | cls,
65 | websocket_url: str = "ws://localhost:8765",
66 | wait_for_connection: bool = True
67 | ) -> 'BridgeWebPage':
68 | """Create bridge connection
69 |
70 | Args:
71 | websocket_url: WebSocket server URL
72 | wait_for_connection: Wait for extension to connect
73 |
74 | Returns:
75 | BridgeWebPage instance
76 | """
77 | bridge = cls(websocket_url)
78 |
79 | if wait_for_connection:
80 | await bridge.connect()
81 |
82 | return bridge
83 |
84 | async def connect(self, timeout: float = 30.0) -> None:
85 | """Connect to Chrome extension"""
86 | try:
87 | logger.info(f"Waiting for extension connection on {self.websocket_url}")
88 |
89 | # Start WebSocket server and wait for extension to connect
90 | server = await websockets.serve(
91 | self._handle_connection,
92 | "localhost",
93 | 8765
94 | )
95 |
96 | # Wait for connection with timeout
97 | start_time = asyncio.get_event_loop().time()
98 | while not self.websocket and (asyncio.get_event_loop().time() - start_time) < timeout:
99 | await asyncio.sleep(0.1)
100 |
101 | if not self.websocket:
102 | raise TimeoutError("Extension connection timeout")
103 |
104 | logger.info("Extension connected successfully")
105 |
106 | except Exception as e:
107 | logger.error(f"Failed to connect to extension: {e}")
108 | raise
109 |
110 | async def _handle_connection(self, websocket, path):
111 | """Handle WebSocket connection from extension"""
112 | self.websocket = websocket
113 | logger.info("Extension connected")
114 |
115 | try:
116 | async for message in websocket:
117 | await self._handle_message(message)
118 | except websockets.exceptions.ConnectionClosed:
119 | logger.info("Extension disconnected")
120 | self.websocket = None
121 | except Exception as e:
122 | logger.error(f"WebSocket error: {e}")
123 |
124 | async def _handle_message(self, message: str) -> None:
125 | """Handle message from extension"""
126 | try:
127 | data = json.loads(message)
128 |
129 | if "id" in data and data["id"] in self._response_futures:
130 | # Response to command
131 | future = self._response_futures.pop(data["id"])
132 | if not future.done():
133 | future.set_result(data)
134 | else:
135 | # Unsolicited message from extension
136 | logger.debug(f"Received message: {data}")
137 |
138 | except Exception as e:
139 | logger.error(f"Failed to handle message: {e}")
140 |
141 | async def send_command(self, command: Dict[str, Any]) -> Dict[str, Any]:
142 | """Send command to extension and wait for response"""
143 | if not self.websocket:
144 | raise RuntimeError("Extension not connected")
145 |
146 | command_id = self._command_id
147 | self._command_id += 1
148 |
149 | command["id"] = command_id
150 |
151 | # Create future for response
152 | future = asyncio.Future()
153 | self._response_futures[command_id] = future
154 |
155 | try:
156 | # Send command
157 | await self.websocket.send(json.dumps(command))
158 |
159 | # Wait for response
160 | response = await asyncio.wait_for(future, timeout=30.0)
161 |
162 | if response.get("error"):
163 | raise Exception(f"Command failed: {response['error']}")
164 |
165 | return response
166 |
167 | except asyncio.TimeoutError:
168 | self._response_futures.pop(command_id, None)
169 | raise TimeoutError("Command timeout")
170 | except Exception as e:
171 | self._response_futures.pop(command_id, None)
172 | raise
173 |
174 | @property
175 | def interface_type(self) -> InterfaceType:
176 | """Get interface type"""
177 | return InterfaceType.WEB
178 |
179 | async def get_context(self) -> UIContext:
180 | """Get current UI context"""
181 | try:
182 | response = await self.send_command({"action": "getContext"})
183 |
184 | # Parse context data
185 | context_data = response["data"]
186 |
187 | # Convert to UIContext
188 | screenshot_base64 = context_data["screenshot"]
189 | size = Size(**context_data["size"])
190 |
191 | elements = []
192 | for elem_data in context_data["elements"]:
193 | rect = Rect(**elem_data["rect"])
194 | node_type = NodeType(elem_data.get("nodeType", "other"))
195 |
196 | element = BridgeElement(
197 | bridge=self,
198 | id=elem_data["id"],
199 | content=elem_data["content"],
200 | rect=rect,
201 | center=tuple(elem_data["center"]),
202 | node_type=node_type,
203 | attributes=elem_data.get("attributes", {}),
204 | is_visible=elem_data.get("isVisible", True)
205 | )
206 | elements.append(element)
207 |
208 | # Build tree
209 | tree_data = context_data.get("tree", {})
210 | tree = self._build_tree_from_data(tree_data)
211 |
212 | return UIContext(
213 | screenshot_base64=screenshot_base64,
214 | size=size,
215 | content=elements,
216 | tree=tree
217 | )
218 |
219 | except Exception as e:
220 | logger.error(f"Failed to get context: {e}")
221 | raise
222 |
223 | async def action_space(self) -> List[str]:
224 | """Get available actions"""
225 | return [
226 | "tap", "click", "double_click", "right_click",
227 | "input", "type", "clear",
228 | "scroll", "scroll_up", "scroll_down", "scroll_left", "scroll_right",
229 | "hover", "drag", "key_press", "navigate"
230 | ]
231 |
232 | async def tap(self, x: float, y: float) -> None:
233 | """Tap at coordinates"""
234 | try:
235 | await self.send_command({
236 | "action": "tap",
237 | "coordinates": {"x": x, "y": y}
238 | })
239 | except Exception as e:
240 | logger.error(f"Failed to tap at ({x}, {y}): {e}")
241 | raise
242 |
243 | async def input_text(self, text: str) -> None:
244 | """Input text to focused element"""
245 | try:
246 | await self.send_command({
247 | "action": "inputText",
248 | "text": text
249 | })
250 | except Exception as e:
251 | logger.error(f"Failed to input text: {e}")
252 | raise
253 |
254 | async def scroll(self, direction: str, distance: Optional[int] = None) -> None:
255 | """Scroll in direction"""
256 | try:
257 | await self.send_command({
258 | "action": "scroll",
259 | "direction": direction,
260 | "distance": distance or 500
261 | })
262 | except Exception as e:
263 | logger.error(f"Failed to scroll {direction}: {e}")
264 | raise
265 |
266 | async def navigate_to(self, url: str) -> None:
267 | """Navigate to URL"""
268 | try:
269 | await self.send_command({
270 | "action": "navigate",
271 | "url": url
272 | })
273 | except Exception as e:
274 | logger.error(f"Failed to navigate to {url}: {e}")
275 | raise
276 |
277 | def _build_tree_from_data(self, tree_data: Dict[str, Any]) -> UITree:
278 | """Build UITree from extension data"""
279 | if not tree_data:
280 | # Return minimal tree
281 | root_node = UINode(
282 | id="root",
283 | content="",
284 | rect=Rect(left=0, top=0, width=1920, height=1080),
285 | center=(960, 540),
286 | node_type=NodeType.CONTAINER,
287 | attributes={},
288 | is_visible=True,
289 | children=[]
290 | )
291 | return UITree(node=root_node, children=[])
292 |
293 | # Convert tree data to UINode
294 | node_data = tree_data["node"]
295 | node = UINode(
296 | id=node_data["id"],
297 | content=node_data["content"],
298 | rect=Rect(**node_data["rect"]),
299 | center=tuple(node_data["center"]),
300 | node_type=NodeType(node_data.get("nodeType", "other")),
301 | attributes=node_data.get("attributes", {}),
302 | is_visible=node_data.get("isVisible", True),
303 | children=[]
304 | )
305 |
306 | # Build children recursively
307 | children = []
308 | for child_data in tree_data.get("children", []):
309 | child_tree = self._build_tree_from_data(child_data)
310 | children.append(child_tree)
311 |
312 | return UITree(node=node, children=children)
313 |
314 | async def close(self) -> None:
315 | """Close bridge connection"""
316 | if self.websocket:
317 | await self.websocket.close()
318 | self.websocket = None
--------------------------------------------------------------------------------
/midscene/core/ai_model/providers.py:
--------------------------------------------------------------------------------
1 | """
2 | AI Model Providers - Implementations for different AI services
3 | """
4 |
5 | import json
6 | from typing import Any, Dict, List, Optional, Type
7 |
8 | import httpx
9 | from loguru import logger
10 | from pydantic import BaseModel
11 |
12 | from .service import AIProvider, AIModelConfig, parse_json_response, create_usage_info
13 |
14 |
15 | class OpenAIProvider(AIProvider):
16 | """OpenAI API provider"""
17 |
18 | async def call(
19 | self,
20 | messages: List[Dict[str, Any]],
21 | config: AIModelConfig,
22 | response_schema: Optional[Type[BaseModel]] = None,
23 | **kwargs
24 | ) -> Dict[str, Any]:
25 | """Call OpenAI API"""
26 | headers = {
27 | "Authorization": f"Bearer {config.api_key}",
28 | "Content-Type": "application/json"
29 | }
30 |
31 | payload = {
32 | "model": config.model,
33 | "messages": messages,
34 | "max_tokens": config.max_tokens,
35 | "temperature": config.temperature
36 | }
37 |
38 | # Support structured output for compatible models
39 | if response_schema and "gpt-4" in config.model:
40 | payload["response_format"] = {
41 | "type": "json_schema",
42 | "json_schema": {
43 | "name": response_schema.__name__,
44 | "schema": response_schema.model_json_schema()
45 | }
46 | }
47 |
48 | base_url = config.base_url or "https://api.openai.com"
49 | url = f"{base_url}/v1/chat/completions"
50 |
51 | async with httpx.AsyncClient(timeout=config.timeout) as client:
52 | response = await client.post(url, headers=headers, json=payload)
53 | response.raise_for_status()
54 |
55 | result = response.json()
56 | content = result['choices'][0]['message']['content']
57 |
58 | if response_schema:
59 | try:
60 | parsed = parse_json_response(content)
61 | validated = response_schema(**parsed)
62 | return {
63 | "content": validated.model_dump(),
64 | "usage": create_usage_info(result.get('usage', {}))
65 | }
66 | except Exception as e:
67 | logger.warning(f"Failed to parse structured response: {e}")
68 | return {
69 | "content": {"error": str(e), "raw_content": content},
70 | "usage": create_usage_info(result.get('usage', {}))
71 | }
72 |
73 | return {
74 | "content": content,
75 | "usage": create_usage_info(result.get('usage', {}))
76 | }
77 |
78 |
79 | class AnthropicProvider(AIProvider):
80 | """Anthropic Claude API provider"""
81 |
82 | async def call(
83 | self,
84 | messages: List[Dict[str, Any]],
85 | config: AIModelConfig,
86 | response_schema: Optional[Type[BaseModel]] = None,
87 | **kwargs
88 | ) -> Dict[str, Any]:
89 | """Call Anthropic API"""
90 | headers = {
91 | "x-api-key": config.api_key,
92 | "Content-Type": "application/json",
93 | "anthropic-version": "2023-06-01"
94 | }
95 |
96 | # Convert messages format for Anthropic
97 | system_message = ""
98 | anthropic_messages = []
99 |
100 | for msg in messages:
101 | if msg["role"] == "system":
102 | system_message = msg["content"]
103 | else:
104 | anthropic_messages.append(msg)
105 |
106 | payload = {
107 | "model": config.model,
108 | "max_tokens": config.max_tokens,
109 | "temperature": config.temperature,
110 | "messages": anthropic_messages
111 | }
112 |
113 | if system_message:
114 | payload["system"] = system_message
115 |
116 | base_url = config.base_url or "https://api.anthropic.com"
117 | url = f"{base_url}/v1/messages"
118 |
119 | async with httpx.AsyncClient(timeout=config.timeout) as client:
120 | response = await client.post(url, headers=headers, json=payload)
121 | response.raise_for_status()
122 |
123 | result = response.json()
124 | content = result['content'][0]['text']
125 |
126 | if response_schema:
127 | try:
128 | parsed = parse_json_response(content)
129 | validated = response_schema(**parsed)
130 | return {
131 | "content": validated.model_dump(),
132 | "usage": create_usage_info(result.get('usage', {}))
133 | }
134 | except Exception as e:
135 | logger.warning(f"Failed to parse structured response: {e}")
136 | return {
137 | "content": {"error": str(e), "raw_content": content},
138 | "usage": create_usage_info(result.get('usage', {}))
139 | }
140 |
141 | return {
142 | "content": content,
143 | "usage": create_usage_info(result.get('usage', {}))
144 | }
145 |
146 |
147 | class QwenProvider(AIProvider):
148 | """Alibaba Qwen API provider"""
149 |
150 | async def call(
151 | self,
152 | messages: List[Dict[str, Any]],
153 | config: AIModelConfig,
154 | response_schema: Optional[Type[BaseModel]] = None,
155 | **kwargs
156 | ) -> Dict[str, Any]:
157 | """Call Qwen API"""
158 | try:
159 | import dashscope
160 | except ImportError:
161 | raise ImportError("dashscope is required for Qwen provider. Install with: pip install dashscope")
162 |
163 | dashscope.api_key = config.api_key
164 |
165 | # Convert messages for Qwen
166 | qwen_messages = []
167 | for msg in messages:
168 | qwen_messages.append({
169 | "role": msg["role"],
170 | "content": msg["content"]
171 | })
172 |
173 | response = await dashscope.Generation.acall(
174 | model=config.model,
175 | messages=qwen_messages,
176 | max_tokens=config.max_tokens,
177 | temperature=config.temperature,
178 | result_format='message'
179 | )
180 |
181 | if response.status_code == 200:
182 | content = response.output.choices[0]['message']['content']
183 |
184 | if response_schema:
185 | try:
186 | parsed = parse_json_response(content)
187 | validated = response_schema(**parsed)
188 | return {
189 | "content": validated.model_dump(),
190 | "usage": create_usage_info(response.usage)
191 | }
192 | except Exception as e:
193 | logger.warning(f"Failed to parse structured response: {e}")
194 | return {
195 | "content": {"error": str(e), "raw_content": content},
196 | "usage": create_usage_info(response.usage)
197 | }
198 |
199 | return {
200 | "content": content,
201 | "usage": create_usage_info(response.usage)
202 | }
203 | else:
204 | raise Exception(f"Qwen API error: {response.message}")
205 |
206 |
207 | class GeminiProvider(AIProvider):
208 | """Google Gemini API provider"""
209 |
210 | async def call(
211 | self,
212 | messages: List[Dict[str, Any]],
213 | config: AIModelConfig,
214 | response_schema: Optional[Type[BaseModel]] = None,
215 | **kwargs
216 | ) -> Dict[str, Any]:
217 | """Call Gemini API"""
218 | try:
219 | import google.generativeai as genai
220 | except ImportError:
221 | raise ImportError("google-generativeai is required for Gemini provider. Install with: pip install google-generativeai")
222 |
223 | genai.configure(api_key=config.api_key)
224 | model = genai.GenerativeModel(config.model)
225 |
226 | # Convert messages format for Gemini
227 | gemini_messages = []
228 | for msg in messages:
229 | if msg["role"] == "system":
230 | # Gemini doesn't have system role, prepend to first user message
231 | continue
232 | elif msg["role"] == "user":
233 | if isinstance(msg["content"], list):
234 | # Handle multimodal content
235 | parts = []
236 | for part in msg["content"]:
237 | if part["type"] == "text":
238 | parts.append(part["text"])
239 | elif part["type"] == "image_url":
240 | # Convert base64 image to Gemini format
241 | import base64
242 | import io
243 | from PIL import Image
244 |
245 | image_data = part["image_url"]["url"]
246 | if image_data.startswith("data:image"):
247 | image_data = image_data.split(",")[1]
248 |
249 | image_bytes = base64.b64decode(image_data)
250 | image = Image.open(io.BytesIO(image_bytes))
251 | parts.append(image)
252 |
253 | gemini_messages.append({"role": "user", "parts": parts})
254 | else:
255 | gemini_messages.append({"role": "user", "parts": [msg["content"]]})
256 | elif msg["role"] == "assistant":
257 | gemini_messages.append({"role": "model", "parts": [msg["content"]]})
258 |
259 | generation_config = genai.types.GenerationConfig(
260 | max_output_tokens=config.max_tokens,
261 | temperature=config.temperature
262 | )
263 |
264 | response = await model.generate_content_async(
265 | gemini_messages,
266 | generation_config=generation_config
267 | )
268 |
269 | content = response.text
270 |
271 | if response_schema:
272 | try:
273 | parsed = parse_json_response(content)
274 | validated = response_schema(**parsed)
275 | return {
276 | "content": validated.model_dump(),
277 | "usage": create_usage_info({
278 | "prompt_tokens": response.usage_metadata.prompt_token_count,
279 | "completion_tokens": response.usage_metadata.candidates_token_count,
280 | "total_tokens": response.usage_metadata.total_token_count
281 | })
282 | }
283 | except Exception as e:
284 | logger.warning(f"Failed to parse structured response: {e}")
285 | return {
286 | "content": {"error": str(e), "raw_content": content},
287 | "usage": create_usage_info({
288 | "prompt_tokens": response.usage_metadata.prompt_token_count,
289 | "completion_tokens": response.usage_metadata.candidates_token_count,
290 | "total_tokens": response.usage_metadata.total_token_count
291 | })
292 | }
293 |
294 | return {
295 | "content": content,
296 | "usage": create_usage_info({
297 | "prompt_tokens": response.usage_metadata.prompt_token_count,
298 | "completion_tokens": response.usage_metadata.candidates_token_count,
299 | "total_tokens": response.usage_metadata.total_token_count
300 | })
301 | }
--------------------------------------------------------------------------------
/wiki/核心概念/Insight-UI理解引擎.md:
--------------------------------------------------------------------------------
1 | # Insight UI理解引擎
2 |
3 | Insight 是 Midscene Python 的 AI 驱动的 UI 理解引擎,负责页面分析、元素定位和操作决策。它是连接 AI 模型与实际操作的核心组件。
4 |
5 | ## 🧠 设计理念
6 |
7 | ### AI 驱动的视觉理解
8 | Insight 利用先进的视觉语言模型(VLM)来理解页面内容:
9 |
10 | ```python
11 | # Insight 不依赖传统的选择器
12 | # 而是通过 AI 视觉理解来定位元素
13 | element = await insight.locate("蓝色的登录按钮")
14 | element = await insight.locate("位于页面右上角的搜索图标")
15 | ```
16 |
17 | ### 上下文感知决策
18 | Insight 结合页面状态、用户意图和历史操作来做出智能决策:
19 |
20 | ```python
21 | # 同样的描述在不同上下文下可能指向不同元素
22 | await insight.locate("确定按钮") # 对话框中的确定按钮
23 | await insight.locate("确定按钮") # 表单中的确定按钮
24 | ```
25 |
26 | ## 🏗️ 架构设计
27 |
28 | ### 核心组件
29 |
30 | ```mermaid
31 | graph TB
32 | A[Insight Engine] --> B[Context Provider]
33 | A --> C[AI Model Service]
34 | A --> D[Response Processor]
35 | A --> E[Dump Subscribers]
36 |
37 | B --> F[UI Context]
38 | C --> G[Multi-Model Support]
39 | D --> H[Element Processing]
40 | D --> I[Data Validation]
41 |
42 | subgraph "输入处理"
43 | B
44 | F
45 | end
46 |
47 | subgraph "AI 推理"
48 | C
49 | G
50 | end
51 |
52 | subgraph "结果处理"
53 | D
54 | H
55 | I
56 | end
57 | ```
58 |
59 | ### Insight 类结构
60 |
61 | ```python
62 | class Insight:
63 | """AI-powered UI understanding and reasoning engine"""
64 |
65 | def __init__(
66 | self,
67 | context_provider: Union[UIContext, Callable],
68 | ai_service: Optional[AIModelService] = None,
69 | model_config: Optional[AIModelConfig] = None
70 | ):
71 | self.context_provider = context_provider # 上下文提供者
72 | self.ai_service = ai_service # AI 模型服务
73 | self.model_config = model_config # 模型配置
74 | self._dump_subscribers = [] # 调试订阅者
75 | ```
76 |
77 | ## 🎯 核心功能
78 |
79 | ### 1. 智能元素定位 (locate)
80 |
81 | Insight 的核心能力是通过自然语言精确定位页面元素:
82 |
83 | ```python
84 | # 基础定位
85 | login_btn = await insight.locate("登录按钮")
86 | search_box = await insight.locate("搜索输入框")
87 |
88 | # 描述性定位
89 | submit_btn = await insight.locate("绿色的提交按钮")
90 | close_icon = await insight.locate("模态对话框右上角的关闭图标")
91 |
92 | # 相对定位
93 | next_page = await insight.locate("分页器中的下一页按钮")
94 | first_item = await insight.locate("列表中的第一个商品")
95 |
96 | # 条件定位
97 | error_msg = await insight.locate("如果存在错误信息的提示框")
98 | ```
99 |
100 | #### 定位策略
101 |
102 | Insight 使用多层次的定位策略:
103 |
104 | 1. **视觉识别**: 分析截图中的视觉元素
105 | 2. **语义理解**: 理解元素的功能和语义
106 | 3. **布局分析**: 考虑元素的位置关系
107 | 4. **上下文感知**: 结合页面状态和操作历史
108 |
109 | ```python
110 | class LocateResponse(BaseModel):
111 | """AI locate response schema"""
112 | elements: List[Dict[str, Any]] # 找到的元素列表
113 | reasoning: str # AI 推理过程
114 | confidence: float # 置信度
115 | errors: List[str] = [] # 错误信息
116 | ```
117 |
118 | #### 定位选项
119 |
120 | ```python
121 | from midscene.core.types import LocateOption
122 |
123 | options = LocateOption(
124 | multiple=True, # 查找多个匹配的元素
125 | timeout=10, # 定位超时时间
126 | wait_for_visible=True, # 等待元素可见
127 | confidence_threshold=0.8 # 最小置信度阈值
128 | )
129 |
130 | elements = await insight.locate("商品卡片", options)
131 | ```
132 |
133 | ### 2. 数据提取 (extract)
134 |
135 | 从页面提取结构化数据:
136 |
137 | ```python
138 | # 简单数据提取
139 | user_info = await insight.extract({
140 | "name": "用户姓名",
141 | "email": "邮箱地址",
142 | "role": "用户角色"
143 | })
144 |
145 | # 复杂列表数据
146 | products = await insight.extract({
147 | "products": [
148 | {
149 | "name": "商品名称",
150 | "price": "价格",
151 | "rating": "评分",
152 | "description": "商品描述",
153 | "in_stock": "是否有库存"
154 | }
155 | ]
156 | })
157 |
158 | # 嵌套结构数据
159 | page_data = await insight.extract({
160 | "header": {
161 | "title": "页面标题",
162 | "user": "当前用户名"
163 | },
164 | "content": {
165 | "articles": [
166 | {
167 | "title": "文章标题",
168 | "author": "作者",
169 | "date": "发布日期"
170 | }
171 | ]
172 | },
173 | "footer": {
174 | "copyright": "版权信息"
175 | }
176 | })
177 | ```
178 |
179 | #### 提取选项
180 |
181 | ```python
182 | from midscene.core.types import ExtractOption
183 |
184 | options = ExtractOption(
185 | return_thought=True, # 返回 AI 的思考过程
186 | schema_validation=True, # 启用数据结构验证
187 | timeout=30 # 提取超时时间
188 | )
189 |
190 | result = await insight.extract(schema, options)
191 | print(result["thought"]) # AI 的推理过程
192 | print(result["data"]) # 提取的数据
193 | ```
194 |
195 | ### 3. 智能断言 (assert_condition)
196 |
197 | 验证页面状态和条件:
198 |
199 | ```python
200 | # 状态断言
201 | result = await insight.assert_condition("用户已成功登录")
202 | assert result.passed, result.message
203 |
204 | # 内容断言
205 | result = await insight.assert_condition("页面显示了 5 个搜索结果")
206 | assert result.passed
207 |
208 | # 复杂条件断言
209 | result = await insight.assert_condition(
210 | "如果是新用户,页面应该显示欢迎指引"
211 | )
212 |
213 | # 否定断言
214 | result = await insight.assert_condition("页面没有显示错误信息")
215 | ```
216 |
217 | #### 断言结果
218 |
219 | ```python
220 | class AssertResult:
221 | passed: bool # 断言是否通过
222 | reasoning: str # AI 推理过程
223 | confidence: float # 置信度
224 | message: str # 详细消息
225 | ```
226 |
227 | ## 🔧 上下文管理
228 |
229 | ### 上下文提供者
230 |
231 | Insight 通过上下文提供者获取页面信息:
232 |
233 | ```python
234 | # 静态上下文
235 | context = UIContext(
236 | screenshot_base64="...",
237 | page_title="登录页面",
238 | url="https://example.com/login"
239 | )
240 | insight = Insight(context)
241 |
242 | # 动态上下文
243 | async def get_context(action: InsightAction) -> UIContext:
244 | # 根据操作类型获取不同的上下文信息
245 | if action == InsightAction.LOCATE:
246 | return await page.get_locate_context()
247 | elif action == InsightAction.EXTRACT:
248 | return await page.get_extract_context()
249 | else:
250 | return await page.get_default_context()
251 |
252 | insight = Insight(get_context)
253 | ```
254 |
255 | ### 上下文类型
256 |
257 | ```python
258 | class UIContext(BaseModel):
259 | """UI context information"""
260 | screenshot_base64: str # 页面截图(Base64 编码)
261 | page_title: str # 页面标题
262 | url: str # 页面 URL
263 | viewport_size: tuple # 视口大小
264 | device_pixel_ratio: float # 设备像素比
265 | elements: List[BaseElement] # 页面元素信息
266 | timestamp: float # 时间戳
267 | ```
268 |
269 | ## 🎨 AI 消息构建
270 |
271 | ### 定位消息
272 |
273 | Insight 为不同操作构建专门的 AI 消息:
274 |
275 | ```python
276 | def _build_locate_messages(
277 | self,
278 | prompt: str,
279 | context: UIContext,
280 | options: LocateOption
281 | ) -> List[Dict]:
282 | """构建元素定位的 AI 消息"""
283 | return [
284 | {
285 | "role": "system",
286 | "content": self._get_locate_system_prompt()
287 | },
288 | {
289 | "role": "user",
290 | "content": [
291 | {
292 | "type": "text",
293 | "text": f"请在页面中定位:{prompt}"
294 | },
295 | {
296 | "type": "image_url",
297 | "image_url": {
298 | "url": f"data:image/png;base64,{context.screenshot_base64}"
299 | }
300 | }
301 | ]
302 | }
303 | ]
304 | ```
305 |
306 | ### 系统提示词
307 |
308 | ```python
309 | def _get_locate_system_prompt(self) -> str:
310 | """获取元素定位的系统提示词"""
311 | return """
312 | 你是一个专业的UI元素定位专家。请分析页面截图,根据用户描述精确定位目标元素。
313 |
314 | 定位原则:
315 | 1. 优先考虑功能语义而非视觉外观
316 | 2. 结合上下文理解元素关系
317 | 3. 对于模糊描述,选择最可能的候选元素
318 | 4. 提供详细的定位推理过程
319 |
320 | 返回格式:
321 | {
322 | "elements": [
323 | {
324 | "rect": {"x": 0, "y": 0, "width": 100, "height": 30},
325 | "text": "元素文本",
326 | "tag": "元素标签",
327 | "attributes": {"id": "...", "class": "..."},
328 | "confidence": 0.95
329 | }
330 | ],
331 | "reasoning": "定位推理过程",
332 | "confidence": 0.9
333 | }
334 | """.strip()
335 | ```
336 |
337 | ## 📊 响应处理
338 |
339 | ### 元素处理
340 |
341 | ```python
342 | def _process_locate_response(
343 | self,
344 | response: LocateResponse,
345 | context: UIContext
346 | ) -> Optional[BaseElement]:
347 | """处理定位响应,返回最佳匹配元素"""
348 |
349 | if not response.elements:
350 | return None
351 |
352 | # 选择置信度最高的元素
353 | best_element = max(
354 | response.elements,
355 | key=lambda e: e.get("confidence", 0)
356 | )
357 |
358 | # 创建元素对象
359 | element = BaseElement(
360 | rect=best_element["rect"],
361 | text=best_element.get("text", ""),
362 | tag_name=best_element.get("tag", ""),
363 | attributes=best_element.get("attributes", {})
364 | )
365 |
366 | return element
367 | ```
368 |
369 | ### 数据验证
370 |
371 | ```python
372 | def _validate_extract_response(
373 | self,
374 | response: ExtractResponse,
375 | schema: Dict
376 | ) -> bool:
377 | """验证提取数据的结构是否符合预期"""
378 |
379 | try:
380 | # 使用 Pydantic 进行结构验证
381 | from pydantic import create_model
382 |
383 | # 动态创建验证模型
384 | validator = create_model("ExtractValidator", **schema)
385 | validator(**response.data)
386 |
387 | return True
388 | except Exception as e:
389 | logger.warning(f"Data validation failed: {e}")
390 | return False
391 | ```
392 |
393 | ## 🔍 调试和监控
394 |
395 | ### 调试订阅者
396 |
397 | Insight 支持调试订阅者来监控执行过程:
398 |
399 | ```python
400 | async def debug_subscriber(dump_data: Dict):
401 | """调试订阅者函数"""
402 | operation = dump_data["type"]
403 | prompt = dump_data.get("prompt", "")
404 |
405 | print(f"🔍 操作: {operation}")
406 | print(f"📝 提示: {prompt}")
407 |
408 | if "error" in dump_data:
409 | print(f"❌ 错误: {dump_data['error']}")
410 | else:
411 | print(f"✅ 成功")
412 |
413 | # 保存调试信息到文件
414 | with open(f"debug_{operation}.json", "w") as f:
415 | json.dump(dump_data, f, indent=2)
416 |
417 | # 注册调试订阅者
418 | insight.subscribe_to_dump(debug_subscriber)
419 | ```
420 |
421 | ### 执行统计
422 |
423 | ```python
424 | class InsightMetrics:
425 | """Insight 执行统计"""
426 |
427 | def __init__(self):
428 | self.operation_count = 0
429 | self.total_time = 0
430 | self.success_count = 0
431 | self.ai_tokens_used = 0
432 |
433 | def record_operation(self, operation: str, duration: float, success: bool, tokens: int):
434 | self.operation_count += 1
435 | self.total_time += duration
436 | if success:
437 | self.success_count += 1
438 | self.ai_tokens_used += tokens
439 |
440 | @property
441 | def success_rate(self) -> float:
442 | return self.success_count / self.operation_count if self.operation_count > 0 else 0
443 |
444 | @property
445 | def avg_time(self) -> float:
446 | return self.total_time / self.operation_count if self.operation_count > 0 else 0
447 |
448 | # 使用统计
449 | metrics = InsightMetrics()
450 | insight.set_metrics_collector(metrics)
451 | ```
452 |
453 | ## ⚙️ 高级配置
454 |
455 | ### 模型配置
456 |
457 | ```python
458 | from midscene.core.ai_model import AIModelConfig
459 |
460 | # 针对不同操作使用不同配置
461 | locate_config = AIModelConfig(
462 | provider="openai",
463 | model="gpt-4-vision-preview",
464 | temperature=0.1, # 定位需要更确定性
465 | max_tokens=500
466 | )
467 |
468 | extract_config = AIModelConfig(
469 | provider="claude",
470 | model="claude-3-sonnet-20240229",
471 | temperature=0.2, # 提取允许更多创造性
472 | max_tokens=2000
473 | )
474 |
475 | # 创建专门的 Insight 实例
476 | locate_insight = Insight(context_provider, model_config=locate_config)
477 | extract_insight = Insight(context_provider, model_config=extract_config)
478 | ```
479 |
480 | ### 缓存配置
481 |
482 | ```python
483 | # 启用智能缓存
484 | insight.enable_cache(
485 | cache_size=1000, # 缓存条目数
486 | ttl=3600, # 缓存过期时间(秒)
487 | hash_screenshot=True, # 基于截图内容生成缓存键
488 | cache_ai_responses=True # 缓存 AI 响应
489 | )
490 |
491 | # 缓存策略配置
492 | insight.set_cache_strategy(
493 | locate_cache_enabled=True, # 定位操作缓存
494 | extract_cache_enabled=True, # 提取操作缓存
495 | assert_cache_enabled=False # 断言操作不缓存(实时性要求高)
496 | )
497 | ```
498 |
499 | ## 🚀 性能优化
500 |
501 | ### 批量操作
502 |
503 | ```python
504 | # 批量定位多个元素
505 | elements = await insight.batch_locate([
506 | "登录按钮",
507 | "注册链接",
508 | "忘记密码链接"
509 | ])
510 |
511 | # 批量提取多个数据块
512 | data_blocks = await insight.batch_extract([
513 | {"user_info": {"name": "姓名", "email": "邮箱"}},
514 | {"product_list": [{"name": "商品名", "price": "价格"}]},
515 | {"navigation": {"items": ["导航项目"]}}
516 | ])
517 | ```
518 |
519 | ### 并发控制
520 |
521 | ```python
522 | # 设置并发限制
523 | insight.set_concurrency_limit(3)
524 |
525 | # 异步并发执行
526 | import asyncio
527 |
528 | async def parallel_operations():
529 | tasks = [
530 | insight.locate("按钮1"),
531 | insight.locate("按钮2"),
532 | insight.extract(schema1),
533 | insight.extract(schema2)
534 | ]
535 |
536 | results = await asyncio.gather(*tasks, return_exceptions=True)
537 | return results
538 | ```
539 |
540 | ## 🎯 最佳实践
541 |
542 | ### 1. 清晰的描述
543 | ```python
544 | # ❌ 模糊描述
545 | await insight.locate("按钮")
546 |
547 | # ✅ 具体描述
548 | await insight.locate("页面右上角的蓝色登录按钮")
549 | ```
550 |
551 | ### 2. 合理的置信度阈值
552 | ```python
553 | # 根据场景调整置信度要求
554 | options = LocateOption(
555 | confidence_threshold=0.9 # 高要求场景
556 | )
557 | element = await insight.locate("重要操作按钮", options)
558 | ```
559 |
560 | ### 3. 错误处理和重试
561 | ```python
562 | async def robust_locate(prompt: str, max_retries: int = 3):
563 | for attempt in range(max_retries):
564 | try:
565 | result = await insight.locate(prompt)
566 | if result.element:
567 | return result
568 | except Exception as e:
569 | if attempt == max_retries - 1:
570 | raise
571 | await asyncio.sleep(1) # 等待后重试
572 |
573 | raise ElementNotFoundError(f"Element not found after {max_retries} attempts")
574 | ```
575 |
576 | ### 4. 上下文优化
577 | ```python
578 | # 为不同操作提供优化的上下文
579 | async def optimized_context_provider(action: InsightAction) -> UIContext:
580 | base_context = await page.get_context()
581 |
582 | if action == InsightAction.LOCATE:
583 | # 定位操作需要更详细的元素信息
584 | base_context.elements = await page.get_all_elements()
585 | elif action == InsightAction.EXTRACT:
586 | # 提取操作需要更完整的页面内容
587 | base_context.page_content = await page.get_page_content()
588 |
589 | return base_context
590 | ```
591 |
592 | ## 🔗 相关文档
593 |
594 | - **Agent 集成**: [Agent 核心控制器](Agent核心控制器.md)
595 | - **AI 模型**: [AI模型服务抽象层](AI模型服务抽象层.md)
596 | - **数据类型**: [UI上下文与数据模型](UI上下文与数据模型.md)
597 | - **API 参考**: [Insight API](../API参考/Insight-API.md)
598 |
599 | ---
600 |
601 | Insight 是 Midscene Python 的智能核心,它让 AI 能够真正"看懂"和"理解"用户界面。掌握 Insight 的使用将大大提升你的自动化脚本的智能程度和稳定性!
--------------------------------------------------------------------------------
/midscene/shared/report.py:
--------------------------------------------------------------------------------
1 | """
2 | Report generation and visualization
3 | """
4 |
5 | import json
6 | from datetime import datetime
7 | from pathlib import Path
8 | from typing import Any, Dict, List, Optional
9 |
10 | from jinja2 import Template
11 | from loguru import logger
12 |
13 |
14 | class ExecutionReport:
15 | """Execution report data model"""
16 |
17 | def __init__(self):
18 | self.start_time = datetime.now()
19 | self.end_time: Optional[datetime] = None
20 | self.success = True
21 | self.error: Optional[str] = None
22 | self.tasks: List[Dict[str, Any]] = []
23 | self.metadata: Dict[str, Any] = {}
24 | self.screenshots: List[str] = []
25 | self.ai_usage: Dict[str, Any] = {}
26 |
27 | def add_task(self, task_data: Dict[str, Any]) -> None:
28 | """Add task execution data"""
29 | self.tasks.append({
30 | **task_data,
31 | "timestamp": datetime.now().isoformat()
32 | })
33 |
34 | def add_screenshot(self, screenshot_base64: str, description: str = "") -> None:
35 | """Add screenshot to report"""
36 | self.screenshots.append({
37 | "image": screenshot_base64,
38 | "description": description,
39 | "timestamp": datetime.now().isoformat()
40 | })
41 |
42 | def update_ai_usage(self, usage_data: Dict[str, Any]) -> None:
43 | """Update AI usage statistics"""
44 | for key, value in usage_data.items():
45 | if key in self.ai_usage:
46 | if isinstance(value, (int, float)):
47 | self.ai_usage[key] += value
48 | else:
49 | self.ai_usage[key] = value
50 | else:
51 | self.ai_usage[key] = value
52 |
53 | def finalize(self, success: bool = True, error: Optional[str] = None) -> None:
54 | """Finalize report"""
55 | self.end_time = datetime.now()
56 | self.success = success
57 | self.error = error
58 |
59 | def to_dict(self) -> Dict[str, Any]:
60 | """Convert to dictionary"""
61 | duration = None
62 | if self.end_time:
63 | duration = (self.end_time - self.start_time).total_seconds()
64 |
65 | return {
66 | "start_time": self.start_time.isoformat(),
67 | "end_time": self.end_time.isoformat() if self.end_time else None,
68 | "duration_seconds": duration,
69 | "success": self.success,
70 | "error": self.error,
71 | "tasks": self.tasks,
72 | "metadata": self.metadata,
73 | "screenshots": self.screenshots,
74 | "ai_usage": self.ai_usage,
75 | "summary": {
76 | "total_tasks": len(self.tasks),
77 | "successful_tasks": len([t for t in self.tasks if t.get("success", True)]),
78 | "failed_tasks": len([t for t in self.tasks if not t.get("success", True)]),
79 | "total_screenshots": len(self.screenshots)
80 | }
81 | }
82 |
83 |
84 | class ReportGenerator:
85 | """Generate execution reports in various formats"""
86 |
87 | def __init__(self, output_dir: str = "./reports"):
88 | """Initialize report generator
89 |
90 | Args:
91 | output_dir: Output directory for reports
92 | """
93 | self.output_dir = Path(output_dir)
94 | self.output_dir.mkdir(parents=True, exist_ok=True)
95 |
96 | def generate_html_report(
97 | self,
98 | report: ExecutionReport,
99 | template_path: Optional[str] = None
100 | ) -> str:
101 | """Generate HTML report
102 |
103 | Args:
104 | report: Execution report data
105 | template_path: Custom template path
106 |
107 | Returns:
108 | Path to generated HTML file
109 | """
110 | if template_path:
111 | with open(template_path, 'r', encoding='utf-8') as f:
112 | template_content = f.read()
113 | else:
114 | template_content = self._get_default_html_template()
115 |
116 | template = Template(template_content)
117 |
118 | # Generate report
119 | html_content = template.render(
120 | report=report.to_dict(),
121 | generated_at=datetime.now().isoformat()
122 | )
123 |
124 | # Save to file
125 | timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
126 | filename = f"midscene_report_{timestamp}.html"
127 | file_path = self.output_dir / filename
128 |
129 | with open(file_path, 'w', encoding='utf-8') as f:
130 | f.write(html_content)
131 |
132 | logger.info(f"HTML report generated: {file_path}")
133 | return str(file_path)
134 |
135 | def generate_json_report(self, report: ExecutionReport) -> str:
136 | """Generate JSON report
137 |
138 | Args:
139 | report: Execution report data
140 |
141 | Returns:
142 | Path to generated JSON file
143 | """
144 | # Save to file
145 | timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
146 | filename = f"midscene_report_{timestamp}.json"
147 | file_path = self.output_dir / filename
148 |
149 | with open(file_path, 'w', encoding='utf-8') as f:
150 | json.dump(report.to_dict(), f, ensure_ascii=False, indent=2)
151 |
152 | logger.info(f"JSON report generated: {file_path}")
153 | return str(file_path)
154 |
155 | def _get_default_html_template(self) -> str:
156 | """Get default HTML template"""
157 | return """
158 |
159 |
160 |
161 |
162 |
163 | Midscene Execution Report
164 |
329 |
330 |
331 |
332 |
343 |
344 |
345 |
346 |
Duration
347 |
348 | {% if report.duration_seconds %}
349 | {{ "%.1f"|format(report.duration_seconds) }}s
350 | {% else %}
351 | -
352 | {% endif %}
353 |
354 |
355 |
356 |
Total Tasks
357 |
{{ report.summary.total_tasks }}
358 |
359 |
360 |
Successful
361 |
{{ report.summary.successful_tasks }}
362 |
363 |
364 |
Failed
365 |
{{ report.summary.failed_tasks }}
366 |
367 |
368 |
369 | {% if report.tasks %}
370 |
371 |
📋 Task Execution
372 | {% for task in report.tasks %}
373 |
374 |
380 | {% if task.get('error') %}
381 |
382 | Error: {{ task.error }}
383 |
384 | {% endif %}
385 | {% if task.get('result') %}
386 |
387 | Result: {{ task.result }}
388 |
389 | {% endif %}
390 |
391 | {{ task.timestamp }}
392 |
393 |
394 | {% endfor %}
395 |
396 | {% endif %}
397 |
398 | {% if report.ai_usage %}
399 |
400 |
🧠 AI Usage Statistics
401 |
402 | {% for key, value in report.ai_usage.items() %}
403 |
404 |
{{ key.replace('_', ' ').title() }}
405 |
{{ value }}
406 |
407 | {% endfor %}
408 |
409 |
410 | {% endif %}
411 |
412 | {% if report.screenshots %}
413 |
414 |
📸 Screenshots
415 | {% for screenshot in report.screenshots %}
416 |
417 |

418 | {% if screenshot.description %}
419 |
{{ screenshot.description }}
420 | {% endif %}
421 |
422 | {% endfor %}
423 |
424 | {% endif %}
425 |
426 |
429 |
430 |
431 |
432 | """.strip()
433 |
434 |
435 | def create_report() -> ExecutionReport:
436 | """Create new execution report
437 |
438 | Returns:
439 | ExecutionReport instance
440 | """
441 | return ExecutionReport()
--------------------------------------------------------------------------------
/midscene/web/playwright_page.py:
--------------------------------------------------------------------------------
1 | """
2 | Playwright integration for Midscene
3 | """
4 |
5 | import base64
6 | import json
7 | from typing import Any, Dict, List, Optional
8 |
9 | from playwright.async_api import async_playwright, Page, Browser, BrowserContext
10 | from loguru import logger
11 |
12 | from ..core.types import (
13 | AbstractInterface, InterfaceType, UIContext, BaseElement, UINode, UITree,
14 | Size, Rect, Point, NodeType
15 | )
16 |
17 |
18 | class PlaywrightElement(BaseElement):
19 | """Playwright element wrapper"""
20 |
21 | def __init__(self, page: Page, selector: str, **kwargs):
22 | self._page = page
23 | self._selector = selector
24 | super().__init__(**kwargs)
25 |
26 | async def tap(self) -> None:
27 | """Click this element"""
28 | try:
29 | await self._page.click(self._selector)
30 | except Exception as e:
31 | logger.error(f"Failed to click element: {e}")
32 | raise
33 |
34 | async def input_text(self, text: str) -> None:
35 | """Input text to this element"""
36 | try:
37 | await self._page.fill(self._selector, text)
38 | except Exception as e:
39 | logger.error(f"Failed to input text: {e}")
40 | raise
41 |
42 |
43 | class PlaywrightWebPage(AbstractInterface):
44 | """Playwright page interface"""
45 |
46 | def __init__(self, page: Page, context: BrowserContext, browser: Browser):
47 | """Initialize with Playwright page
48 |
49 | Args:
50 | page: Playwright page instance
51 | context: Browser context
52 | browser: Browser instance
53 | """
54 | self.page = page
55 | self.context = context
56 | self.browser = browser
57 |
58 | @classmethod
59 | async def create(
60 | cls,
61 | headless: bool = False,
62 | viewport_size: tuple[int, int] = (1920, 1080),
63 | user_data_dir: Optional[str] = None,
64 | **browser_options
65 | ) -> 'PlaywrightWebPage':
66 | """Create new Playwright page instance
67 |
68 | Args:
69 | headless: Run in headless mode
70 | viewport_size: Browser viewport size
71 | user_data_dir: Browser user data directory
72 | **browser_options: Additional browser options
73 |
74 | Returns:
75 | PlaywrightWebPage instance
76 | """
77 | playwright = await async_playwright().start()
78 |
79 | launch_options = {
80 | "headless": headless,
81 | **browser_options
82 | }
83 |
84 | if user_data_dir:
85 | launch_options["user_data_dir"] = user_data_dir
86 |
87 | browser = await playwright.chromium.launch(**launch_options)
88 |
89 | context = await browser.new_context(
90 | viewport={"width": viewport_size[0], "height": viewport_size[1]}
91 | )
92 |
93 | page = await context.new_page()
94 |
95 | return cls(page, context, browser)
96 |
97 | @property
98 | def interface_type(self) -> InterfaceType:
99 | """Get interface type"""
100 | return InterfaceType.WEB
101 |
102 | async def get_context(self) -> UIContext:
103 | """Get current UI context"""
104 | try:
105 | # Take screenshot
106 | screenshot_base64 = await self._take_screenshot()
107 |
108 | # Get page size
109 | size = await self._get_page_size()
110 |
111 | # Extract DOM elements
112 | elements = await self._extract_elements()
113 |
114 | # Build UI tree
115 | tree = await self._build_ui_tree()
116 |
117 | return UIContext(
118 | screenshot_base64=screenshot_base64,
119 | size=size,
120 | content=elements,
121 | tree=tree
122 | )
123 |
124 | except Exception as e:
125 | logger.error(f"Failed to get context: {e}")
126 | raise
127 |
128 | async def action_space(self) -> List[str]:
129 | """Get available actions"""
130 | return [
131 | "tap", "click", "double_click", "right_click",
132 | "input", "type", "fill", "clear",
133 | "scroll", "scroll_up", "scroll_down", "scroll_left", "scroll_right",
134 | "hover", "drag", "key_press", "navigate", "reload",
135 | "go_back", "go_forward"
136 | ]
137 |
138 | async def tap(self, x: float, y: float) -> None:
139 | """Tap at coordinates"""
140 | try:
141 | await self.page.mouse.click(x, y)
142 | except Exception as e:
143 | logger.error(f"Failed to tap at ({x}, {y}): {e}")
144 | raise
145 |
146 | async def input_text(self, text: str) -> None:
147 | """Input text to focused element"""
148 | try:
149 | await self.page.keyboard.type(text)
150 | except Exception as e:
151 | logger.error(f"Failed to input text: {e}")
152 | raise
153 |
154 | async def scroll(self, direction: str, distance: Optional[int] = None) -> None:
155 | """Scroll in direction"""
156 | try:
157 | distance = distance or 500
158 |
159 | if direction == "down":
160 | await self.page.mouse.wheel(0, distance)
161 | elif direction == "up":
162 | await self.page.mouse.wheel(0, -distance)
163 | elif direction == "right":
164 | await self.page.mouse.wheel(distance, 0)
165 | elif direction == "left":
166 | await self.page.mouse.wheel(-distance, 0)
167 | else:
168 | raise ValueError(f"Invalid scroll direction: {direction}")
169 |
170 | except Exception as e:
171 | logger.error(f"Failed to scroll {direction}: {e}")
172 | raise
173 |
174 | async def navigate_to(self, url: str) -> None:
175 | """Navigate to URL"""
176 | try:
177 | await self.page.goto(url, wait_until="networkidle")
178 | except Exception as e:
179 | logger.error(f"Failed to navigate to {url}: {e}")
180 | raise
181 |
182 | async def wait_for_element(
183 | self,
184 | selector: str,
185 | timeout: float = 10000
186 | ) -> None:
187 | """Wait for element to be present"""
188 | try:
189 | await self.page.wait_for_selector(selector, timeout=timeout)
190 | except Exception as e:
191 | raise TimeoutError(f"Element not found: {selector}")
192 |
193 | async def evaluate_script(self, script: str, *args) -> Any:
194 | """Evaluate JavaScript"""
195 | return await self.page.evaluate(script, *args)
196 |
197 | async def close(self) -> None:
198 | """Close the browser"""
199 | try:
200 | await self.context.close()
201 | await self.browser.close()
202 | except Exception as e:
203 | logger.warning(f"Error closing browser: {e}")
204 |
205 | async def _take_screenshot(self) -> str:
206 | """Take screenshot and return base64 string"""
207 | try:
208 | # Take screenshot as bytes
209 | screenshot_bytes = await self.page.screenshot(type="png")
210 |
211 | # Convert to base64
212 | screenshot_base64 = base64.b64encode(screenshot_bytes).decode('utf-8')
213 |
214 | return screenshot_base64
215 |
216 | except Exception as e:
217 | logger.error(f"Failed to take screenshot: {e}")
218 | raise
219 |
220 | async def _get_page_size(self) -> Size:
221 | """Get page viewport size"""
222 | try:
223 | viewport_size = await self.page.evaluate("""
224 | () => ({
225 | width: window.innerWidth,
226 | height: window.innerHeight
227 | })
228 | """)
229 |
230 | return Size(
231 | width=viewport_size['width'],
232 | height=viewport_size['height']
233 | )
234 |
235 | except Exception as e:
236 | logger.error(f"Failed to get page size: {e}")
237 | return Size(width=1920, height=1080)
238 |
239 | async def _extract_elements(self) -> List[PlaywrightElement]:
240 | """Extract all visible elements from page"""
241 | try:
242 | # Use JavaScript to extract element information
243 | element_data = await self.page.evaluate("""
244 | () => {
245 | const elements = [];
246 | const allElements = document.querySelectorAll('*');
247 |
248 | allElements.forEach((el, index) => {
249 | const rect = el.getBoundingClientRect();
250 |
251 | // Skip elements that are not visible
252 | if (rect.width === 0 || rect.height === 0 ||
253 | rect.top < 0 || rect.left < 0 ||
254 | getComputedStyle(el).visibility === 'hidden' ||
255 | getComputedStyle(el).display === 'none') {
256 | return;
257 | }
258 |
259 | // Generate a selector for this element
260 | const selector = generateSelector(el);
261 |
262 | elements.push({
263 | id: `element_${index}`,
264 | selector: selector,
265 | tagName: el.tagName.toLowerCase(),
266 | content: el.textContent?.trim() || el.getAttribute('alt') || el.getAttribute('title') || '',
267 | rect: {
268 | left: rect.left,
269 | top: rect.top,
270 | width: rect.width,
271 | height: rect.height
272 | },
273 | center: [rect.left + rect.width / 2, rect.top + rect.height / 2],
274 | attributes: {
275 | id: el.id,
276 | className: el.className,
277 | type: el.type,
278 | name: el.name,
279 | href: el.href,
280 | src: el.src,
281 | value: el.value,
282 | placeholder: el.placeholder
283 | }
284 | });
285 | });
286 |
287 | function generateSelector(element) {
288 | if (element.id) {
289 | return `#${element.id}`;
290 | }
291 |
292 | let path = element.tagName.toLowerCase();
293 | let parent = element.parentElement;
294 |
295 | while (parent && parent !== document.body) {
296 | const siblings = Array.from(parent.children);
297 | const index = siblings.indexOf(element) + 1;
298 | path = `${parent.tagName.toLowerCase()}:nth-child(${index}) > ${path}`;
299 | element = parent;
300 | parent = element.parentElement;
301 | }
302 |
303 | return path;
304 | }
305 |
306 | return elements;
307 | }
308 | """)
309 |
310 | elements = []
311 | for data in element_data:
312 | rect_data = data['rect']
313 | rect = Rect(
314 | left=rect_data['left'],
315 | top=rect_data['top'],
316 | width=rect_data['width'],
317 | height=rect_data['height']
318 | )
319 |
320 | # Determine node type
321 | tag_name = data['tagName']
322 | node_type = self._get_node_type(tag_name, data['attributes'])
323 |
324 | element = PlaywrightElement(
325 | page=self.page,
326 | selector=data['selector'],
327 | id=data['id'],
328 | content=data['content'],
329 | rect=rect,
330 | center=tuple(data['center']),
331 | node_type=node_type,
332 | attributes=data['attributes'],
333 | is_visible=True
334 | )
335 |
336 | elements.append(element)
337 |
338 | return elements
339 |
340 | except Exception as e:
341 | logger.error(f"Failed to extract elements: {e}")
342 | return []
343 |
344 | def _get_node_type(self, tag_name: str, attributes: Dict[str, Any]) -> NodeType:
345 | """Determine node type from tag name and attributes"""
346 | if tag_name in ['input', 'textarea']:
347 | input_type = attributes.get('type', '').lower()
348 | if input_type in ['text', 'password', 'email', 'search', 'url', 'tel']:
349 | return NodeType.INPUT
350 | elif input_type in ['button', 'submit', 'reset']:
351 | return NodeType.BUTTON
352 | elif tag_name in ['button']:
353 | return NodeType.BUTTON
354 | elif tag_name in ['a']:
355 | return NodeType.LINK
356 | elif tag_name in ['img']:
357 | return NodeType.IMAGE
358 | elif tag_name in ['div', 'span', 'section', 'article', 'header', 'footer', 'nav']:
359 | return NodeType.CONTAINER
360 | elif tag_name in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'label', 'td', 'th']:
361 | return NodeType.TEXT
362 | else:
363 | return NodeType.OTHER
364 |
365 | async def _build_ui_tree(self) -> UITree:
366 | """Build UI tree structure"""
367 | try:
368 | # Simplified tree building - just create a root container
369 | # In a full implementation, we would parse the actual DOM tree
370 | root_node = UINode(
371 | id="root",
372 | content="",
373 | rect=Rect(left=0, top=0, width=1920, height=1080),
374 | center=(960, 540),
375 | node_type=NodeType.CONTAINER,
376 | attributes={},
377 | is_visible=True,
378 | children=[]
379 | )
380 |
381 | return UITree(node=root_node, children=[])
382 |
383 | except Exception as e:
384 | logger.error(f"Failed to build UI tree: {e}")
385 | # Return minimal tree
386 | root_node = UINode(
387 | id="root",
388 | content="",
389 | rect=Rect(left=0, top=0, width=1920, height=1080),
390 | center=(960, 540),
391 | node_type=NodeType.CONTAINER,
392 | attributes={},
393 | is_visible=True,
394 | children=[]
395 | )
396 | return UITree(node=root_node, children=[])
397 |
398 | async def __aenter__(self):
399 | return self
400 |
401 | async def __aexit__(self, exc_type, exc_val, exc_tb):
402 | await self.close()
--------------------------------------------------------------------------------