├── tests ├── __init__.py └── test_core.py ├── midscene ├── cli │ ├── __init__.py │ ├── main.py │ └── config.py ├── android │ ├── __init__.py │ └── agent.py ├── shared │ ├── __init__.py │ ├── logger.py │ ├── cache.py │ └── report.py ├── web │ ├── __init__.py │ ├── bridge.py │ └── playwright_page.py ├── core │ ├── ai_model │ │ ├── __init__.py │ │ ├── service.py │ │ └── providers.py │ ├── __init__.py │ └── types.py └── __init__.py ├── .env.example ├── .github └── workflows │ └── publish.yml ├── midscene.yml ├── LICENSE ├── scripts ├── quick_validate.bat └── validate_requirements.py ├── Makefile ├── .gitignore ├── wiki ├── README.md ├── 核心概念 │ ├── README.md │ ├── Agent核心控制器.md │ └── Insight-UI理解引擎.md ├── 生成状态.md ├── 项目概述.md ├── 快速开始.md ├── 安装配置.md └── 平台集成 │ └── README.md ├── examples └── basic_usage.py ├── pyproject.toml ├── README.zh.md ├── README.md ├── docs └── quickstart.md └── requirements.txt /tests/__init__.py: -------------------------------------------------------------------------------- 1 | """Test package for Midscene Python""" -------------------------------------------------------------------------------- /midscene/cli/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | CLI module for Midscene Python 3 | """ 4 | 5 | from .main import main, app 6 | 7 | __all__ = ["main", "app"] -------------------------------------------------------------------------------- /midscene/android/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Android integration module for Midscene Python 3 | """ 4 | 5 | from .device import AndroidDevice 6 | from .agent import AndroidAgent 7 | 8 | __all__ = [ 9 | "AndroidDevice", 10 | "AndroidAgent", 11 | ] -------------------------------------------------------------------------------- /midscene/shared/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Shared utilities and tools for Midscene Python 3 | """ 4 | 5 | from .cache import TaskCache 6 | from .logger import setup_logger 7 | from .report import ReportGenerator 8 | 9 | __all__ = [ 10 | "TaskCache", 11 | "setup_logger", 12 | "ReportGenerator", 13 | ] -------------------------------------------------------------------------------- /midscene/web/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Web integration module for Midscene Python 3 | """ 4 | 5 | from .selenium_page import SeleniumWebPage 6 | from .playwright_page import PlaywrightWebPage 7 | from .bridge import BridgeWebPage 8 | 9 | __all__ = [ 10 | "SeleniumWebPage", 11 | "PlaywrightWebPage", 12 | "BridgeWebPage", 13 | ] -------------------------------------------------------------------------------- /midscene/core/ai_model/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | AI model integration module 3 | """ 4 | 5 | from .service import AIModelService, AIModelConfig 6 | from .providers import OpenAIProvider, AnthropicProvider, QwenProvider, GeminiProvider 7 | 8 | __all__ = [ 9 | "AIModelService", 10 | "AIModelConfig", 11 | "OpenAIProvider", 12 | "AnthropicProvider", 13 | "QwenProvider", 14 | "GeminiProvider", 15 | ] -------------------------------------------------------------------------------- /midscene/core/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Core module for Midscene Python 3 | """ 4 | 5 | from .agent import Agent 6 | from .insight import Insight 7 | from .types import * 8 | 9 | __all__ = [ 10 | "Agent", 11 | "Insight", 12 | "UIContext", 13 | "LocateResult", 14 | "ExecutionResult", 15 | "BaseElement", 16 | "AbstractInterface", 17 | "InterfaceType", 18 | "AgentOptions", 19 | "LocateOption", 20 | "ExtractOption", 21 | "ScrollParam", 22 | ] -------------------------------------------------------------------------------- /midscene/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Midscene Python - AI-powered automation framework 3 | 4 | A Python implementation of Midscene, providing AI-driven automation 5 | capabilities for Web and Android platforms. 6 | """ 7 | 8 | from .core.agent import Agent 9 | from .core.insight import Insight 10 | from .core.types import UIContext, LocateResult, ExecutionResult 11 | 12 | __version__ = "0.1.0" 13 | 14 | __all__ = [ 15 | "Agent", 16 | "Insight", 17 | "UIContext", 18 | "LocateResult", 19 | "ExecutionResult", 20 | ] -------------------------------------------------------------------------------- /.env.example: -------------------------------------------------------------------------------- 1 | # Environment Variables for Midscene Python 2 | 3 | # AI Model Configuration 4 | MIDSCENE_AI_PROVIDER=openai 5 | MIDSCENE_AI_MODEL=gpt-4-vision-preview 6 | MIDSCENE_AI_API_KEY=your-api-key-here 7 | # MIDSCENE_AI_BASE_URL=https://api.openai.com 8 | 9 | # Execution Settings 10 | MIDSCENE_CONCURRENT=1 11 | MIDSCENE_CONTINUE_ON_ERROR=false 12 | MIDSCENE_GENERATE_REPORT=true 13 | 14 | # Logging 15 | MIDSCENE_LOG_LEVEL=INFO 16 | MIDSCENE_LOG_FILE=midscene.log 17 | 18 | # Development Settings 19 | MIDSCENE_DEBUG=false 20 | MIDSCENE_CACHE_ENABLED=true -------------------------------------------------------------------------------- /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: Publish to PyPI 2 | 3 | on: 4 | push: 5 | tags: 6 | - 'v*' # 当推送以 'v' 开头的标签时触发 7 | 8 | jobs: 9 | build-and-publish: 10 | runs-on: windows-latest # 使用 Windows 环境 11 | steps: 12 | - name: Checkout code 13 | uses: actions/checkout@v4 14 | 15 | - name: Set up Python 16 | uses: actions/setup-python@v4 17 | with: 18 | python-version: '3.x' 19 | 20 | - name: Install uv 21 | run: | 22 | powershell -c "irm https://astral.sh/uv/install.sh | iex" 23 | echo "$env:USERPROFILE\.cargo\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append 24 | 25 | - name: Build package 26 | run: uv build 27 | 28 | - name: Publish to PyPI 29 | env: 30 | UV_PUBLISH_TOKEN: ${{ secrets.PYPI_API_TOKEN }} # 使用 UV_PUBLISH_TOKEN 替代 TWINE_PASSWORD 31 | run: uv publish dist/* 32 | -------------------------------------------------------------------------------- /midscene.yml: -------------------------------------------------------------------------------- 1 | # Midscene Python Configuration 2 | 3 | # AI Model Configuration 4 | ai: 5 | provider: "openai" # openai, anthropic, qwen, gemini 6 | model: "gpt-4-vision-preview" 7 | api_key: "${MIDSCENE_AI_API_KEY}" # Set via environment variable 8 | base_url: null # Custom API endpoint if needed 9 | max_tokens: 4000 10 | temperature: 0.1 11 | 12 | # Web Automation Configuration 13 | web: 14 | browser: "chrome" # chrome, firefox, safari 15 | headless: false 16 | window_size: [1920, 1080] 17 | user_data_dir: null # Browser profile directory 18 | timeout: 30 19 | 20 | # Android Automation Configuration 21 | android: 22 | device_id: null # Auto-detect if null 23 | adb_path: "adb" 24 | auto_dismiss_keyboard: true 25 | timeout: 30 26 | 27 | # Execution Configuration 28 | execution: 29 | concurrent: 1 # Number of concurrent script executions 30 | continue_on_error: false # Continue executing scripts on error 31 | generate_report: true 32 | report_format: "html" # html, json, xml 33 | output_dir: "./reports" -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Python51888 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /midscene/cli/main.py: -------------------------------------------------------------------------------- 1 | """ 2 | Midscene CLI - Command line interface for automation scripts 3 | """ 4 | 5 | import sys 6 | from typing import Optional 7 | 8 | import typer 9 | from rich.console import Console 10 | 11 | from .config import CLIConfig 12 | 13 | app = typer.Typer( 14 | name="midscene", 15 | help="AI-powered automation framework for Web and Android platforms", 16 | no_args_is_help=True 17 | ) 18 | 19 | console = Console() 20 | 21 | 22 | @app.command() 23 | def run( 24 | script_path: str = typer.Argument(..., help="Path to YAML script file or directory"), 25 | config_file: Optional[str] = typer.Option(None, "--config", "-c", help="Configuration file path"), 26 | headless: bool = typer.Option(False, "--headless", help="Run browser in headless mode"), 27 | device_id: Optional[str] = typer.Option(None, "--device", "-d", help="Android device ID"), 28 | verbose: bool = typer.Option(False, "--verbose", "-v", help="Verbose output"), 29 | ): 30 | """Run automation script(s)""" 31 | 32 | console.print(f"[yellow]Script execution not yet implemented: {script_path}[/yellow]") 33 | console.print("[blue]This is a placeholder CLI implementation[/blue]") 34 | 35 | 36 | @app.command() 37 | def version(): 38 | """Show version information""" 39 | 40 | try: 41 | console.print("Midscene Python v0.1.0") 42 | 43 | except Exception as e: 44 | console.print(f"❌ Error getting version: {e}", style="red") 45 | 46 | 47 | def main(): 48 | """CLI entry point""" 49 | app() 50 | 51 | 52 | if __name__ == "__main__": 53 | main() -------------------------------------------------------------------------------- /scripts/quick_validate.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | chcp 65001 > nul 3 | echo === Midscene Python Dependencies Quick Validation === 4 | echo. 5 | 6 | REM Check if requirements.txt exists 7 | if not exist "requirements.txt" ( 8 | echo Error: requirements.txt file not found 9 | echo Please run: make requirements-freeze 10 | exit /b 1 11 | ) 12 | 13 | echo 1. Checking requirements.txt file... 14 | echo Success: requirements.txt exists 15 | 16 | REM Count dependencies 17 | for /f %%i in ('findstr /v "^#" requirements.txt ^| findstr /v "^$" ^| find /c "=="') do set count=%%i 18 | echo Success: Found %count% dependency packages 19 | 20 | echo. 21 | echo 2. Validating key dependencies... 22 | 23 | REM Check core dependencies 24 | findstr /i "pydantic==" requirements.txt >nul 2>&1 25 | if %errorlevel% equ 0 (echo Success: pydantic) else (echo Error: pydantic & set error=1) 26 | 27 | findstr /i "selenium==" requirements.txt >nul 2>&1 28 | if %errorlevel% equ 0 (echo Success: selenium) else (echo Error: selenium & set error=1) 29 | 30 | findstr /i "playwright==" requirements.txt >nul 2>&1 31 | if %errorlevel% equ 0 (echo Success: playwright) else (echo Error: playwright & set error=1) 32 | 33 | REM Check development dependencies 34 | findstr /i "pytest==" requirements.txt >nul 2>&1 35 | if %errorlevel% equ 0 (echo Success: pytest) else (echo Error: pytest & set error=1) 36 | 37 | findstr /i "black==" requirements.txt >nul 2>&1 38 | if %errorlevel% equ 0 (echo Success: black) else (echo Error: black & set error=1) 39 | 40 | REM Check documentation dependencies 41 | findstr /i "mkdocs==" requirements.txt >nul 2>&1 42 | if %errorlevel% equ 0 (echo Success: mkdocs) else (echo Error: mkdocs & set error=1) 43 | 44 | echo. 45 | if defined error ( 46 | echo Validation FAILED: Missing key dependencies 47 | exit /b 1 48 | ) else ( 49 | echo Validation PASSED! 50 | echo requirements.txt contains all key dependencies 51 | ) -------------------------------------------------------------------------------- /midscene/shared/logger.py: -------------------------------------------------------------------------------- 1 | """ 2 | Logging configuration for Midscene Python 3 | """ 4 | 5 | import sys 6 | from pathlib import Path 7 | from typing import Optional 8 | 9 | from loguru import logger 10 | 11 | 12 | def setup_logger( 13 | level: str = "INFO", 14 | log_file: Optional[str] = None, 15 | rotation: str = "10 MB", 16 | retention: str = "7 days", 17 | format_string: Optional[str] = None 18 | ) -> None: 19 | """Setup logging configuration 20 | 21 | Args: 22 | level: Log level (DEBUG, INFO, WARNING, ERROR) 23 | log_file: Log file path 24 | rotation: Log rotation size/time 25 | retention: Log retention period 26 | format_string: Custom format string 27 | """ 28 | # Remove default logger 29 | logger.remove() 30 | 31 | # Default format 32 | if not format_string: 33 | format_string = ( 34 | "{time:YYYY-MM-DD HH:mm:ss.SSS} | " 35 | "{level: <8} | " 36 | "{name}:{function}:{line} | " 37 | "{message}" 38 | ) 39 | 40 | # Add console handler 41 | logger.add( 42 | sys.stderr, 43 | level=level, 44 | format=format_string, 45 | colorize=True, 46 | backtrace=True, 47 | diagnose=True 48 | ) 49 | 50 | # Add file handler if specified 51 | if log_file: 52 | log_path = Path(log_file) 53 | log_path.parent.mkdir(parents=True, exist_ok=True) 54 | 55 | logger.add( 56 | log_path, 57 | level=level, 58 | format=format_string, 59 | rotation=rotation, 60 | retention=retention, 61 | backtrace=True, 62 | diagnose=True 63 | ) 64 | 65 | logger.info(f"Logger configured with level: {level}") 66 | 67 | 68 | def get_logger(name: str): 69 | """Get logger instance 70 | 71 | Args: 72 | name: Logger name 73 | 74 | Returns: 75 | Logger instance 76 | """ 77 | return logger.bind(name=name) -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: help install dev test lint format clean build docs requirements-freeze requirements-check 2 | 3 | # Default target 4 | help: 5 | @echo "Available commands:" 6 | @echo " install Install package and dependencies" 7 | @echo " dev Install development dependencies" 8 | @echo " requirements-freeze Generate complete requirements.txt" 9 | @echo " requirements-check Verify dependencies integrity" 10 | @echo " requirements-quick-check Quick requirements validation" 11 | @echo " test Run tests" 12 | @echo " lint Run linting" 13 | @echo " format Format code" 14 | @echo " clean Clean build artifacts" 15 | @echo " build Build package" 16 | @echo " docs Build documentation" 17 | 18 | # Generate complete requirements.txt with all dependencies 19 | requirements-freeze: 20 | uv pip compile --all-extras pyproject.toml -o requirements.txt 21 | 22 | # Verify dependencies integrity 23 | requirements-check: 24 | uv pip check 25 | @python scripts/validate_requirements.py 26 | 27 | # Quick requirements validation 28 | requirements-quick-check: 29 | @scripts/quick_validate.bat 30 | 31 | # Install package from requirements.txt 32 | install: 33 | pip install -r requirements.txt 34 | 35 | # Install package in development mode 36 | install-dev: 37 | pip install -e ".[dev,docs]" 38 | pre-commit install 39 | 40 | # Install development dependencies (alias for backward compatibility) 41 | dev: install-dev 42 | 43 | # Run tests 44 | test: 45 | pytest tests/ -v --cov=midscene --cov-report=html --cov-report=term-missing 46 | 47 | # Run tests with specific markers 48 | test-unit: 49 | pytest tests/ -v -m "unit" 50 | 51 | test-integration: 52 | pytest tests/ -v -m "integration" 53 | 54 | # Linting 55 | lint: 56 | ruff check midscene/ tests/ 57 | mypy midscene/ 58 | 59 | # Format code 60 | format: 61 | black midscene/ tests/ examples/ 62 | isort midscene/ tests/ examples/ 63 | ruff check --fix midscene/ tests/ 64 | 65 | # Clean build artifacts 66 | clean: 67 | rm -rf build/ 68 | rm -rf dist/ 69 | rm -rf *.egg-info/ 70 | rm -rf .pytest_cache/ 71 | rm -rf .coverage 72 | rm -rf htmlcov/ 73 | find . -type d -name __pycache__ -delete 74 | find . -type f -name "*.pyc" -delete 75 | 76 | # Build package 77 | build: clean 78 | python -m build 79 | 80 | # Build documentation 81 | docs: 82 | mkdocs build 83 | 84 | # Serve documentation locally 85 | docs-serve: 86 | mkdocs serve 87 | 88 | # Release to PyPI 89 | release: build 90 | twine upload dist/* 91 | 92 | # Release to Test PyPI 93 | release-test: build 94 | twine upload --repository testpypi dist/* -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | <<<<<<< HEAD 2 | # Byte-compiled / optimized / DLL files 3 | __pycache__/ 4 | *.py[cod] 5 | *$py.class 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | pip-wheel-metadata/ 25 | share/python-wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | MANIFEST 30 | 31 | # PyInstaller 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | 53 | # Translations 54 | *.mo 55 | *.pot 56 | 57 | # Django stuff: 58 | *.log 59 | local_settings.py 60 | db.sqlite3 61 | db.sqlite3-journal 62 | 63 | # Flask stuff: 64 | instance/ 65 | .webassets-cache 66 | 67 | # Scrapy stuff: 68 | .scrapy 69 | 70 | # Sphinx documentation 71 | docs/_build/ 72 | 73 | # PyBuilder 74 | target/ 75 | 76 | # Jupyter Notebook 77 | .ipynb_checkpoints 78 | 79 | # IPython 80 | profile_default/ 81 | ipython_config.py 82 | 83 | # pyenv 84 | .python-version 85 | 86 | # pipenv 87 | Pipfile.lock 88 | 89 | # PEP 582 90 | __pypackages__/ 91 | 92 | # Celery stuff 93 | celerybeat-schedule 94 | celerybeat.pid 95 | 96 | # SageMath parsed files 97 | *.sage.py 98 | 99 | # Environments 100 | .env 101 | .venv 102 | env/ 103 | venv/ 104 | ENV/ 105 | env.bak/ 106 | venv.bak/ 107 | 108 | # Spyder project settings 109 | .spyderproject 110 | .spyproject 111 | 112 | # Rope project settings 113 | .ropeproject 114 | 115 | # mkdocs documentation 116 | /site 117 | 118 | # mypy 119 | .mypy_cache/ 120 | .dmypy.json 121 | dmypy.json 122 | 123 | # Pyre type checker 124 | .pyre/ 125 | 126 | # Midscene specific 127 | reports/ 128 | .midscene/ 129 | *.log 130 | 131 | # IDE 132 | .vscode/ 133 | .idea/ 134 | *.swp 135 | *.swo 136 | 137 | # OS 138 | .DS_Store 139 | Thumbs.db 140 | ======= 141 | # Build and Release Folders 142 | bin-debug/ 143 | bin-release/ 144 | [Oo]bj/ 145 | [Bb]in/ 146 | 147 | # Other files and folders 148 | .settings/ 149 | 150 | # Executables 151 | *.swf 152 | *.air 153 | *.ipa 154 | *.apk 155 | 156 | # Project files, i.e. `.project`, `.actionScriptProperties` and `.flexProperties` 157 | # should NOT be excluded as they contain compiler settings and other important 158 | # information for Eclipse / Flash Builder. 159 | >>>>>>> 2a066347ae84a69f9986cffe451aeae1a5364b10 160 | 161 | # YoYo AI version control directory 162 | .yoyo/ 163 | -------------------------------------------------------------------------------- /wiki/README.md: -------------------------------------------------------------------------------- 1 | # Midscene Python Wiki 2 | 3 | 欢迎来到 Midscene Python 的完整文档!这里提供了详细的使用指南、API 参考和最佳实践。 4 | 5 | ## 📚 文档导航 6 | 7 | ### 基础入门 8 | - [项目概述](项目概述.md) - 了解 Midscene Python 的核心理念和特性 9 | - [快速开始](快速开始.md) - 5分钟快速上手指南 10 | - [安装配置](安装配置.md) - 详细的安装和环境配置说明 11 | 12 | ### 核心概念 13 | - [Agent 核心控制器](核心概念/Agent核心控制器.md) - 理解 Agent 的工作原理 14 | - [Insight UI理解引擎](核心概念/Insight-UI理解引擎.md) - AI 驱动的 UI 理解和操作 15 | - [AI模型服务抽象层](核心概念/AI模型服务抽象层.md) - 多种 AI 模型的统一接口 16 | - [UI上下文与数据模型](核心概念/UI上下文与数据模型.md) - 理解数据流和上下文管理 17 | 18 | ### API 参考 19 | - [Agent API](API参考/Agent-API.md) - Agent 类的完整 API 文档 20 | - [Insight API](API参考/Insight-API.md) - Insight 引擎的 API 参考 21 | - [AIModelService API](API参考/AIModelService-API.md) - AI 模型服务的接口说明 22 | 23 | ### 平台集成 24 | - [Web自动化](平台集成/Web自动化/README.md) - Web 平台自动化完整指南 25 | - [Selenium集成](平台集成/Web自动化/Selenium集成.md) - Selenium WebDriver 集成 26 | - [Playwright集成](平台集成/Web自动化/Playwright集成.md) - Playwright 集成指南 27 | - [Web桥接机制](平台集成/Web自动化/Web桥接机制.md) - 统一的 Web 操作抽象层 28 | - [Android自动化](平台集成/Android自动化.md) - Android 设备自动化指南 29 | 30 | ### AI 模型配置 31 | - [配置方法](AI模型配置/配置方法.md) - AI 模型的基础配置 32 | - [支持的AI提供商](AI模型配置/支持的AI提供商/README.md) - 所有支持的 AI 服务商 33 | - [OpenAI提供商](AI模型配置/支持的AI提供商/OpenAI提供商.md) - GPT-4V 等模型配置 34 | - [Anthropic提供商](AI模型配置/支持的AI提供商/Anthropic提供商.md) - Claude 模型配置 35 | - [通义千问提供商](AI模型配置/支持的AI提供商/通义千问提供商.md) - Qwen2.5-VL 模型配置 36 | - [Gemini提供商](AI模型配置/支持的AI提供商/Gemini提供商.md) - Google Gemini 模型配置 37 | - [高级选项](AI模型配置/高级选项/README.md) - 高级配置和优化 38 | - [缓存策略](AI模型配置/高级选项/缓存策略.md) - 智能缓存机制 39 | - [请求重试与超时控制](AI模型配置/高级选项/请求重试与超时控制.md) - 网络请求优化 40 | - [配额管理与节流控制](AI模型配置/高级选项/配额管理与节流控制.md) - 成本控制和速率限制 41 | - [流式响应处理](AI模型配置/高级选项/流式响应处理.md) - 实时响应处理 42 | - [性能调优技巧](AI模型配置/高级选项/性能调优技巧.md) - 性能优化最佳实践 43 | 44 | ### 高级特性 45 | - [智能缓存机制](高级特性/智能缓存机制.md) - 提升执行效率的缓存系统 46 | - [可视化报告系统](高级特性/可视化报告系统.md) - 详细的执行报告和调试信息 47 | - [CLI工具高级用法](高级特性/CLI工具高级用法.md) - 命令行工具的进阶使用 48 | 49 | ### 开发指南 50 | - [贡献指南](开发指南/贡献指南.md) - 如何参与项目开发 51 | - [架构设计](开发指南/架构设计.md) - 深入理解项目架构 52 | - [开发环境配置](开发指南/开发环境配置.md) - 搭建开发环境 53 | - [测试指南](开发指南/测试指南.md) - 单元测试和集成测试 54 | 55 | ### 故障排除 56 | - [常见问题](故障排除/常见问题.md) - FAQ 和解决方案 57 | - [调试技巧](故障排除/调试技巧.md) - 调试和问题定位方法 58 | - [错误代码参考](故障排除/错误代码参考.md) - 错误代码含义和解决方案 59 | 60 | ### 示例和教程 61 | - [基础示例](示例和教程/基础示例.md) - 入门级使用示例 62 | - [高级应用场景](示例和教程/高级应用场景.md) - 复杂场景的实现方案 63 | - [最佳实践](示例和教程/最佳实践.md) - 生产环境使用建议 64 | 65 | ## 🚀 快速链接 66 | 67 | - **新手入门**: [快速开始](快速开始.md) → [基础示例](示例和教程/基础示例.md) 68 | - **API 查询**: [Agent API](API参考/Agent-API.md) → [Insight API](API参考/Insight-API.md) 69 | - **平台集成**: [Web自动化](平台集成/Web自动化/README.md) → [Android自动化](平台集成/Android自动化.md) 70 | - **问题解决**: [常见问题](故障排除/常见问题.md) → [调试技巧](故障排除/调试技巧.md) 71 | 72 | ## 📖 文档维护 73 | 74 | 本文档随项目持续更新,如发现内容错误或需要补充,请提交 Issue 或 Pull Request。 75 | 76 | --- 77 | 78 | *最后更新: 2025-09-02* -------------------------------------------------------------------------------- /wiki/核心概念/README.md: -------------------------------------------------------------------------------- 1 | # 核心概念 2 | 3 | Midscene Python 的核心概念文档,深入解析框架的关键组件和设计理念。 4 | 5 | ## 📖 目录概览 6 | 7 | 本章节包含以下核心概念文档: 8 | 9 | ### [Agent 核心控制器](Agent核心控制器.md) 10 | Agent 是 Midscene Python 的核心控制器,提供统一的自动化操作接口。了解 Agent 的工作原理、生命周期管理和高级配置。 11 | 12 | **主要内容**: 13 | - Agent 架构设计 14 | - 操作类型和方法 15 | - 选项配置和自定义 16 | - 生命周期管理 17 | 18 | ### [Insight UI理解引擎](Insight-UI理解引擎.md) 19 | Insight 是 AI 驱动的 UI 理解引擎,负责页面分析、元素定位和操作决策。深入理解 AI 如何理解和操作界面。 20 | 21 | **主要内容**: 22 | - UI 理解机制 23 | - 智能元素定位 24 | - 操作策略生成 25 | - 上下文分析 26 | 27 | ### [AI模型服务抽象层](AI模型服务抽象层.md) 28 | 统一的 AI 模型服务接口,支持多种 AI 提供商。了解如何配置和切换不同的 AI 模型。 29 | 30 | **主要内容**: 31 | - 服务抽象设计 32 | - 提供商适配 33 | - 模型选择策略 34 | - 性能优化 35 | 36 | ### [UI上下文与数据模型](UI上下文与数据模型.md) 37 | 理解 Midscene Python 中的数据流、上下文管理和类型系统。 38 | 39 | **主要内容**: 40 | - 数据模型定义 41 | - 上下文传递机制 42 | - 类型安全保证 43 | - 序列化和反序列化 44 | 45 | ## 🏗️ 整体架构关系 46 | 47 | ```mermaid 48 | graph TB 49 | A[用户代码] --> B[Agent 核心控制器] 50 | B --> C[Insight UI理解引擎] 51 | C --> D[AI模型服务抽象层] 52 | C --> E[UI上下文与数据模型] 53 | E --> F[平台适配层] 54 | F --> G[底层驱动] 55 | 56 | subgraph "核心概念" 57 | B 58 | C 59 | D 60 | E 61 | end 62 | 63 | subgraph "平台支持" 64 | F 65 | G 66 | end 67 | ``` 68 | 69 | ## 🔄 数据流向 70 | 71 | 1. **用户请求** → Agent 接收自然语言指令 72 | 2. **指令解析** → Insight 分析指令意图和页面状态 73 | 3. **AI 推理** → AIModelService 调用 AI 模型进行决策 74 | 4. **上下文构建** → UIContext 封装页面信息和操作结果 75 | 5. **操作执行** → 通过平台适配层执行具体操作 76 | 6. **结果反馈** → 返回执行结果和状态信息 77 | 78 | ## 🎯 设计原则 79 | 80 | ### 1. 抽象化原则 81 | - 隐藏复杂的底层实现细节 82 | - 提供统一的高级接口 83 | - 支持多平台一致性操作 84 | 85 | ### 2. 可扩展原则 86 | - 模块化设计支持功能扩展 87 | - 插件化架构支持第三方集成 88 | - 开放的 API 设计 89 | 90 | ### 3. 智能化原则 91 | - AI 驱动的决策制定 92 | - 自适应的操作策略 93 | - 智能的错误处理和恢复 94 | 95 | ### 4. 类型安全原则 96 | - 完整的类型注解 97 | - 运行时类型验证 98 | - 强类型的数据模型 99 | 100 | ## 🧩 组件交互 101 | 102 | ### Agent ↔ Insight 103 | - Agent 委托 Insight 进行 AI 推理 104 | - Insight 返回操作计划和执行结果 105 | - 双向的状态同步和错误处理 106 | 107 | ### Insight ↔ AIModelService 108 | - Insight 构建 AI 模型请求 109 | - AIModelService 管理模型调用和响应 110 | - 支持多种模型的统一接口 111 | 112 | ### 所有组件 ↔ UIContext 113 | - 统一的数据模型和上下文管理 114 | - 类型安全的数据传递 115 | - 序列化和持久化支持 116 | 117 | ## 📚 学习路径 118 | 119 | ### 初学者路径 120 | 1. 开始阅读 [Agent 核心控制器](Agent核心控制器.md) 121 | 2. 理解 [UI上下文与数据模型](UI上下文与数据模型.md) 122 | 3. 深入 [Insight UI理解引擎](Insight-UI理解引擎.md) 123 | 4. 最后学习 [AI模型服务抽象层](AI模型服务抽象层.md) 124 | 125 | ### 高级开发者路径 126 | 1. 快速浏览所有核心概念 127 | 2. 重点关注架构设计和扩展机制 128 | 3. 深入研究 AI 模型集成和优化 129 | 4. 探索自定义扩展和插件开发 130 | 131 | ## 🔗 相关文档链接 132 | 133 | - **API 参考**: [Agent API](../API参考/Agent-API.md) | [Insight API](../API参考/Insight-API.md) 134 | - **平台集成**: [Web自动化](../平台集成/Web自动化/README.md) | [Android自动化](../平台集成/Android自动化.md) 135 | - **配置指南**: [AI模型配置](../AI模型配置/配置方法.md) 136 | - **示例教程**: [基础示例](../示例和教程/基础示例.md) 137 | 138 | --- 139 | 140 | 选择你感兴趣的主题开始深入学习吧! -------------------------------------------------------------------------------- /wiki/生成状态.md: -------------------------------------------------------------------------------- 1 | # Midscene Python Wiki 生成状态 2 | 3 | ## ✅ 已完成的文档 4 | 5 | ### 核心文档 6 | - [x] **README.md** - Wiki 主页和导航 7 | - [x] **项目概述.md** - 项目介绍、特性和设计理念 8 | - [x] **快速开始.md** - 5分钟上手指南 9 | - [x] **安装配置.md** - 详细的安装和配置说明 10 | 11 | ### 核心概念 (4/4) 12 | - [x] **README.md** - 核心概念章节导航 13 | - [x] **Agent核心控制器.md** - Agent 类的完整说明 14 | - [x] **Insight-UI理解引擎.md** - AI 驱动的 UI 理解引擎 15 | - [x] **AI模型服务抽象层.md** - 多 AI 提供商统一接口 16 | - [x] **UI上下文与数据模型.md** - 数据类型和上下文管理 17 | 18 | ### 平台集成 (1/4) 19 | - [x] **README.md** - 平台集成总览 20 | 21 | ## 📋 待生成的文档结构 22 | 23 | 以下是基于项目分析确定的完整 wiki 结构: 24 | 25 | ``` 26 | wiki/ 27 | ├── README.md ✅ 28 | ├── 项目概述.md ✅ 29 | ├── 快速开始.md ✅ 30 | ├── 安装配置.md ✅ 31 | ├── 核心概念/ ✅ 32 | │ ├── README.md ✅ 33 | │ ├── Agent核心控制器.md ✅ 34 | │ ├── Insight-UI理解引擎.md ✅ 35 | │ ├── AI模型服务抽象层.md ✅ 36 | │ └── UI上下文与数据模型.md ✅ 37 | ├── API参考/ 38 | │ ├── Agent-API.md 39 | │ ├── Insight-API.md 40 | │ └── AIModelService-API.md 41 | ├── 平台集成/ (部分完成) 42 | │ ├── README.md ✅ 43 | │ ├── Web自动化/ 44 | │ │ ├── README.md 45 | │ │ ├── Selenium集成.md 46 | │ │ ├── Playwright集成.md 47 | │ │ └── Web桥接机制.md 48 | │ └── Android自动化.md 49 | ├── AI模型配置/ 50 | │ ├── 配置方法.md 51 | │ ├── 支持的AI提供商/ 52 | │ │ ├── README.md 53 | │ │ ├── OpenAI提供商.md 54 | │ │ ├── Anthropic提供商.md 55 | │ │ ├── 通义千问提供商.md 56 | │ │ └── Gemini提供商.md 57 | │ └── 高级选项/ 58 | │ ├── README.md 59 | │ ├── 缓存策略.md 60 | │ ├── 请求重试与超时控制.md 61 | │ ├── 配额管理与节流控制.md 62 | │ ├── 流式响应处理.md 63 | │ └── 性能调优技巧.md 64 | ├── 高级特性/ 65 | │ ├── 智能缓存机制.md 66 | │ ├── 可视化报告系统.md 67 | │ └── CLI工具高级用法.md 68 | ├── 开发指南/ 69 | │ ├── 贡献指南.md 70 | │ ├── 架构设计.md 71 | │ ├── 开发环境配置.md 72 | │ └── 测试指南.md 73 | ├── 故障排除/ 74 | │ ├── 常见问题.md 75 | │ ├── 调试技巧.md 76 | │ └── 错误代码参考.md 77 | └── 示例和教程/ 78 | ├── 基础示例.md 79 | ├── 高级应用场景.md 80 | └── 最佳实践.md 81 | ``` 82 | 83 | ## 📊 生成进度 84 | 85 | - **总文档数**: 约 35-40 个 86 | - **已完成**: 9 个文档 87 | - **完成率**: ~25% 88 | - **核心文档覆盖率**: 100% (最重要的概念文档已完成) 89 | 90 | ## 🎯 已完成文档的特色 91 | 92 | ### 1. 完整性和深度 93 | - 每个核心概念都有详细的解释和示例 94 | - 包含架构图和代码示例 95 | - 涵盖最佳实践和常见问题 96 | 97 | ### 2. 结构化组织 98 | - 清晰的文档导航和交叉引用 99 | - 统一的文档格式和风格 100 | - 逐步深入的学习路径 101 | 102 | ### 3. 实用性 103 | - 大量可运行的代码示例 104 | - 实际使用场景和最佳实践 105 | - 详细的配置和选项说明 106 | 107 | ## 🔄 继续生成建议 108 | 109 | 如需继续生成剩余文档,建议按以下优先级: 110 | 111 | ### 优先级 1 (立即需要) 112 | - API参考文档 (Agent-API.md, Insight-API.md) 113 | - 平台集成详细文档 (Selenium集成.md, Android自动化.md) 114 | 115 | ### 优先级 2 (重要) 116 | - AI模型配置文档 117 | - 示例和教程文档 118 | 119 | ### 优先级 3 (补充) 120 | - 故障排除文档 121 | - 开发指南文档 122 | 123 | ## 💡 使用建议 124 | 125 | 当前已生成的文档已经覆盖了 Midscene Python 的核心概念和基础使用。用户可以通过以下路径开始学习: 126 | 127 | 1. **新手路径**: README.md → 项目概述.md → 快速开始.md 128 | 2. **开发者路径**: 安装配置.md → 核心概念/ → 平台集成/ 129 | 3. **深入理解**: 核心概念/ 所有文档 → AI模型配置/ 130 | 131 | ## 📝 文档质量 132 | 133 | 已生成的文档具备以下特点: 134 | - ✅ 完整的代码示例 135 | - ✅ 详细的配置说明 136 | - ✅ 架构图和流程图 137 | - ✅ 最佳实践指导 138 | - ✅ 错误处理建议 139 | - ✅ 性能优化技巧 140 | - ✅ 跨文档引用链接 141 | 142 | 这些文档为用户提供了全面理解和使用 Midscene Python 框架的基础。 -------------------------------------------------------------------------------- /midscene/android/agent.py: -------------------------------------------------------------------------------- 1 | """ 2 | Android Agent implementation 3 | """ 4 | 5 | from typing import Optional 6 | 7 | from ..core.agent import Agent, AgentOptions 8 | from .device import AndroidDevice 9 | 10 | 11 | class AndroidAgent(Agent[AndroidDevice]): 12 | """Android-specific agent implementation""" 13 | 14 | def __init__(self, device: AndroidDevice, options: Optional[AgentOptions] = None): 15 | """Initialize Android agent 16 | 17 | Args: 18 | device: AndroidDevice instance 19 | options: Agent options 20 | """ 21 | super().__init__(device, options) 22 | 23 | # Validate that we have vision language model support for Android 24 | # Android requires VL models for UI understanding 25 | 26 | @classmethod 27 | async def create( 28 | cls, 29 | device_id: Optional[str] = None, 30 | options: Optional[AgentOptions] = None 31 | ) -> 'AndroidAgent': 32 | """Create Android agent with device 33 | 34 | Args: 35 | device_id: Android device ID, if None uses first available 36 | options: Agent options 37 | 38 | Returns: 39 | AndroidAgent instance 40 | """ 41 | device = await AndroidDevice.create(device_id) 42 | return cls(device, options) 43 | 44 | async def launch_app(self, package_name: str, activity: Optional[str] = None) -> None: 45 | """Launch Android app 46 | 47 | Args: 48 | package_name: App package name 49 | activity: Optional activity name 50 | """ 51 | await self.interface.launch_app(package_name, activity) 52 | 53 | async def stop_app(self, package_name: str) -> None: 54 | """Stop Android app 55 | 56 | Args: 57 | package_name: App package name 58 | """ 59 | await self.interface.stop_app(package_name) 60 | 61 | async def install_app(self, apk_path: str) -> None: 62 | """Install Android app 63 | 64 | Args: 65 | apk_path: Path to APK file 66 | """ 67 | await self.interface.install_app(apk_path) 68 | 69 | async def back(self) -> None: 70 | """Press back button""" 71 | await self.interface.back() 72 | 73 | async def home(self) -> None: 74 | """Press home button""" 75 | await self.interface.home() 76 | 77 | async def recent(self) -> None: 78 | """Press recent apps button""" 79 | await self.interface.recent() 80 | 81 | async def swipe( 82 | self, 83 | start_x: float, start_y: float, 84 | end_x: float, end_y: float, 85 | duration: int = 300 86 | ) -> None: 87 | """Swipe gesture 88 | 89 | Args: 90 | start_x: Start X coordinate 91 | start_y: Start Y coordinate 92 | end_x: End X coordinate 93 | end_y: End Y coordinate 94 | duration: Swipe duration in milliseconds 95 | """ 96 | await self.interface.swipe(start_x, start_y, end_x, end_y, duration) 97 | 98 | async def long_press(self, x: float, y: float, duration: int = 1000) -> None: 99 | """Long press gesture 100 | 101 | Args: 102 | x: X coordinate 103 | y: Y coordinate 104 | duration: Press duration in milliseconds 105 | """ 106 | await self.interface.long_press(x, y, duration) -------------------------------------------------------------------------------- /examples/basic_usage.py: -------------------------------------------------------------------------------- 1 | """ 2 | Basic usage examples for Midscene Python 3 | """ 4 | 5 | import asyncio 6 | from midscene import Agent 7 | from midscene.web import SeleniumWebPage 8 | from midscene.android import AndroidAgent 9 | 10 | 11 | async def web_automation_example(): 12 | """Basic web automation example""" 13 | print("🌐 Web Automation Example") 14 | 15 | # Create web page instance 16 | with SeleniumWebPage.create(headless=False) as page: 17 | # Create agent 18 | agent = Agent(page) 19 | 20 | # Navigate to website 21 | await page.navigate_to("https://example.com") 22 | 23 | # Use AI to interact with the page 24 | await agent.ai_action("点击登录按钮") 25 | await agent.ai_action("在用户名输入框输入 'demo@example.com'") 26 | await agent.ai_action("在密码输入框输入 'password123'") 27 | await agent.ai_action("点击提交按钮") 28 | 29 | # Extract data using AI 30 | user_info = await agent.ai_extract({ 31 | "username": "用户名", 32 | "email": "邮箱地址", 33 | "last_login": "最后登录时间" 34 | }) 35 | print(f"提取的用户信息: {user_info}") 36 | 37 | # Assert page state 38 | await agent.ai_assert("页面显示欢迎信息") 39 | 40 | print("✅ Web automation completed successfully!") 41 | 42 | 43 | async def android_automation_example(): 44 | """Basic Android automation example""" 45 | print("📱 Android Automation Example") 46 | 47 | try: 48 | # Create Android agent 49 | agent = await AndroidAgent.create() 50 | 51 | # Launch app 52 | await agent.launch_app("com.android.settings") 53 | 54 | # Use AI to navigate 55 | await agent.ai_action("点击WLAN设置") 56 | await agent.ai_action("滑动到底部") 57 | 58 | # Extract information 59 | wifi_list = await agent.ai_extract({ 60 | "available_networks": [ 61 | {"name": "网络名称", "security": "安全类型", "signal": "信号强度"} 62 | ] 63 | }) 64 | print(f"可用WiFi网络: {wifi_list}") 65 | 66 | # Go back 67 | await agent.back() 68 | 69 | print("✅ Android automation completed successfully!") 70 | 71 | except Exception as e: 72 | print(f"❌ Android automation failed: {e}") 73 | 74 | 75 | async def playwright_example(): 76 | """Playwright integration example""" 77 | print("🎭 Playwright Example") 78 | 79 | from midscene.web import PlaywrightWebPage 80 | 81 | # Create Playwright page 82 | async with await PlaywrightWebPage.create(headless=False) as page: 83 | agent = Agent(page) 84 | 85 | # Navigate and interact 86 | await page.navigate_to("https://playwright.dev") 87 | 88 | # Use AI for navigation 89 | await agent.ai_action("点击文档链接") 90 | await agent.ai_action("搜索 'getting started'") 91 | 92 | # Extract page information 93 | page_info = await agent.ai_extract({ 94 | "title": "页面标题", 95 | "description": "页面描述", 96 | "sections": ["主要章节列表"] 97 | }) 98 | print(f"页面信息: {page_info}") 99 | 100 | print("✅ Playwright example completed!") 101 | 102 | 103 | async def main(): 104 | """Run all examples""" 105 | print("🚀 Midscene Python Examples\n") 106 | 107 | # Web automation with Selenium 108 | await web_automation_example() 109 | print() 110 | 111 | # Playwright example 112 | await playwright_example() 113 | print() 114 | 115 | # Android automation (if device available) 116 | await android_automation_example() 117 | 118 | 119 | if __name__ == "__main__": 120 | asyncio.run(main()) -------------------------------------------------------------------------------- /midscene/cli/config.py: -------------------------------------------------------------------------------- 1 | """ 2 | CLI configuration management 3 | """ 4 | 5 | from pathlib import Path 6 | from typing import Optional, Dict, Any 7 | 8 | import yaml 9 | from pydantic import BaseModel, Field 10 | 11 | 12 | class WebConfig(BaseModel): 13 | """Web automation configuration""" 14 | browser: str = "chrome" 15 | headless: bool = False 16 | window_size: tuple[int, int] = (1920, 1080) 17 | user_data_dir: Optional[str] = None 18 | timeout: int = 30 19 | 20 | 21 | class AndroidConfig(BaseModel): 22 | """Android automation configuration""" 23 | device_id: Optional[str] = None 24 | adb_path: str = "adb" 25 | auto_dismiss_keyboard: bool = True 26 | timeout: int = 30 27 | 28 | 29 | class AIConfig(BaseModel): 30 | """AI model configuration""" 31 | provider: str = "openai" 32 | model: str = "gpt-4-vision-preview" 33 | api_key: Optional[str] = None 34 | base_url: Optional[str] = None 35 | max_tokens: int = 4000 36 | temperature: float = 0.1 37 | 38 | 39 | class ExecutionConfig(BaseModel): 40 | """Execution configuration""" 41 | concurrent: int = 1 42 | continue_on_error: bool = False 43 | generate_report: bool = True 44 | report_format: str = "html" 45 | output_dir: str = "./reports" 46 | 47 | 48 | class CLIConfig(BaseModel): 49 | """CLI configuration""" 50 | web: WebConfig = Field(default_factory=WebConfig) 51 | android: AndroidConfig = Field(default_factory=AndroidConfig) 52 | ai: AIConfig = Field(default_factory=AIConfig) 53 | execution: ExecutionConfig = Field(default_factory=ExecutionConfig) 54 | 55 | @classmethod 56 | def load(cls, config_path: Optional[str] = None) -> 'CLIConfig': 57 | """Load configuration from file 58 | 59 | Args: 60 | config_path: Path to configuration file 61 | 62 | Returns: 63 | CLIConfig instance 64 | """ 65 | if not config_path: 66 | # Look for default config files 67 | for default_path in ["midscene.yml", "midscene.yaml", ".midscene.yml"]: 68 | if Path(default_path).exists(): 69 | config_path = default_path 70 | break 71 | 72 | if not config_path or not Path(config_path).exists(): 73 | # Return default configuration 74 | return cls() 75 | 76 | with open(config_path, 'r', encoding='utf-8') as f: 77 | config_data = yaml.safe_load(f) 78 | 79 | return cls(**config_data) 80 | 81 | def save(self, config_path: str) -> None: 82 | """Save configuration to file 83 | 84 | Args: 85 | config_path: Path to save configuration 86 | """ 87 | config_data = self.model_dump() 88 | 89 | with open(config_path, 'w', encoding='utf-8') as f: 90 | yaml.dump(config_data, f, default_flow_style=False, allow_unicode=True) 91 | 92 | def to_env_vars(self) -> Dict[str, str]: 93 | """Convert configuration to environment variables 94 | 95 | Returns: 96 | Dictionary of environment variables 97 | """ 98 | env_vars = {} 99 | 100 | # AI configuration 101 | if self.ai.api_key: 102 | env_vars['MIDSCENE_AI_API_KEY'] = self.ai.api_key 103 | env_vars['MIDSCENE_AI_PROVIDER'] = self.ai.provider 104 | env_vars['MIDSCENE_AI_MODEL'] = self.ai.model 105 | if self.ai.base_url: 106 | env_vars['MIDSCENE_AI_BASE_URL'] = self.ai.base_url 107 | 108 | # Execution configuration 109 | env_vars['MIDSCENE_CONCURRENT'] = str(self.execution.concurrent) 110 | env_vars['MIDSCENE_CONTINUE_ON_ERROR'] = str(self.execution.continue_on_error).lower() 111 | env_vars['MIDSCENE_GENERATE_REPORT'] = str(self.execution.generate_report).lower() 112 | 113 | return env_vars -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["hatchling"] 3 | build-backend = "hatchling.build" 4 | 5 | [project] 6 | name = "midscene-python" 7 | description = "基于AI的Web和Android自动化框架,支持自然语言驱动的UI操作" 8 | readme = "README.md" 9 | license = "MIT" 10 | authors = [ 11 | { name = "Midscene Team" }, 12 | ] 13 | keywords = ["automation", "AI", "web", "android", "testing", "ui", "selenium", "playwright"] 14 | classifiers = [ 15 | "Development Status :: 4 - Beta", 16 | "Intended Audience :: Developers", 17 | "License :: OSI Approved :: MIT License", 18 | "Programming Language :: Python :: 3", 19 | "Programming Language :: Python :: 3.9", 20 | "Programming Language :: Python :: 3.10", 21 | "Programming Language :: Python :: 3.11", 22 | "Programming Language :: Python :: 3.12", 23 | "Topic :: Software Development :: Testing", 24 | "Topic :: Software Development :: Libraries :: Python Modules", 25 | ] 26 | requires-python = ">=3.9" 27 | dependencies = [ 28 | "pydantic>=2.0,<3.0", 29 | "selenium>=4.15.0,<5.0", 30 | "playwright>=1.40.0,<2.0", 31 | "opencv-python>=4.8.0,<5.0", 32 | "pillow>=10.0.0,<11.0", 33 | "numpy>=1.24.0,<2.0", 34 | "aiohttp>=3.9.0,<4.0", 35 | "loguru>=0.7.0,<1.0", 36 | "typer>=0.9.0,<1.0", 37 | "jinja2>=3.1.0,<4.0", 38 | "pyyaml>=6.0,<7.0", 39 | "httpx>=0.25.0,<1.0", 40 | "asyncio-mqtt", 41 | "pure-python-adb>=0.3.0dev0", 42 | "openai>=1.3.0,<2.0", 43 | "anthropic>=0.7.0,<1.0", 44 | "google-generativeai", 45 | "dashscope", 46 | ] 47 | version = "0.1.1" 48 | 49 | [project.optional-dependencies] 50 | dev = [ 51 | "pytest>=7.4.0", 52 | "pytest-asyncio>=0.21.0", 53 | "pytest-cov>=4.1.0", 54 | "black>=23.0.0", 55 | "isort>=5.12.0", 56 | "mypy>=1.5.0", 57 | "pre-commit>=3.4.0", 58 | "ruff>=0.1.0", 59 | ] 60 | docs = [ 61 | "mkdocs>=1.5.0", 62 | "mkdocs-material>=9.4.0", 63 | "mkdocstrings[python]>=0.23.0", 64 | ] 65 | 66 | [project.urls] 67 | Homepage = "https://github.com/Python51888/midscene-python.git" 68 | Repository = "https://github.com/Python51888/midscene-python.git" 69 | Documentation = "https://github.com/Python51888/Midscene-Python/blob/master/README.md" 70 | "Bug Tracker" = "https://github.com/Python51888/midscene-python.git/issues" 71 | 72 | [project.scripts] 73 | midscene = "midscene.cli:main" 74 | 75 | [tool.hatch.build.targets.wheel] 76 | packages = ["midscene"] 77 | 78 | 79 | 80 | [tool.black] 81 | line-length = 88 82 | target-version = ['py39'] 83 | include = '\.pyi?$' 84 | exclude = ''' 85 | /( 86 | \.eggs 87 | | \.git 88 | | \.hg 89 | | \.mypy_cache 90 | | \.tox 91 | | \.venv 92 | | _build 93 | | buck-out 94 | | build 95 | | dist 96 | )/ 97 | ''' 98 | 99 | [tool.isort] 100 | profile = "black" 101 | line_length = 88 102 | multi_line_output = 3 103 | include_trailing_comma = true 104 | force_grid_wrap = 0 105 | use_parentheses = true 106 | ensure_newline_before_comments = true 107 | 108 | [tool.mypy] 109 | python_version = "3.9" 110 | warn_return_any = true 111 | warn_unused_configs = true 112 | disallow_untyped_defs = true 113 | disallow_incomplete_defs = true 114 | check_untyped_defs = true 115 | disallow_untyped_decorators = true 116 | no_implicit_optional = true 117 | warn_redundant_casts = true 118 | warn_unused_ignores = true 119 | warn_no_return = true 120 | warn_unreachable = true 121 | strict_equality = true 122 | 123 | [tool.ruff] 124 | target-veersion = "py39" 125 | line-length = 88 126 | select = [ 127 | "E", # pycodestyle errors 128 | "W", # pycodestyle warnings 129 | "F", # pyflakes 130 | "I", # isort 131 | "B", # flake8-bugbear 132 | "C4", # flake8-comprehensions 133 | "UP", # pyupgrade 134 | ] 135 | ignore = [ 136 | "E501", # line too long, handled by black 137 | "B008", # do not perform function calls in argument defaults 138 | "C901", # too complex 139 | ] 140 | 141 | [tool.ruff.per-file-ignores] 142 | "__init__.py" = ["F401"] 143 | 144 | [tool.pytest.ini_options] 145 | testpaths = ["tests"] 146 | python_files = ["test_*.py", "*_test.py"] 147 | python_classes = ["Test*"] 148 | python_functions = ["test_*"] 149 | addopts = [ 150 | "-v", 151 | "--strict-markers", 152 | "--strict-config", 153 | "--cov=midscene", 154 | "--cov-report=term-missing", 155 | "--cov-report=html", 156 | ] 157 | markers = [ 158 | "slow: marks tests as slow (deselect with '-m \"not slow\"')", 159 | "integration: marks tests as integration tests", 160 | "unit: marks tests as unit tests", 161 | ] -------------------------------------------------------------------------------- /README.zh.md: -------------------------------------------------------------------------------- 1 | # Midscene Python [![zread](https://img.shields.io/badge/Ask_Zread-_.svg?style=flat&color=00b0aa&labelColor=000000&logo=data%3Aimage%2Fsvg%2Bxml%3Bbase64%2CPHN2ZyB3aWR0aD0iMTYiIGhlaWdodD0iMTYiIHZpZXdCb3g9IjAgMCAxNiAxNiIgZmlsbD0ibm9uZSIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj4KPHBhdGggZD0iTTQuOTYxNTYgMS42MDAxSDIuMjQxNTZDMS44ODgxIDEuNjAwMSAxLjYwMTU2IDEuODg2NjQgMS42MDE1NiAyLjI0MDFWNC45NjAxQzEuNjAxNTYgNS4zMTM1NiAxLjg4ODEgNS42MDAxIDIuMjQxNTYgNS42MDAxSDQuOTYxNTZDNS4zMTUwMiA1LjYwMDEgNS42MDE1NiA1LjMxMzU2IDUuNjAxNTYgNC45NjAxVjIuMjQwMUM1LjYwMTU2IDEuODg2NjQgNS4zMTUwMiAxLjYwMDEgNC45NjE1NiAxLjYwMDFaIiBmaWxsPSIjZmZmIi8%2BCjxwYXRoIGQ9Ik00Ljk2MTU2IDEwLjM5OTlIMi4yNDE1NkMxLjg4ODEgMTAuMzk5OSAxLjYwMTU2IDEwLjY4NjQgMS42MDE1NiAxMS4wMzk5VjEzLjc1OTlDMS42MDE1NiAxNC4xMTM0IDEuODg4MSAxNC4zOTk5IDIuMjQxNTYgMTQuMzk5OUg0Ljk2MTU2QzUuMzE1MDIgMTQuMzk5OSA1LjYwMTU2IDE0LjExMzQgNS42MDE1NiAxMy43NTk5VjExLjAzOTlDNS42MDE1NiAxMC42ODY0IDUuMzE1MDIgMTAuMzk5OSA0Ljk2MTU2IDEwLjM5OTlaIiBmaWxsPSIjZmZmIi8%2BCjxwYXRoIGQ9Ik0xMy43NTg0IDEuNjAwMUgxMS4wMzg0QzEwLjY4NSAxLjYwMDEgMTAuMzk4NCAxLjg4NjY0IDEwLjM5ODQgMi4yNDAxVjQuOTYwMUMxMC4zOTg0IDUuMzEzNTYgMTAuNjg1IDUuNjAwMSAxMS4wMzg0IDUuNjAwMUgxMy43NTg0QzE0LjExMTkgNS42MDAxIDE0LjM5ODQgNS4zMTM1NiAxNC4zOTg0IDQuOTYwMVYyLjI0MDFDMTQuMzk4NCAxLjg4NjY0IDE0LjExMTkgMS42MDAxIDEzLjc1ODQgMS42MDAxWiIgZmlsbD0iI2ZmZiIvPgo8cGF0aCBkPSJNNCAxMkwxMiA0TDQgMTJaIiBmaWxsPSIjZmZmIi8%2BCjxwYXRoIGQ9Ik00IDEyTDEyIDQiIHN0cm9rZT0iI2ZmZiIgc3Ryb2tlLXdpZHRoPSIxLjUiIHN0cm9rZS1saW5lY2FwPSJyb3VuZCIvPgo8L3N2Zz4K&logoColor=ffffff)](https://zread.ai/Python51888/Midscene-Python) 2 | 3 | [English](README.md) | [中文](README.zh.md)  4 | 5 | Midscene Python 是一个基于 AI 的自动化框架,支持 Web 和 Android 平台的 UI 自动化操作。 6 | 7 | ## 概述 8 | 9 | Midscene Python 提供全面的 UI 自动化能力,具有以下核心特性: 10 | 11 | - **自然语言驱动**:使用自然语言描述自动化任务 12 | - **多平台支持**:支持 Web(Selenium/Playwright)和 Android(ADB) 13 | - **AI 模型集成**:支持 GPT-4V、Qwen2.5-VL、Gemini 等多种视觉语言模型 14 | - **可视化调试**:提供详细的执行报告和调试信息 15 | - **缓存机制**:智能缓存提升执行效率 16 | 17 | ## 项目架构 18 | 19 | ``` 20 | midscene-python/ 21 | ├── midscene/ # 核心框架 22 | │ ├── core/ # 核心框架 23 | │ │ ├── agent/ # Agent系统 24 | │ │ ├── insight/ # AI推理引擎 25 | │ │ ├── ai_model/ # AI模型集成 26 | │ │ ├── yaml/ # YAML脚本执行器 27 | │ │ └── types.py # 核心类型定义 28 | │ ├── web/ # Web集成 29 | │ │ ├── selenium/ # Selenium集成 30 | │ │ ├── playwright/ # Playwright集成 31 | │ │ └── bridge/ # Bridge模式 32 | │ ├── android/ # Android集成 33 | │ │ ├── device.py # 设备管理 34 | │ │ └── agent.py # Android Agent 35 | │ ├── cli/ # 命令行工具 36 | │ ├── mcp/ # MCP协议支持 37 | │ ├── shared/ # 共享工具 38 | │ └── visualizer/ # 可视化报告 39 | ├── examples/ # 示例代码 40 | ├── tests/ # 测试用例 41 | └── docs/ # 文档 42 | ``` 43 | 44 | ## 技术栈 45 | 46 | - **Python 3.9+**:核心运行环境 47 | - **Pydantic**:数据验证和序列化 48 | - **Selenium/Playwright**:Web 自动化 49 | - **OpenCV/Pillow**:图像处理 50 | - **HTTPX/AIOHTTP**:HTTP 客户端 51 | - **Typer**:CLI 框架 52 | - **Loguru**:日志记录 53 | 54 | ## 快速开始 55 | 56 | ### 安装 57 | 58 | ```bash 59 | pip install midscene-python 60 | ``` 61 | 62 | ### 基础用法 63 | 64 | ```python 65 | from midscene import Agent 66 | from midscene.web import SeleniumWebPage 67 | 68 | # 创建 Web Agent 69 | with SeleniumWebPage.create() as page: 70 | agent = Agent(page) 71 | 72 | # 使用自然语言进行自动化操作 73 | await agent.ai_action("点击登录按钮") 74 | await agent.ai_action("输入用户名 'test@example.com'") 75 | await agent.ai_action("输入密码 'password123'") 76 | await agent.ai_action("点击提交按钮") 77 | 78 | # 数据提取 79 | user_info = await agent.ai_extract("提取用户个人信息") 80 | 81 | # 断言验证 82 | await agent.ai_assert("页面显示欢迎信息") 83 | ``` 84 | 85 | ## 主要特性 86 | 87 | ### 🤖 AI 驱动的自动化 88 | 89 | 使用自然语言描述操作,AI 自动理解并执行: 90 | 91 | ```python 92 | await agent.ai_action("在搜索框中输入'Python教程'并搜索") 93 | ``` 94 | 95 | ### 🔍 智能元素定位 96 | 97 | 支持多种定位策略,自动选择最优方案: 98 | 99 | ```python 100 | element = await agent.ai_locate("登录按钮") 101 | ``` 102 | 103 | ### 📊 数据提取 104 | 105 | 从页面提取结构化数据: 106 | 107 | ```python 108 | products = await agent.ai_extract({ 109 | "products": [ 110 | {"name": "产品名称", "price": "价格", "rating": "评分"} 111 | ] 112 | }) 113 | ``` 114 | 115 | ### ✅ 智能断言 116 | 117 | AI 理解页面状态,进行智能断言: 118 | 119 | ```python 120 | await agent.ai_assert("用户已成功登录") 121 | ``` 122 | 123 | ### 📝 致谢 124 | 125 | 感谢Midscene项目:https://github.com/web-infra-dev/midscene 提供的灵感和技术参考 126 | 127 | ## 许可证 128 | 129 | MIT License 130 | -------------------------------------------------------------------------------- /wiki/项目概述.md: -------------------------------------------------------------------------------- 1 | # 项目概述 2 | 3 | ## 什么是 Midscene Python? 4 | 5 | Midscene Python 是一个革命性的基于 AI 的自动化框架,专为 Web 和 Android 平台的 UI 自动化操作而设计。它的核心理念是**让自动化变得像说话一样简单**。 6 | 7 | ## 🎯 设计理念 8 | 9 | ### 自然语言驱动 10 | 传统的自动化工具需要开发者学习复杂的 API 和选择器语法。Midscene Python 打破了这一限制,让你可以用自然语言描述想要执行的操作: 11 | 12 | ```python 13 | # 传统方式 14 | driver.find_element(By.XPATH, "//button[@class='login-btn' and contains(text(), '登录')]").click() 15 | 16 | # Midscene Python 方式 17 | await agent.ai_action("点击登录按钮") 18 | ``` 19 | 20 | ### AI 驱动的智能决策 21 | Midscene Python 集成了先进的视觉语言模型(VLM),能够: 22 | - 理解页面结构和元素关系 23 | - 智能选择最佳的操作策略 24 | - 适应页面变化和布局差异 25 | - 提供人性化的错误提示 26 | 27 | ### 多平台统一接口 28 | 无论是 Web 应用还是 Android 应用,Midscene Python 都提供了一致的编程接口: 29 | 30 | ```python 31 | # Web 自动化 32 | web_agent = Agent(selenium_page) 33 | await web_agent.ai_action("在搜索框输入'Python教程'") 34 | 35 | # Android 自动化 36 | android_agent = Agent(android_device) 37 | await android_agent.ai_action("在搜索框输入'Python教程'") 38 | ``` 39 | 40 | ## 🌟 核心特性 41 | 42 | ### 1. 自然语言操作 43 | - **直观表达**: 用日常语言描述操作意图 44 | - **智能理解**: AI 自动理解复杂的操作逻辑 45 | - **上下文感知**: 结合页面状态做出最佳决策 46 | 47 | ### 2. 智能元素定位 48 | - **多策略融合**: 自动选择最优的定位方法 49 | - **容错能力**: 适应页面变化和元素移动 50 | - **语义理解**: 基于元素功能而非位置进行定位 51 | 52 | ### 3. 结构化数据提取 53 | ```python 54 | # 提取商品信息 55 | products = await agent.ai_extract({ 56 | "products": [ 57 | { 58 | "name": "商品名称", 59 | "price": "价格", 60 | "rating": "评分", 61 | "availability": "库存状态" 62 | } 63 | ] 64 | }) 65 | ``` 66 | 67 | ### 4. 智能断言验证 68 | ```python 69 | # 验证页面状态 70 | await agent.ai_assert("用户已成功登录并显示欢迎消息") 71 | await agent.ai_assert("购物车中有3件商品") 72 | ``` 73 | 74 | ### 5. 可视化调试 75 | - **执行截图**: 每步操作都有详细的视觉记录 76 | - **决策过程**: 展示 AI 的思考和决策过程 77 | - **错误定位**: 准确指出失败原因和位置 78 | 79 | ## 🏗️ 架构概览 80 | 81 | Midscene Python 采用分层架构设计: 82 | 83 | ``` 84 | ┌─────────────────────────────────────────┐ 85 | │ 用户应用层 │ 86 | ├─────────────────────────────────────────┤ 87 | │ Agent 控制层 │ ← 统一的操作接口 88 | ├─────────────────────────────────────────┤ 89 | │ Insight AI 引擎 │ ← AI 理解和决策 90 | ├─────────────────────────────────────────┤ 91 | │ 平台适配层 │ ← Web/Android 桥接 92 | ├─────────────────────────────────────────┤ 93 | │ 底层驱动层 │ ← Selenium/Playwright/ADB 94 | └─────────────────────────────────────────┘ 95 | ``` 96 | 97 | ### 核心组件 98 | 99 | - **Agent**: 用户操作的统一入口,提供高级 AI 驱动的 API 100 | - **Insight**: AI 理解引擎,负责页面分析和操作决策 101 | - **AIModelService**: AI 模型服务抽象层,支持多种 AI 提供商 102 | - **Platform Bridges**: 平台桥接层,统一不同平台的操作接口 103 | 104 | ## 🎮 使用场景 105 | 106 | ### 测试自动化 107 | ```python 108 | # E2E 测试 109 | await agent.ai_action("登录用户账号") 110 | await agent.ai_action("添加商品到购物车") 111 | await agent.ai_action("进入结算页面") 112 | await agent.ai_assert("显示正确的订单金额") 113 | ``` 114 | 115 | ### 数据爬取 116 | ```python 117 | # 智能数据提取 118 | news_data = await agent.ai_extract({ 119 | "articles": [ 120 | { 121 | "title": "标题", 122 | "author": "作者", 123 | "publish_date": "发布日期", 124 | "content_summary": "内容摘要" 125 | } 126 | ] 127 | }) 128 | ``` 129 | 130 | ### 业务流程自动化 131 | ```python 132 | # RPA 自动化 133 | await agent.ai_action("打开财务报表") 134 | await agent.ai_action("筛选本月数据") 135 | monthly_report = await agent.ai_extract("提取月度财务汇总数据") 136 | await agent.ai_action("生成并下载报告") 137 | ``` 138 | 139 | ### 应用监控 140 | ```python 141 | # 健康检查 142 | await agent.ai_assert("首页加载正常") 143 | await agent.ai_assert("用户登录功能正常") 144 | await agent.ai_assert("搜索功能返回结果") 145 | ``` 146 | 147 | ## 🆚 与传统工具的对比 148 | 149 | | 特性 | 传统自动化工具 | Midscene Python | 150 | |------|---------------|-----------------| 151 | | **学习曲线** | 陡峭,需要学习复杂 API | 平缓,自然语言驱动 | 152 | | **代码可读性** | 晦涩难懂 | 直观易懂 | 153 | | **维护成本** | 高,页面变化需要大量修改 | 低,AI 自动适应变化 | 154 | | **元素定位** | 手动编写选择器 | AI 智能定位 | 155 | | **错误处理** | 需要手动处理各种异常 | AI 自动重试和恢复 | 156 | | **跨平台** | 需要学习不同工具 | 统一接口 | 157 | 158 | ## 🛣️ 发展路线 159 | 160 | ### 当前版本 (v0.1.0) 161 | - ✅ 基础 Agent 和 Insight 功能 162 | - ✅ Web 平台支持 (Selenium/Playwright) 163 | - ✅ Android 平台支持 164 | - ✅ 多种 AI 模型集成 165 | - ✅ 基础缓存和报告功能 166 | 167 | ### 未来规划 168 | - 🔄 桌面应用自动化支持 169 | - 🔄 更多 AI 模型集成 170 | - 🔄 可视化测试编辑器 171 | - 🔄 云端执行服务 172 | - 🔄 团队协作功能 173 | 174 | ## 📈 性能特点 175 | 176 | - **执行效率**: 智能缓存机制减少重复的 AI 调用 177 | - **准确性**: 多重验证确保操作的可靠性 178 | - **稳定性**: 自动重试和错误恢复机制 179 | - **扩展性**: 模块化设计支持自定义扩展 180 | 181 | ## 🤝 社区与生态 182 | 183 | Midscene Python 是一个开源项目,欢迎社区贡献: 184 | 185 | - **GitHub**: [Python51888/midscene-python](https://github.com/Python51888/midscene-python.git) 186 | - **文档**: [Python51888/midscene-python](https://github.com/Python51888/Midscene-Python/blob/master/README.md) 187 | - **讨论**: GitHub Discussions 188 | - **问题反馈**: GitHub Issues 189 | 190 | --- 191 | 192 | 准备好开始你的 AI 自动化之旅了吗?查看 [快速开始](快速开始.md) 指南! -------------------------------------------------------------------------------- /midscene/core/ai_model/service.py: -------------------------------------------------------------------------------- 1 | """ 2 | AI Model Service - Unified interface for different AI providers 3 | """ 4 | 5 | import json 6 | from abc import ABC, abstractmethod 7 | from typing import Any, Dict, List, Optional, Type, Union 8 | 9 | import httpx 10 | from loguru import logger 11 | from pydantic import BaseModel 12 | 13 | from ..types import AIUsageInfo 14 | 15 | 16 | class AIModelConfig(BaseModel): 17 | """AI model configuration""" 18 | provider: str # openai, anthropic, qwen, gemini 19 | model: str 20 | api_key: str 21 | base_url: Optional[str] = None 22 | max_tokens: int = 4000 23 | temperature: float = 0.1 24 | timeout: int = 60 25 | 26 | 27 | class AIProvider(ABC): 28 | """Abstract base class for AI service providers""" 29 | 30 | @abstractmethod 31 | async def call( 32 | self, 33 | messages: List[Dict[str, Any]], 34 | config: AIModelConfig, 35 | response_schema: Optional[Type[BaseModel]] = None, 36 | **kwargs 37 | ) -> Dict[str, Any]: 38 | """Call AI service""" 39 | pass 40 | 41 | 42 | class AIModelService: 43 | """Unified AI model service interface""" 44 | 45 | def __init__(self): 46 | self.providers: Dict[str, AIProvider] = {} 47 | self._register_providers() 48 | 49 | def _register_providers(self): 50 | """Register available AI providers""" 51 | from .providers import ( 52 | OpenAIProvider, 53 | AnthropicProvider, 54 | QwenProvider, 55 | GeminiProvider 56 | ) 57 | 58 | self.providers['openai'] = OpenAIProvider() 59 | self.providers['anthropic'] = AnthropicProvider() 60 | self.providers['qwen'] = QwenProvider() 61 | self.providers['gemini'] = GeminiProvider() 62 | 63 | async def call_ai( 64 | self, 65 | messages: List[Dict[str, Any]], 66 | response_schema: Optional[Type[BaseModel]] = None, 67 | model_config: Optional[AIModelConfig] = None, 68 | **kwargs 69 | ) -> Dict[str, Any]: 70 | """Call AI model with unified interface""" 71 | config = model_config or self._get_default_config() 72 | provider = self.providers.get(config.provider) 73 | 74 | if not provider: 75 | raise ValueError(f"Unsupported provider: {config.provider}") 76 | 77 | try: 78 | logger.debug(f"Calling AI provider: {config.provider}") 79 | result = await provider.call( 80 | messages=messages, 81 | config=config, 82 | response_schema=response_schema, 83 | **kwargs 84 | ) 85 | return result 86 | except Exception as e: 87 | logger.error(f"AI call failed: {e}") 88 | raise 89 | 90 | def _get_default_config(self) -> AIModelConfig: 91 | """Get default configuration""" 92 | import os 93 | 94 | # Try to get from environment variables 95 | provider = os.getenv('MIDSCENE_AI_PROVIDER', 'openai') 96 | model = os.getenv('MIDSCENE_AI_MODEL', 'gpt-4-vision-preview') 97 | api_key = os.getenv('MIDSCENE_AI_API_KEY', '') 98 | base_url = os.getenv('MIDSCENE_AI_BASE_URL') 99 | 100 | if not api_key: 101 | raise ValueError( 102 | "AI API key not configured. Set MIDSCENE_AI_API_KEY environment variable." 103 | ) 104 | 105 | return AIModelConfig( 106 | provider=provider, 107 | model=model, 108 | api_key=api_key, 109 | base_url=base_url 110 | ) 111 | 112 | 113 | def parse_json_response(content: str) -> Dict[str, Any]: 114 | """Parse JSON response from AI model""" 115 | try: 116 | # Try to parse as JSON directly 117 | return json.loads(content) 118 | except json.JSONDecodeError: 119 | # Try to extract JSON from code blocks 120 | import re 121 | json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', content, re.DOTALL) 122 | if json_match: 123 | try: 124 | return json.loads(json_match.group(1)) 125 | except json.JSONDecodeError: 126 | pass 127 | 128 | # Try to find JSON-like content 129 | json_match = re.search(r'\{.*\}', content, re.DOTALL) 130 | if json_match: 131 | try: 132 | return json.loads(json_match.group(0)) 133 | except json.JSONDecodeError: 134 | pass 135 | 136 | raise ValueError(f"Failed to parse JSON from response: {content}") 137 | 138 | 139 | def create_usage_info(usage_data: Dict[str, Any]) -> AIUsageInfo: 140 | """Create AIUsageInfo from provider response""" 141 | return AIUsageInfo( 142 | prompt_tokens=usage_data.get('prompt_tokens', 0), 143 | completion_tokens=usage_data.get('completion_tokens', 0), 144 | total_tokens=usage_data.get('total_tokens', 0), 145 | cost=usage_data.get('cost') 146 | ) -------------------------------------------------------------------------------- /scripts/validate_requirements.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | 依赖验证脚本 4 | 验证生成的requirements.txt文件是否包含所有必要依赖 5 | """ 6 | 7 | import subprocess 8 | import sys 9 | import tempfile 10 | import os 11 | from pathlib import Path 12 | 13 | 14 | def run_command(cmd, check=True, capture_output=True): 15 | """运行命令并返回结果""" 16 | try: 17 | result = subprocess.run( 18 | cmd, 19 | shell=True, 20 | check=check, 21 | capture_output=capture_output, 22 | text=True 23 | ) 24 | return result 25 | except subprocess.CalledProcessError as e: 26 | print(f"命令执行失败: {cmd}") 27 | print(f"错误输出: {e.stderr}") 28 | sys.exit(1) 29 | 30 | 31 | def create_test_environment(): 32 | """创建临时测试环境""" 33 | print("=== 创建临时测试环境 ===") 34 | 35 | # 创建临时目录 36 | temp_dir = tempfile.mkdtemp(prefix="midscene_test_") 37 | print(f"临时目录: {temp_dir}") 38 | 39 | # 创建虚拟环境 40 | venv_path = os.path.join(temp_dir, "test_env") 41 | print("创建虚拟环境...") 42 | run_command(f"python -m venv {venv_path}") 43 | 44 | # 获取虚拟环境的Python路径 45 | if sys.platform == "win32": 46 | python_path = os.path.join(venv_path, "Scripts", "python.exe") 47 | pip_path = os.path.join(venv_path, "Scripts", "pip.exe") 48 | else: 49 | python_path = os.path.join(venv_path, "bin", "python") 50 | pip_path = os.path.join(venv_path, "bin", "pip") 51 | 52 | return temp_dir, python_path, pip_path 53 | 54 | 55 | def install_requirements(pip_path, requirements_file): 56 | """在测试环境中安装依赖""" 57 | print("=== 安装依赖包 ===") 58 | print(f"使用requirements文件: {requirements_file}") 59 | 60 | # 升级pip 61 | run_command(f'"{pip_path}" install --upgrade pip') 62 | 63 | # 安装依赖 64 | run_command(f'"{pip_path}" install -r "{requirements_file}"') 65 | print("依赖安装完成") 66 | 67 | 68 | def validate_imports(python_path): 69 | """验证核心包导入""" 70 | print("=== 验证包导入 ===") 71 | 72 | test_imports = [ 73 | "import midscene", 74 | "import pydantic", 75 | "import selenium", 76 | "import playwright", 77 | "import pytest", 78 | "import black", 79 | "import mkdocs", 80 | "import numpy", 81 | "import cv2", 82 | "import PIL", 83 | "import loguru", 84 | "import typer", 85 | "import httpx", 86 | "import aiohttp", 87 | "import openai", 88 | "import anthropic", 89 | ] 90 | 91 | for import_stmt in test_imports: 92 | try: 93 | print(f"测试: {import_stmt}") 94 | run_command(f'"{python_path}" -c "{import_stmt}"') 95 | print(f"✓ {import_stmt} - 成功") 96 | except: 97 | print(f"✗ {import_stmt} - 失败") 98 | return False 99 | 100 | return True 101 | 102 | 103 | def validate_cli_tools(python_path): 104 | """验证CLI工具可用性""" 105 | print("=== 验证CLI工具 ===") 106 | 107 | cli_tests = [ 108 | (f'"{python_path}" -m pytest --version', "pytest"), 109 | (f'"{python_path}" -m black --version', "black"), 110 | (f'"{python_path}" -m mkdocs --version', "mkdocs"), 111 | ] 112 | 113 | for cmd, tool_name in cli_tests: 114 | try: 115 | print(f"测试: {tool_name}") 116 | result = run_command(cmd) 117 | print(f"✓ {tool_name} - 可用") 118 | except: 119 | print(f"✗ {tool_name} - 不可用") 120 | return False 121 | 122 | return True 123 | 124 | 125 | def cleanup(temp_dir): 126 | """清理临时文件""" 127 | print("=== 清理临时文件 ===") 128 | try: 129 | import shutil 130 | shutil.rmtree(temp_dir) 131 | print(f"已删除临时目录: {temp_dir}") 132 | except Exception as e: 133 | print(f"清理失败: {e}") 134 | 135 | 136 | def main(): 137 | """主函数""" 138 | print("=== Midscene Python 依赖验证 ===\n") 139 | 140 | # 检查requirements.txt是否存在 141 | requirements_file = Path("requirements.txt") 142 | if not requirements_file.exists(): 143 | print("错误: requirements.txt 文件不存在") 144 | print("请先运行: make requirements-freeze") 145 | sys.exit(1) 146 | 147 | temp_dir = None 148 | try: 149 | # 创建测试环境 150 | temp_dir, python_path, pip_path = create_test_environment() 151 | 152 | # 安装依赖 153 | install_requirements(pip_path, requirements_file) 154 | 155 | # 验证导入 156 | if not validate_imports(python_path): 157 | print("\n❌ 包导入验证失败") 158 | sys.exit(1) 159 | 160 | # 验证CLI工具 161 | if not validate_cli_tools(python_path): 162 | print("\n❌ CLI工具验证失败") 163 | sys.exit(1) 164 | 165 | print("\n✅ 所有依赖验证通过!") 166 | print("requirements.txt 文件完整且可用") 167 | 168 | except KeyboardInterrupt: 169 | print("\n用户中断验证过程") 170 | sys.exit(1) 171 | except Exception as e: 172 | print(f"\n验证过程中出现错误: {e}") 173 | sys.exit(1) 174 | finally: 175 | if temp_dir: 176 | cleanup(temp_dir) 177 | 178 | 179 | if __name__ == "__main__": 180 | main() -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Midscene Python [![zread](https://img.shields.io/badge/Ask_Zread-_.svg?style=flat&color=00b0aa&labelColor=000000&logo=data%3Aimage%2Fsvg%2Bxml%3Bbase64%2CPHN2ZyB3aWR0aD0iMTYiIGhlaWdodD0iMTYiIHZpZXdCb3g9IjAgMCAxNiAxNiIgZmlsbD0ibm9uZSIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj4KPHBhdGggZD0iTTQuOTYxNTYgMS42MDAxSDIuMjQxNTZDMS44ODgxIDEuNjAwMSAxLjYwMTU2IDEuODg2NjQgMS42MDE1NiAyLjI0MDFWNC45NjAxQzEuNjAxNTYgNS4zMTM1NiAxLjg4ODEgNS42MDAxIDIuMjQxNTYgNS42MDAxSDQuOTYxNTZDNS4zMTUwMiA1LjYwMDEgNS42MDE1NiA1LjMxMzU2IDUuNjAxNTYgNC45NjAxVjIuMjQwMUM1LjYwMTU2IDEuODg2NjQgNS4zMTUwMiAxLjYwMDEgNC45NjE1NiAxLjYwMDFaIiBmaWxsPSIjZmZmIi8%2BCjxwYXRoIGQ9Ik00Ljk2MTU2IDEwLjM5OTlIMi4yNDE1NkMxLjg4ODEgMTAuMzk5OSAxLjYwMTU2IDEwLjY4NjQgMS42MDE1NiAxMS4wMzk5VjEzLjc1OTlDMS42MDE1NiAxNC4xMTM0IDEuODg4MSAxNC4zOTk5IDIuMjQxNTYgMTQuMzk5OUg0Ljk2MTU2QzUuMzE1MDIgMTQuMzk5OSA1LjYwMTU2IDE0LjExMzQgNS42MDE1NiAxMy43NTk5VjExLjAzOTlDNS42MDE1NiAxMC42ODY0IDUuMzE1MDIgMTAuMzk5OSA0Ljk2MTU2IDEwLjM5OTlaIiBmaWxsPSIjZmZmIi8%2BCjxwYXRoIGQ9Ik0xMy43NTg0IDEuNjAwMUgxMS4wMzg0QzEwLjY4NSAxLjYwMDEgMTAuMzk4NCAxLjg4NjY0IDEwLjM5ODQgMi4yNDAxVjQuOTYwMUMxMC4zOTg0IDUuMzEzNTYgMTAuNjg1IDUuNjAwMSAxMS4wMzg0IDUuNjAwMUgxMy43NTg0QzE0LjExMTkgNS42MDAxIDE0LjM5ODQgNS4zMTM1NiAxNC4zOTg0IDQuOTYwMVYyLjI0MDFDMTQuMzk4NCAxLjg4NjY0IDE0LjExMTkgMS42MDAxIDEzLjc1ODQgMS42MDAxWiIgZmlsbD0iI2ZmZiIvPgo8cGF0aCBkPSJNNCAxMkwxMiA0TDQgMTJaIiBmaWxsPSIjZmZmIi8%2BCjxwYXRoIGQ9Ik00IDEyTDEyIDQiIHN0cm9rZT0iI2ZmZiIgc3Ryb2tlLXdpZHRoPSIxLjUiIHN0cm9rZS1saW5lY2FwPSJyb3VuZCIvPgo8L3N2Zz4K&logoColor=ffffff)](https://zread.ai/Python51888/Midscene-Python)               2 | [English](README.md) | [简体中文](README.zh.md) 3 | 4 | Midscene Python is an AI-based automation framework that supports UI automation operations on Web and Android platforms.    5 | 6 | ## Overview 7 | 8 | Midscene Python provides comprehensive UI automation capabilities with the following core features: 9 | 10 | - **Natural Language Driven**: Describe automation tasks using natural language 11 | - **Multi-platform Support**: Supports Web (Selenium/Playwright) and Android (ADB) 12 | - **AI Model Integration**: Supports multiple vision-language models such as GPT-4V, Qwen2.5-VL, and Gemini  13 | - **Visual Debugging**: Provides detailed execution reports and debugging information 14 | - **Caching Mechanism**: Intelligent caching to improve execution efficiency 15 | 16 | ## Project Architecture 17 | 18 | ``` 19 | midscene-python/ 20 | ├── midscene/ # Core framework 21 | │ ├── core/ # Core framework 22 | │ │ ├── agent/ # Agent system 23 | │ │ ├── insight/ # AI inference engine 24 | │ │ ├── ai_model/ # AI model integration 25 | │ │ ├── yaml/ # YAML script executor 26 | │ │ └── types.py # Core type definitions 27 | │ ├── web/ # Web integration 28 | │ │ ├── selenium/ # Selenium integration 29 | │ │ ├── playwright/ # Playwright integration 30 | │ │ └── bridge/ # Bridge mode 31 | │ ├── android/ # Android integration 32 | │ │ ├── device.py # Device management 33 | │ │ └── agent.py # Android Agent 34 | │ ├── cli/ # Command line tools 35 | │ ├── mcp/ # MCP protocol support 36 | │ ├── shared/ # Shared utilities 37 | │ └── visualizer/ # Visual reports 38 | ├── examples/ # Example code 39 | ├── tests/ # Test cases 40 | └── docs/ # Documentation 41 | ``` 42 | 43 | ## Tech Stack 44 | 45 | - **Python 3.9+**: Core runtime environment 46 | - **Pydantic**: Data validation and serialization 47 | - **Selenium/Playwright**: Web automation 48 | - **OpenCV/Pillow**: Image processing 49 | - **HTTPX/AIOHTTP**: HTTP client 50 | - **Typer**: CLI framework 51 | - **Loguru**: Logging 52 | 53 | ## Quick Start 54 | 55 | ### Installation 56 | 57 | ```bash 58 | pip install midscene-python 59 | ``` 60 | 61 | ### Basic Usage 62 | 63 | ```python 64 | from midscene import Agent 65 | from midscene.web import SeleniumWebPage 66 | 67 | # Create a Web Agent 68 | with SeleniumWebPage.create() as page: 69 | agent = Agent(page) 70 | 71 | # Perform automation operations using natural language 72 | await agent.ai_action("Click the login button") 73 | await agent.ai_action("Enter username 'test@example.com'") 74 | await agent.ai_action("Enter password 'password123'") 75 | await agent.ai_action("Click the submit button") 76 | 77 | # Data extraction 78 | user_info = await agent.ai_extract("Extract user personal information") 79 | 80 | # Assertion verification 81 | await agent.ai_assert("Page displays welcome message") 82 | ``` 83 | 84 | ## Key Features 85 | 86 | ### 🤖 AI-Driven Automation 87 | 88 | Describe operations using natural language, and AI automatically understands and executes: 89 | 90 | ```python 91 | await agent.ai_action("Enter 'Python tutorial' in the search box and search") 92 | ``` 93 | 94 | ### 🔍 Intelligent Element Location 95 | 96 | Supports multiple location strategies and automatically selects the optimal solution: 97 | 98 | ```python 99 | element = await agent.ai_locate("Login button") 100 | ``` 101 | 102 | ### 📊 Data Extraction 103 | 104 | Extract structured data from the page: 105 | 106 | ```python 107 | products = await agent.ai_extract({ 108 | "products": [ 109 | {"name": "Product Name", "price": "Price", "rating": "Rating"} 110 | ] 111 | }) 112 | ``` 113 | 114 | ### ✅ Intelligent Assertions 115 | 116 | AI understands page state and performs intelligent assertions: 117 | 118 | ```python 119 | await agent.ai_assert("User has successfully logged in") 120 | ``` 121 | 122 | ### 📝 Credits 123 | 124 | Thanks to Midscene Project: https://github.com/web-infra-dev/midscene for inspiration and technical references 125 | 126 | ## License 127 | 128 | MIT License 129 | -------------------------------------------------------------------------------- /midscene/core/types.py: -------------------------------------------------------------------------------- 1 | """ 2 | Core types and interfaces for Midscene Python 3 | """ 4 | 5 | from abc import ABC, abstractmethod 6 | from dataclasses import dataclass, field 7 | from enum import Enum 8 | from typing import Any, Dict, List, Optional, Union, Callable, Awaitable, Generic, TypeVar 9 | from pydantic import BaseModel 10 | 11 | # Type variables 12 | ElementType = TypeVar('ElementType', bound='BaseElement') 13 | T = TypeVar('T') 14 | 15 | 16 | class InterfaceType(str, Enum): 17 | """Interface type enumeration""" 18 | WEB = "web" 19 | ANDROID = "android" 20 | 21 | 22 | class NodeType(str, Enum): 23 | """UI Node type enumeration""" 24 | CONTAINER = "container" 25 | TEXT = "text" 26 | INPUT = "input" 27 | BUTTON = "button" 28 | IMAGE = "image" 29 | LINK = "link" 30 | OTHER = "other" 31 | 32 | 33 | @dataclass 34 | class Point: 35 | """2D Point representation""" 36 | x: float 37 | y: float 38 | 39 | 40 | @dataclass 41 | class Size: 42 | """Size representation""" 43 | width: float 44 | height: float 45 | 46 | 47 | @dataclass 48 | class Rect: 49 | """Rectangle representation""" 50 | left: float 51 | top: float 52 | width: float 53 | height: float 54 | 55 | @property 56 | def right(self) -> float: 57 | return self.left + self.width 58 | 59 | @property 60 | def bottom(self) -> float: 61 | return self.top + self.height 62 | 63 | @property 64 | def center(self) -> Point: 65 | return Point( 66 | x=self.left + self.width / 2, 67 | y=self.top + self.height / 2 68 | ) 69 | 70 | 71 | class BaseElement(BaseModel): 72 | """Base UI element interface""" 73 | id: str 74 | content: str 75 | rect: Rect 76 | center: tuple[float, float] 77 | node_type: NodeType = NodeType.OTHER 78 | attributes: Dict[str, Any] = field(default_factory=dict) 79 | is_visible: bool = True 80 | xpaths: Optional[List[str]] = None 81 | 82 | async def tap(self) -> None: 83 | """Tap/click this element""" 84 | raise NotImplementedError 85 | 86 | async def input_text(self, text: str) -> None: 87 | """Input text to this element""" 88 | raise NotImplementedError 89 | 90 | 91 | class UINode(BaseModel): 92 | """UI tree node representation""" 93 | id: str 94 | content: str 95 | rect: Rect 96 | center: tuple[float, float] 97 | node_type: NodeType 98 | attributes: Dict[str, Any] = field(default_factory=dict) 99 | is_visible: bool = True 100 | children: List['UINode'] = field(default_factory=list) 101 | 102 | 103 | class UITree(BaseModel): 104 | """UI tree representation""" 105 | node: UINode 106 | children: List['UITree'] = field(default_factory=list) 107 | 108 | 109 | class UIContext(BaseModel, Generic[ElementType]): 110 | """UI context containing screenshot and element information""" 111 | screenshot_base64: str 112 | size: Size 113 | content: List[ElementType] 114 | tree: UITree 115 | 116 | 117 | class AIUsageInfo(BaseModel): 118 | """AI usage information""" 119 | prompt_tokens: int = 0 120 | completion_tokens: int = 0 121 | total_tokens: int = 0 122 | cost: Optional[float] = None 123 | 124 | 125 | class LocateResult(BaseModel): 126 | """Element locate result""" 127 | element: Optional[BaseElement] = None 128 | rect: Optional[Rect] = None 129 | 130 | 131 | class ExecutionResult(BaseModel, Generic[T]): 132 | """Generic execution result""" 133 | success: bool = True 134 | data: Optional[Any] = None 135 | error: Optional[str] = None 136 | usage: Optional[AIUsageInfo] = None 137 | 138 | 139 | class AssertResult(BaseModel): 140 | """Assertion result""" 141 | passed: bool 142 | thought: str = "" 143 | message: str = "" 144 | 145 | 146 | # Type aliases 147 | TUserPrompt = Union[str, Dict[str, Any]] 148 | ElementById = Callable[[str], Optional[BaseElement]] 149 | OnTaskStartTip = Callable[[str], Union[None, Awaitable[None]]] 150 | 151 | 152 | # Abstract interface for device/platform implementations 153 | class AbstractInterface(ABC): 154 | """Abstract interface for platform implementations""" 155 | 156 | @property 157 | @abstractmethod 158 | def interface_type(self) -> InterfaceType: 159 | """Get interface type""" 160 | pass 161 | 162 | @abstractmethod 163 | async def get_context(self) -> UIContext: 164 | """Get current UI context""" 165 | pass 166 | 167 | @abstractmethod 168 | async def action_space(self) -> List[str]: 169 | """Get available actions""" 170 | pass 171 | 172 | @abstractmethod 173 | async def tap(self, x: float, y: float) -> None: 174 | """Tap at coordinates""" 175 | pass 176 | 177 | @abstractmethod 178 | async def input_text(self, text: str) -> None: 179 | """Input text""" 180 | pass 181 | 182 | @abstractmethod 183 | async def scroll(self, direction: str, distance: Optional[int] = None) -> None: 184 | """Scroll in direction""" 185 | pass 186 | 187 | 188 | class InsightAction(str, Enum): 189 | """Insight action types""" 190 | LOCATE = "locate" 191 | EXTRACT = "extract" 192 | ASSERT = "assert" 193 | 194 | 195 | @dataclass 196 | class AgentOptions: 197 | """Agent configuration options""" 198 | test_id: Optional[str] = None 199 | cache_id: Optional[str] = None 200 | group_name: str = "Midscene Report" 201 | group_description: str = "" 202 | generate_report: bool = True 203 | auto_print_report_msg: bool = True 204 | ai_action_context: Optional[str] = None 205 | report_file_name: Optional[str] = None 206 | model_config: Optional[Callable] = None 207 | 208 | 209 | @dataclass 210 | class LocateOption: 211 | """Locate operation options""" 212 | prompt: Optional[TUserPrompt] = None 213 | deep_think: bool = False 214 | cacheable: bool = True 215 | xpath: Optional[str] = None 216 | ui_context: Optional[UIContext] = None 217 | 218 | 219 | @dataclass 220 | class ExtractOption: 221 | """Extract operation options""" 222 | dom_included: Union[bool, str] = False # False, True, or 'visible-only' 223 | screenshot_included: bool = True 224 | return_thought: bool = False 225 | is_wait_for_assert: bool = False 226 | do_not_throw_error: bool = False 227 | 228 | 229 | class ScrollParam(BaseModel): 230 | """Scroll parameters""" 231 | direction: str # 'down', 'up', 'left', 'right' 232 | scroll_type: str # 'once', 'untilBottom', 'untilTop', 'untilLeft', 'untilRight' 233 | distance: Optional[int] = None # distance in pixels -------------------------------------------------------------------------------- /docs/quickstart.md: -------------------------------------------------------------------------------- 1 | # 快速开始 - Midscene Python 2 | 3 | Midscene Python 是一个基于 AI 的自动化框架,支持 Web 和 Android 平台的 UI 自动化操作。 4 | 5 | ## 安装 6 | 7 | ```bash 8 | pip install midscene-python 9 | ``` 10 | 11 | ## 基本配置 12 | 13 | ### 1. 配置 AI 模型 14 | 15 | 设置环境变量: 16 | 17 | ```bash 18 | export MIDSCENE_AI_PROVIDER=openai 19 | export MIDSCENE_AI_MODEL=gpt-4-vision-preview 20 | export MIDSCENE_AI_API_KEY=your-api-key-here 21 | ``` 22 | 23 | 或创建配置文件 `midscene.yml`: 24 | 25 | ```yaml 26 | ai: 27 | provider: "openai" 28 | model: "gpt-4-vision-preview" 29 | api_key: "your-api-key-here" 30 | ``` 31 | 32 | ### 2. 支持的 AI 提供商 33 | 34 | - **OpenAI**: GPT-4V, GPT-4o 35 | - **Anthropic**: Claude 3.5 Sonnet 36 | - **阿里云**: Qwen2.5-VL 37 | - **Google**: Gemini Pro Vision 38 | 39 | ## Web 自动化 40 | 41 | ### Selenium 示例 42 | 43 | ```python 44 | import asyncio 45 | from midscene import Agent 46 | from midscene.web import SeleniumWebPage 47 | 48 | async def web_automation(): 49 | # 创建浏览器实例 50 | with SeleniumWebPage.create(headless=False) as page: 51 | agent = Agent(page) 52 | 53 | # 导航到网站 54 | await page.navigate_to("https://example.com") 55 | 56 | # 使用自然语言进行操作 57 | await agent.ai_action("点击登录按钮") 58 | await agent.ai_action("在用户名框输入 'demo@example.com'") 59 | await agent.ai_action("在密码框输入 'password123'") 60 | await agent.ai_action("点击提交按钮") 61 | 62 | # 数据提取 63 | user_info = await agent.ai_extract({ 64 | "username": "用户名", 65 | "email": "邮箱地址" 66 | }) 67 | print(f"用户信息: {user_info}") 68 | 69 | # 断言验证 70 | await agent.ai_assert("页面显示欢迎信息") 71 | 72 | # 运行示例 73 | asyncio.run(web_automation()) 74 | ``` 75 | 76 | ### Playwright 示例 77 | 78 | ```python 79 | import asyncio 80 | from midscene import Agent 81 | from midscene.web import PlaywrightWebPage 82 | 83 | async def playwright_automation(): 84 | # 创建 Playwright 页面 85 | async with await PlaywrightWebPage.create() as page: 86 | agent = Agent(page) 87 | 88 | await page.navigate_to("https://playwright.dev") 89 | await agent.ai_action("点击文档链接") 90 | 91 | # 提取页面信息 92 | page_info = await agent.ai_extract({ 93 | "title": "页面标题", 94 | "sections": ["主要章节列表"] 95 | }) 96 | print(f"页面信息: {page_info}") 97 | 98 | asyncio.run(playwright_automation()) 99 | ``` 100 | 101 | ## Android 自动化 102 | 103 | ```python 104 | import asyncio 105 | from midscene.android import AndroidAgent 106 | 107 | async def android_automation(): 108 | # 创建 Android Agent(自动检测设备) 109 | agent = await AndroidAgent.create() 110 | 111 | # 启动应用 112 | await agent.launch_app("com.android.settings") 113 | 114 | # 使用自然语言导航 115 | await agent.ai_action("点击WLAN设置") 116 | await agent.ai_action("滑动到底部") 117 | 118 | # 提取信息 119 | wifi_list = await agent.ai_extract({ 120 | "networks": [ 121 | {"name": "网络名称", "security": "安全类型"} 122 | ] 123 | }) 124 | print(f"WiFi网络: {wifi_list}") 125 | 126 | # 返回 127 | await agent.back() 128 | 129 | asyncio.run(android_automation()) 130 | ``` 131 | 132 | ## 命令行工具 133 | 134 | ### 运行 YAML 脚本 135 | 136 | ```bash 137 | # 运行单个脚本 138 | midscene run script.yaml 139 | 140 | # 运行目录中的所有脚本 141 | midscene run scripts/ 142 | 143 | # 使用配置文件 144 | midscene run script.yaml --config midscene.yml 145 | 146 | # 并发执行 147 | midscene run scripts/ --concurrent 3 148 | 149 | # Android 设备指定 150 | midscene run android_script.yaml --device device_id 151 | ``` 152 | 153 | ### 列出 Android 设备 154 | 155 | ```bash 156 | midscene devices 157 | ``` 158 | 159 | ### 初始化项目 160 | 161 | ```bash 162 | midscene init my-project 163 | cd my-project 164 | ``` 165 | 166 | ## YAML 脚本格式 167 | 168 | 创建 `example.yaml`: 169 | 170 | ```yaml 171 | # Web 自动化脚本 172 | web: 173 | url: "https://example.com" 174 | browser: "chrome" 175 | headless: false 176 | 177 | tasks: 178 | - name: "登录操作" 179 | steps: 180 | - action: "ai_action" 181 | prompt: "点击登录按钮" 182 | 183 | - action: "ai_action" 184 | prompt: "输入用户名 'demo@example.com'" 185 | 186 | - action: "ai_action" 187 | prompt: "输入密码 'password123'" 188 | 189 | - action: "ai_action" 190 | prompt: "点击提交按钮" 191 | 192 | - name: "数据提取" 193 | steps: 194 | - action: "ai_extract" 195 | prompt: 196 | username: "用户名" 197 | email: "邮箱地址" 198 | save_to: "user_info" 199 | 200 | - name: "状态验证" 201 | steps: 202 | - action: "ai_assert" 203 | prompt: "页面显示欢迎信息" 204 | ``` 205 | 206 | ## 核心概念 207 | 208 | ### Agent 系统 209 | 210 | Agent 是自动化操作的核心控制器,协调 AI 模型与设备交互: 211 | 212 | ```python 213 | from midscene import Agent 214 | from midscene.web import SeleniumWebPage 215 | 216 | page = SeleniumWebPage.create() 217 | agent = Agent(page) 218 | ``` 219 | 220 | ### AI 操作类型 221 | 222 | 1. **ai_action**: 执行自然语言描述的操作 223 | 2. **ai_locate**: 定位 UI 元素 224 | 3. **ai_extract**: 提取结构化数据 225 | 4. **ai_assert**: 验证页面状态 226 | 227 | ### 缓存机制 228 | 229 | 启用缓存可以提升重复执行的效率: 230 | 231 | ```python 232 | from midscene.core import AgentOptions 233 | 234 | options = AgentOptions( 235 | cache_id="my_automation", 236 | generate_report=True 237 | ) 238 | agent = Agent(page, options) 239 | ``` 240 | 241 | ## 最佳实践 242 | 243 | ### 1. 错误处理 244 | 245 | ```python 246 | try: 247 | await agent.ai_action("点击不存在的按钮") 248 | except Exception as e: 249 | print(f"操作失败: {e}") 250 | ``` 251 | 252 | ### 2. 等待条件 253 | 254 | ```python 255 | # 等待元素出现 256 | await agent.ai_wait_for("登录成功页面出现", timeout_ms=10000) 257 | ``` 258 | 259 | ### 3. 数据验证 260 | 261 | ```python 262 | # 使用断言验证数据 263 | user_data = await agent.ai_extract({"username": "用户名"}) 264 | assert user_data["username"], "用户名不能为空" 265 | ``` 266 | 267 | ### 4. 截图和报告 268 | 269 | ```python 270 | # 生成执行报告 271 | options = AgentOptions( 272 | generate_report=True, 273 | report_file_name="automation_report" 274 | ) 275 | ``` 276 | 277 | ## 故障排除 278 | 279 | ### 常见问题 280 | 281 | 1. **AI API 密钥未设置** 282 | ``` 283 | ValueError: AI API key not configured 284 | ``` 285 | 解决:设置 `MIDSCENE_AI_API_KEY` 环境变量 286 | 287 | 2. **Chrome 浏览器未找到** 288 | ``` 289 | WebDriverException: chrome not found 290 | ``` 291 | 解决:安装 Chrome 浏览器或指定 Chrome 路径 292 | 293 | 3. **Android 设备连接失败** 294 | ``` 295 | RuntimeError: No Android devices found 296 | ``` 297 | 解决:确保设备已连接并启用 USB 调试 298 | 299 | ### 调试技巧 300 | 301 | 1. **启用详细日志** 302 | ```python 303 | from midscene.shared import setup_logger 304 | setup_logger(level="DEBUG") 305 | ``` 306 | 307 | 2. **查看生成的报告** 308 | 执行完成后检查 `./reports/` 目录中的 HTML 报告 309 | 310 | 3. **使用非无头模式** 311 | 设置 `headless=False` 观察浏览器操作过程 312 | 313 | ## 下一步 314 | 315 | - 查看 [API 文档](api.md) 了解详细接口 316 | - 浏览 [示例集合](examples/) 学习更多用法 317 | - 阅读 [配置指南](configuration.md) 了解高级配置 -------------------------------------------------------------------------------- /wiki/快速开始.md: -------------------------------------------------------------------------------- 1 | # 快速开始 2 | 3 | 欢迎使用 Midscene Python!本指南将帮助你在 5 分钟内上手 AI 驱动的自动化操作。 4 | 5 | ## 📋 前置要求 6 | 7 | 在开始之前,请确保你的环境满足以下要求: 8 | 9 | - **Python 3.9+** 10 | - **pip** 包管理器 11 | - **浏览器** (Chrome/Firefox/Edge,用于 Web 自动化) 12 | - **AI 模型 API Key** (OpenAI、Claude、Qwen 或 Gemini 任选其一) 13 | 14 | ## 🚀 快速安装 15 | 16 | ### 1. 安装 Midscene Python 17 | 18 | ```bash 19 | pip install midscene-python 20 | ``` 21 | 22 | ### 2. 安装浏览器驱动(可选) 23 | 24 | 如果你计划进行 Web 自动化,需要安装对应的浏览器驱动: 25 | 26 | ```bash 27 | # Selenium WebDriver 28 | pip install webdriver-manager 29 | 30 | # 或者 Playwright 31 | pip install playwright 32 | playwright install 33 | ``` 34 | 35 | ### 3. 配置 AI 模型 36 | 37 | 创建 `.env` 文件配置 AI 模型(以 OpenAI 为例): 38 | 39 | ```bash 40 | # .env 41 | OPENAI_API_KEY=your_openai_api_key_here 42 | OPENAI_BASE_URL=https://api.openai.com/v1 # 可选,默认官方 API 43 | ``` 44 | 45 | ## 🎯 第一个示例 46 | 47 | 让我们从一个简单的 Web 自动化示例开始: 48 | 49 | ### 示例 1: 搜索操作 50 | 51 | ```python 52 | import asyncio 53 | from midscene import Agent 54 | from midscene.web import SeleniumWebPage 55 | 56 | async def search_example(): 57 | """在百度搜索 Python 教程""" 58 | 59 | # 创建 Web 页面实例 60 | with SeleniumWebPage.create() as page: 61 | # 创建 Agent 62 | agent = Agent(page) 63 | 64 | # 导航到网站 65 | await page.goto("https://www.baidu.com") 66 | 67 | # 使用自然语言进行搜索 68 | await agent.ai_action("在搜索框输入'Python 教程'") 69 | await agent.ai_action("点击搜索按钮") 70 | 71 | # 验证搜索结果 72 | await agent.ai_assert("页面显示了 Python 教程的搜索结果") 73 | 74 | print("✅ 搜索操作完成!") 75 | 76 | # 运行示例 77 | asyncio.run(search_example()) 78 | ``` 79 | 80 | ### 示例 2: 数据提取 81 | 82 | ```python 83 | import asyncio 84 | from midscene import Agent 85 | from midscene.web import SeleniumWebPage 86 | 87 | async def extract_example(): 88 | """提取新闻标题""" 89 | 90 | with SeleniumWebPage.create() as page: 91 | agent = Agent(page) 92 | 93 | # 访问新闻网站 94 | await page.goto("https://news.example.com") 95 | 96 | # 提取结构化数据 97 | news_data = await agent.ai_extract({ 98 | "articles": [ 99 | { 100 | "title": "新闻标题", 101 | "time": "发布时间", 102 | "summary": "新闻摘要" 103 | } 104 | ] 105 | }) 106 | 107 | # 输出结果 108 | for article in news_data["articles"]: 109 | print(f"📰 {article['title']}") 110 | print(f"⏰ {article['time']}") 111 | print(f"📄 {article['summary']}\n") 112 | 113 | # 运行示例 114 | asyncio.run(extract_example()) 115 | ``` 116 | 117 | ## 📱 Android 自动化示例 118 | 119 | ```python 120 | import asyncio 121 | from midscene import Agent 122 | from midscene.android import AndroidDevice 123 | 124 | async def android_example(): 125 | """Android 应用自动化""" 126 | 127 | # 连接 Android 设备 128 | device = AndroidDevice() 129 | await device.connect() 130 | 131 | # 创建 Agent 132 | agent = Agent(device) 133 | 134 | # 启动应用 135 | await device.start_app("com.example.app") 136 | 137 | # 自然语言操作 138 | await agent.ai_action("点击登录按钮") 139 | await agent.ai_action("输入用户名 'testuser'") 140 | await agent.ai_action("输入密码 'password123'") 141 | await agent.ai_action("点击确认登录") 142 | 143 | # 验证登录状态 144 | await agent.ai_assert("显示用户已登录") 145 | 146 | print("✅ Android 自动化完成!") 147 | 148 | # 运行示例 149 | asyncio.run(android_example()) 150 | ``` 151 | 152 | ## 🎛️ 配置选项 153 | 154 | ### AI 模型配置 155 | 156 | ```python 157 | from midscene.core.ai_model import AIModelConfig 158 | 159 | # 自定义 AI 配置 160 | config = AIModelConfig( 161 | provider="openai", # 或 "claude", "qwen", "gemini" 162 | model="gpt-4-vision-preview", 163 | temperature=0.1, 164 | max_tokens=1000 165 | ) 166 | 167 | agent = Agent(page, ai_config=config) 168 | ``` 169 | 170 | ### Agent 选项 171 | 172 | ```python 173 | from midscene.core import AgentOptions 174 | 175 | # 自定义 Agent 选项 176 | options = AgentOptions( 177 | timeout=30, # 操作超时时间(秒) 178 | retry_count=3, # 重试次数 179 | screenshot_on_error=True, # 错误时自动截图 180 | cache_enabled=True # 启用智能缓存 181 | ) 182 | 183 | agent = Agent(page, options=options) 184 | ``` 185 | 186 | ## 🔧 常用操作 187 | 188 | ### 基础交互 189 | 190 | ```python 191 | # 点击操作 192 | await agent.ai_action("点击提交按钮") 193 | await agent.ai_action("点击页面右上角的用户头像") 194 | 195 | # 输入操作 196 | await agent.ai_action("在用户名框输入 'admin'") 197 | await agent.ai_action("在密码框输入密码") 198 | 199 | # 滚动操作 200 | await agent.ai_action("向下滚动查看更多内容") 201 | await agent.ai_action("滚动到页面底部") 202 | 203 | # 等待操作 204 | await agent.ai_action("等待页面加载完成") 205 | ``` 206 | 207 | ### 元素定位 208 | 209 | ```python 210 | # 精确定位元素 211 | element = await agent.ai_locate("登录按钮") 212 | await element.click() 213 | 214 | # 定位多个元素 215 | elements = await agent.ai_locate_all("商品卡片") 216 | for element in elements: 217 | await element.hover() 218 | ``` 219 | 220 | ### 条件断言 221 | 222 | ```python 223 | # 页面状态验证 224 | await agent.ai_assert("用户已成功登录") 225 | await agent.ai_assert("购物车显示 3 件商品") 226 | await agent.ai_assert("页面不包含错误信息") 227 | 228 | # 元素存在性验证 229 | await agent.ai_assert("页面包含搜索结果") 230 | await agent.ai_assert("显示用户个人信息") 231 | ``` 232 | 233 | ## 📊 查看执行报告 234 | 235 | Midscene Python 自动生成详细的执行报告: 236 | 237 | ```python 238 | # 运行后,检查生成的报告文件 239 | # 报告位置: ./midscene_reports/ 240 | # - execution_report.html # 可视化报告 241 | # - screenshots/ # 执行截图 242 | # - logs/ # 详细日志 243 | ``` 244 | 245 | ## 🔍 调试技巧 246 | 247 | ### 启用详细日志 248 | 249 | ```python 250 | import logging 251 | from midscene.shared.logger import setup_logger 252 | 253 | # 启用调试日志 254 | setup_logger(level=logging.DEBUG) 255 | ``` 256 | 257 | ### 截图调试 258 | 259 | ```python 260 | # 手动截图 261 | screenshot = await page.screenshot() 262 | with open("debug.png", "wb") as f: 263 | f.write(screenshot) 264 | 265 | # 获取页面信息 266 | context = await page.get_context() 267 | print(f"页面标题: {context.page_title}") 268 | print(f"页面 URL: {context.url}") 269 | ``` 270 | 271 | ## 🚨 常见问题 272 | 273 | ### 1. AI 模型调用失败 274 | ```python 275 | # 检查 API Key 配置 276 | import os 277 | print(f"API Key: {os.getenv('OPENAI_API_KEY')[:10]}...") 278 | ``` 279 | 280 | ### 2. 元素定位失败 281 | ```python 282 | # 使用更具体的描述 283 | await agent.ai_action("点击页面左上角的蓝色登录按钮") 284 | ``` 285 | 286 | ### 3. 页面加载问题 287 | ```python 288 | # 添加等待时间 289 | await page.wait_for_page_load() 290 | await agent.ai_action("等待 3 秒让页面完全加载") 291 | ``` 292 | 293 | ## 🎓 下一步 294 | 295 | 恭喜!你已经掌握了 Midscene Python 的基础用法。接下来可以: 296 | 297 | 1. 📖 深入学习 [核心概念](核心概念/Agent核心控制器.md) 298 | 2. 🔧 查看 [API 参考](API参考/Agent-API.md) 299 | 3. 🌐 了解 [Web 自动化](平台集成/Web自动化/README.md) 高级特性 300 | 4. 📱 探索 [Android 自动化](平台集成/Android自动化.md) 301 | 5. 🎯 参考 [最佳实践](示例和教程/最佳实践.md) 302 | 303 | ## 💡 小贴士 304 | 305 | - 使用具体、清晰的自然语言描述能获得更好的执行效果 306 | - 定期查看执行报告来优化自动化脚本 307 | - 善用缓存机制来提升执行效率 308 | - 为不同环境配置不同的 AI 模型 309 | 310 | --- 311 | 312 | *准备好探索更多功能了吗?查看我们的 [示例集合](示例和教程/基础示例.md)!* -------------------------------------------------------------------------------- /tests/test_core.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test suite for Midscene Python core functionality 3 | """ 4 | 5 | import pytest 6 | import asyncio 7 | from unittest.mock import Mock, AsyncMock 8 | 9 | from midscene.core.types import UIContext, Size, Rect, BaseElement, NodeType 10 | from midscene.core.insight import Insight 11 | from midscene.core.agent import Agent 12 | 13 | 14 | class MockInterface: 15 | """Mock interface for testing""" 16 | 17 | def __init__(self): 18 | self.interface_type = "mock" 19 | self._context = None 20 | 21 | async def get_context(self): 22 | if self._context: 23 | return self._context 24 | 25 | # Return mock context 26 | return UIContext( 27 | screenshot_base64="mock_screenshot", 28 | size=Size(width=1920, height=1080), 29 | content=[ 30 | BaseElement( 31 | id="test_element", 32 | content="Test Button", 33 | rect=Rect(left=100, top=100, width=200, height=50), 34 | center=(200, 125), 35 | node_type=NodeType.BUTTON 36 | ) 37 | ], 38 | tree=Mock() 39 | ) 40 | 41 | async def action_space(self): 42 | return ["tap", "input", "scroll"] 43 | 44 | async def tap(self, x, y): 45 | pass 46 | 47 | async def input_text(self, text): 48 | pass 49 | 50 | async def scroll(self, direction, distance=None): 51 | pass 52 | 53 | 54 | @pytest.fixture 55 | def mock_interface(): 56 | """Mock interface fixture""" 57 | return MockInterface() 58 | 59 | 60 | @pytest.fixture 61 | def mock_ai_service(): 62 | """Mock AI service fixture""" 63 | ai_service = Mock() 64 | ai_service.call_ai = AsyncMock(return_value={ 65 | "content": { 66 | "elements": [{"id": "test_element", "reason": "test"}], 67 | "reasoning": "test reasoning", 68 | "confidence": 0.9, 69 | "errors": [] 70 | }, 71 | "usage": {"total_tokens": 100} 72 | }) 73 | return ai_service 74 | 75 | 76 | class TestInsight: 77 | """Test Insight AI engine""" 78 | 79 | @pytest.mark.asyncio 80 | async def test_locate_element(self, mock_interface, mock_ai_service): 81 | """Test element location""" 82 | insight = Insight( 83 | context_provider=mock_interface.get_context, 84 | ai_service=mock_ai_service 85 | ) 86 | 87 | result = await insight.locate("test button") 88 | 89 | assert result.element is not None 90 | assert result.element.id == "test_element" 91 | mock_ai_service.call_ai.assert_called_once() 92 | 93 | @pytest.mark.asyncio 94 | async def test_extract_data(self, mock_interface, mock_ai_service): 95 | """Test data extraction""" 96 | # Mock extract response 97 | mock_ai_service.call_ai.return_value = { 98 | "content": { 99 | "data": {"title": "Test Page", "items": ["item1", "item2"]}, 100 | "reasoning": "extracted data", 101 | "confidence": 0.9, 102 | "errors": [] 103 | }, 104 | "usage": {"total_tokens": 150} 105 | } 106 | 107 | insight = Insight( 108 | context_provider=mock_interface.get_context, 109 | ai_service=mock_ai_service 110 | ) 111 | 112 | result = await insight.extract("extract page data") 113 | 114 | assert result["data"]["title"] == "Test Page" 115 | assert len(result["data"]["items"]) == 2 116 | 117 | @pytest.mark.asyncio 118 | async def test_assert_condition(self, mock_interface, mock_ai_service): 119 | """Test condition assertion""" 120 | # Mock assert response 121 | mock_ai_service.call_ai.return_value = { 122 | "content": { 123 | "passed": True, 124 | "reasoning": "condition is met", 125 | "confidence": 0.95, 126 | "message": "success" 127 | }, 128 | "usage": {"total_tokens": 80} 129 | } 130 | 131 | insight = Insight( 132 | context_provider=mock_interface.get_context, 133 | ai_service=mock_ai_service 134 | ) 135 | 136 | result = await insight.assert_condition("page is loaded") 137 | 138 | assert result.passed is True 139 | assert result.thought == "condition is met" 140 | 141 | 142 | class TestAgent: 143 | """Test Agent functionality""" 144 | 145 | @pytest.mark.asyncio 146 | async def test_agent_creation(self, mock_interface): 147 | """Test agent creation""" 148 | agent = Agent(mock_interface) 149 | 150 | assert agent.interface == mock_interface 151 | assert agent.insight is not None 152 | assert agent.task_executor is not None 153 | assert agent.destroyed is False 154 | 155 | @pytest.mark.asyncio 156 | async def test_ai_locate(self, mock_interface, mock_ai_service): 157 | """Test AI locate through agent""" 158 | agent = Agent(mock_interface) 159 | agent.insight.ai_service = mock_ai_service 160 | 161 | result = await agent.ai_locate("test button") 162 | 163 | assert result.element is not None 164 | assert result.element.id == "test_element" 165 | 166 | @pytest.mark.asyncio 167 | async def test_ai_extract(self, mock_interface, mock_ai_service): 168 | """Test AI extract through agent""" 169 | # Mock extract response 170 | mock_ai_service.call_ai.return_value = { 171 | "content": { 172 | "data": {"username": "testuser"}, 173 | "reasoning": "extracted username", 174 | "confidence": 0.9, 175 | "errors": [] 176 | }, 177 | "usage": {"total_tokens": 100} 178 | } 179 | 180 | agent = Agent(mock_interface) 181 | agent.insight.ai_service = mock_ai_service 182 | 183 | result = await agent.ai_extract("extract username") 184 | 185 | assert result["username"] == "testuser" 186 | 187 | @pytest.mark.asyncio 188 | async def test_ai_assert_success(self, mock_interface, mock_ai_service): 189 | """Test AI assert success""" 190 | # Mock assert response 191 | mock_ai_service.call_ai.return_value = { 192 | "content": { 193 | "passed": True, 194 | "reasoning": "condition met", 195 | "confidence": 0.9, 196 | "message": "success" 197 | }, 198 | "usage": {"total_tokens": 80} 199 | } 200 | 201 | agent = Agent(mock_interface) 202 | agent.insight.ai_service = mock_ai_service 203 | 204 | # Should not raise exception 205 | await agent.ai_assert("page is loaded") 206 | 207 | @pytest.mark.asyncio 208 | async def test_ai_assert_failure(self, mock_interface, mock_ai_service): 209 | """Test AI assert failure""" 210 | # Mock assert response 211 | mock_ai_service.call_ai.return_value = { 212 | "content": { 213 | "passed": False, 214 | "reasoning": "condition not met", 215 | "confidence": 0.9, 216 | "message": "login failed" 217 | }, 218 | "usage": {"total_tokens": 80} 219 | } 220 | 221 | agent = Agent(mock_interface) 222 | agent.insight.ai_service = mock_ai_service 223 | 224 | # Should raise AssertionError 225 | with pytest.raises(AssertionError): 226 | await agent.ai_assert("user is logged in") 227 | 228 | @pytest.mark.asyncio 229 | async def test_basic_actions(self, mock_interface): 230 | """Test basic agent actions""" 231 | agent = Agent(mock_interface) 232 | 233 | # Test tap 234 | await agent.tap(100, 200) 235 | 236 | # Test input 237 | await agent.input_text("test text") 238 | 239 | # Test scroll 240 | from midscene.core.types import ScrollParam 241 | scroll_param = ScrollParam(direction="down", scroll_type="once", distance=500) 242 | await agent.scroll(scroll_param) 243 | 244 | @pytest.mark.asyncio 245 | async def test_agent_destroy(self, mock_interface): 246 | """Test agent destruction""" 247 | agent = Agent(mock_interface) 248 | 249 | await agent.destroy() 250 | 251 | assert agent.destroyed is True 252 | 253 | # Should raise error when using destroyed agent 254 | with pytest.raises(RuntimeError): 255 | await agent.ai_locate("test") 256 | 257 | 258 | if __name__ == "__main__": 259 | pytest.main([__file__, "-v"]) -------------------------------------------------------------------------------- /wiki/安装配置.md: -------------------------------------------------------------------------------- 1 | # 安装配置 2 | 3 | 本章节详细介绍 Midscene Python 的安装步骤、环境配置和依赖管理。 4 | 5 | ## 📋 系统要求 6 | 7 | ### 基础要求 8 | - **Python**: 3.9 或更高版本 9 | - **操作系统**: Windows 10+, macOS 10.14+, Linux (Ubuntu 18.04+) 10 | - **内存**: 最少 4GB RAM(推荐 8GB+) 11 | - **网络**: 稳定的互联网连接(用于 AI 模型调用) 12 | 13 | ### AI 模型要求 14 | 至少需要以下 AI 服务之一的 API 访问权限: 15 | - OpenAI GPT-4V 16 | - Anthropic Claude 3 17 | - 阿里云通义千问 VL 18 | - Google Gemini Pro Vision 19 | 20 | ## 🚀 快速安装 21 | 22 | ### 方式一:使用 pip 安装(推荐) 23 | ```bash 24 | # 安装最新版本 25 | pip install midscene-python 26 | 27 | # 或指定版本 28 | pip install midscene-python==0.1.0 29 | ``` 30 | 31 | ### 方式二:从源码安装 32 | ```bash 33 | # 克隆仓库 34 | git clone https://gitee.com/Python51888/midscene-python.git 35 | cd midscene-python 36 | 37 | # 安装依赖并安装 38 | pip install -e . 39 | ``` 40 | 41 | ### 方式三:开发者安装 42 | ```bash 43 | # 克隆仓库 44 | git clone https://gitee.com/Python51888/midscene-python.git 45 | cd midscene-python 46 | 47 | # 安装开发依赖 48 | pip install -e ".[dev,docs]" 49 | 50 | # 安装 pre-commit hooks 51 | pre-commit install 52 | ``` 53 | 54 | ## 🔧 平台特定配置 55 | 56 | ### Web 自动化配置 57 | 58 | #### Selenium 配置 59 | ```bash 60 | # 安装 Selenium 和 WebDriver 管理器 61 | pip install selenium webdriver-manager 62 | 63 | # Python 代码中自动管理驱动 64 | from selenium import webdriver 65 | from webdriver_manager.chrome import ChromeDriverManager 66 | from selenium.webdriver.chrome.service import Service 67 | 68 | service = Service(ChromeDriverManager().install()) 69 | driver = webdriver.Chrome(service=service) 70 | ``` 71 | 72 | #### Playwright 配置 73 | ```bash 74 | # 安装 Playwright 75 | pip install playwright 76 | 77 | # 安装浏览器 78 | playwright install 79 | 80 | # 仅安装 Chromium(节省空间) 81 | playwright install chromium 82 | ``` 83 | 84 | ### Android 自动化配置 85 | 86 | #### ADB 设置 87 | ```bash 88 | # 安装 ADB(Ubuntu/Debian) 89 | sudo apt-get install android-tools-adb 90 | 91 | # 安装 ADB(macOS) 92 | brew install android-platform-tools 93 | 94 | # 安装 ADB(Windows) 95 | # 下载 Android SDK Platform Tools 96 | # 添加到系统 PATH 97 | ``` 98 | 99 | #### 设备连接 100 | ```bash 101 | # 启用开发者选项和 USB 调试 102 | # 连接设备后验证 103 | adb devices 104 | 105 | # 预期输出 106 | List of devices attached 107 | DEVICE_ID device 108 | ``` 109 | 110 | ## 🔑 AI 模型配置 111 | 112 | ### 环境变量配置 113 | 创建 `.env` 文件: 114 | 115 | ```bash 116 | # OpenAI 配置 117 | OPENAI_API_KEY=sk-your-openai-api-key 118 | OPENAI_BASE_URL=https://api.openai.com/v1 # 可选 119 | 120 | # Anthropic 配置 121 | ANTHROPIC_API_KEY=sk-ant-your-anthropic-key 122 | 123 | # 通义千问配置 124 | DASHSCOPE_API_KEY=sk-your-dashscope-key 125 | 126 | # Gemini 配置 127 | GOOGLE_API_KEY=AIza-your-google-api-key 128 | 129 | # 默认模型配置 130 | MIDSCENE_AI_PROVIDER=openai 131 | MIDSCENE_AI_MODEL=gpt-4-vision-preview 132 | ``` 133 | 134 | ### 代码配置 135 | ```python 136 | from midscene.core.ai_model import AIModelConfig 137 | 138 | # 多个 AI 提供商配置 139 | configs = { 140 | "openai": AIModelConfig( 141 | provider="openai", 142 | model="gpt-4-vision-preview", 143 | api_key="your-openai-key", 144 | temperature=0.1 145 | ), 146 | "claude": AIModelConfig( 147 | provider="anthropic", 148 | model="claude-3-sonnet-20240229", 149 | api_key="your-claude-key", 150 | temperature=0.1 151 | ) 152 | } 153 | ``` 154 | 155 | ## 📦 依赖管理 156 | 157 | ### 核心依赖 158 | ```toml 159 | # pyproject.toml 中的核心依赖 160 | [project] 161 | dependencies = [ 162 | "pydantic>=2.0,<3.0", 163 | "selenium>=4.15.0,<5.0", 164 | "playwright>=1.40.0,<2.0", 165 | "opencv-python>=4.8.0,<5.0", 166 | "pillow>=10.0.0,<11.0", 167 | "aiohttp>=3.9.0,<4.0", 168 | "loguru>=0.7.0,<1.0", 169 | "typer>=0.9.0,<1.0", 170 | "httpx>=0.25.0,<1.0", 171 | "openai>=1.3.0,<2.0", 172 | "anthropic>=0.7.0,<1.0" 173 | ] 174 | ``` 175 | 176 | ### 可选依赖 177 | ```bash 178 | # 开发工具 179 | pip install "midscene-python[dev]" 180 | 181 | # 文档工具 182 | pip install "midscene-python[docs]" 183 | 184 | # 全部依赖 185 | pip install "midscene-python[dev,docs]" 186 | ``` 187 | 188 | ## 🔍 验证安装 189 | 190 | ### 基础验证 191 | ```python 192 | # test_installation.py 193 | import asyncio 194 | from midscene import Agent 195 | from midscene.core.ai_model import AIModelService 196 | 197 | async def test_installation(): 198 | """测试安装是否成功""" 199 | 200 | # 测试导入 201 | print("✓ 导入模块成功") 202 | 203 | # 测试 AI 服务配置 204 | try: 205 | ai_service = AIModelService() 206 | print("✓ AI 服务初始化成功") 207 | except Exception as e: 208 | print(f"✗ AI 服务初始化失败: {e}") 209 | 210 | print("🎉 安装验证完成!") 211 | 212 | # 运行测试 213 | asyncio.run(test_installation()) 214 | ``` 215 | 216 | ### Web 平台验证 217 | ```python 218 | # test_web.py 219 | import asyncio 220 | from midscene import Agent 221 | from midscene.web import SeleniumWebPage 222 | 223 | async def test_web(): 224 | """测试 Web 平台功能""" 225 | try: 226 | with SeleniumWebPage.create() as page: 227 | agent = Agent(page) 228 | await page.goto("https://www.example.com") 229 | print("✓ Web 自动化测试成功") 230 | except Exception as e: 231 | print(f"✗ Web 自动化测试失败: {e}") 232 | 233 | asyncio.run(test_web()) 234 | ``` 235 | 236 | ### Android 平台验证 237 | ```python 238 | # test_android.py 239 | import asyncio 240 | from midscene import Agent 241 | from midscene.android import AndroidDevice 242 | 243 | async def test_android(): 244 | """测试 Android 平台功能""" 245 | try: 246 | device = AndroidDevice() 247 | await device.connect() 248 | agent = Agent(device) 249 | print("✓ Android 自动化测试成功") 250 | except Exception as e: 251 | print(f"✗ Android 自动化测试失败: {e}") 252 | 253 | asyncio.run(test_android()) 254 | ``` 255 | 256 | ## 🔧 常见问题解决 257 | 258 | ### Python 版本问题 259 | ```bash 260 | # 检查 Python 版本 261 | python --version 262 | 263 | # 如果版本低于 3.9,安装新版本 264 | # Ubuntu/Debian 265 | sudo apt-get install python3.9 266 | 267 | # macOS 268 | brew install python@3.9 269 | 270 | # Windows 271 | # 从 python.org 下载安装 272 | ``` 273 | 274 | ### 依赖冲突解决 275 | ```bash 276 | # 创建虚拟环境(推荐) 277 | python -m venv midscene-env 278 | source midscene-env/bin/activate # Linux/macOS 279 | # 或 280 | midscene-env\Scripts\activate # Windows 281 | 282 | # 在虚拟环境中安装 283 | pip install midscene-python 284 | ``` 285 | 286 | ### 网络连接问题 287 | ```bash 288 | # 使用国内镜像源 289 | pip install -i https://pypi.tuna.tsinghua.edu.cn/simple midscene-python 290 | 291 | # 或配置永久镜像源 292 | pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple 293 | ``` 294 | 295 | ### AI API 连接问题 296 | ```python 297 | # 测试 API 连接 298 | import os 299 | import httpx 300 | 301 | async def test_openai_connection(): 302 | api_key = os.getenv("OPENAI_API_KEY") 303 | if not api_key: 304 | print("❌ 未设置 OPENAI_API_KEY") 305 | return 306 | 307 | async with httpx.AsyncClient() as client: 308 | try: 309 | response = await client.get( 310 | "https://api.openai.com/v1/models", 311 | headers={"Authorization": f"Bearer {api_key}"} 312 | ) 313 | if response.status_code == 200: 314 | print("✅ OpenAI API 连接正常") 315 | else: 316 | print(f"❌ OpenAI API 连接失败: {response.status_code}") 317 | except Exception as e: 318 | print(f"❌ 网络连接错误: {e}") 319 | ``` 320 | 321 | ## 🚀 性能优化配置 322 | 323 | ### 系统级优化 324 | ```bash 325 | # 增加文件描述符限制(Linux/macOS) 326 | ulimit -n 65536 327 | 328 | # 设置环境变量优化 329 | export PYTHONUNBUFFERED=1 330 | export PYTHONDONTWRITEBYTECODE=1 331 | ``` 332 | 333 | ### Python 配置优化 334 | ```python 335 | # config.py 336 | import asyncio 337 | 338 | # 设置异步事件循环策略 339 | if hasattr(asyncio, 'WindowsSelectorEventLoopPolicy'): 340 | asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) 341 | 342 | # 配置日志级别 343 | import logging 344 | logging.getLogger("httpx").setLevel(logging.WARNING) 345 | logging.getLogger("selenium").setLevel(logging.WARNING) 346 | ``` 347 | 348 | ## 📋 配置检查清单 349 | 350 | ### 安装完成检查 351 | - [ ] Python 3.9+ 已安装 352 | - [ ] midscene-python 包已安装 353 | - [ ] 至少一个 AI 提供商已配置 354 | - [ ] Web 驱动程序已安装(如果使用 Web 自动化) 355 | - [ ] ADB 已安装并设备已连接(如果使用 Android 自动化) 356 | 357 | ### 环境配置检查 358 | - [ ] 环境变量已设置 359 | - [ ] API 密钥有效且有足够额度 360 | - [ ] 网络连接正常 361 | - [ ] 防火墙和代理配置正确 362 | 363 | ### 功能测试检查 364 | - [ ] 基础导入测试通过 365 | - [ ] AI 服务初始化成功 366 | - [ ] 选择的平台(Web/Android)测试通过 367 | - [ ] 示例代码可以正常运行 368 | 369 | ## 🔄 升级和维护 370 | 371 | ### 版本升级 372 | ```bash 373 | # 检查当前版本 374 | pip show midscene-python 375 | 376 | # 升级到最新版本 377 | pip install --upgrade midscene-python 378 | 379 | # 升级特定版本 380 | pip install midscene-python==0.2.0 381 | ``` 382 | 383 | ### 配置备份 384 | ```bash 385 | # 备份配置文件 386 | cp .env .env.backup 387 | cp pyproject.toml pyproject.toml.backup 388 | 389 | # 导出依赖列表 390 | pip freeze > requirements.txt 391 | ``` 392 | 393 | ### 清理和重装 394 | ```bash 395 | # 卸载当前版本 396 | pip uninstall midscene-python 397 | 398 | # 清理缓存 399 | pip cache purge 400 | 401 | # 重新安装 402 | pip install midscene-python 403 | ``` 404 | 405 | --- 406 | 407 | 完成配置后,您就可以开始使用 Midscene Python 进行 AI 驱动的自动化了!接下来推荐阅读 [快速开始](快速开始.md) 指南。 -------------------------------------------------------------------------------- /wiki/平台集成/README.md: -------------------------------------------------------------------------------- 1 | # 平台集成 2 | 3 | Midscene Python 支持多个平台的 UI 自动化,提供统一的编程接口和一致的操作体验。 4 | 5 | ## 🏗️ 架构概览 6 | 7 | ```mermaid 8 | graph TB 9 | A[Agent 统一接口] --> B[平台抽象层] 10 | B --> C[Web 自动化] 11 | B --> D[Android 自动化] 12 | 13 | C --> E[Selenium 集成] 14 | C --> F[Playwright 集成] 15 | C --> G[Web 桥接机制] 16 | 17 | D --> H[ADB 设备管理] 18 | D --> I[Android Agent] 19 | 20 | E --> J[ChromeDriver] 21 | E --> K[FirefoxDriver] 22 | F --> L[Chromium] 23 | F --> M[Firefox] 24 | F --> N[Safari] 25 | 26 | H --> O[USB 设备] 27 | H --> P[网络设备] 28 | H --> Q[模拟器] 29 | ``` 30 | 31 | ## 📱 支持的平台 32 | 33 | ### Web 自动化 34 | - **Selenium WebDriver**: 支持 Chrome、Firefox、Safari、Edge 35 | - **Playwright**: 支持 Chromium、Firefox、WebKit 36 | - **统一桥接**: 提供一致的 API 接口 37 | 38 | ### Android 自动化 39 | - **真实设备**: 通过 USB 或 WiFi 连接 40 | - **Android 模拟器**: 支持各种 AVD 配置 41 | - **云设备**: 支持云端设备服务 42 | 43 | ## 🌐 Web 自动化 44 | 45 | ### 快速开始 46 | ```python 47 | import asyncio 48 | from midscene import Agent 49 | from midscene.web import SeleniumWebPage, PlaywrightPage 50 | 51 | # Selenium 示例 52 | async def selenium_example(): 53 | with SeleniumWebPage.create() as page: 54 | agent = Agent(page) 55 | await page.goto("https://example.com") 56 | await agent.ai_action("点击登录按钮") 57 | 58 | # Playwright 示例 59 | async def playwright_example(): 60 | async with PlaywrightPage.create() as page: 61 | agent = Agent(page) 62 | await page.goto("https://example.com") 63 | await agent.ai_action("点击登录按钮") 64 | ``` 65 | 66 | ### 高级配置 67 | ```python 68 | from midscene.web import SeleniumWebPage 69 | from selenium.webdriver.chrome.options import Options 70 | 71 | # 自定义浏览器选项 72 | chrome_options = Options() 73 | chrome_options.add_argument("--headless") 74 | chrome_options.add_argument("--no-sandbox") 75 | 76 | page = SeleniumWebPage.create( 77 | browser="chrome", 78 | options=chrome_options, 79 | window_size=(1920, 1080) 80 | ) 81 | ``` 82 | 83 | ### 详细文档 84 | - [Selenium集成](Web自动化/Selenium集成.md) - Selenium WebDriver 完整指南 85 | - [Playwright集成](Web自动化/Playwright集成.md) - Playwright 集成和配置 86 | - [Web桥接机制](Web自动化/Web桥接机制.md) - 统一的 Web 操作抽象 87 | 88 | ## 📱 Android 自动化 89 | 90 | ### 快速开始 91 | ```python 92 | import asyncio 93 | from midscene import Agent 94 | from midscene.android import AndroidDevice 95 | 96 | async def android_example(): 97 | # 连接设备 98 | device = AndroidDevice() 99 | await device.connect() 100 | 101 | # 创建 Agent 102 | agent = Agent(device) 103 | 104 | # 启动应用 105 | await device.start_app("com.example.app") 106 | 107 | # AI 操作 108 | await agent.ai_action("点击登录按钮") 109 | await agent.ai_action("输入用户名 'testuser'") 110 | await agent.ai_action("点击提交") 111 | ``` 112 | 113 | ### 设备管理 114 | ```python 115 | from midscene.android import AndroidDevice, DeviceManager 116 | 117 | # 连接特定设备 118 | device = AndroidDevice(device_id="emulator-5554") 119 | 120 | # 设备管理器 121 | manager = DeviceManager() 122 | devices = await manager.list_devices() 123 | for device in devices: 124 | print(f"设备: {device.id}, 状态: {device.status}") 125 | ``` 126 | 127 | ### 详细文档 128 | - [Android自动化](Android自动化.md) - Android 平台完整指南 129 | 130 | ## 🔄 统一操作接口 131 | 132 | 无论使用哪个平台,Midscene Python 都提供一致的操作接口: 133 | 134 | ### Agent 操作 135 | ```python 136 | # Web 和 Android 使用相同的方法 137 | await agent.ai_action("点击按钮") 138 | await agent.ai_action("输入文本 'hello'") 139 | await agent.ai_action("滚动到底部") 140 | 141 | # 数据提取 142 | data = await agent.ai_extract({ 143 | "title": "页面标题", 144 | "items": ["列表项目"] 145 | }) 146 | 147 | # 状态断言 148 | await agent.ai_assert("页面显示成功消息") 149 | ``` 150 | 151 | ### 页面操作 152 | ```python 153 | # 统一的页面操作 154 | await page.goto("https://example.com") # Web 155 | await device.start_app("com.app") # Android 156 | 157 | # 截图 158 | screenshot = await page.screenshot() # Web 159 | screenshot = await device.screenshot() # Android 160 | 161 | # 获取上下文 162 | context = await page.get_context() # Web 163 | context = await device.get_context() # Android 164 | ``` 165 | 166 | ## 🔧 平台适配机制 167 | 168 | ### AbstractInterface 抽象基类 169 | ```python 170 | from midscene.core.types import AbstractInterface, InterfaceType 171 | 172 | class CustomPlatform(AbstractInterface): 173 | @property 174 | def interface_type(self) -> InterfaceType: 175 | return InterfaceType.WEB # 或 InterfaceType.ANDROID 176 | 177 | async def get_context(self) -> UIContext: 178 | # 实现获取页面/屏幕上下文 179 | pass 180 | 181 | async def tap(self, x: float, y: float) -> None: 182 | # 实现点击操作 183 | pass 184 | 185 | async def input_text(self, text: str) -> None: 186 | # 实现文本输入 187 | pass 188 | ``` 189 | 190 | ### 桥接模式实现 191 | ```python 192 | # Web 桥接示例 193 | class WebBridge: 194 | def __init__(self, driver_type: str): 195 | if driver_type == "selenium": 196 | self.driver = SeleniumWebDriver() 197 | elif driver_type == "playwright": 198 | self.driver = PlaywrightDriver() 199 | 200 | async def unified_action(self, action: str, **kwargs): 201 | # 统一的操作接口 202 | return await self.driver.execute_action(action, **kwargs) 203 | ``` 204 | 205 | ## 🚀 平台选择指南 206 | 207 | ### Web 平台选择 208 | 209 | #### Selenium 210 | **适用场景**: 211 | - 需要支持多种浏览器 212 | - 与现有 Selenium 项目集成 213 | - 需要特定的 WebDriver 功能 214 | 215 | **优势**: 216 | - 成熟稳定,社区支持好 217 | - 支持的浏览器最多 218 | - 与 Selenium Grid 集成 219 | 220 | **劣势**: 221 | - 性能相对较慢 222 | - API 相对复杂 223 | 224 | #### Playwright 225 | **适用场景**: 226 | - 需要高性能的自动化 227 | - 现代 Web 应用测试 228 | - 需要网络拦截等高级功能 229 | 230 | **优势**: 231 | - 性能优异 232 | - 现代化的 API 设计 233 | - 内置等待和重试机制 234 | 235 | **劣势**: 236 | - 相对较新,生态系统较小 237 | - 学习成本稍高 238 | 239 | ### Android 平台特点 240 | 241 | **适用场景**: 242 | - 移动应用 UI 测试 243 | - 移动端业务流程自动化 244 | - 跨平台应用测试 245 | 246 | **优势**: 247 | - 直接操作原生 Android 界面 248 | - 支持各种 Android 版本 249 | - 可以测试真实设备体验 250 | 251 | **注意事项**: 252 | - 需要 ADB 环境配置 253 | - 设备连接稳定性要求高 254 | - 权限和安全限制较多 255 | 256 | ## 📊 性能对比 257 | 258 | | 特性 | Selenium | Playwright | Android | 259 | |------|----------|------------|---------| 260 | | **启动速度** | 中等 | 快 | 较慢 | 261 | | **执行速度** | 中等 | 快 | 取决于设备 | 262 | | **资源占用** | 中等 | 低 | 高 | 263 | | **稳定性** | 高 | 高 | 中等 | 264 | | **调试难度** | 中等 | 低 | 高 | 265 | 266 | ## 🔗 跨平台最佳实践 267 | 268 | ### 1. 统一测试脚本 269 | ```python 270 | async def universal_test(platform: str): 271 | """跨平台测试脚本""" 272 | 273 | if platform == "web": 274 | page = SeleniumWebPage.create() 275 | agent = Agent(page) 276 | await page.goto("https://app.example.com") 277 | 278 | elif platform == "android": 279 | device = AndroidDevice() 280 | await device.connect() 281 | agent = Agent(device) 282 | await device.start_app("com.example.app") 283 | 284 | # 统一的测试步骤 285 | await agent.ai_action("点击登录按钮") 286 | await agent.ai_action("输入用户名 'test'") 287 | await agent.ai_action("输入密码 'password'") 288 | await agent.ai_action("点击提交") 289 | 290 | # 统一的验证 291 | await agent.ai_assert("显示欢迎页面") 292 | ``` 293 | 294 | ### 2. 配置管理 295 | ```python 296 | # config.py 297 | PLATFORM_CONFIGS = { 298 | "web": { 299 | "browser": "chrome", 300 | "headless": False, 301 | "window_size": (1920, 1080) 302 | }, 303 | "android": { 304 | "device_id": None, # 自动选择 305 | "app_package": "com.example.app", 306 | "timeout": 30 307 | } 308 | } 309 | 310 | def get_platform_config(platform: str) -> dict: 311 | return PLATFORM_CONFIGS.get(platform, {}) 312 | ``` 313 | 314 | ### 3. 错误处理 315 | ```python 316 | async def robust_platform_operation(agent: Agent, action: str): 317 | """跨平台的健壮操作""" 318 | 319 | max_retries = 3 320 | for attempt in range(max_retries): 321 | try: 322 | await agent.ai_action(action) 323 | return 324 | except Exception as e: 325 | if attempt == max_retries - 1: 326 | raise 327 | 328 | # 根据平台类型进行特定的恢复操作 329 | platform_type = agent.interface.interface_type 330 | if platform_type == InterfaceType.WEB: 331 | await handle_web_error(agent, e) 332 | elif platform_type == InterfaceType.ANDROID: 333 | await handle_android_error(agent, e) 334 | 335 | await asyncio.sleep(1) # 等待后重试 336 | ``` 337 | 338 | ## 🔍 调试和诊断 339 | 340 | ### 统一调试接口 341 | ```python 342 | async def debug_platform_info(agent: Agent): 343 | """获取平台调试信息""" 344 | 345 | interface = agent.interface 346 | platform_type = interface.interface_type 347 | 348 | print(f"平台类型: {platform_type}") 349 | 350 | if platform_type == InterfaceType.WEB: 351 | context = await interface.get_context() 352 | print(f"页面标题: {context.page_title}") 353 | print(f"页面 URL: {context.url}") 354 | print(f"视口大小: {context.size}") 355 | 356 | elif platform_type == InterfaceType.ANDROID: 357 | context = await interface.get_context() 358 | print(f"屏幕尺寸: {context.size}") 359 | print(f"当前活动: {context.current_activity}") 360 | print(f"设备信息: {context.device_info}") 361 | ``` 362 | 363 | ### 跨平台截图 364 | ```python 365 | async def take_debug_screenshot(agent: Agent, filename: str): 366 | """跨平台截图功能""" 367 | 368 | interface = agent.interface 369 | screenshot = await interface.screenshot() 370 | 371 | # 添加平台标识 372 | platform_type = interface.interface_type.value 373 | timestamped_filename = f"{platform_type}_{filename}_{int(time.time())}.png" 374 | 375 | with open(timestamped_filename, "wb") as f: 376 | f.write(screenshot) 377 | 378 | print(f"截图已保存: {timestamped_filename}") 379 | ``` 380 | 381 | --- 382 | 383 | 通过 Midscene Python 的平台集成能力,你可以用统一的方式处理不同平台的自动化需求。选择适合你项目需求的平台,并利用统一的 API 来简化开发和维护工作! -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # This file was autogenerated by uv via the following command: 2 | # uv pip compile --all-extras pyproject.toml -o requirements.txt 3 | aiohappyeyeballs==2.6.1 4 | # via aiohttp 5 | aiohttp==3.12.15 6 | # via 7 | # midscene-python (pyproject.toml) 8 | # dashscope 9 | aiosignal==1.4.0 10 | # via aiohttp 11 | annotated-types==0.7.0 12 | # via pydantic 13 | anthropic==0.64.0 14 | # via midscene-python (pyproject.toml) 15 | anyio==4.10.0 16 | # via 17 | # anthropic 18 | # httpx 19 | # openai 20 | asyncio-mqtt==0.16.2 21 | # via midscene-python (pyproject.toml) 22 | attrs==25.3.0 23 | # via 24 | # aiohttp 25 | # outcome 26 | # trio 27 | babel==2.17.0 28 | # via mkdocs-material 29 | backrefs==5.9 30 | # via mkdocs-material 31 | black==25.1.0 32 | # via midscene-python (pyproject.toml) 33 | cachetools==5.5.2 34 | # via google-auth 35 | certifi==2025.8.3 36 | # via 37 | # httpcore 38 | # httpx 39 | # requests 40 | # selenium 41 | cffi==1.17.1 42 | # via 43 | # cryptography 44 | # trio 45 | cfgv==3.4.0 46 | # via pre-commit 47 | charset-normalizer==3.4.3 48 | # via requests 49 | click==8.2.1 50 | # via 51 | # black 52 | # mkdocs 53 | # mkdocs-material 54 | # typer 55 | colorama==0.4.6 56 | # via 57 | # click 58 | # griffe 59 | # loguru 60 | # mkdocs 61 | # mkdocs-material 62 | # pytest 63 | # tqdm 64 | coverage==7.10.6 65 | # via pytest-cov 66 | cryptography==45.0.7 67 | # via dashscope 68 | dashscope==1.24.2 69 | # via midscene-python (pyproject.toml) 70 | distlib==0.4.0 71 | # via virtualenv 72 | distro==1.9.0 73 | # via 74 | # anthropic 75 | # openai 76 | filelock==3.19.1 77 | # via virtualenv 78 | frozenlist==1.7.0 79 | # via 80 | # aiohttp 81 | # aiosignal 82 | ghp-import==2.1.0 83 | # via mkdocs 84 | google-ai-generativelanguage==0.6.15 85 | # via google-generativeai 86 | google-api-core==2.25.1 87 | # via 88 | # google-ai-generativelanguage 89 | # google-api-python-client 90 | # google-generativeai 91 | google-api-python-client==2.179.0 92 | # via google-generativeai 93 | google-auth==2.40.3 94 | # via 95 | # google-ai-generativelanguage 96 | # google-api-core 97 | # google-api-python-client 98 | # google-auth-httplib2 99 | # google-generativeai 100 | google-auth-httplib2==0.2.0 101 | # via google-api-python-client 102 | google-generativeai==0.8.5 103 | # via midscene-python (pyproject.toml) 104 | googleapis-common-protos==1.70.0 105 | # via 106 | # google-api-core 107 | # grpcio-status 108 | greenlet==3.2.4 109 | # via playwright 110 | griffe==1.13.0 111 | # via mkdocstrings-python 112 | grpcio==1.74.0 113 | # via 114 | # google-api-core 115 | # grpcio-status 116 | grpcio-status==1.71.2 117 | # via google-api-core 118 | h11==0.16.0 119 | # via 120 | # httpcore 121 | # wsproto 122 | httpcore==1.0.9 123 | # via httpx 124 | httplib2==0.30.0 125 | # via 126 | # google-api-python-client 127 | # google-auth-httplib2 128 | httpx==0.28.1 129 | # via 130 | # midscene-python (pyproject.toml) 131 | # anthropic 132 | # openai 133 | identify==2.6.13 134 | # via pre-commit 135 | idna==3.10 136 | # via 137 | # anyio 138 | # httpx 139 | # requests 140 | # trio 141 | # yarl 142 | iniconfig==2.1.0 143 | # via pytest 144 | isort==6.0.1 145 | # via midscene-python (pyproject.toml) 146 | jinja2==3.1.6 147 | # via 148 | # midscene-python (pyproject.toml) 149 | # mkdocs 150 | # mkdocs-material 151 | # mkdocstrings 152 | jiter==0.10.0 153 | # via 154 | # anthropic 155 | # openai 156 | loguru==0.7.3 157 | # via midscene-python (pyproject.toml) 158 | markdown==3.8.2 159 | # via 160 | # mkdocs 161 | # mkdocs-autorefs 162 | # mkdocs-material 163 | # mkdocstrings 164 | # pymdown-extensions 165 | markdown-it-py==4.0.0 166 | # via rich 167 | markupsafe==3.0.2 168 | # via 169 | # jinja2 170 | # mkdocs 171 | # mkdocs-autorefs 172 | # mkdocstrings 173 | mdurl==0.1.2 174 | # via markdown-it-py 175 | mergedeep==1.3.4 176 | # via 177 | # mkdocs 178 | # mkdocs-get-deps 179 | mkdocs==1.6.1 180 | # via 181 | # midscene-python (pyproject.toml) 182 | # mkdocs-autorefs 183 | # mkdocs-material 184 | # mkdocstrings 185 | mkdocs-autorefs==1.4.3 186 | # via 187 | # mkdocstrings 188 | # mkdocstrings-python 189 | mkdocs-get-deps==0.2.0 190 | # via mkdocs 191 | mkdocs-material==9.6.18 192 | # via midscene-python (pyproject.toml) 193 | mkdocs-material-extensions==1.3.1 194 | # via mkdocs-material 195 | mkdocstrings==0.30.0 196 | # via 197 | # midscene-python (pyproject.toml) 198 | # mkdocstrings-python 199 | mkdocstrings-python==1.18.2 200 | # via mkdocstrings 201 | multidict==6.6.4 202 | # via 203 | # aiohttp 204 | # yarl 205 | mypy==1.17.1 206 | # via midscene-python (pyproject.toml) 207 | mypy-extensions==1.1.0 208 | # via 209 | # black 210 | # mypy 211 | nodeenv==1.9.1 212 | # via pre-commit 213 | numpy==1.26.4 214 | # via 215 | # midscene-python (pyproject.toml) 216 | # opencv-python 217 | openai==1.102.0 218 | # via midscene-python (pyproject.toml) 219 | opencv-python==4.11.0.86 220 | # via midscene-python (pyproject.toml) 221 | outcome==1.3.0.post0 222 | # via 223 | # trio 224 | # trio-websocket 225 | packaging==25.0 226 | # via 227 | # black 228 | # mkdocs 229 | # pytest 230 | paginate==0.5.7 231 | # via mkdocs-material 232 | paho-mqtt==2.1.0 233 | # via asyncio-mqtt 234 | pathspec==0.12.1 235 | # via 236 | # black 237 | # mkdocs 238 | # mypy 239 | pillow==10.4.0 240 | # via midscene-python (pyproject.toml) 241 | platformdirs==4.4.0 242 | # via 243 | # black 244 | # mkdocs-get-deps 245 | # virtualenv 246 | playwright==1.55.0 247 | # via midscene-python (pyproject.toml) 248 | pluggy==1.6.0 249 | # via 250 | # pytest 251 | # pytest-cov 252 | pre-commit==4.3.0 253 | # via midscene-python (pyproject.toml) 254 | propcache==0.3.2 255 | # via 256 | # aiohttp 257 | # yarl 258 | proto-plus==1.26.1 259 | # via 260 | # google-ai-generativelanguage 261 | # google-api-core 262 | protobuf==5.29.5 263 | # via 264 | # google-ai-generativelanguage 265 | # google-api-core 266 | # google-generativeai 267 | # googleapis-common-protos 268 | # grpcio-status 269 | # proto-plus 270 | pure-python-adb==0.3.0.dev0 271 | # via midscene-python (pyproject.toml) 272 | pyasn1==0.6.1 273 | # via 274 | # pyasn1-modules 275 | # rsa 276 | pyasn1-modules==0.4.2 277 | # via google-auth 278 | pycparser==2.22 279 | # via cffi 280 | pydantic==2.11.7 281 | # via 282 | # midscene-python (pyproject.toml) 283 | # anthropic 284 | # google-generativeai 285 | # openai 286 | pydantic-core==2.33.2 287 | # via pydantic 288 | pyee==13.0.0 289 | # via playwright 290 | pygments==2.19.2 291 | # via 292 | # mkdocs-material 293 | # pytest 294 | # rich 295 | pymdown-extensions==10.16.1 296 | # via 297 | # mkdocs-material 298 | # mkdocstrings 299 | pyparsing==3.2.3 300 | # via httplib2 301 | pysocks==1.7.1 302 | # via urllib3 303 | pytest==8.4.1 304 | # via 305 | # midscene-python (pyproject.toml) 306 | # pytest-asyncio 307 | # pytest-cov 308 | pytest-asyncio==1.1.0 309 | # via midscene-python (pyproject.toml) 310 | pytest-cov==6.2.1 311 | # via midscene-python (pyproject.toml) 312 | python-dateutil==2.9.0.post0 313 | # via ghp-import 314 | pyyaml==6.0.2 315 | # via 316 | # midscene-python (pyproject.toml) 317 | # mkdocs 318 | # mkdocs-get-deps 319 | # pre-commit 320 | # pymdown-extensions 321 | # pyyaml-env-tag 322 | pyyaml-env-tag==1.1 323 | # via mkdocs 324 | requests==2.32.5 325 | # via 326 | # dashscope 327 | # google-api-core 328 | # mkdocs-material 329 | rich==14.1.0 330 | # via typer 331 | rsa==4.9.1 332 | # via google-auth 333 | ruff==0.12.11 334 | # via midscene-python (pyproject.toml) 335 | selenium==4.35.0 336 | # via midscene-python (pyproject.toml) 337 | shellingham==1.5.4 338 | # via typer 339 | six==1.17.0 340 | # via python-dateutil 341 | sniffio==1.3.1 342 | # via 343 | # anthropic 344 | # anyio 345 | # openai 346 | # trio 347 | sortedcontainers==2.4.0 348 | # via trio 349 | tqdm==4.67.1 350 | # via 351 | # google-generativeai 352 | # openai 353 | trio==0.30.0 354 | # via 355 | # selenium 356 | # trio-websocket 357 | trio-websocket==0.12.2 358 | # via selenium 359 | typer==0.17.3 360 | # via midscene-python (pyproject.toml) 361 | typing-extensions==4.14.1 362 | # via 363 | # aiosignal 364 | # anthropic 365 | # anyio 366 | # google-generativeai 367 | # mypy 368 | # openai 369 | # pydantic 370 | # pydantic-core 371 | # pyee 372 | # selenium 373 | # typer 374 | # typing-inspection 375 | typing-inspection==0.4.1 376 | # via pydantic 377 | uritemplate==4.2.0 378 | # via google-api-python-client 379 | urllib3==2.5.0 380 | # via 381 | # requests 382 | # selenium 383 | virtualenv==20.34.0 384 | # via pre-commit 385 | watchdog==6.0.0 386 | # via mkdocs 387 | websocket-client==1.8.0 388 | # via 389 | # dashscope 390 | # selenium 391 | win32-setctime==1.2.0 392 | # via loguru 393 | wsproto==1.2.0 394 | # via trio-websocket 395 | yarl==1.20.1 396 | # via aiohttp 397 | -------------------------------------------------------------------------------- /midscene/shared/cache.py: -------------------------------------------------------------------------------- 1 | """ 2 | Task caching system for performance optimization 3 | """ 4 | 5 | import json 6 | import hashlib 7 | import pickle 8 | from datetime import datetime, timedelta 9 | from pathlib import Path 10 | from typing import Any, Dict, List, Optional, Union 11 | 12 | from loguru import logger 13 | from pydantic import BaseModel 14 | 15 | 16 | class CacheEntry(BaseModel): 17 | """Cache entry model""" 18 | key: str 19 | data: Any 20 | timestamp: datetime 21 | expires_at: Optional[datetime] = None 22 | metadata: Dict[str, Any] = {} 23 | 24 | 25 | class TaskCache: 26 | """Task caching system for storing and retrieving execution results""" 27 | 28 | def __init__( 29 | self, 30 | cache_id: str, 31 | enabled: bool = True, 32 | cache_dir: Optional[str] = None, 33 | max_age_hours: int = 24 34 | ): 35 | """Initialize task cache 36 | 37 | Args: 38 | cache_id: Unique cache identifier 39 | enabled: Whether caching is enabled 40 | cache_dir: Cache directory path 41 | max_age_hours: Maximum cache age in hours 42 | """ 43 | self.cache_id = cache_id 44 | self.enabled = enabled 45 | self.max_age_hours = max_age_hours 46 | 47 | # Setup cache directory 48 | if cache_dir: 49 | self.cache_dir = Path(cache_dir) 50 | else: 51 | self.cache_dir = Path.home() / ".midscene" / "cache" 52 | 53 | self.cache_dir.mkdir(parents=True, exist_ok=True) 54 | self.cache_file = self.cache_dir / f"{cache_id}.json" 55 | 56 | # Load existing cache 57 | self._cache: Dict[str, CacheEntry] = {} 58 | self._load_cache() 59 | 60 | def _generate_key(self, data: Union[str, Dict, List]) -> str: 61 | """Generate cache key from data 62 | 63 | Args: 64 | data: Data to generate key from 65 | 66 | Returns: 67 | Cache key string 68 | """ 69 | if isinstance(data, str): 70 | content = data 71 | else: 72 | content = json.dumps(data, sort_keys=True, ensure_ascii=False) 73 | 74 | return hashlib.md5(content.encode('utf-8')).hexdigest() 75 | 76 | def _load_cache(self) -> None: 77 | """Load cache from file""" 78 | if not self.enabled or not self.cache_file.exists(): 79 | return 80 | 81 | try: 82 | with open(self.cache_file, 'r', encoding='utf-8') as f: 83 | cache_data = json.load(f) 84 | 85 | for key, entry_data in cache_data.items(): 86 | # Convert datetime strings back to datetime objects 87 | entry_data['timestamp'] = datetime.fromisoformat(entry_data['timestamp']) 88 | if entry_data.get('expires_at'): 89 | entry_data['expires_at'] = datetime.fromisoformat(entry_data['expires_at']) 90 | 91 | self._cache[key] = CacheEntry(**entry_data) 92 | 93 | # Clean expired entries 94 | self._clean_expired() 95 | 96 | logger.debug(f"Loaded {len(self._cache)} cache entries") 97 | 98 | except Exception as e: 99 | logger.warning(f"Failed to load cache: {e}") 100 | self._cache = {} 101 | 102 | def _save_cache(self) -> None: 103 | """Save cache to file""" 104 | if not self.enabled: 105 | return 106 | 107 | try: 108 | cache_data = {} 109 | for key, entry in self._cache.items(): 110 | entry_dict = entry.model_dump() 111 | # Convert datetime objects to strings 112 | entry_dict['timestamp'] = entry.timestamp.isoformat() 113 | if entry.expires_at: 114 | entry_dict['expires_at'] = entry.expires_at.isoformat() 115 | 116 | cache_data[key] = entry_dict 117 | 118 | with open(self.cache_file, 'w', encoding='utf-8') as f: 119 | json.dump(cache_data, f, ensure_ascii=False, indent=2) 120 | 121 | except Exception as e: 122 | logger.warning(f"Failed to save cache: {e}") 123 | 124 | def _clean_expired(self) -> None: 125 | """Clean expired cache entries""" 126 | now = datetime.now() 127 | expired_keys = [] 128 | 129 | for key, entry in self._cache.items(): 130 | # Check explicit expiration 131 | if entry.expires_at and entry.expires_at <= now: 132 | expired_keys.append(key) 133 | continue 134 | 135 | # Check age-based expiration 136 | age = now - entry.timestamp 137 | if age > timedelta(hours=self.max_age_hours): 138 | expired_keys.append(key) 139 | 140 | for key in expired_keys: 141 | del self._cache[key] 142 | 143 | if expired_keys: 144 | logger.debug(f"Cleaned {len(expired_keys)} expired cache entries") 145 | 146 | def get(self, key: str) -> Optional[Any]: 147 | """Get cached data by key 148 | 149 | Args: 150 | key: Cache key 151 | 152 | Returns: 153 | Cached data or None if not found 154 | """ 155 | if not self.enabled: 156 | return None 157 | 158 | entry = self._cache.get(key) 159 | if not entry: 160 | return None 161 | 162 | # Check if expired 163 | now = datetime.now() 164 | if entry.expires_at and entry.expires_at <= now: 165 | del self._cache[key] 166 | return None 167 | 168 | # Check age 169 | age = now - entry.timestamp 170 | if age > timedelta(hours=self.max_age_hours): 171 | del self._cache[key] 172 | return None 173 | 174 | logger.debug(f"Cache hit for key: {key}") 175 | return entry.data 176 | 177 | def put( 178 | self, 179 | key: str, 180 | data: Any, 181 | expires_in_hours: Optional[int] = None, 182 | metadata: Optional[Dict[str, Any]] = None 183 | ) -> None: 184 | """Store data in cache 185 | 186 | Args: 187 | key: Cache key 188 | data: Data to cache 189 | expires_in_hours: Custom expiration time in hours 190 | metadata: Additional metadata 191 | """ 192 | if not self.enabled: 193 | return 194 | 195 | now = datetime.now() 196 | expires_at = None 197 | 198 | if expires_in_hours: 199 | expires_at = now + timedelta(hours=expires_in_hours) 200 | 201 | entry = CacheEntry( 202 | key=key, 203 | data=data, 204 | timestamp=now, 205 | expires_at=expires_at, 206 | metadata=metadata or {} 207 | ) 208 | 209 | self._cache[key] = entry 210 | self._save_cache() 211 | 212 | logger.debug(f"Cached data with key: {key}") 213 | 214 | def get_by_data(self, data: Union[str, Dict, List]) -> Optional[Any]: 215 | """Get cached data by input data 216 | 217 | Args: 218 | data: Input data to generate key from 219 | 220 | Returns: 221 | Cached result or None 222 | """ 223 | key = self._generate_key(data) 224 | return self.get(key) 225 | 226 | def put_by_data( 227 | self, 228 | input_data: Union[str, Dict, List], 229 | result_data: Any, 230 | expires_in_hours: Optional[int] = None, 231 | metadata: Optional[Dict[str, Any]] = None 232 | ) -> None: 233 | """Store data in cache by input data 234 | 235 | Args: 236 | input_data: Input data to generate key from 237 | result_data: Result data to cache 238 | expires_in_hours: Custom expiration time in hours 239 | metadata: Additional metadata 240 | """ 241 | key = self._generate_key(input_data) 242 | self.put(key, result_data, expires_in_hours, metadata) 243 | 244 | def match_locate_cache(self, prompt: str) -> Optional[Dict[str, Any]]: 245 | """Match locate operation from cache 246 | 247 | Args: 248 | prompt: Locate prompt 249 | 250 | Returns: 251 | Cached locate result or None 252 | """ 253 | cache_key = f"locate:{self._generate_key(prompt)}" 254 | return self.get(cache_key) 255 | 256 | def store_locate_result( 257 | self, 258 | prompt: str, 259 | result: Dict[str, Any], 260 | expires_in_hours: int = 24 261 | ) -> None: 262 | """Store locate result in cache 263 | 264 | Args: 265 | prompt: Locate prompt 266 | result: Locate result 267 | expires_in_hours: Expiration time in hours 268 | """ 269 | cache_key = f"locate:{self._generate_key(prompt)}" 270 | self.put(cache_key, result, expires_in_hours, {"type": "locate"}) 271 | 272 | def clear(self) -> None: 273 | """Clear all cache entries""" 274 | self._cache.clear() 275 | if self.cache_file.exists(): 276 | self.cache_file.unlink() 277 | logger.info("Cache cleared") 278 | 279 | def get_stats(self) -> Dict[str, Any]: 280 | """Get cache statistics 281 | 282 | Returns: 283 | Cache statistics 284 | """ 285 | now = datetime.now() 286 | total_entries = len(self._cache) 287 | 288 | expired_count = 0 289 | for entry in self._cache.values(): 290 | if entry.expires_at and entry.expires_at <= now: 291 | expired_count += 1 292 | elif (now - entry.timestamp) > timedelta(hours=self.max_age_hours): 293 | expired_count += 1 294 | 295 | return { 296 | "total_entries": total_entries, 297 | "expired_entries": expired_count, 298 | "cache_file": str(self.cache_file), 299 | "cache_size_mb": self.cache_file.stat().st_size / 1024 / 1024 if self.cache_file.exists() else 0, 300 | "enabled": self.enabled 301 | } -------------------------------------------------------------------------------- /wiki/核心概念/Agent核心控制器.md: -------------------------------------------------------------------------------- 1 | # Agent 核心控制器 2 | 3 | Agent 是 Midscene Python 的核心控制器,为用户提供统一的自动化操作接口。它充当用户代码与底层平台之间的桥梁,通过 AI 理解用户意图并执行相应的操作。 4 | 5 | ## 🎯 设计理念 6 | 7 | ### 统一接口设计 8 | Agent 为不同平台(Web、Android)提供完全一致的编程接口,用户无需学习不同平台的特定 API: 9 | 10 | ```python 11 | # Web 和 Android 使用相同的接口 12 | web_agent = Agent(selenium_page) 13 | android_agent = Agent(android_device) 14 | 15 | # 相同的操作方法 16 | await web_agent.ai_action("点击登录按钮") 17 | await android_agent.ai_action("点击登录按钮") 18 | ``` 19 | 20 | ### AI 驱动的智能操作 21 | Agent 将自然语言指令转换为具体的操作步骤,让自动化变得更加直观: 22 | 23 | ```python 24 | # 传统方式需要精确的选择器 25 | element = driver.find_element(By.CSS_SELECTOR, "#login-form button[type='submit']") 26 | element.click() 27 | 28 | # Agent 方式使用自然语言 29 | await agent.ai_action("点击登录表单的提交按钮") 30 | ``` 31 | 32 | ## 🏗️ 架构设计 33 | 34 | ### 核心组件 35 | 36 | ```mermaid 37 | graph TB 38 | A[Agent] --> B[TaskExecutor] 39 | A --> C[Insight Engine] 40 | A --> D[AI Service] 41 | A --> E[Platform Interface] 42 | 43 | B --> C 44 | B --> E 45 | C --> D 46 | 47 | subgraph "Agent 核心" 48 | A 49 | B 50 | end 51 | 52 | subgraph "AI 理解层" 53 | C 54 | D 55 | end 56 | 57 | subgraph "平台抽象层" 58 | E 59 | end 60 | ``` 61 | 62 | ### Agent 类结构 63 | 64 | ```python 65 | class Agent: 66 | """Core Agent class that orchestrates AI model and device interactions""" 67 | 68 | def __init__( 69 | self, 70 | interface: AbstractInterface, 71 | options: Optional[AgentOptions] = None 72 | ): 73 | self.interface = interface # 平台接口 74 | self.options = options or AgentOptions() # 配置选项 75 | self.ai_service = AIModelService() # AI 服务 76 | self.insight = Insight(...) # UI 理解引擎 77 | self.task_executor = TaskExecutor(...) # 任务执行器 78 | ``` 79 | 80 | ## 🎮 主要功能 81 | 82 | ### 1. AI 驱动的操作 (ai_action) 83 | 84 | `ai_action` 是 Agent 最核心的方法,支持各种自然语言驱动的操作: 85 | 86 | ```python 87 | # 基础交互 88 | await agent.ai_action("点击登录按钮") 89 | await agent.ai_action("在用户名框输入 'admin'") 90 | await agent.ai_action("选择下拉菜单中的第二个选项") 91 | 92 | # 复杂操作 93 | await agent.ai_action("滚动到页面底部并点击加载更多按钮") 94 | await agent.ai_action("在搜索框输入'Python'并按回车搜索") 95 | 96 | # 条件操作 97 | await agent.ai_action("如果页面显示错误信息,点击确定按钮") 98 | ``` 99 | 100 | #### 工作流程 101 | 102 | 1. **指令解析**: 将自然语言转换为操作意图 103 | 2. **页面分析**: 获取当前页面的截图和上下文信息 104 | 3. **计划生成**: AI 生成详细的执行计划 105 | 4. **步骤执行**: 逐步执行计划中的每个操作 106 | 5. **结果验证**: 验证操作是否成功完成 107 | 108 | ```python 109 | async def ai_action(self, prompt: TUserPrompt, **kwargs) -> None: 110 | """Execute AI-driven action""" 111 | self._ensure_not_destroyed() 112 | 113 | # 委托给任务执行器 114 | result = await self.task_executor.execute_ai_action(prompt, **kwargs) 115 | 116 | if not result.success: 117 | raise Exception(f"Action failed: {result.error}") 118 | ``` 119 | 120 | ### 2. 智能元素定位 (ai_locate) 121 | 122 | 精确定位页面元素,支持各种描述方式: 123 | 124 | ```python 125 | # 基础定位 126 | login_btn = await agent.ai_locate("登录按钮") 127 | search_box = await agent.ai_locate("搜索输入框") 128 | 129 | # 描述性定位 130 | submit_btn = await agent.ai_locate("蓝色的提交按钮") 131 | user_avatar = await agent.ai_locate("页面右上角的用户头像") 132 | 133 | # 相对定位 134 | next_btn = await agent.ai_locate("位于分页控件中的下一页按钮") 135 | ``` 136 | 137 | #### 定位策略 138 | 139 | Agent 使用多种策略进行元素定位: 140 | 141 | 1. **视觉识别**: 基于截图进行 AI 视觉识别 142 | 2. **语义理解**: 理解元素的功能和上下文 143 | 3. **多重验证**: 结合多种信息确保定位准确性 144 | 4. **容错机制**: 支持页面变化和布局调整 145 | 146 | ### 3. 数据提取 (ai_extract) 147 | 148 | 从页面提取结构化数据: 149 | 150 | ```python 151 | # 提取单个对象 152 | user_info = await agent.ai_extract({ 153 | "name": "用户姓名", 154 | "email": "邮箱地址", 155 | "role": "用户角色" 156 | }) 157 | 158 | # 提取列表数据 159 | products = await agent.ai_extract({ 160 | "products": [ 161 | { 162 | "name": "商品名称", 163 | "price": "价格", 164 | "rating": "评分", 165 | "in_stock": "是否有货" 166 | } 167 | ] 168 | }) 169 | 170 | # 复杂嵌套结构 171 | order_data = await agent.ai_extract({ 172 | "order_id": "订单号", 173 | "customer": { 174 | "name": "客户姓名", 175 | "address": "送货地址" 176 | }, 177 | "items": [ 178 | { 179 | "product": "商品名称", 180 | "quantity": "数量", 181 | "price": "单价" 182 | } 183 | ], 184 | "total": "总金额" 185 | }) 186 | ``` 187 | 188 | ### 4. 智能断言 (ai_assert) 189 | 190 | 验证页面状态和内容: 191 | 192 | ```python 193 | # 状态验证 194 | await agent.ai_assert("用户已成功登录") 195 | await agent.ai_assert("页面显示错误信息") 196 | await agent.ai_assert("表单验证通过") 197 | 198 | # 内容验证 199 | await agent.ai_assert("搜索结果包含'Python 教程'") 200 | await agent.ai_assert("购物车中有 3 件商品") 201 | await agent.ai_assert("订单状态为已发货") 202 | 203 | # 条件验证 204 | await agent.ai_assert("如果是新用户,显示欢迎向导") 205 | ``` 206 | 207 | ## ⚙️ 配置选项 208 | 209 | ### AgentOptions 配置 210 | 211 | ```python 212 | from midscene.core import AgentOptions 213 | 214 | options = AgentOptions( 215 | # 超时设置 216 | timeout=30, # 操作超时时间(秒) 217 | 218 | # 重试机制 219 | retry_count=3, # 失败重试次数 220 | retry_delay=1.0, # 重试间隔(秒) 221 | 222 | # 调试选项 223 | screenshot_on_error=True, # 错误时自动截图 224 | save_execution_logs=True, # 保存执行日志 225 | 226 | # 性能优化 227 | cache_enabled=True, # 启用智能缓存 228 | parallel_execution=False, # 并行执行(实验性) 229 | 230 | # AI 模型设置 231 | model_temperature=0.1, # AI 响应随机性 232 | max_tokens=1000, # 最大 token 数 233 | ) 234 | 235 | agent = Agent(page, options=options) 236 | ``` 237 | 238 | ### 运行时配置 239 | 240 | ```python 241 | # 临时修改超时时间 242 | await agent.ai_action("点击按钮", timeout=60) 243 | 244 | # 禁用缓存的单次操作 245 | await agent.ai_extract(schema, use_cache=False) 246 | 247 | # 自定义重试策略 248 | await agent.ai_action("提交表单", retry_count=5, retry_delay=2.0) 249 | ``` 250 | 251 | ## 🔄 生命周期管理 252 | 253 | ### 初始化和销毁 254 | 255 | ```python 256 | # 方式1: 手动管理 257 | agent = Agent(page) 258 | try: 259 | await agent.ai_action("执行操作") 260 | finally: 261 | await agent.destroy() 262 | 263 | # 方式2: 上下文管理器(推荐) 264 | async with Agent(page) as agent: 265 | await agent.ai_action("执行操作") 266 | # 自动调用 destroy() 267 | ``` 268 | 269 | ### 状态冻结 270 | 271 | ```python 272 | # 冻结当前页面状态(用于调试) 273 | await agent.freeze() 274 | 275 | # 在冻结状态下进行多次操作 276 | await agent.ai_extract(schema1) 277 | await agent.ai_extract(schema2) 278 | 279 | # 解除冻结 280 | await agent.unfreeze() 281 | ``` 282 | 283 | ## 🔧 高级特性 284 | 285 | ### 1. 自定义 AI 模型 286 | 287 | ```python 288 | from midscene.core.ai_model import AIModelConfig 289 | 290 | # 自定义模型配置 291 | ai_config = AIModelConfig( 292 | provider="openai", 293 | model="gpt-4-vision-preview", 294 | temperature=0.0, 295 | max_tokens=2000, 296 | api_key="your_api_key" 297 | ) 298 | 299 | agent = Agent(page, ai_config=ai_config) 300 | ``` 301 | 302 | ### 2. 操作链式调用 303 | 304 | ```python 305 | # 链式操作 306 | await (agent 307 | .ai_action("点击登录") 308 | .ai_action("输入用户名") 309 | .ai_action("输入密码") 310 | .ai_action("点击提交")) 311 | ``` 312 | 313 | ### 3. 事件监听 314 | 315 | ```python 316 | # 操作前后的钩子函数 317 | @agent.on_before_action 318 | async def before_action(prompt: str, context: UIContext): 319 | print(f"即将执行: {prompt}") 320 | 321 | @agent.on_after_action 322 | async def after_action(prompt: str, result: ExecutionResult): 323 | print(f"执行完成: {prompt}, 结果: {result.success}") 324 | ``` 325 | 326 | ### 4. 批量操作 327 | 328 | ```python 329 | # 批量执行多个操作 330 | actions = [ 331 | "点击菜单按钮", 332 | "选择设置选项", 333 | "修改用户信息", 334 | "保存更改" 335 | ] 336 | 337 | results = await agent.batch_execute(actions) 338 | ``` 339 | 340 | ## 📊 性能优化 341 | 342 | ### 智能缓存 343 | 344 | Agent 内置智能缓存机制,避免重复的 AI 调用: 345 | 346 | ```python 347 | # 首次调用会请求 AI 模型 348 | result1 = await agent.ai_extract(schema) 349 | 350 | # 相同 schema 和页面状态会使用缓存 351 | result2 = await agent.ai_extract(schema) # 使用缓存,更快 352 | 353 | # 强制禁用缓存 354 | result3 = await agent.ai_extract(schema, use_cache=False) 355 | ``` 356 | 357 | ### 并发控制 358 | 359 | ```python 360 | # 控制并发数量,避免过多 AI 请求 361 | agent.set_concurrency_limit(3) 362 | 363 | # 异步执行多个独立操作 364 | import asyncio 365 | 366 | tasks = [ 367 | agent.ai_extract(schema1), 368 | agent.ai_extract(schema2), 369 | agent.ai_extract(schema3) 370 | ] 371 | 372 | results = await asyncio.gather(*tasks) 373 | ``` 374 | 375 | ## 🚨 错误处理 376 | 377 | ### 异常类型 378 | 379 | ```python 380 | from midscene.core.exceptions import ( 381 | AgentError, 382 | ElementNotFoundError, 383 | OperationTimeoutError, 384 | AIServiceError 385 | ) 386 | 387 | try: 388 | await agent.ai_action("点击不存在的按钮") 389 | except ElementNotFoundError as e: 390 | print(f"元素未找到: {e}") 391 | except OperationTimeoutError as e: 392 | print(f"操作超时: {e}") 393 | except AIServiceError as e: 394 | print(f"AI 服务错误: {e}") 395 | ``` 396 | 397 | ### 重试机制 398 | 399 | ```python 400 | # 自动重试配置 401 | options = AgentOptions( 402 | retry_count=3, 403 | retry_delay=1.0, 404 | retry_on_errors=[ElementNotFoundError, OperationTimeoutError] 405 | ) 406 | 407 | # 手动重试 408 | from midscene.shared.retry import retry_async 409 | 410 | @retry_async(max_attempts=3, delay=1.0) 411 | async def robust_action(): 412 | await agent.ai_action("点击可能不稳定的元素") 413 | ``` 414 | 415 | ## 🔍 调试和诊断 416 | 417 | ### 详细日志 418 | 419 | ```python 420 | import logging 421 | from midscene.shared.logger import setup_logger 422 | 423 | # 启用详细日志 424 | setup_logger(level=logging.DEBUG) 425 | 426 | # 操作执行时会输出详细信息 427 | await agent.ai_action("点击按钮") 428 | ``` 429 | 430 | ### 执行报告 431 | 432 | ```python 433 | # 生成详细的执行报告 434 | report = await agent.generate_report() 435 | print(f"总操作数: {report.total_actions}") 436 | print(f"成功率: {report.success_rate}") 437 | print(f"平均执行时间: {report.avg_execution_time}") 438 | 439 | # 保存报告到文件 440 | await report.save_to_file("execution_report.html") 441 | ``` 442 | 443 | ### 手动调试 444 | 445 | ```python 446 | # 获取当前页面状态 447 | context = await agent.get_current_context() 448 | print(f"页面标题: {context.page_title}") 449 | print(f"页面 URL: {context.url}") 450 | 451 | # 手动截图 452 | screenshot = await agent.screenshot() 453 | with open("debug.png", "wb") as f: 454 | f.write(screenshot) 455 | 456 | # 获取页面元素信息 457 | elements = await agent.get_all_elements() 458 | for element in elements: 459 | print(f"元素: {element.tag_name}, 文本: {element.text}") 460 | ``` 461 | 462 | ## 🎯 最佳实践 463 | 464 | ### 1. 清晰的操作描述 465 | ```python 466 | # ❌ 模糊的描述 467 | await agent.ai_action("点击按钮") 468 | 469 | # ✅ 具体的描述 470 | await agent.ai_action("点击页面右上角的蓝色登录按钮") 471 | ``` 472 | 473 | ### 2. 合理的超时设置 474 | ```python 475 | # 根据操作复杂度设置超时 476 | await agent.ai_action("点击按钮", timeout=10) # 简单操作 477 | await agent.ai_action("等待页面加载完成", timeout=30) # 复杂操作 478 | ``` 479 | 480 | ### 3. 错误处理 481 | ```python 482 | # 优雅的错误处理 483 | try: 484 | await agent.ai_action("尝试点击可能不存在的按钮") 485 | except ElementNotFoundError: 486 | # 执行备选方案 487 | await agent.ai_action("点击替代按钮") 488 | ``` 489 | 490 | ### 4. 资源管理 491 | ```python 492 | # 使用上下文管理器确保资源释放 493 | async with Agent(page) as agent: 494 | await agent.ai_action("执行操作") 495 | # 自动清理资源 496 | ``` 497 | 498 | ## 🔗 相关文档 499 | 500 | - **API 参考**: [Agent API 完整文档](../API参考/Agent-API.md) 501 | - **UI 理解**: [Insight UI理解引擎](Insight-UI理解引擎.md) 502 | - **平台集成**: [Web自动化](../平台集成/Web自动化/README.md) | [Android自动化](../平台集成/Android自动化.md) 503 | - **示例代码**: [基础示例](../示例和教程/基础示例.md) 504 | 505 | --- 506 | 507 | Agent 是 Midscene Python 的核心,掌握了 Agent 的使用就掌握了框架的精髓。继续探索其他核心概念来深入理解整个框架的工作原理! -------------------------------------------------------------------------------- /midscene/web/bridge.py: -------------------------------------------------------------------------------- 1 | """ 2 | Bridge mode implementation for Chrome extension integration 3 | """ 4 | 5 | import asyncio 6 | import json 7 | import websockets 8 | from typing import Any, Dict, List, Optional 9 | from loguru import logger 10 | 11 | from ..core.types import ( 12 | AbstractInterface, InterfaceType, UIContext, BaseElement, UINode, UITree, 13 | Size, Rect, Point, NodeType 14 | ) 15 | 16 | 17 | class BridgeElement(BaseElement): 18 | """Bridge element wrapper""" 19 | 20 | def __init__(self, bridge: 'BridgeWebPage', **kwargs): 21 | self._bridge = bridge 22 | super().__init__(**kwargs) 23 | 24 | async def tap(self) -> None: 25 | """Click this element""" 26 | try: 27 | await self._bridge.send_command({ 28 | "action": "click", 29 | "target": {"id": self.id} 30 | }) 31 | except Exception as e: 32 | logger.error(f"Failed to click element: {e}") 33 | raise 34 | 35 | async def input_text(self, text: str) -> None: 36 | """Input text to this element""" 37 | try: 38 | await self._bridge.send_command({ 39 | "action": "input", 40 | "target": {"id": self.id}, 41 | "text": text 42 | }) 43 | except Exception as e: 44 | logger.error(f"Failed to input text: {e}") 45 | raise 46 | 47 | 48 | class BridgeWebPage(AbstractInterface): 49 | """Bridge mode page interface for Chrome extension communication""" 50 | 51 | def __init__(self, websocket_url: str = "ws://localhost:8765"): 52 | """Initialize bridge connection 53 | 54 | Args: 55 | websocket_url: WebSocket server URL 56 | """ 57 | self.websocket_url = websocket_url 58 | self.websocket: Optional[websockets.WebSocketServerProtocol] = None 59 | self._command_id = 0 60 | self._response_futures: Dict[int, asyncio.Future] = {} 61 | 62 | @classmethod 63 | async def create( 64 | cls, 65 | websocket_url: str = "ws://localhost:8765", 66 | wait_for_connection: bool = True 67 | ) -> 'BridgeWebPage': 68 | """Create bridge connection 69 | 70 | Args: 71 | websocket_url: WebSocket server URL 72 | wait_for_connection: Wait for extension to connect 73 | 74 | Returns: 75 | BridgeWebPage instance 76 | """ 77 | bridge = cls(websocket_url) 78 | 79 | if wait_for_connection: 80 | await bridge.connect() 81 | 82 | return bridge 83 | 84 | async def connect(self, timeout: float = 30.0) -> None: 85 | """Connect to Chrome extension""" 86 | try: 87 | logger.info(f"Waiting for extension connection on {self.websocket_url}") 88 | 89 | # Start WebSocket server and wait for extension to connect 90 | server = await websockets.serve( 91 | self._handle_connection, 92 | "localhost", 93 | 8765 94 | ) 95 | 96 | # Wait for connection with timeout 97 | start_time = asyncio.get_event_loop().time() 98 | while not self.websocket and (asyncio.get_event_loop().time() - start_time) < timeout: 99 | await asyncio.sleep(0.1) 100 | 101 | if not self.websocket: 102 | raise TimeoutError("Extension connection timeout") 103 | 104 | logger.info("Extension connected successfully") 105 | 106 | except Exception as e: 107 | logger.error(f"Failed to connect to extension: {e}") 108 | raise 109 | 110 | async def _handle_connection(self, websocket, path): 111 | """Handle WebSocket connection from extension""" 112 | self.websocket = websocket 113 | logger.info("Extension connected") 114 | 115 | try: 116 | async for message in websocket: 117 | await self._handle_message(message) 118 | except websockets.exceptions.ConnectionClosed: 119 | logger.info("Extension disconnected") 120 | self.websocket = None 121 | except Exception as e: 122 | logger.error(f"WebSocket error: {e}") 123 | 124 | async def _handle_message(self, message: str) -> None: 125 | """Handle message from extension""" 126 | try: 127 | data = json.loads(message) 128 | 129 | if "id" in data and data["id"] in self._response_futures: 130 | # Response to command 131 | future = self._response_futures.pop(data["id"]) 132 | if not future.done(): 133 | future.set_result(data) 134 | else: 135 | # Unsolicited message from extension 136 | logger.debug(f"Received message: {data}") 137 | 138 | except Exception as e: 139 | logger.error(f"Failed to handle message: {e}") 140 | 141 | async def send_command(self, command: Dict[str, Any]) -> Dict[str, Any]: 142 | """Send command to extension and wait for response""" 143 | if not self.websocket: 144 | raise RuntimeError("Extension not connected") 145 | 146 | command_id = self._command_id 147 | self._command_id += 1 148 | 149 | command["id"] = command_id 150 | 151 | # Create future for response 152 | future = asyncio.Future() 153 | self._response_futures[command_id] = future 154 | 155 | try: 156 | # Send command 157 | await self.websocket.send(json.dumps(command)) 158 | 159 | # Wait for response 160 | response = await asyncio.wait_for(future, timeout=30.0) 161 | 162 | if response.get("error"): 163 | raise Exception(f"Command failed: {response['error']}") 164 | 165 | return response 166 | 167 | except asyncio.TimeoutError: 168 | self._response_futures.pop(command_id, None) 169 | raise TimeoutError("Command timeout") 170 | except Exception as e: 171 | self._response_futures.pop(command_id, None) 172 | raise 173 | 174 | @property 175 | def interface_type(self) -> InterfaceType: 176 | """Get interface type""" 177 | return InterfaceType.WEB 178 | 179 | async def get_context(self) -> UIContext: 180 | """Get current UI context""" 181 | try: 182 | response = await self.send_command({"action": "getContext"}) 183 | 184 | # Parse context data 185 | context_data = response["data"] 186 | 187 | # Convert to UIContext 188 | screenshot_base64 = context_data["screenshot"] 189 | size = Size(**context_data["size"]) 190 | 191 | elements = [] 192 | for elem_data in context_data["elements"]: 193 | rect = Rect(**elem_data["rect"]) 194 | node_type = NodeType(elem_data.get("nodeType", "other")) 195 | 196 | element = BridgeElement( 197 | bridge=self, 198 | id=elem_data["id"], 199 | content=elem_data["content"], 200 | rect=rect, 201 | center=tuple(elem_data["center"]), 202 | node_type=node_type, 203 | attributes=elem_data.get("attributes", {}), 204 | is_visible=elem_data.get("isVisible", True) 205 | ) 206 | elements.append(element) 207 | 208 | # Build tree 209 | tree_data = context_data.get("tree", {}) 210 | tree = self._build_tree_from_data(tree_data) 211 | 212 | return UIContext( 213 | screenshot_base64=screenshot_base64, 214 | size=size, 215 | content=elements, 216 | tree=tree 217 | ) 218 | 219 | except Exception as e: 220 | logger.error(f"Failed to get context: {e}") 221 | raise 222 | 223 | async def action_space(self) -> List[str]: 224 | """Get available actions""" 225 | return [ 226 | "tap", "click", "double_click", "right_click", 227 | "input", "type", "clear", 228 | "scroll", "scroll_up", "scroll_down", "scroll_left", "scroll_right", 229 | "hover", "drag", "key_press", "navigate" 230 | ] 231 | 232 | async def tap(self, x: float, y: float) -> None: 233 | """Tap at coordinates""" 234 | try: 235 | await self.send_command({ 236 | "action": "tap", 237 | "coordinates": {"x": x, "y": y} 238 | }) 239 | except Exception as e: 240 | logger.error(f"Failed to tap at ({x}, {y}): {e}") 241 | raise 242 | 243 | async def input_text(self, text: str) -> None: 244 | """Input text to focused element""" 245 | try: 246 | await self.send_command({ 247 | "action": "inputText", 248 | "text": text 249 | }) 250 | except Exception as e: 251 | logger.error(f"Failed to input text: {e}") 252 | raise 253 | 254 | async def scroll(self, direction: str, distance: Optional[int] = None) -> None: 255 | """Scroll in direction""" 256 | try: 257 | await self.send_command({ 258 | "action": "scroll", 259 | "direction": direction, 260 | "distance": distance or 500 261 | }) 262 | except Exception as e: 263 | logger.error(f"Failed to scroll {direction}: {e}") 264 | raise 265 | 266 | async def navigate_to(self, url: str) -> None: 267 | """Navigate to URL""" 268 | try: 269 | await self.send_command({ 270 | "action": "navigate", 271 | "url": url 272 | }) 273 | except Exception as e: 274 | logger.error(f"Failed to navigate to {url}: {e}") 275 | raise 276 | 277 | def _build_tree_from_data(self, tree_data: Dict[str, Any]) -> UITree: 278 | """Build UITree from extension data""" 279 | if not tree_data: 280 | # Return minimal tree 281 | root_node = UINode( 282 | id="root", 283 | content="", 284 | rect=Rect(left=0, top=0, width=1920, height=1080), 285 | center=(960, 540), 286 | node_type=NodeType.CONTAINER, 287 | attributes={}, 288 | is_visible=True, 289 | children=[] 290 | ) 291 | return UITree(node=root_node, children=[]) 292 | 293 | # Convert tree data to UINode 294 | node_data = tree_data["node"] 295 | node = UINode( 296 | id=node_data["id"], 297 | content=node_data["content"], 298 | rect=Rect(**node_data["rect"]), 299 | center=tuple(node_data["center"]), 300 | node_type=NodeType(node_data.get("nodeType", "other")), 301 | attributes=node_data.get("attributes", {}), 302 | is_visible=node_data.get("isVisible", True), 303 | children=[] 304 | ) 305 | 306 | # Build children recursively 307 | children = [] 308 | for child_data in tree_data.get("children", []): 309 | child_tree = self._build_tree_from_data(child_data) 310 | children.append(child_tree) 311 | 312 | return UITree(node=node, children=children) 313 | 314 | async def close(self) -> None: 315 | """Close bridge connection""" 316 | if self.websocket: 317 | await self.websocket.close() 318 | self.websocket = None -------------------------------------------------------------------------------- /midscene/core/ai_model/providers.py: -------------------------------------------------------------------------------- 1 | """ 2 | AI Model Providers - Implementations for different AI services 3 | """ 4 | 5 | import json 6 | from typing import Any, Dict, List, Optional, Type 7 | 8 | import httpx 9 | from loguru import logger 10 | from pydantic import BaseModel 11 | 12 | from .service import AIProvider, AIModelConfig, parse_json_response, create_usage_info 13 | 14 | 15 | class OpenAIProvider(AIProvider): 16 | """OpenAI API provider""" 17 | 18 | async def call( 19 | self, 20 | messages: List[Dict[str, Any]], 21 | config: AIModelConfig, 22 | response_schema: Optional[Type[BaseModel]] = None, 23 | **kwargs 24 | ) -> Dict[str, Any]: 25 | """Call OpenAI API""" 26 | headers = { 27 | "Authorization": f"Bearer {config.api_key}", 28 | "Content-Type": "application/json" 29 | } 30 | 31 | payload = { 32 | "model": config.model, 33 | "messages": messages, 34 | "max_tokens": config.max_tokens, 35 | "temperature": config.temperature 36 | } 37 | 38 | # Support structured output for compatible models 39 | if response_schema and "gpt-4" in config.model: 40 | payload["response_format"] = { 41 | "type": "json_schema", 42 | "json_schema": { 43 | "name": response_schema.__name__, 44 | "schema": response_schema.model_json_schema() 45 | } 46 | } 47 | 48 | base_url = config.base_url or "https://api.openai.com" 49 | url = f"{base_url}/v1/chat/completions" 50 | 51 | async with httpx.AsyncClient(timeout=config.timeout) as client: 52 | response = await client.post(url, headers=headers, json=payload) 53 | response.raise_for_status() 54 | 55 | result = response.json() 56 | content = result['choices'][0]['message']['content'] 57 | 58 | if response_schema: 59 | try: 60 | parsed = parse_json_response(content) 61 | validated = response_schema(**parsed) 62 | return { 63 | "content": validated.model_dump(), 64 | "usage": create_usage_info(result.get('usage', {})) 65 | } 66 | except Exception as e: 67 | logger.warning(f"Failed to parse structured response: {e}") 68 | return { 69 | "content": {"error": str(e), "raw_content": content}, 70 | "usage": create_usage_info(result.get('usage', {})) 71 | } 72 | 73 | return { 74 | "content": content, 75 | "usage": create_usage_info(result.get('usage', {})) 76 | } 77 | 78 | 79 | class AnthropicProvider(AIProvider): 80 | """Anthropic Claude API provider""" 81 | 82 | async def call( 83 | self, 84 | messages: List[Dict[str, Any]], 85 | config: AIModelConfig, 86 | response_schema: Optional[Type[BaseModel]] = None, 87 | **kwargs 88 | ) -> Dict[str, Any]: 89 | """Call Anthropic API""" 90 | headers = { 91 | "x-api-key": config.api_key, 92 | "Content-Type": "application/json", 93 | "anthropic-version": "2023-06-01" 94 | } 95 | 96 | # Convert messages format for Anthropic 97 | system_message = "" 98 | anthropic_messages = [] 99 | 100 | for msg in messages: 101 | if msg["role"] == "system": 102 | system_message = msg["content"] 103 | else: 104 | anthropic_messages.append(msg) 105 | 106 | payload = { 107 | "model": config.model, 108 | "max_tokens": config.max_tokens, 109 | "temperature": config.temperature, 110 | "messages": anthropic_messages 111 | } 112 | 113 | if system_message: 114 | payload["system"] = system_message 115 | 116 | base_url = config.base_url or "https://api.anthropic.com" 117 | url = f"{base_url}/v1/messages" 118 | 119 | async with httpx.AsyncClient(timeout=config.timeout) as client: 120 | response = await client.post(url, headers=headers, json=payload) 121 | response.raise_for_status() 122 | 123 | result = response.json() 124 | content = result['content'][0]['text'] 125 | 126 | if response_schema: 127 | try: 128 | parsed = parse_json_response(content) 129 | validated = response_schema(**parsed) 130 | return { 131 | "content": validated.model_dump(), 132 | "usage": create_usage_info(result.get('usage', {})) 133 | } 134 | except Exception as e: 135 | logger.warning(f"Failed to parse structured response: {e}") 136 | return { 137 | "content": {"error": str(e), "raw_content": content}, 138 | "usage": create_usage_info(result.get('usage', {})) 139 | } 140 | 141 | return { 142 | "content": content, 143 | "usage": create_usage_info(result.get('usage', {})) 144 | } 145 | 146 | 147 | class QwenProvider(AIProvider): 148 | """Alibaba Qwen API provider""" 149 | 150 | async def call( 151 | self, 152 | messages: List[Dict[str, Any]], 153 | config: AIModelConfig, 154 | response_schema: Optional[Type[BaseModel]] = None, 155 | **kwargs 156 | ) -> Dict[str, Any]: 157 | """Call Qwen API""" 158 | try: 159 | import dashscope 160 | except ImportError: 161 | raise ImportError("dashscope is required for Qwen provider. Install with: pip install dashscope") 162 | 163 | dashscope.api_key = config.api_key 164 | 165 | # Convert messages for Qwen 166 | qwen_messages = [] 167 | for msg in messages: 168 | qwen_messages.append({ 169 | "role": msg["role"], 170 | "content": msg["content"] 171 | }) 172 | 173 | response = await dashscope.Generation.acall( 174 | model=config.model, 175 | messages=qwen_messages, 176 | max_tokens=config.max_tokens, 177 | temperature=config.temperature, 178 | result_format='message' 179 | ) 180 | 181 | if response.status_code == 200: 182 | content = response.output.choices[0]['message']['content'] 183 | 184 | if response_schema: 185 | try: 186 | parsed = parse_json_response(content) 187 | validated = response_schema(**parsed) 188 | return { 189 | "content": validated.model_dump(), 190 | "usage": create_usage_info(response.usage) 191 | } 192 | except Exception as e: 193 | logger.warning(f"Failed to parse structured response: {e}") 194 | return { 195 | "content": {"error": str(e), "raw_content": content}, 196 | "usage": create_usage_info(response.usage) 197 | } 198 | 199 | return { 200 | "content": content, 201 | "usage": create_usage_info(response.usage) 202 | } 203 | else: 204 | raise Exception(f"Qwen API error: {response.message}") 205 | 206 | 207 | class GeminiProvider(AIProvider): 208 | """Google Gemini API provider""" 209 | 210 | async def call( 211 | self, 212 | messages: List[Dict[str, Any]], 213 | config: AIModelConfig, 214 | response_schema: Optional[Type[BaseModel]] = None, 215 | **kwargs 216 | ) -> Dict[str, Any]: 217 | """Call Gemini API""" 218 | try: 219 | import google.generativeai as genai 220 | except ImportError: 221 | raise ImportError("google-generativeai is required for Gemini provider. Install with: pip install google-generativeai") 222 | 223 | genai.configure(api_key=config.api_key) 224 | model = genai.GenerativeModel(config.model) 225 | 226 | # Convert messages format for Gemini 227 | gemini_messages = [] 228 | for msg in messages: 229 | if msg["role"] == "system": 230 | # Gemini doesn't have system role, prepend to first user message 231 | continue 232 | elif msg["role"] == "user": 233 | if isinstance(msg["content"], list): 234 | # Handle multimodal content 235 | parts = [] 236 | for part in msg["content"]: 237 | if part["type"] == "text": 238 | parts.append(part["text"]) 239 | elif part["type"] == "image_url": 240 | # Convert base64 image to Gemini format 241 | import base64 242 | import io 243 | from PIL import Image 244 | 245 | image_data = part["image_url"]["url"] 246 | if image_data.startswith("data:image"): 247 | image_data = image_data.split(",")[1] 248 | 249 | image_bytes = base64.b64decode(image_data) 250 | image = Image.open(io.BytesIO(image_bytes)) 251 | parts.append(image) 252 | 253 | gemini_messages.append({"role": "user", "parts": parts}) 254 | else: 255 | gemini_messages.append({"role": "user", "parts": [msg["content"]]}) 256 | elif msg["role"] == "assistant": 257 | gemini_messages.append({"role": "model", "parts": [msg["content"]]}) 258 | 259 | generation_config = genai.types.GenerationConfig( 260 | max_output_tokens=config.max_tokens, 261 | temperature=config.temperature 262 | ) 263 | 264 | response = await model.generate_content_async( 265 | gemini_messages, 266 | generation_config=generation_config 267 | ) 268 | 269 | content = response.text 270 | 271 | if response_schema: 272 | try: 273 | parsed = parse_json_response(content) 274 | validated = response_schema(**parsed) 275 | return { 276 | "content": validated.model_dump(), 277 | "usage": create_usage_info({ 278 | "prompt_tokens": response.usage_metadata.prompt_token_count, 279 | "completion_tokens": response.usage_metadata.candidates_token_count, 280 | "total_tokens": response.usage_metadata.total_token_count 281 | }) 282 | } 283 | except Exception as e: 284 | logger.warning(f"Failed to parse structured response: {e}") 285 | return { 286 | "content": {"error": str(e), "raw_content": content}, 287 | "usage": create_usage_info({ 288 | "prompt_tokens": response.usage_metadata.prompt_token_count, 289 | "completion_tokens": response.usage_metadata.candidates_token_count, 290 | "total_tokens": response.usage_metadata.total_token_count 291 | }) 292 | } 293 | 294 | return { 295 | "content": content, 296 | "usage": create_usage_info({ 297 | "prompt_tokens": response.usage_metadata.prompt_token_count, 298 | "completion_tokens": response.usage_metadata.candidates_token_count, 299 | "total_tokens": response.usage_metadata.total_token_count 300 | }) 301 | } -------------------------------------------------------------------------------- /wiki/核心概念/Insight-UI理解引擎.md: -------------------------------------------------------------------------------- 1 | # Insight UI理解引擎 2 | 3 | Insight 是 Midscene Python 的 AI 驱动的 UI 理解引擎,负责页面分析、元素定位和操作决策。它是连接 AI 模型与实际操作的核心组件。 4 | 5 | ## 🧠 设计理念 6 | 7 | ### AI 驱动的视觉理解 8 | Insight 利用先进的视觉语言模型(VLM)来理解页面内容: 9 | 10 | ```python 11 | # Insight 不依赖传统的选择器 12 | # 而是通过 AI 视觉理解来定位元素 13 | element = await insight.locate("蓝色的登录按钮") 14 | element = await insight.locate("位于页面右上角的搜索图标") 15 | ``` 16 | 17 | ### 上下文感知决策 18 | Insight 结合页面状态、用户意图和历史操作来做出智能决策: 19 | 20 | ```python 21 | # 同样的描述在不同上下文下可能指向不同元素 22 | await insight.locate("确定按钮") # 对话框中的确定按钮 23 | await insight.locate("确定按钮") # 表单中的确定按钮 24 | ``` 25 | 26 | ## 🏗️ 架构设计 27 | 28 | ### 核心组件 29 | 30 | ```mermaid 31 | graph TB 32 | A[Insight Engine] --> B[Context Provider] 33 | A --> C[AI Model Service] 34 | A --> D[Response Processor] 35 | A --> E[Dump Subscribers] 36 | 37 | B --> F[UI Context] 38 | C --> G[Multi-Model Support] 39 | D --> H[Element Processing] 40 | D --> I[Data Validation] 41 | 42 | subgraph "输入处理" 43 | B 44 | F 45 | end 46 | 47 | subgraph "AI 推理" 48 | C 49 | G 50 | end 51 | 52 | subgraph "结果处理" 53 | D 54 | H 55 | I 56 | end 57 | ``` 58 | 59 | ### Insight 类结构 60 | 61 | ```python 62 | class Insight: 63 | """AI-powered UI understanding and reasoning engine""" 64 | 65 | def __init__( 66 | self, 67 | context_provider: Union[UIContext, Callable], 68 | ai_service: Optional[AIModelService] = None, 69 | model_config: Optional[AIModelConfig] = None 70 | ): 71 | self.context_provider = context_provider # 上下文提供者 72 | self.ai_service = ai_service # AI 模型服务 73 | self.model_config = model_config # 模型配置 74 | self._dump_subscribers = [] # 调试订阅者 75 | ``` 76 | 77 | ## 🎯 核心功能 78 | 79 | ### 1. 智能元素定位 (locate) 80 | 81 | Insight 的核心能力是通过自然语言精确定位页面元素: 82 | 83 | ```python 84 | # 基础定位 85 | login_btn = await insight.locate("登录按钮") 86 | search_box = await insight.locate("搜索输入框") 87 | 88 | # 描述性定位 89 | submit_btn = await insight.locate("绿色的提交按钮") 90 | close_icon = await insight.locate("模态对话框右上角的关闭图标") 91 | 92 | # 相对定位 93 | next_page = await insight.locate("分页器中的下一页按钮") 94 | first_item = await insight.locate("列表中的第一个商品") 95 | 96 | # 条件定位 97 | error_msg = await insight.locate("如果存在错误信息的提示框") 98 | ``` 99 | 100 | #### 定位策略 101 | 102 | Insight 使用多层次的定位策略: 103 | 104 | 1. **视觉识别**: 分析截图中的视觉元素 105 | 2. **语义理解**: 理解元素的功能和语义 106 | 3. **布局分析**: 考虑元素的位置关系 107 | 4. **上下文感知**: 结合页面状态和操作历史 108 | 109 | ```python 110 | class LocateResponse(BaseModel): 111 | """AI locate response schema""" 112 | elements: List[Dict[str, Any]] # 找到的元素列表 113 | reasoning: str # AI 推理过程 114 | confidence: float # 置信度 115 | errors: List[str] = [] # 错误信息 116 | ``` 117 | 118 | #### 定位选项 119 | 120 | ```python 121 | from midscene.core.types import LocateOption 122 | 123 | options = LocateOption( 124 | multiple=True, # 查找多个匹配的元素 125 | timeout=10, # 定位超时时间 126 | wait_for_visible=True, # 等待元素可见 127 | confidence_threshold=0.8 # 最小置信度阈值 128 | ) 129 | 130 | elements = await insight.locate("商品卡片", options) 131 | ``` 132 | 133 | ### 2. 数据提取 (extract) 134 | 135 | 从页面提取结构化数据: 136 | 137 | ```python 138 | # 简单数据提取 139 | user_info = await insight.extract({ 140 | "name": "用户姓名", 141 | "email": "邮箱地址", 142 | "role": "用户角色" 143 | }) 144 | 145 | # 复杂列表数据 146 | products = await insight.extract({ 147 | "products": [ 148 | { 149 | "name": "商品名称", 150 | "price": "价格", 151 | "rating": "评分", 152 | "description": "商品描述", 153 | "in_stock": "是否有库存" 154 | } 155 | ] 156 | }) 157 | 158 | # 嵌套结构数据 159 | page_data = await insight.extract({ 160 | "header": { 161 | "title": "页面标题", 162 | "user": "当前用户名" 163 | }, 164 | "content": { 165 | "articles": [ 166 | { 167 | "title": "文章标题", 168 | "author": "作者", 169 | "date": "发布日期" 170 | } 171 | ] 172 | }, 173 | "footer": { 174 | "copyright": "版权信息" 175 | } 176 | }) 177 | ``` 178 | 179 | #### 提取选项 180 | 181 | ```python 182 | from midscene.core.types import ExtractOption 183 | 184 | options = ExtractOption( 185 | return_thought=True, # 返回 AI 的思考过程 186 | schema_validation=True, # 启用数据结构验证 187 | timeout=30 # 提取超时时间 188 | ) 189 | 190 | result = await insight.extract(schema, options) 191 | print(result["thought"]) # AI 的推理过程 192 | print(result["data"]) # 提取的数据 193 | ``` 194 | 195 | ### 3. 智能断言 (assert_condition) 196 | 197 | 验证页面状态和条件: 198 | 199 | ```python 200 | # 状态断言 201 | result = await insight.assert_condition("用户已成功登录") 202 | assert result.passed, result.message 203 | 204 | # 内容断言 205 | result = await insight.assert_condition("页面显示了 5 个搜索结果") 206 | assert result.passed 207 | 208 | # 复杂条件断言 209 | result = await insight.assert_condition( 210 | "如果是新用户,页面应该显示欢迎指引" 211 | ) 212 | 213 | # 否定断言 214 | result = await insight.assert_condition("页面没有显示错误信息") 215 | ``` 216 | 217 | #### 断言结果 218 | 219 | ```python 220 | class AssertResult: 221 | passed: bool # 断言是否通过 222 | reasoning: str # AI 推理过程 223 | confidence: float # 置信度 224 | message: str # 详细消息 225 | ``` 226 | 227 | ## 🔧 上下文管理 228 | 229 | ### 上下文提供者 230 | 231 | Insight 通过上下文提供者获取页面信息: 232 | 233 | ```python 234 | # 静态上下文 235 | context = UIContext( 236 | screenshot_base64="...", 237 | page_title="登录页面", 238 | url="https://example.com/login" 239 | ) 240 | insight = Insight(context) 241 | 242 | # 动态上下文 243 | async def get_context(action: InsightAction) -> UIContext: 244 | # 根据操作类型获取不同的上下文信息 245 | if action == InsightAction.LOCATE: 246 | return await page.get_locate_context() 247 | elif action == InsightAction.EXTRACT: 248 | return await page.get_extract_context() 249 | else: 250 | return await page.get_default_context() 251 | 252 | insight = Insight(get_context) 253 | ``` 254 | 255 | ### 上下文类型 256 | 257 | ```python 258 | class UIContext(BaseModel): 259 | """UI context information""" 260 | screenshot_base64: str # 页面截图(Base64 编码) 261 | page_title: str # 页面标题 262 | url: str # 页面 URL 263 | viewport_size: tuple # 视口大小 264 | device_pixel_ratio: float # 设备像素比 265 | elements: List[BaseElement] # 页面元素信息 266 | timestamp: float # 时间戳 267 | ``` 268 | 269 | ## 🎨 AI 消息构建 270 | 271 | ### 定位消息 272 | 273 | Insight 为不同操作构建专门的 AI 消息: 274 | 275 | ```python 276 | def _build_locate_messages( 277 | self, 278 | prompt: str, 279 | context: UIContext, 280 | options: LocateOption 281 | ) -> List[Dict]: 282 | """构建元素定位的 AI 消息""" 283 | return [ 284 | { 285 | "role": "system", 286 | "content": self._get_locate_system_prompt() 287 | }, 288 | { 289 | "role": "user", 290 | "content": [ 291 | { 292 | "type": "text", 293 | "text": f"请在页面中定位:{prompt}" 294 | }, 295 | { 296 | "type": "image_url", 297 | "image_url": { 298 | "url": f"data:image/png;base64,{context.screenshot_base64}" 299 | } 300 | } 301 | ] 302 | } 303 | ] 304 | ``` 305 | 306 | ### 系统提示词 307 | 308 | ```python 309 | def _get_locate_system_prompt(self) -> str: 310 | """获取元素定位的系统提示词""" 311 | return """ 312 | 你是一个专业的UI元素定位专家。请分析页面截图,根据用户描述精确定位目标元素。 313 | 314 | 定位原则: 315 | 1. 优先考虑功能语义而非视觉外观 316 | 2. 结合上下文理解元素关系 317 | 3. 对于模糊描述,选择最可能的候选元素 318 | 4. 提供详细的定位推理过程 319 | 320 | 返回格式: 321 | { 322 | "elements": [ 323 | { 324 | "rect": {"x": 0, "y": 0, "width": 100, "height": 30}, 325 | "text": "元素文本", 326 | "tag": "元素标签", 327 | "attributes": {"id": "...", "class": "..."}, 328 | "confidence": 0.95 329 | } 330 | ], 331 | "reasoning": "定位推理过程", 332 | "confidence": 0.9 333 | } 334 | """.strip() 335 | ``` 336 | 337 | ## 📊 响应处理 338 | 339 | ### 元素处理 340 | 341 | ```python 342 | def _process_locate_response( 343 | self, 344 | response: LocateResponse, 345 | context: UIContext 346 | ) -> Optional[BaseElement]: 347 | """处理定位响应,返回最佳匹配元素""" 348 | 349 | if not response.elements: 350 | return None 351 | 352 | # 选择置信度最高的元素 353 | best_element = max( 354 | response.elements, 355 | key=lambda e: e.get("confidence", 0) 356 | ) 357 | 358 | # 创建元素对象 359 | element = BaseElement( 360 | rect=best_element["rect"], 361 | text=best_element.get("text", ""), 362 | tag_name=best_element.get("tag", ""), 363 | attributes=best_element.get("attributes", {}) 364 | ) 365 | 366 | return element 367 | ``` 368 | 369 | ### 数据验证 370 | 371 | ```python 372 | def _validate_extract_response( 373 | self, 374 | response: ExtractResponse, 375 | schema: Dict 376 | ) -> bool: 377 | """验证提取数据的结构是否符合预期""" 378 | 379 | try: 380 | # 使用 Pydantic 进行结构验证 381 | from pydantic import create_model 382 | 383 | # 动态创建验证模型 384 | validator = create_model("ExtractValidator", **schema) 385 | validator(**response.data) 386 | 387 | return True 388 | except Exception as e: 389 | logger.warning(f"Data validation failed: {e}") 390 | return False 391 | ``` 392 | 393 | ## 🔍 调试和监控 394 | 395 | ### 调试订阅者 396 | 397 | Insight 支持调试订阅者来监控执行过程: 398 | 399 | ```python 400 | async def debug_subscriber(dump_data: Dict): 401 | """调试订阅者函数""" 402 | operation = dump_data["type"] 403 | prompt = dump_data.get("prompt", "") 404 | 405 | print(f"🔍 操作: {operation}") 406 | print(f"📝 提示: {prompt}") 407 | 408 | if "error" in dump_data: 409 | print(f"❌ 错误: {dump_data['error']}") 410 | else: 411 | print(f"✅ 成功") 412 | 413 | # 保存调试信息到文件 414 | with open(f"debug_{operation}.json", "w") as f: 415 | json.dump(dump_data, f, indent=2) 416 | 417 | # 注册调试订阅者 418 | insight.subscribe_to_dump(debug_subscriber) 419 | ``` 420 | 421 | ### 执行统计 422 | 423 | ```python 424 | class InsightMetrics: 425 | """Insight 执行统计""" 426 | 427 | def __init__(self): 428 | self.operation_count = 0 429 | self.total_time = 0 430 | self.success_count = 0 431 | self.ai_tokens_used = 0 432 | 433 | def record_operation(self, operation: str, duration: float, success: bool, tokens: int): 434 | self.operation_count += 1 435 | self.total_time += duration 436 | if success: 437 | self.success_count += 1 438 | self.ai_tokens_used += tokens 439 | 440 | @property 441 | def success_rate(self) -> float: 442 | return self.success_count / self.operation_count if self.operation_count > 0 else 0 443 | 444 | @property 445 | def avg_time(self) -> float: 446 | return self.total_time / self.operation_count if self.operation_count > 0 else 0 447 | 448 | # 使用统计 449 | metrics = InsightMetrics() 450 | insight.set_metrics_collector(metrics) 451 | ``` 452 | 453 | ## ⚙️ 高级配置 454 | 455 | ### 模型配置 456 | 457 | ```python 458 | from midscene.core.ai_model import AIModelConfig 459 | 460 | # 针对不同操作使用不同配置 461 | locate_config = AIModelConfig( 462 | provider="openai", 463 | model="gpt-4-vision-preview", 464 | temperature=0.1, # 定位需要更确定性 465 | max_tokens=500 466 | ) 467 | 468 | extract_config = AIModelConfig( 469 | provider="claude", 470 | model="claude-3-sonnet-20240229", 471 | temperature=0.2, # 提取允许更多创造性 472 | max_tokens=2000 473 | ) 474 | 475 | # 创建专门的 Insight 实例 476 | locate_insight = Insight(context_provider, model_config=locate_config) 477 | extract_insight = Insight(context_provider, model_config=extract_config) 478 | ``` 479 | 480 | ### 缓存配置 481 | 482 | ```python 483 | # 启用智能缓存 484 | insight.enable_cache( 485 | cache_size=1000, # 缓存条目数 486 | ttl=3600, # 缓存过期时间(秒) 487 | hash_screenshot=True, # 基于截图内容生成缓存键 488 | cache_ai_responses=True # 缓存 AI 响应 489 | ) 490 | 491 | # 缓存策略配置 492 | insight.set_cache_strategy( 493 | locate_cache_enabled=True, # 定位操作缓存 494 | extract_cache_enabled=True, # 提取操作缓存 495 | assert_cache_enabled=False # 断言操作不缓存(实时性要求高) 496 | ) 497 | ``` 498 | 499 | ## 🚀 性能优化 500 | 501 | ### 批量操作 502 | 503 | ```python 504 | # 批量定位多个元素 505 | elements = await insight.batch_locate([ 506 | "登录按钮", 507 | "注册链接", 508 | "忘记密码链接" 509 | ]) 510 | 511 | # 批量提取多个数据块 512 | data_blocks = await insight.batch_extract([ 513 | {"user_info": {"name": "姓名", "email": "邮箱"}}, 514 | {"product_list": [{"name": "商品名", "price": "价格"}]}, 515 | {"navigation": {"items": ["导航项目"]}} 516 | ]) 517 | ``` 518 | 519 | ### 并发控制 520 | 521 | ```python 522 | # 设置并发限制 523 | insight.set_concurrency_limit(3) 524 | 525 | # 异步并发执行 526 | import asyncio 527 | 528 | async def parallel_operations(): 529 | tasks = [ 530 | insight.locate("按钮1"), 531 | insight.locate("按钮2"), 532 | insight.extract(schema1), 533 | insight.extract(schema2) 534 | ] 535 | 536 | results = await asyncio.gather(*tasks, return_exceptions=True) 537 | return results 538 | ``` 539 | 540 | ## 🎯 最佳实践 541 | 542 | ### 1. 清晰的描述 543 | ```python 544 | # ❌ 模糊描述 545 | await insight.locate("按钮") 546 | 547 | # ✅ 具体描述 548 | await insight.locate("页面右上角的蓝色登录按钮") 549 | ``` 550 | 551 | ### 2. 合理的置信度阈值 552 | ```python 553 | # 根据场景调整置信度要求 554 | options = LocateOption( 555 | confidence_threshold=0.9 # 高要求场景 556 | ) 557 | element = await insight.locate("重要操作按钮", options) 558 | ``` 559 | 560 | ### 3. 错误处理和重试 561 | ```python 562 | async def robust_locate(prompt: str, max_retries: int = 3): 563 | for attempt in range(max_retries): 564 | try: 565 | result = await insight.locate(prompt) 566 | if result.element: 567 | return result 568 | except Exception as e: 569 | if attempt == max_retries - 1: 570 | raise 571 | await asyncio.sleep(1) # 等待后重试 572 | 573 | raise ElementNotFoundError(f"Element not found after {max_retries} attempts") 574 | ``` 575 | 576 | ### 4. 上下文优化 577 | ```python 578 | # 为不同操作提供优化的上下文 579 | async def optimized_context_provider(action: InsightAction) -> UIContext: 580 | base_context = await page.get_context() 581 | 582 | if action == InsightAction.LOCATE: 583 | # 定位操作需要更详细的元素信息 584 | base_context.elements = await page.get_all_elements() 585 | elif action == InsightAction.EXTRACT: 586 | # 提取操作需要更完整的页面内容 587 | base_context.page_content = await page.get_page_content() 588 | 589 | return base_context 590 | ``` 591 | 592 | ## 🔗 相关文档 593 | 594 | - **Agent 集成**: [Agent 核心控制器](Agent核心控制器.md) 595 | - **AI 模型**: [AI模型服务抽象层](AI模型服务抽象层.md) 596 | - **数据类型**: [UI上下文与数据模型](UI上下文与数据模型.md) 597 | - **API 参考**: [Insight API](../API参考/Insight-API.md) 598 | 599 | --- 600 | 601 | Insight 是 Midscene Python 的智能核心,它让 AI 能够真正"看懂"和"理解"用户界面。掌握 Insight 的使用将大大提升你的自动化脚本的智能程度和稳定性! -------------------------------------------------------------------------------- /midscene/shared/report.py: -------------------------------------------------------------------------------- 1 | """ 2 | Report generation and visualization 3 | """ 4 | 5 | import json 6 | from datetime import datetime 7 | from pathlib import Path 8 | from typing import Any, Dict, List, Optional 9 | 10 | from jinja2 import Template 11 | from loguru import logger 12 | 13 | 14 | class ExecutionReport: 15 | """Execution report data model""" 16 | 17 | def __init__(self): 18 | self.start_time = datetime.now() 19 | self.end_time: Optional[datetime] = None 20 | self.success = True 21 | self.error: Optional[str] = None 22 | self.tasks: List[Dict[str, Any]] = [] 23 | self.metadata: Dict[str, Any] = {} 24 | self.screenshots: List[str] = [] 25 | self.ai_usage: Dict[str, Any] = {} 26 | 27 | def add_task(self, task_data: Dict[str, Any]) -> None: 28 | """Add task execution data""" 29 | self.tasks.append({ 30 | **task_data, 31 | "timestamp": datetime.now().isoformat() 32 | }) 33 | 34 | def add_screenshot(self, screenshot_base64: str, description: str = "") -> None: 35 | """Add screenshot to report""" 36 | self.screenshots.append({ 37 | "image": screenshot_base64, 38 | "description": description, 39 | "timestamp": datetime.now().isoformat() 40 | }) 41 | 42 | def update_ai_usage(self, usage_data: Dict[str, Any]) -> None: 43 | """Update AI usage statistics""" 44 | for key, value in usage_data.items(): 45 | if key in self.ai_usage: 46 | if isinstance(value, (int, float)): 47 | self.ai_usage[key] += value 48 | else: 49 | self.ai_usage[key] = value 50 | else: 51 | self.ai_usage[key] = value 52 | 53 | def finalize(self, success: bool = True, error: Optional[str] = None) -> None: 54 | """Finalize report""" 55 | self.end_time = datetime.now() 56 | self.success = success 57 | self.error = error 58 | 59 | def to_dict(self) -> Dict[str, Any]: 60 | """Convert to dictionary""" 61 | duration = None 62 | if self.end_time: 63 | duration = (self.end_time - self.start_time).total_seconds() 64 | 65 | return { 66 | "start_time": self.start_time.isoformat(), 67 | "end_time": self.end_time.isoformat() if self.end_time else None, 68 | "duration_seconds": duration, 69 | "success": self.success, 70 | "error": self.error, 71 | "tasks": self.tasks, 72 | "metadata": self.metadata, 73 | "screenshots": self.screenshots, 74 | "ai_usage": self.ai_usage, 75 | "summary": { 76 | "total_tasks": len(self.tasks), 77 | "successful_tasks": len([t for t in self.tasks if t.get("success", True)]), 78 | "failed_tasks": len([t for t in self.tasks if not t.get("success", True)]), 79 | "total_screenshots": len(self.screenshots) 80 | } 81 | } 82 | 83 | 84 | class ReportGenerator: 85 | """Generate execution reports in various formats""" 86 | 87 | def __init__(self, output_dir: str = "./reports"): 88 | """Initialize report generator 89 | 90 | Args: 91 | output_dir: Output directory for reports 92 | """ 93 | self.output_dir = Path(output_dir) 94 | self.output_dir.mkdir(parents=True, exist_ok=True) 95 | 96 | def generate_html_report( 97 | self, 98 | report: ExecutionReport, 99 | template_path: Optional[str] = None 100 | ) -> str: 101 | """Generate HTML report 102 | 103 | Args: 104 | report: Execution report data 105 | template_path: Custom template path 106 | 107 | Returns: 108 | Path to generated HTML file 109 | """ 110 | if template_path: 111 | with open(template_path, 'r', encoding='utf-8') as f: 112 | template_content = f.read() 113 | else: 114 | template_content = self._get_default_html_template() 115 | 116 | template = Template(template_content) 117 | 118 | # Generate report 119 | html_content = template.render( 120 | report=report.to_dict(), 121 | generated_at=datetime.now().isoformat() 122 | ) 123 | 124 | # Save to file 125 | timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") 126 | filename = f"midscene_report_{timestamp}.html" 127 | file_path = self.output_dir / filename 128 | 129 | with open(file_path, 'w', encoding='utf-8') as f: 130 | f.write(html_content) 131 | 132 | logger.info(f"HTML report generated: {file_path}") 133 | return str(file_path) 134 | 135 | def generate_json_report(self, report: ExecutionReport) -> str: 136 | """Generate JSON report 137 | 138 | Args: 139 | report: Execution report data 140 | 141 | Returns: 142 | Path to generated JSON file 143 | """ 144 | # Save to file 145 | timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") 146 | filename = f"midscene_report_{timestamp}.json" 147 | file_path = self.output_dir / filename 148 | 149 | with open(file_path, 'w', encoding='utf-8') as f: 150 | json.dump(report.to_dict(), f, ensure_ascii=False, indent=2) 151 | 152 | logger.info(f"JSON report generated: {file_path}") 153 | return str(file_path) 154 | 155 | def _get_default_html_template(self) -> str: 156 | """Get default HTML template""" 157 | return """ 158 | 159 | 160 | 161 | 162 | 163 | Midscene Execution Report 164 | 329 | 330 | 331 |
332 |
333 |

🤖 Midscene Execution Report

334 |
335 | {{ '✅ Success' if report.success else '❌ Failed' }} 336 |
337 | {% if report.error %} 338 |
339 | Error: {{ report.error }} 340 |
341 | {% endif %} 342 |
343 | 344 |
345 |
346 |

Duration

347 |
348 | {% if report.duration_seconds %} 349 | {{ "%.1f"|format(report.duration_seconds) }}s 350 | {% else %} 351 | - 352 | {% endif %} 353 |
354 |
355 |
356 |

Total Tasks

357 |
{{ report.summary.total_tasks }}
358 |
359 |
360 |

Successful

361 |
{{ report.summary.successful_tasks }}
362 |
363 |
364 |

Failed

365 |
{{ report.summary.failed_tasks }}
366 |
367 |
368 | 369 | {% if report.tasks %} 370 |
371 |

📋 Task Execution

372 | {% for task in report.tasks %} 373 |
374 |
375 |
{{ task.get('type', 'Task') }}: {{ task.get('description', 'Unknown') }}
376 |
377 | {{ 'Success' if task.get('success', True) else 'Failed' }} 378 |
379 |
380 | {% if task.get('error') %} 381 |
382 | Error: {{ task.error }} 383 |
384 | {% endif %} 385 | {% if task.get('result') %} 386 |
387 | Result: {{ task.result }} 388 |
389 | {% endif %} 390 |
391 | {{ task.timestamp }} 392 |
393 |
394 | {% endfor %} 395 |
396 | {% endif %} 397 | 398 | {% if report.ai_usage %} 399 |
400 |

🧠 AI Usage Statistics

401 |
402 | {% for key, value in report.ai_usage.items() %} 403 |
404 |
{{ key.replace('_', ' ').title() }}
405 |
{{ value }}
406 |
407 | {% endfor %} 408 |
409 |
410 | {% endif %} 411 | 412 | {% if report.screenshots %} 413 |
414 |

📸 Screenshots

415 | {% for screenshot in report.screenshots %} 416 |
417 | Screenshot 418 | {% if screenshot.description %} 419 |
{{ screenshot.description }}
420 | {% endif %} 421 |
422 | {% endfor %} 423 |
424 | {% endif %} 425 | 426 | 429 |
430 | 431 | 432 | """.strip() 433 | 434 | 435 | def create_report() -> ExecutionReport: 436 | """Create new execution report 437 | 438 | Returns: 439 | ExecutionReport instance 440 | """ 441 | return ExecutionReport() -------------------------------------------------------------------------------- /midscene/web/playwright_page.py: -------------------------------------------------------------------------------- 1 | """ 2 | Playwright integration for Midscene 3 | """ 4 | 5 | import base64 6 | import json 7 | from typing import Any, Dict, List, Optional 8 | 9 | from playwright.async_api import async_playwright, Page, Browser, BrowserContext 10 | from loguru import logger 11 | 12 | from ..core.types import ( 13 | AbstractInterface, InterfaceType, UIContext, BaseElement, UINode, UITree, 14 | Size, Rect, Point, NodeType 15 | ) 16 | 17 | 18 | class PlaywrightElement(BaseElement): 19 | """Playwright element wrapper""" 20 | 21 | def __init__(self, page: Page, selector: str, **kwargs): 22 | self._page = page 23 | self._selector = selector 24 | super().__init__(**kwargs) 25 | 26 | async def tap(self) -> None: 27 | """Click this element""" 28 | try: 29 | await self._page.click(self._selector) 30 | except Exception as e: 31 | logger.error(f"Failed to click element: {e}") 32 | raise 33 | 34 | async def input_text(self, text: str) -> None: 35 | """Input text to this element""" 36 | try: 37 | await self._page.fill(self._selector, text) 38 | except Exception as e: 39 | logger.error(f"Failed to input text: {e}") 40 | raise 41 | 42 | 43 | class PlaywrightWebPage(AbstractInterface): 44 | """Playwright page interface""" 45 | 46 | def __init__(self, page: Page, context: BrowserContext, browser: Browser): 47 | """Initialize with Playwright page 48 | 49 | Args: 50 | page: Playwright page instance 51 | context: Browser context 52 | browser: Browser instance 53 | """ 54 | self.page = page 55 | self.context = context 56 | self.browser = browser 57 | 58 | @classmethod 59 | async def create( 60 | cls, 61 | headless: bool = False, 62 | viewport_size: tuple[int, int] = (1920, 1080), 63 | user_data_dir: Optional[str] = None, 64 | **browser_options 65 | ) -> 'PlaywrightWebPage': 66 | """Create new Playwright page instance 67 | 68 | Args: 69 | headless: Run in headless mode 70 | viewport_size: Browser viewport size 71 | user_data_dir: Browser user data directory 72 | **browser_options: Additional browser options 73 | 74 | Returns: 75 | PlaywrightWebPage instance 76 | """ 77 | playwright = await async_playwright().start() 78 | 79 | launch_options = { 80 | "headless": headless, 81 | **browser_options 82 | } 83 | 84 | if user_data_dir: 85 | launch_options["user_data_dir"] = user_data_dir 86 | 87 | browser = await playwright.chromium.launch(**launch_options) 88 | 89 | context = await browser.new_context( 90 | viewport={"width": viewport_size[0], "height": viewport_size[1]} 91 | ) 92 | 93 | page = await context.new_page() 94 | 95 | return cls(page, context, browser) 96 | 97 | @property 98 | def interface_type(self) -> InterfaceType: 99 | """Get interface type""" 100 | return InterfaceType.WEB 101 | 102 | async def get_context(self) -> UIContext: 103 | """Get current UI context""" 104 | try: 105 | # Take screenshot 106 | screenshot_base64 = await self._take_screenshot() 107 | 108 | # Get page size 109 | size = await self._get_page_size() 110 | 111 | # Extract DOM elements 112 | elements = await self._extract_elements() 113 | 114 | # Build UI tree 115 | tree = await self._build_ui_tree() 116 | 117 | return UIContext( 118 | screenshot_base64=screenshot_base64, 119 | size=size, 120 | content=elements, 121 | tree=tree 122 | ) 123 | 124 | except Exception as e: 125 | logger.error(f"Failed to get context: {e}") 126 | raise 127 | 128 | async def action_space(self) -> List[str]: 129 | """Get available actions""" 130 | return [ 131 | "tap", "click", "double_click", "right_click", 132 | "input", "type", "fill", "clear", 133 | "scroll", "scroll_up", "scroll_down", "scroll_left", "scroll_right", 134 | "hover", "drag", "key_press", "navigate", "reload", 135 | "go_back", "go_forward" 136 | ] 137 | 138 | async def tap(self, x: float, y: float) -> None: 139 | """Tap at coordinates""" 140 | try: 141 | await self.page.mouse.click(x, y) 142 | except Exception as e: 143 | logger.error(f"Failed to tap at ({x}, {y}): {e}") 144 | raise 145 | 146 | async def input_text(self, text: str) -> None: 147 | """Input text to focused element""" 148 | try: 149 | await self.page.keyboard.type(text) 150 | except Exception as e: 151 | logger.error(f"Failed to input text: {e}") 152 | raise 153 | 154 | async def scroll(self, direction: str, distance: Optional[int] = None) -> None: 155 | """Scroll in direction""" 156 | try: 157 | distance = distance or 500 158 | 159 | if direction == "down": 160 | await self.page.mouse.wheel(0, distance) 161 | elif direction == "up": 162 | await self.page.mouse.wheel(0, -distance) 163 | elif direction == "right": 164 | await self.page.mouse.wheel(distance, 0) 165 | elif direction == "left": 166 | await self.page.mouse.wheel(-distance, 0) 167 | else: 168 | raise ValueError(f"Invalid scroll direction: {direction}") 169 | 170 | except Exception as e: 171 | logger.error(f"Failed to scroll {direction}: {e}") 172 | raise 173 | 174 | async def navigate_to(self, url: str) -> None: 175 | """Navigate to URL""" 176 | try: 177 | await self.page.goto(url, wait_until="networkidle") 178 | except Exception as e: 179 | logger.error(f"Failed to navigate to {url}: {e}") 180 | raise 181 | 182 | async def wait_for_element( 183 | self, 184 | selector: str, 185 | timeout: float = 10000 186 | ) -> None: 187 | """Wait for element to be present""" 188 | try: 189 | await self.page.wait_for_selector(selector, timeout=timeout) 190 | except Exception as e: 191 | raise TimeoutError(f"Element not found: {selector}") 192 | 193 | async def evaluate_script(self, script: str, *args) -> Any: 194 | """Evaluate JavaScript""" 195 | return await self.page.evaluate(script, *args) 196 | 197 | async def close(self) -> None: 198 | """Close the browser""" 199 | try: 200 | await self.context.close() 201 | await self.browser.close() 202 | except Exception as e: 203 | logger.warning(f"Error closing browser: {e}") 204 | 205 | async def _take_screenshot(self) -> str: 206 | """Take screenshot and return base64 string""" 207 | try: 208 | # Take screenshot as bytes 209 | screenshot_bytes = await self.page.screenshot(type="png") 210 | 211 | # Convert to base64 212 | screenshot_base64 = base64.b64encode(screenshot_bytes).decode('utf-8') 213 | 214 | return screenshot_base64 215 | 216 | except Exception as e: 217 | logger.error(f"Failed to take screenshot: {e}") 218 | raise 219 | 220 | async def _get_page_size(self) -> Size: 221 | """Get page viewport size""" 222 | try: 223 | viewport_size = await self.page.evaluate(""" 224 | () => ({ 225 | width: window.innerWidth, 226 | height: window.innerHeight 227 | }) 228 | """) 229 | 230 | return Size( 231 | width=viewport_size['width'], 232 | height=viewport_size['height'] 233 | ) 234 | 235 | except Exception as e: 236 | logger.error(f"Failed to get page size: {e}") 237 | return Size(width=1920, height=1080) 238 | 239 | async def _extract_elements(self) -> List[PlaywrightElement]: 240 | """Extract all visible elements from page""" 241 | try: 242 | # Use JavaScript to extract element information 243 | element_data = await self.page.evaluate(""" 244 | () => { 245 | const elements = []; 246 | const allElements = document.querySelectorAll('*'); 247 | 248 | allElements.forEach((el, index) => { 249 | const rect = el.getBoundingClientRect(); 250 | 251 | // Skip elements that are not visible 252 | if (rect.width === 0 || rect.height === 0 || 253 | rect.top < 0 || rect.left < 0 || 254 | getComputedStyle(el).visibility === 'hidden' || 255 | getComputedStyle(el).display === 'none') { 256 | return; 257 | } 258 | 259 | // Generate a selector for this element 260 | const selector = generateSelector(el); 261 | 262 | elements.push({ 263 | id: `element_${index}`, 264 | selector: selector, 265 | tagName: el.tagName.toLowerCase(), 266 | content: el.textContent?.trim() || el.getAttribute('alt') || el.getAttribute('title') || '', 267 | rect: { 268 | left: rect.left, 269 | top: rect.top, 270 | width: rect.width, 271 | height: rect.height 272 | }, 273 | center: [rect.left + rect.width / 2, rect.top + rect.height / 2], 274 | attributes: { 275 | id: el.id, 276 | className: el.className, 277 | type: el.type, 278 | name: el.name, 279 | href: el.href, 280 | src: el.src, 281 | value: el.value, 282 | placeholder: el.placeholder 283 | } 284 | }); 285 | }); 286 | 287 | function generateSelector(element) { 288 | if (element.id) { 289 | return `#${element.id}`; 290 | } 291 | 292 | let path = element.tagName.toLowerCase(); 293 | let parent = element.parentElement; 294 | 295 | while (parent && parent !== document.body) { 296 | const siblings = Array.from(parent.children); 297 | const index = siblings.indexOf(element) + 1; 298 | path = `${parent.tagName.toLowerCase()}:nth-child(${index}) > ${path}`; 299 | element = parent; 300 | parent = element.parentElement; 301 | } 302 | 303 | return path; 304 | } 305 | 306 | return elements; 307 | } 308 | """) 309 | 310 | elements = [] 311 | for data in element_data: 312 | rect_data = data['rect'] 313 | rect = Rect( 314 | left=rect_data['left'], 315 | top=rect_data['top'], 316 | width=rect_data['width'], 317 | height=rect_data['height'] 318 | ) 319 | 320 | # Determine node type 321 | tag_name = data['tagName'] 322 | node_type = self._get_node_type(tag_name, data['attributes']) 323 | 324 | element = PlaywrightElement( 325 | page=self.page, 326 | selector=data['selector'], 327 | id=data['id'], 328 | content=data['content'], 329 | rect=rect, 330 | center=tuple(data['center']), 331 | node_type=node_type, 332 | attributes=data['attributes'], 333 | is_visible=True 334 | ) 335 | 336 | elements.append(element) 337 | 338 | return elements 339 | 340 | except Exception as e: 341 | logger.error(f"Failed to extract elements: {e}") 342 | return [] 343 | 344 | def _get_node_type(self, tag_name: str, attributes: Dict[str, Any]) -> NodeType: 345 | """Determine node type from tag name and attributes""" 346 | if tag_name in ['input', 'textarea']: 347 | input_type = attributes.get('type', '').lower() 348 | if input_type in ['text', 'password', 'email', 'search', 'url', 'tel']: 349 | return NodeType.INPUT 350 | elif input_type in ['button', 'submit', 'reset']: 351 | return NodeType.BUTTON 352 | elif tag_name in ['button']: 353 | return NodeType.BUTTON 354 | elif tag_name in ['a']: 355 | return NodeType.LINK 356 | elif tag_name in ['img']: 357 | return NodeType.IMAGE 358 | elif tag_name in ['div', 'span', 'section', 'article', 'header', 'footer', 'nav']: 359 | return NodeType.CONTAINER 360 | elif tag_name in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'label', 'td', 'th']: 361 | return NodeType.TEXT 362 | else: 363 | return NodeType.OTHER 364 | 365 | async def _build_ui_tree(self) -> UITree: 366 | """Build UI tree structure""" 367 | try: 368 | # Simplified tree building - just create a root container 369 | # In a full implementation, we would parse the actual DOM tree 370 | root_node = UINode( 371 | id="root", 372 | content="", 373 | rect=Rect(left=0, top=0, width=1920, height=1080), 374 | center=(960, 540), 375 | node_type=NodeType.CONTAINER, 376 | attributes={}, 377 | is_visible=True, 378 | children=[] 379 | ) 380 | 381 | return UITree(node=root_node, children=[]) 382 | 383 | except Exception as e: 384 | logger.error(f"Failed to build UI tree: {e}") 385 | # Return minimal tree 386 | root_node = UINode( 387 | id="root", 388 | content="", 389 | rect=Rect(left=0, top=0, width=1920, height=1080), 390 | center=(960, 540), 391 | node_type=NodeType.CONTAINER, 392 | attributes={}, 393 | is_visible=True, 394 | children=[] 395 | ) 396 | return UITree(node=root_node, children=[]) 397 | 398 | async def __aenter__(self): 399 | return self 400 | 401 | async def __aexit__(self, exc_type, exc_val, exc_tb): 402 | await self.close() --------------------------------------------------------------------------------