├── tests
    ├── __init__.py
    └── test_core.py
├── midscene
    ├── cli
    │   ├── __init__.py
    │   ├── main.py
    │   └── config.py
    ├── android
    │   ├── __init__.py
    │   └── agent.py
    ├── shared
    │   ├── __init__.py
    │   ├── logger.py
    │   ├── cache.py
    │   └── report.py
    ├── web
    │   ├── __init__.py
    │   ├── bridge.py
    │   └── playwright_page.py
    ├── core
    │   ├── ai_model
    │   │   ├── __init__.py
    │   │   ├── service.py
    │   │   └── providers.py
    │   ├── __init__.py
    │   └── types.py
    └── __init__.py
├── .env.example
├── .github
    └── workflows
    │   └── publish.yml
├── midscene.yml
├── LICENSE
├── scripts
    ├── quick_validate.bat
    └── validate_requirements.py
├── Makefile
├── .gitignore
├── wiki
    ├── README.md
    ├── 核心概念
    │   ├── README.md
    │   ├── Agent核心控制器.md
    │   └── Insight-UI理解引擎.md
    ├── 生成状态.md
    ├── 项目概述.md
    ├── 快速开始.md
    ├── 安装配置.md
    └── 平台集成
    │   └── README.md
├── examples
    └── basic_usage.py
├── pyproject.toml
├── README.zh.md
├── README.md
├── docs
    └── quickstart.md
└── requirements.txt


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | """Test package for Midscene Python"""


--------------------------------------------------------------------------------
/midscene/cli/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | CLI module for Midscene Python
3 | """
4 | 
5 | from .main import main, app
6 | 
7 | __all__ = ["main", "app"]


--------------------------------------------------------------------------------
/midscene/android/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Android integration module for Midscene Python
 3 | """
 4 | 
 5 | from .device import AndroidDevice
 6 | from .agent import AndroidAgent
 7 | 
 8 | __all__ = [
 9 |     "AndroidDevice",
10 |     "AndroidAgent",
11 | ]


--------------------------------------------------------------------------------
/midscene/shared/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Shared utilities and tools for Midscene Python
 3 | """
 4 | 
 5 | from .cache import TaskCache
 6 | from .logger import setup_logger
 7 | from .report import ReportGenerator
 8 | 
 9 | __all__ = [
10 |     "TaskCache",
11 |     "setup_logger", 
12 |     "ReportGenerator",
13 | ]


--------------------------------------------------------------------------------
/midscene/web/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Web integration module for Midscene Python
 3 | """
 4 | 
 5 | from .selenium_page import SeleniumWebPage
 6 | from .playwright_page import PlaywrightWebPage
 7 | from .bridge import BridgeWebPage
 8 | 
 9 | __all__ = [
10 |     "SeleniumWebPage",
11 |     "PlaywrightWebPage", 
12 |     "BridgeWebPage",
13 | ]


--------------------------------------------------------------------------------
/midscene/core/ai_model/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | AI model integration module
 3 | """
 4 | 
 5 | from .service import AIModelService, AIModelConfig
 6 | from .providers import OpenAIProvider, AnthropicProvider, QwenProvider, GeminiProvider
 7 | 
 8 | __all__ = [
 9 |     "AIModelService",
10 |     "AIModelConfig", 
11 |     "OpenAIProvider",
12 |     "AnthropicProvider",
13 |     "QwenProvider",
14 |     "GeminiProvider",
15 | ]


--------------------------------------------------------------------------------
/midscene/core/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Core module for Midscene Python
 3 | """
 4 | 
 5 | from .agent import Agent
 6 | from .insight import Insight
 7 | from .types import *
 8 | 
 9 | __all__ = [
10 |     "Agent",
11 |     "Insight",
12 |     "UIContext",
13 |     "LocateResult", 
14 |     "ExecutionResult",
15 |     "BaseElement",
16 |     "AbstractInterface",
17 |     "InterfaceType",
18 |     "AgentOptions",
19 |     "LocateOption",
20 |     "ExtractOption",
21 |     "ScrollParam",
22 | ]


--------------------------------------------------------------------------------
/midscene/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Midscene Python - AI-powered automation framework
 3 | 
 4 | A Python implementation of Midscene, providing AI-driven automation
 5 | capabilities for Web and Android platforms.
 6 | """
 7 | 
 8 | from .core.agent import Agent
 9 | from .core.insight import Insight
10 | from .core.types import UIContext, LocateResult, ExecutionResult
11 | 
12 | __version__ = "0.1.0"
13 | 
14 | __all__ = [
15 |     "Agent",
16 |     "Insight", 
17 |     "UIContext",
18 |     "LocateResult",
19 |     "ExecutionResult",
20 | ]


--------------------------------------------------------------------------------
/.env.example:
--------------------------------------------------------------------------------
 1 | # Environment Variables for Midscene Python
 2 | 
 3 | # AI Model Configuration
 4 | MIDSCENE_AI_PROVIDER=openai
 5 | MIDSCENE_AI_MODEL=gpt-4-vision-preview
 6 | MIDSCENE_AI_API_KEY=your-api-key-here
 7 | # MIDSCENE_AI_BASE_URL=https://api.openai.com
 8 | 
 9 | # Execution Settings
10 | MIDSCENE_CONCURRENT=1
11 | MIDSCENE_CONTINUE_ON_ERROR=false
12 | MIDSCENE_GENERATE_REPORT=true
13 | 
14 | # Logging
15 | MIDSCENE_LOG_LEVEL=INFO
16 | MIDSCENE_LOG_FILE=midscene.log
17 | 
18 | # Development Settings
19 | MIDSCENE_DEBUG=false
20 | MIDSCENE_CACHE_ENABLED=true


--------------------------------------------------------------------------------
/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
 1 | name: Publish to PyPI
 2 | 
 3 | on:
 4 |   push:
 5 |     tags:
 6 |       - 'v*'  # 当推送以 'v' 开头的标签时触发
 7 | 
 8 | jobs:
 9 |   build-and-publish:
10 |     runs-on: windows-latest  # 使用 Windows 环境
11 |     steps:
12 |       - name: Checkout code
13 |         uses: actions/checkout@v4
14 | 
15 |       - name: Set up Python
16 |         uses: actions/setup-python@v4
17 |         with:
18 |           python-version: '3.x'
19 | 
20 |       - name: Install uv
21 |         run: |
22 |           powershell -c "irm https://astral.sh/uv/install.sh | iex"
23 |           echo "$env:USERPROFILE\.cargo\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
24 | 
25 |       - name: Build package
26 |         run: uv build
27 | 
28 |       - name: Publish to PyPI
29 |         env:
30 |           UV_PUBLISH_TOKEN: ${{ secrets.PYPI_API_TOKEN }}  # 使用 UV_PUBLISH_TOKEN 替代 TWINE_PASSWORD
31 |         run: uv publish dist/*
32 | 


--------------------------------------------------------------------------------
/midscene.yml:
--------------------------------------------------------------------------------
 1 | # Midscene Python Configuration
 2 | 
 3 | # AI Model Configuration
 4 | ai:
 5 |   provider: "openai"  # openai, anthropic, qwen, gemini
 6 |   model: "gpt-4-vision-preview"
 7 |   api_key: "${MIDSCENE_AI_API_KEY}"  # Set via environment variable
 8 |   base_url: null  # Custom API endpoint if needed
 9 |   max_tokens: 4000
10 |   temperature: 0.1
11 | 
12 | # Web Automation Configuration
13 | web:
14 |   browser: "chrome"  # chrome, firefox, safari
15 |   headless: false
16 |   window_size: [1920, 1080]
17 |   user_data_dir: null  # Browser profile directory
18 |   timeout: 30
19 | 
20 | # Android Automation Configuration
21 | android:
22 |   device_id: null  # Auto-detect if null
23 |   adb_path: "adb"
24 |   auto_dismiss_keyboard: true
25 |   timeout: 30
26 | 
27 | # Execution Configuration
28 | execution:
29 |   concurrent: 1  # Number of concurrent script executions
30 |   continue_on_error: false  # Continue executing scripts on error
31 |   generate_report: true
32 |   report_format: "html"  # html, json, xml
33 |   output_dir: "./reports"


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 Python51888
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/midscene/cli/main.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Midscene CLI - Command line interface for automation scripts
 3 | """
 4 | 
 5 | import sys
 6 | from typing import Optional
 7 | 
 8 | import typer
 9 | from rich.console import Console
10 | 
11 | from .config import CLIConfig
12 | 
13 | app = typer.Typer(
14 |     name="midscene",
15 |     help="AI-powered automation framework for Web and Android platforms",
16 |     no_args_is_help=True
17 | )
18 | 
19 | console = Console()
20 | 
21 | 
22 | @app.command()
23 | def run(
24 |     script_path: str = typer.Argument(..., help="Path to YAML script file or directory"),
25 |     config_file: Optional[str] = typer.Option(None, "--config", "-c", help="Configuration file path"),
26 |     headless: bool = typer.Option(False, "--headless", help="Run browser in headless mode"),
27 |     device_id: Optional[str] = typer.Option(None, "--device", "-d", help="Android device ID"),
28 |     verbose: bool = typer.Option(False, "--verbose", "-v", help="Verbose output"),
29 | ):
30 |     """Run automation script(s)"""
31 |     
32 |     console.print(f"[yellow]Script execution not yet implemented: {script_path}[/yellow]")
33 |     console.print("[blue]This is a placeholder CLI implementation[/blue]")
34 | 
35 | 
36 | @app.command()
37 | def version():
38 |     """Show version information"""
39 |     
40 |     try:
41 |         console.print("Midscene Python v0.1.0")
42 |         
43 |     except Exception as e:
44 |         console.print(f"❌ Error getting version: {e}", style="red")
45 | 
46 | 
47 | def main():
48 |     """CLI entry point"""
49 |     app()
50 | 
51 | 
52 | if __name__ == "__main__":
53 |     main()


--------------------------------------------------------------------------------
/scripts/quick_validate.bat:
--------------------------------------------------------------------------------
 1 | @echo off
 2 | chcp 65001 > nul
 3 | echo === Midscene Python Dependencies Quick Validation ===
 4 | echo.
 5 | 
 6 | REM Check if requirements.txt exists
 7 | if not exist "requirements.txt" (
 8 |     echo Error: requirements.txt file not found
 9 |     echo Please run: make requirements-freeze
10 |     exit /b 1
11 | )
12 | 
13 | echo 1. Checking requirements.txt file...
14 | echo Success: requirements.txt exists
15 | 
16 | REM Count dependencies
17 | for /f %%i in ('findstr /v "^#" requirements.txt ^| findstr /v "^$" ^| find /c "=="') do set count=%%i
18 | echo Success: Found %count% dependency packages
19 | 
20 | echo.
21 | echo 2. Validating key dependencies...
22 | 
23 | REM Check core dependencies
24 | findstr /i "pydantic==" requirements.txt >nul 2>&1
25 | if %errorlevel% equ 0 (echo Success: pydantic) else (echo Error: pydantic & set error=1)
26 | 
27 | findstr /i "selenium==" requirements.txt >nul 2>&1
28 | if %errorlevel% equ 0 (echo Success: selenium) else (echo Error: selenium & set error=1)
29 | 
30 | findstr /i "playwright==" requirements.txt >nul 2>&1
31 | if %errorlevel% equ 0 (echo Success: playwright) else (echo Error: playwright & set error=1)
32 | 
33 | REM Check development dependencies
34 | findstr /i "pytest==" requirements.txt >nul 2>&1
35 | if %errorlevel% equ 0 (echo Success: pytest) else (echo Error: pytest & set error=1)
36 | 
37 | findstr /i "black==" requirements.txt >nul 2>&1
38 | if %errorlevel% equ 0 (echo Success: black) else (echo Error: black & set error=1)
39 | 
40 | REM Check documentation dependencies
41 | findstr /i "mkdocs==" requirements.txt >nul 2>&1
42 | if %errorlevel% equ 0 (echo Success: mkdocs) else (echo Error: mkdocs & set error=1)
43 | 
44 | echo.
45 | if defined error (
46 |     echo Validation FAILED: Missing key dependencies
47 |     exit /b 1
48 | ) else (
49 |     echo Validation PASSED!
50 |     echo requirements.txt contains all key dependencies
51 | )


--------------------------------------------------------------------------------
/midscene/shared/logger.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Logging configuration for Midscene Python
 3 | """
 4 | 
 5 | import sys
 6 | from pathlib import Path
 7 | from typing import Optional
 8 | 
 9 | from loguru import logger
10 | 
11 | 
12 | def setup_logger(
13 |     level: str = "INFO",
14 |     log_file: Optional[str] = None,
15 |     rotation: str = "10 MB",
16 |     retention: str = "7 days",
17 |     format_string: Optional[str] = None
18 | ) -> None:
19 |     """Setup logging configuration
20 |     
21 |     Args:
22 |         level: Log level (DEBUG, INFO, WARNING, ERROR)
23 |         log_file: Log file path
24 |         rotation: Log rotation size/time
25 |         retention: Log retention period
26 |         format_string: Custom format string
27 |     """
28 |     # Remove default logger
29 |     logger.remove()
30 |     
31 |     # Default format
32 |     if not format_string:
33 |         format_string = (
34 |             "<green>{time:YYYY-MM-DD HH:mm:ss.SSS}</green> | "
35 |             "<level>{level: <8}</level> | "
36 |             "<cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> | "
37 |             "<level>{message}</level>"
38 |         )
39 |     
40 |     # Add console handler
41 |     logger.add(
42 |         sys.stderr,
43 |         level=level,
44 |         format=format_string,
45 |         colorize=True,
46 |         backtrace=True,
47 |         diagnose=True
48 |     )
49 |     
50 |     # Add file handler if specified
51 |     if log_file:
52 |         log_path = Path(log_file)
53 |         log_path.parent.mkdir(parents=True, exist_ok=True)
54 |         
55 |         logger.add(
56 |             log_path,
57 |             level=level,
58 |             format=format_string,
59 |             rotation=rotation,
60 |             retention=retention,
61 |             backtrace=True,
62 |             diagnose=True
63 |         )
64 |     
65 |     logger.info(f"Logger configured with level: {level}")
66 | 
67 | 
68 | def get_logger(name: str):
69 |     """Get logger instance
70 |     
71 |     Args:
72 |         name: Logger name
73 |         
74 |     Returns:
75 |         Logger instance
76 |     """
77 |     return logger.bind(name=name)


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: help install dev test lint format clean build docs requirements-freeze requirements-check
 2 | 
 3 | # Default target
 4 | help:
 5 | 	@echo "Available commands:"
 6 | 	@echo "  install            Install package and dependencies"
 7 | 	@echo "  dev                Install development dependencies"
 8 | 	@echo "  requirements-freeze    Generate complete requirements.txt"
 9 | 	@echo "  requirements-check     Verify dependencies integrity"
10 | 	@echo "  requirements-quick-check Quick requirements validation"
11 | 	@echo "  test               Run tests"
12 | 	@echo "  lint               Run linting"
13 | 	@echo "  format             Format code"
14 | 	@echo "  clean              Clean build artifacts"
15 | 	@echo "  build              Build package"
16 | 	@echo "  docs               Build documentation"
17 | 
18 | # Generate complete requirements.txt with all dependencies
19 | requirements-freeze:
20 | 	uv pip compile --all-extras pyproject.toml -o requirements.txt
21 | 
22 | # Verify dependencies integrity
23 | requirements-check:
24 | 	uv pip check
25 | 	@python scripts/validate_requirements.py
26 | 
27 | # Quick requirements validation
28 | requirements-quick-check:
29 | 	@scripts/quick_validate.bat
30 | 
31 | # Install package from requirements.txt
32 | install:
33 | 	pip install -r requirements.txt
34 | 
35 | # Install package in development mode
36 | install-dev:
37 | 	pip install -e ".[dev,docs]"
38 | 	pre-commit install
39 | 
40 | # Install development dependencies (alias for backward compatibility)
41 | dev: install-dev
42 | 
43 | # Run tests
44 | test:
45 | 	pytest tests/ -v --cov=midscene --cov-report=html --cov-report=term-missing
46 | 
47 | # Run tests with specific markers
48 | test-unit:
49 | 	pytest tests/ -v -m "unit"
50 | 
51 | test-integration:
52 | 	pytest tests/ -v -m "integration"
53 | 
54 | # Linting
55 | lint:
56 | 	ruff check midscene/ tests/
57 | 	mypy midscene/
58 | 
59 | # Format code
60 | format:
61 | 	black midscene/ tests/ examples/
62 | 	isort midscene/ tests/ examples/
63 | 	ruff check --fix midscene/ tests/
64 | 
65 | # Clean build artifacts
66 | clean:
67 | 	rm -rf build/
68 | 	rm -rf dist/
69 | 	rm -rf *.egg-info/
70 | 	rm -rf .pytest_cache/
71 | 	rm -rf .coverage
72 | 	rm -rf htmlcov/
73 | 	find . -type d -name __pycache__ -delete
74 | 	find . -type f -name "*.pyc" -delete
75 | 
76 | # Build package
77 | build: clean
78 | 	python -m build
79 | 
80 | # Build documentation
81 | docs:
82 | 	mkdocs build
83 | 
84 | # Serve documentation locally
85 | docs-serve:
86 | 	mkdocs serve
87 | 
88 | # Release to PyPI
89 | release: build
90 | 	twine upload dist/*
91 | 
92 | # Release to Test PyPI
93 | release-test: build
94 | 	twine upload --repository testpypi dist/*


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | <<<<<<< HEAD
  2 | # Byte-compiled / optimized / DLL files
  3 | __pycache__/
  4 | *.py[cod]
  5 | *$py.class
  6 | 
  7 | # C extensions
  8 | *.so
  9 | 
 10 | # Distribution / packaging
 11 | .Python
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | pip-wheel-metadata/
 25 | share/python-wheels/
 26 | *.egg-info/
 27 | .installed.cfg
 28 | *.egg
 29 | MANIFEST
 30 | 
 31 | # PyInstaller
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | 
 53 | # Translations
 54 | *.mo
 55 | *.pot
 56 | 
 57 | # Django stuff:
 58 | *.log
 59 | local_settings.py
 60 | db.sqlite3
 61 | db.sqlite3-journal
 62 | 
 63 | # Flask stuff:
 64 | instance/
 65 | .webassets-cache
 66 | 
 67 | # Scrapy stuff:
 68 | .scrapy
 69 | 
 70 | # Sphinx documentation
 71 | docs/_build/
 72 | 
 73 | # PyBuilder
 74 | target/
 75 | 
 76 | # Jupyter Notebook
 77 | .ipynb_checkpoints
 78 | 
 79 | # IPython
 80 | profile_default/
 81 | ipython_config.py
 82 | 
 83 | # pyenv
 84 | .python-version
 85 | 
 86 | # pipenv
 87 | Pipfile.lock
 88 | 
 89 | # PEP 582
 90 | __pypackages__/
 91 | 
 92 | # Celery stuff
 93 | celerybeat-schedule
 94 | celerybeat.pid
 95 | 
 96 | # SageMath parsed files
 97 | *.sage.py
 98 | 
 99 | # Environments
100 | .env
101 | .venv
102 | env/
103 | venv/
104 | ENV/
105 | env.bak/
106 | venv.bak/
107 | 
108 | # Spyder project settings
109 | .spyderproject
110 | .spyproject
111 | 
112 | # Rope project settings
113 | .ropeproject
114 | 
115 | # mkdocs documentation
116 | /site
117 | 
118 | # mypy
119 | .mypy_cache/
120 | .dmypy.json
121 | dmypy.json
122 | 
123 | # Pyre type checker
124 | .pyre/
125 | 
126 | # Midscene specific
127 | reports/
128 | .midscene/
129 | *.log
130 | 
131 | # IDE
132 | .vscode/
133 | .idea/
134 | *.swp
135 | *.swo
136 | 
137 | # OS
138 | .DS_Store
139 | Thumbs.db
140 | =======
141 | # Build and Release Folders
142 | bin-debug/
143 | bin-release/
144 | [Oo]bj/
145 | [Bb]in/
146 | 
147 | # Other files and folders
148 | .settings/
149 | 
150 | # Executables
151 | *.swf
152 | *.air
153 | *.ipa
154 | *.apk
155 | 
156 | # Project files, i.e. `.project`, `.actionScriptProperties` and `.flexProperties`
157 | # should NOT be excluded as they contain compiler settings and other important
158 | # information for Eclipse / Flash Builder.
159 | >>>>>>> 2a066347ae84a69f9986cffe451aeae1a5364b10
160 | 
161 | # YoYo AI version control directory
162 | .yoyo/
163 | 


--------------------------------------------------------------------------------
/wiki/README.md:
--------------------------------------------------------------------------------
 1 | # Midscene Python Wiki
 2 | 
 3 | 欢迎来到 Midscene Python 的完整文档！这里提供了详细的使用指南、API 参考和最佳实践。
 4 | 
 5 | ## 📚 文档导航
 6 | 
 7 | ### 基础入门
 8 | - [项目概述](项目概述.md) - 了解 Midscene Python 的核心理念和特性
 9 | - [快速开始](快速开始.md) - 5分钟快速上手指南
10 | - [安装配置](安装配置.md) - 详细的安装和环境配置说明
11 | 
12 | ### 核心概念
13 | - [Agent 核心控制器](核心概念/Agent核心控制器.md) - 理解 Agent 的工作原理
14 | - [Insight UI理解引擎](核心概念/Insight-UI理解引擎.md) - AI 驱动的 UI 理解和操作
15 | - [AI模型服务抽象层](核心概念/AI模型服务抽象层.md) - 多种 AI 模型的统一接口
16 | - [UI上下文与数据模型](核心概念/UI上下文与数据模型.md) - 理解数据流和上下文管理
17 | 
18 | ### API 参考
19 | - [Agent API](API参考/Agent-API.md) - Agent 类的完整 API 文档
20 | - [Insight API](API参考/Insight-API.md) - Insight 引擎的 API 参考
21 | - [AIModelService API](API参考/AIModelService-API.md) - AI 模型服务的接口说明
22 | 
23 | ### 平台集成
24 | - [Web自动化](平台集成/Web自动化/README.md) - Web 平台自动化完整指南
25 |   - [Selenium集成](平台集成/Web自动化/Selenium集成.md) - Selenium WebDriver 集成
26 |   - [Playwright集成](平台集成/Web自动化/Playwright集成.md) - Playwright 集成指南
27 |   - [Web桥接机制](平台集成/Web自动化/Web桥接机制.md) - 统一的 Web 操作抽象层
28 | - [Android自动化](平台集成/Android自动化.md) - Android 设备自动化指南
29 | 
30 | ### AI 模型配置
31 | - [配置方法](AI模型配置/配置方法.md) - AI 模型的基础配置
32 | - [支持的AI提供商](AI模型配置/支持的AI提供商/README.md) - 所有支持的 AI 服务商
33 |   - [OpenAI提供商](AI模型配置/支持的AI提供商/OpenAI提供商.md) - GPT-4V 等模型配置
34 |   - [Anthropic提供商](AI模型配置/支持的AI提供商/Anthropic提供商.md) - Claude 模型配置
35 |   - [通义千问提供商](AI模型配置/支持的AI提供商/通义千问提供商.md) - Qwen2.5-VL 模型配置
36 |   - [Gemini提供商](AI模型配置/支持的AI提供商/Gemini提供商.md) - Google Gemini 模型配置
37 | - [高级选项](AI模型配置/高级选项/README.md) - 高级配置和优化
38 |   - [缓存策略](AI模型配置/高级选项/缓存策略.md) - 智能缓存机制
39 |   - [请求重试与超时控制](AI模型配置/高级选项/请求重试与超时控制.md) - 网络请求优化
40 |   - [配额管理与节流控制](AI模型配置/高级选项/配额管理与节流控制.md) - 成本控制和速率限制
41 |   - [流式响应处理](AI模型配置/高级选项/流式响应处理.md) - 实时响应处理
42 |   - [性能调优技巧](AI模型配置/高级选项/性能调优技巧.md) - 性能优化最佳实践
43 | 
44 | ### 高级特性
45 | - [智能缓存机制](高级特性/智能缓存机制.md) - 提升执行效率的缓存系统
46 | - [可视化报告系统](高级特性/可视化报告系统.md) - 详细的执行报告和调试信息
47 | - [CLI工具高级用法](高级特性/CLI工具高级用法.md) - 命令行工具的进阶使用
48 | 
49 | ### 开发指南
50 | - [贡献指南](开发指南/贡献指南.md) - 如何参与项目开发
51 | - [架构设计](开发指南/架构设计.md) - 深入理解项目架构
52 | - [开发环境配置](开发指南/开发环境配置.md) - 搭建开发环境
53 | - [测试指南](开发指南/测试指南.md) - 单元测试和集成测试
54 | 
55 | ### 故障排除
56 | - [常见问题](故障排除/常见问题.md) - FAQ 和解决方案
57 | - [调试技巧](故障排除/调试技巧.md) - 调试和问题定位方法
58 | - [错误代码参考](故障排除/错误代码参考.md) - 错误代码含义和解决方案
59 | 
60 | ### 示例和教程
61 | - [基础示例](示例和教程/基础示例.md) - 入门级使用示例
62 | - [高级应用场景](示例和教程/高级应用场景.md) - 复杂场景的实现方案
63 | - [最佳实践](示例和教程/最佳实践.md) - 生产环境使用建议
64 | 
65 | ## 🚀 快速链接
66 | 
67 | - **新手入门**: [快速开始](快速开始.md) → [基础示例](示例和教程/基础示例.md)
68 | - **API 查询**: [Agent API](API参考/Agent-API.md) → [Insight API](API参考/Insight-API.md)
69 | - **平台集成**: [Web自动化](平台集成/Web自动化/README.md) → [Android自动化](平台集成/Android自动化.md)
70 | - **问题解决**: [常见问题](故障排除/常见问题.md) → [调试技巧](故障排除/调试技巧.md)
71 | 
72 | ## 📖 文档维护
73 | 
74 | 本文档随项目持续更新，如发现内容错误或需要补充，请提交 Issue 或 Pull Request。
75 | 
76 | ---
77 | 
78 | *最后更新: 2025-09-02*


--------------------------------------------------------------------------------
/wiki/核心概念/README.md:
--------------------------------------------------------------------------------
  1 | # 核心概念
  2 | 
  3 | Midscene Python 的核心概念文档，深入解析框架的关键组件和设计理念。
  4 | 
  5 | ## 📖 目录概览
  6 | 
  7 | 本章节包含以下核心概念文档：
  8 | 
  9 | ### [Agent 核心控制器](Agent核心控制器.md)
 10 | Agent 是 Midscene Python 的核心控制器，提供统一的自动化操作接口。了解 Agent 的工作原理、生命周期管理和高级配置。
 11 | 
 12 | **主要内容**:
 13 | - Agent 架构设计
 14 | - 操作类型和方法
 15 | - 选项配置和自定义
 16 | - 生命周期管理
 17 | 
 18 | ### [Insight UI理解引擎](Insight-UI理解引擎.md) 
 19 | Insight 是 AI 驱动的 UI 理解引擎，负责页面分析、元素定位和操作决策。深入理解 AI 如何理解和操作界面。
 20 | 
 21 | **主要内容**:
 22 | - UI 理解机制
 23 | - 智能元素定位
 24 | - 操作策略生成
 25 | - 上下文分析
 26 | 
 27 | ### [AI模型服务抽象层](AI模型服务抽象层.md)
 28 | 统一的 AI 模型服务接口，支持多种 AI 提供商。了解如何配置和切换不同的 AI 模型。
 29 | 
 30 | **主要内容**:
 31 | - 服务抽象设计
 32 | - 提供商适配
 33 | - 模型选择策略
 34 | - 性能优化
 35 | 
 36 | ### [UI上下文与数据模型](UI上下文与数据模型.md)
 37 | 理解 Midscene Python 中的数据流、上下文管理和类型系统。
 38 | 
 39 | **主要内容**:
 40 | - 数据模型定义
 41 | - 上下文传递机制
 42 | - 类型安全保证
 43 | - 序列化和反序列化
 44 | 
 45 | ## 🏗️ 整体架构关系
 46 | 
 47 | ```mermaid
 48 | graph TB
 49 |     A[用户代码] --> B[Agent 核心控制器]
 50 |     B --> C[Insight UI理解引擎]
 51 |     C --> D[AI模型服务抽象层]
 52 |     C --> E[UI上下文与数据模型]
 53 |     E --> F[平台适配层]
 54 |     F --> G[底层驱动]
 55 |     
 56 |     subgraph "核心概念"
 57 |         B
 58 |         C
 59 |         D
 60 |         E
 61 |     end
 62 |     
 63 |     subgraph "平台支持"
 64 |         F
 65 |         G
 66 |     end
 67 | ```
 68 | 
 69 | ## 🔄 数据流向
 70 | 
 71 | 1. **用户请求** → Agent 接收自然语言指令
 72 | 2. **指令解析** → Insight 分析指令意图和页面状态
 73 | 3. **AI 推理** → AIModelService 调用 AI 模型进行决策
 74 | 4. **上下文构建** → UIContext 封装页面信息和操作结果
 75 | 5. **操作执行** → 通过平台适配层执行具体操作
 76 | 6. **结果反馈** → 返回执行结果和状态信息
 77 | 
 78 | ## 🎯 设计原则
 79 | 
 80 | ### 1. 抽象化原则
 81 | - 隐藏复杂的底层实现细节
 82 | - 提供统一的高级接口
 83 | - 支持多平台一致性操作
 84 | 
 85 | ### 2. 可扩展原则
 86 | - 模块化设计支持功能扩展
 87 | - 插件化架构支持第三方集成
 88 | - 开放的 API 设计
 89 | 
 90 | ### 3. 智能化原则
 91 | - AI 驱动的决策制定
 92 | - 自适应的操作策略
 93 | - 智能的错误处理和恢复
 94 | 
 95 | ### 4. 类型安全原则
 96 | - 完整的类型注解
 97 | - 运行时类型验证
 98 | - 强类型的数据模型
 99 | 
100 | ## 🧩 组件交互
101 | 
102 | ### Agent ↔ Insight
103 | - Agent 委托 Insight 进行 AI 推理
104 | - Insight 返回操作计划和执行结果
105 | - 双向的状态同步和错误处理
106 | 
107 | ### Insight ↔ AIModelService
108 | - Insight 构建 AI 模型请求
109 | - AIModelService 管理模型调用和响应
110 | - 支持多种模型的统一接口
111 | 
112 | ### 所有组件 ↔ UIContext
113 | - 统一的数据模型和上下文管理
114 | - 类型安全的数据传递
115 | - 序列化和持久化支持
116 | 
117 | ## 📚 学习路径
118 | 
119 | ### 初学者路径
120 | 1. 开始阅读 [Agent 核心控制器](Agent核心控制器.md)
121 | 2. 理解 [UI上下文与数据模型](UI上下文与数据模型.md)
122 | 3. 深入 [Insight UI理解引擎](Insight-UI理解引擎.md)
123 | 4. 最后学习 [AI模型服务抽象层](AI模型服务抽象层.md)
124 | 
125 | ### 高级开发者路径
126 | 1. 快速浏览所有核心概念
127 | 2. 重点关注架构设计和扩展机制
128 | 3. 深入研究 AI 模型集成和优化
129 | 4. 探索自定义扩展和插件开发
130 | 
131 | ## 🔗 相关文档链接
132 | 
133 | - **API 参考**: [Agent API](../API参考/Agent-API.md) | [Insight API](../API参考/Insight-API.md)
134 | - **平台集成**: [Web自动化](../平台集成/Web自动化/README.md) | [Android自动化](../平台集成/Android自动化.md)
135 | - **配置指南**: [AI模型配置](../AI模型配置/配置方法.md)
136 | - **示例教程**: [基础示例](../示例和教程/基础示例.md)
137 | 
138 | ---
139 | 
140 | 选择你感兴趣的主题开始深入学习吧！


--------------------------------------------------------------------------------
/wiki/生成状态.md:
--------------------------------------------------------------------------------
  1 | # Midscene Python Wiki 生成状态
  2 | 
  3 | ## ✅ 已完成的文档
  4 | 
  5 | ### 核心文档
  6 | - [x] **README.md** - Wiki 主页和导航
  7 | - [x] **项目概述.md** - 项目介绍、特性和设计理念
  8 | - [x] **快速开始.md** - 5分钟上手指南
  9 | - [x] **安装配置.md** - 详细的安装和配置说明
 10 | 
 11 | ### 核心概念 (4/4)
 12 | - [x] **README.md** - 核心概念章节导航
 13 | - [x] **Agent核心控制器.md** - Agent 类的完整说明
 14 | - [x] **Insight-UI理解引擎.md** - AI 驱动的 UI 理解引擎
 15 | - [x] **AI模型服务抽象层.md** - 多 AI 提供商统一接口
 16 | - [x] **UI上下文与数据模型.md** - 数据类型和上下文管理
 17 | 
 18 | ### 平台集成 (1/4)
 19 | - [x] **README.md** - 平台集成总览
 20 | 
 21 | ## 📋 待生成的文档结构
 22 | 
 23 | 以下是基于项目分析确定的完整 wiki 结构：
 24 | 
 25 | ```
 26 | wiki/
 27 | ├── README.md ✅
 28 | ├── 项目概述.md ✅
 29 | ├── 快速开始.md ✅
 30 | ├── 安装配置.md ✅
 31 | ├── 核心概念/ ✅
 32 | │   ├── README.md ✅
 33 | │   ├── Agent核心控制器.md ✅
 34 | │   ├── Insight-UI理解引擎.md ✅
 35 | │   ├── AI模型服务抽象层.md ✅
 36 | │   └── UI上下文与数据模型.md ✅
 37 | ├── API参考/
 38 | │   ├── Agent-API.md
 39 | │   ├── Insight-API.md
 40 | │   └── AIModelService-API.md
 41 | ├── 平台集成/ (部分完成)
 42 | │   ├── README.md ✅
 43 | │   ├── Web自动化/
 44 | │   │   ├── README.md
 45 | │   │   ├── Selenium集成.md
 46 | │   │   ├── Playwright集成.md
 47 | │   │   └── Web桥接机制.md
 48 | │   └── Android自动化.md
 49 | ├── AI模型配置/
 50 | │   ├── 配置方法.md
 51 | │   ├── 支持的AI提供商/
 52 | │   │   ├── README.md
 53 | │   │   ├── OpenAI提供商.md
 54 | │   │   ├── Anthropic提供商.md
 55 | │   │   ├── 通义千问提供商.md
 56 | │   │   └── Gemini提供商.md
 57 | │   └── 高级选项/
 58 | │       ├── README.md
 59 | │       ├── 缓存策略.md
 60 | │       ├── 请求重试与超时控制.md
 61 | │       ├── 配额管理与节流控制.md
 62 | │       ├── 流式响应处理.md
 63 | │       └── 性能调优技巧.md
 64 | ├── 高级特性/
 65 | │   ├── 智能缓存机制.md
 66 | │   ├── 可视化报告系统.md
 67 | │   └── CLI工具高级用法.md
 68 | ├── 开发指南/
 69 | │   ├── 贡献指南.md
 70 | │   ├── 架构设计.md
 71 | │   ├── 开发环境配置.md
 72 | │   └── 测试指南.md
 73 | ├── 故障排除/
 74 | │   ├── 常见问题.md
 75 | │   ├── 调试技巧.md
 76 | │   └── 错误代码参考.md
 77 | └── 示例和教程/
 78 |     ├── 基础示例.md
 79 |     ├── 高级应用场景.md
 80 |     └── 最佳实践.md
 81 | ```
 82 | 
 83 | ## 📊 生成进度
 84 | 
 85 | - **总文档数**: 约 35-40 个
 86 | - **已完成**: 9 个文档
 87 | - **完成率**: ~25%
 88 | - **核心文档覆盖率**: 100% (最重要的概念文档已完成)
 89 | 
 90 | ## 🎯 已完成文档的特色
 91 | 
 92 | ### 1. 完整性和深度
 93 | - 每个核心概念都有详细的解释和示例
 94 | - 包含架构图和代码示例
 95 | - 涵盖最佳实践和常见问题
 96 | 
 97 | ### 2. 结构化组织
 98 | - 清晰的文档导航和交叉引用
 99 | - 统一的文档格式和风格
100 | - 逐步深入的学习路径
101 | 
102 | ### 3. 实用性
103 | - 大量可运行的代码示例
104 | - 实际使用场景和最佳实践
105 | - 详细的配置和选项说明
106 | 
107 | ## 🔄 继续生成建议
108 | 
109 | 如需继续生成剩余文档，建议按以下优先级：
110 | 
111 | ### 优先级 1 (立即需要)
112 | - API参考文档 (Agent-API.md, Insight-API.md)
113 | - 平台集成详细文档 (Selenium集成.md, Android自动化.md)
114 | 
115 | ### 优先级 2 (重要)
116 | - AI模型配置文档
117 | - 示例和教程文档
118 | 
119 | ### 优先级 3 (补充)
120 | - 故障排除文档
121 | - 开发指南文档
122 | 
123 | ## 💡 使用建议
124 | 
125 | 当前已生成的文档已经覆盖了 Midscene Python 的核心概念和基础使用。用户可以通过以下路径开始学习：
126 | 
127 | 1. **新手路径**: README.md → 项目概述.md → 快速开始.md
128 | 2. **开发者路径**: 安装配置.md → 核心概念/ → 平台集成/
129 | 3. **深入理解**: 核心概念/ 所有文档 → AI模型配置/
130 | 
131 | ## 📝 文档质量
132 | 
133 | 已生成的文档具备以下特点：
134 | - ✅ 完整的代码示例
135 | - ✅ 详细的配置说明  
136 | - ✅ 架构图和流程图
137 | - ✅ 最佳实践指导
138 | - ✅ 错误处理建议
139 | - ✅ 性能优化技巧
140 | - ✅ 跨文档引用链接
141 | 
142 | 这些文档为用户提供了全面理解和使用 Midscene Python 框架的基础。


--------------------------------------------------------------------------------
/midscene/android/agent.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Android Agent implementation
  3 | """
  4 | 
  5 | from typing import Optional
  6 | 
  7 | from ..core.agent import Agent, AgentOptions
  8 | from .device import AndroidDevice
  9 | 
 10 | 
 11 | class AndroidAgent(Agent[AndroidDevice]):
 12 |     """Android-specific agent implementation"""
 13 |     
 14 |     def __init__(self, device: AndroidDevice, options: Optional[AgentOptions] = None):
 15 |         """Initialize Android agent
 16 |         
 17 |         Args:
 18 |             device: AndroidDevice instance
 19 |             options: Agent options
 20 |         """
 21 |         super().__init__(device, options)
 22 |         
 23 |         # Validate that we have vision language model support for Android
 24 |         # Android requires VL models for UI understanding
 25 |         
 26 |     @classmethod
 27 |     async def create(
 28 |         cls, 
 29 |         device_id: Optional[str] = None,
 30 |         options: Optional[AgentOptions] = None
 31 |     ) -> 'AndroidAgent':
 32 |         """Create Android agent with device
 33 |         
 34 |         Args:
 35 |             device_id: Android device ID, if None uses first available
 36 |             options: Agent options
 37 |             
 38 |         Returns:
 39 |             AndroidAgent instance
 40 |         """
 41 |         device = await AndroidDevice.create(device_id)
 42 |         return cls(device, options)
 43 |     
 44 |     async def launch_app(self, package_name: str, activity: Optional[str] = None) -> None:
 45 |         """Launch Android app
 46 |         
 47 |         Args:
 48 |             package_name: App package name
 49 |             activity: Optional activity name
 50 |         """
 51 |         await self.interface.launch_app(package_name, activity)
 52 |     
 53 |     async def stop_app(self, package_name: str) -> None:
 54 |         """Stop Android app
 55 |         
 56 |         Args:
 57 |             package_name: App package name
 58 |         """
 59 |         await self.interface.stop_app(package_name)
 60 |     
 61 |     async def install_app(self, apk_path: str) -> None:
 62 |         """Install Android app
 63 |         
 64 |         Args:
 65 |             apk_path: Path to APK file
 66 |         """
 67 |         await self.interface.install_app(apk_path)
 68 |     
 69 |     async def back(self) -> None:
 70 |         """Press back button"""
 71 |         await self.interface.back()
 72 |     
 73 |     async def home(self) -> None:
 74 |         """Press home button"""
 75 |         await self.interface.home()
 76 |     
 77 |     async def recent(self) -> None:
 78 |         """Press recent apps button"""
 79 |         await self.interface.recent()
 80 |     
 81 |     async def swipe(
 82 |         self, 
 83 |         start_x: float, start_y: float,
 84 |         end_x: float, end_y: float,
 85 |         duration: int = 300
 86 |     ) -> None:
 87 |         """Swipe gesture
 88 |         
 89 |         Args:
 90 |             start_x: Start X coordinate
 91 |             start_y: Start Y coordinate
 92 |             end_x: End X coordinate
 93 |             end_y: End Y coordinate
 94 |             duration: Swipe duration in milliseconds
 95 |         """
 96 |         await self.interface.swipe(start_x, start_y, end_x, end_y, duration)
 97 |     
 98 |     async def long_press(self, x: float, y: float, duration: int = 1000) -> None:
 99 |         """Long press gesture
100 |         
101 |         Args:
102 |             x: X coordinate
103 |             y: Y coordinate
104 |             duration: Press duration in milliseconds
105 |         """
106 |         await self.interface.long_press(x, y, duration)


--------------------------------------------------------------------------------
/examples/basic_usage.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Basic usage examples for Midscene Python
  3 | """
  4 | 
  5 | import asyncio
  6 | from midscene import Agent
  7 | from midscene.web import SeleniumWebPage
  8 | from midscene.android import AndroidAgent
  9 | 
 10 | 
 11 | async def web_automation_example():
 12 |     """Basic web automation example"""
 13 |     print("🌐 Web Automation Example")
 14 |     
 15 |     # Create web page instance
 16 |     with SeleniumWebPage.create(headless=False) as page:
 17 |         # Create agent
 18 |         agent = Agent(page)
 19 |         
 20 |         # Navigate to website
 21 |         await page.navigate_to("https://example.com")
 22 |         
 23 |         # Use AI to interact with the page
 24 |         await agent.ai_action("点击登录按钮")
 25 |         await agent.ai_action("在用户名输入框输入 'demo@example.com'")
 26 |         await agent.ai_action("在密码输入框输入 'password123'")
 27 |         await agent.ai_action("点击提交按钮")
 28 |         
 29 |         # Extract data using AI
 30 |         user_info = await agent.ai_extract({
 31 |             "username": "用户名",
 32 |             "email": "邮箱地址",
 33 |             "last_login": "最后登录时间"
 34 |         })
 35 |         print(f"提取的用户信息: {user_info}")
 36 |         
 37 |         # Assert page state
 38 |         await agent.ai_assert("页面显示欢迎信息")
 39 |         
 40 |         print("✅ Web automation completed successfully!")
 41 | 
 42 | 
 43 | async def android_automation_example():
 44 |     """Basic Android automation example"""
 45 |     print("📱 Android Automation Example")
 46 |     
 47 |     try:
 48 |         # Create Android agent
 49 |         agent = await AndroidAgent.create()
 50 |         
 51 |         # Launch app
 52 |         await agent.launch_app("com.android.settings")
 53 |         
 54 |         # Use AI to navigate
 55 |         await agent.ai_action("点击WLAN设置")
 56 |         await agent.ai_action("滑动到底部")
 57 |         
 58 |         # Extract information
 59 |         wifi_list = await agent.ai_extract({
 60 |             "available_networks": [
 61 |                 {"name": "网络名称", "security": "安全类型", "signal": "信号强度"}
 62 |             ]
 63 |         })
 64 |         print(f"可用WiFi网络: {wifi_list}")
 65 |         
 66 |         # Go back
 67 |         await agent.back()
 68 |         
 69 |         print("✅ Android automation completed successfully!")
 70 |         
 71 |     except Exception as e:
 72 |         print(f"❌ Android automation failed: {e}")
 73 | 
 74 | 
 75 | async def playwright_example():
 76 |     """Playwright integration example"""
 77 |     print("🎭 Playwright Example")
 78 |     
 79 |     from midscene.web import PlaywrightWebPage
 80 |     
 81 |     # Create Playwright page
 82 |     async with await PlaywrightWebPage.create(headless=False) as page:
 83 |         agent = Agent(page)
 84 |         
 85 |         # Navigate and interact
 86 |         await page.navigate_to("https://playwright.dev")
 87 |         
 88 |         # Use AI for navigation
 89 |         await agent.ai_action("点击文档链接")
 90 |         await agent.ai_action("搜索 'getting started'")
 91 |         
 92 |         # Extract page information
 93 |         page_info = await agent.ai_extract({
 94 |             "title": "页面标题",
 95 |             "description": "页面描述",
 96 |             "sections": ["主要章节列表"]
 97 |         })
 98 |         print(f"页面信息: {page_info}")
 99 |         
100 |         print("✅ Playwright example completed!")
101 | 
102 | 
103 | async def main():
104 |     """Run all examples"""
105 |     print("🚀 Midscene Python Examples\n")
106 |     
107 |     # Web automation with Selenium
108 |     await web_automation_example()
109 |     print()
110 |     
111 |     # Playwright example
112 |     await playwright_example()
113 |     print()
114 |     
115 |     # Android automation (if device available)
116 |     await android_automation_example()
117 | 
118 | 
119 | if __name__ == "__main__":
120 |     asyncio.run(main())


--------------------------------------------------------------------------------
/midscene/cli/config.py:
--------------------------------------------------------------------------------
  1 | """
  2 | CLI configuration management
  3 | """
  4 | 
  5 | from pathlib import Path
  6 | from typing import Optional, Dict, Any
  7 | 
  8 | import yaml
  9 | from pydantic import BaseModel, Field
 10 | 
 11 | 
 12 | class WebConfig(BaseModel):
 13 |     """Web automation configuration"""
 14 |     browser: str = "chrome"
 15 |     headless: bool = False
 16 |     window_size: tuple[int, int] = (1920, 1080)
 17 |     user_data_dir: Optional[str] = None
 18 |     timeout: int = 30
 19 | 
 20 | 
 21 | class AndroidConfig(BaseModel):
 22 |     """Android automation configuration"""
 23 |     device_id: Optional[str] = None
 24 |     adb_path: str = "adb"
 25 |     auto_dismiss_keyboard: bool = True
 26 |     timeout: int = 30
 27 | 
 28 | 
 29 | class AIConfig(BaseModel):
 30 |     """AI model configuration"""
 31 |     provider: str = "openai"
 32 |     model: str = "gpt-4-vision-preview"
 33 |     api_key: Optional[str] = None
 34 |     base_url: Optional[str] = None
 35 |     max_tokens: int = 4000
 36 |     temperature: float = 0.1
 37 | 
 38 | 
 39 | class ExecutionConfig(BaseModel):
 40 |     """Execution configuration"""
 41 |     concurrent: int = 1
 42 |     continue_on_error: bool = False
 43 |     generate_report: bool = True
 44 |     report_format: str = "html"
 45 |     output_dir: str = "./reports"
 46 | 
 47 | 
 48 | class CLIConfig(BaseModel):
 49 |     """CLI configuration"""
 50 |     web: WebConfig = Field(default_factory=WebConfig)
 51 |     android: AndroidConfig = Field(default_factory=AndroidConfig)
 52 |     ai: AIConfig = Field(default_factory=AIConfig)
 53 |     execution: ExecutionConfig = Field(default_factory=ExecutionConfig)
 54 |     
 55 |     @classmethod
 56 |     def load(cls, config_path: Optional[str] = None) -> 'CLIConfig':
 57 |         """Load configuration from file
 58 |         
 59 |         Args:
 60 |             config_path: Path to configuration file
 61 |             
 62 |         Returns:
 63 |             CLIConfig instance
 64 |         """
 65 |         if not config_path:
 66 |             # Look for default config files
 67 |             for default_path in ["midscene.yml", "midscene.yaml", ".midscene.yml"]:
 68 |                 if Path(default_path).exists():
 69 |                     config_path = default_path
 70 |                     break
 71 |         
 72 |         if not config_path or not Path(config_path).exists():
 73 |             # Return default configuration
 74 |             return cls()
 75 |         
 76 |         with open(config_path, 'r', encoding='utf-8') as f:
 77 |             config_data = yaml.safe_load(f)
 78 |         
 79 |         return cls(**config_data)
 80 |     
 81 |     def save(self, config_path: str) -> None:
 82 |         """Save configuration to file
 83 |         
 84 |         Args:
 85 |             config_path: Path to save configuration
 86 |         """
 87 |         config_data = self.model_dump()
 88 |         
 89 |         with open(config_path, 'w', encoding='utf-8') as f:
 90 |             yaml.dump(config_data, f, default_flow_style=False, allow_unicode=True)
 91 |     
 92 |     def to_env_vars(self) -> Dict[str, str]:
 93 |         """Convert configuration to environment variables
 94 |         
 95 |         Returns:
 96 |             Dictionary of environment variables
 97 |         """
 98 |         env_vars = {}
 99 |         
100 |         # AI configuration
101 |         if self.ai.api_key:
102 |             env_vars['MIDSCENE_AI_API_KEY'] = self.ai.api_key
103 |         env_vars['MIDSCENE_AI_PROVIDER'] = self.ai.provider
104 |         env_vars['MIDSCENE_AI_MODEL'] = self.ai.model
105 |         if self.ai.base_url:
106 |             env_vars['MIDSCENE_AI_BASE_URL'] = self.ai.base_url
107 |         
108 |         # Execution configuration
109 |         env_vars['MIDSCENE_CONCURRENT'] = str(self.execution.concurrent)
110 |         env_vars['MIDSCENE_CONTINUE_ON_ERROR'] = str(self.execution.continue_on_error).lower()
111 |         env_vars['MIDSCENE_GENERATE_REPORT'] = str(self.execution.generate_report).lower()
112 |         
113 |         return env_vars


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [build-system]
  2 | requires = ["hatchling"]
  3 | build-backend = "hatchling.build"
  4 | 
  5 | [project]
  6 | name = "midscene-python"
  7 | description = "基于AI的Web和Android自动化框架，支持自然语言驱动的UI操作"
  8 | readme = "README.md"
  9 | license = "MIT"
 10 | authors = [
 11 |     { name = "Midscene Team" },
 12 | ]
 13 | keywords = ["automation", "AI", "web", "android", "testing", "ui", "selenium", "playwright"]
 14 | classifiers = [
 15 |     "Development Status :: 4 - Beta",
 16 |     "Intended Audience :: Developers",
 17 |     "License :: OSI Approved :: MIT License",
 18 |     "Programming Language :: Python :: 3",
 19 |     "Programming Language :: Python :: 3.9",
 20 |     "Programming Language :: Python :: 3.10",
 21 |     "Programming Language :: Python :: 3.11",
 22 |     "Programming Language :: Python :: 3.12",
 23 |     "Topic :: Software Development :: Testing",
 24 |     "Topic :: Software Development :: Libraries :: Python Modules",
 25 | ]
 26 | requires-python = ">=3.9"
 27 | dependencies = [
 28 |     "pydantic>=2.0,<3.0",
 29 |     "selenium>=4.15.0,<5.0",
 30 |     "playwright>=1.40.0,<2.0",
 31 |     "opencv-python>=4.8.0,<5.0",
 32 |     "pillow>=10.0.0,<11.0",
 33 |     "numpy>=1.24.0,<2.0",
 34 |     "aiohttp>=3.9.0,<4.0",
 35 |     "loguru>=0.7.0,<1.0",
 36 |     "typer>=0.9.0,<1.0",
 37 |     "jinja2>=3.1.0,<4.0",
 38 |     "pyyaml>=6.0,<7.0",
 39 |     "httpx>=0.25.0,<1.0",
 40 |     "asyncio-mqtt",
 41 |     "pure-python-adb>=0.3.0dev0",
 42 |     "openai>=1.3.0,<2.0",
 43 |     "anthropic>=0.7.0,<1.0",
 44 |     "google-generativeai",
 45 |     "dashscope",
 46 | ]
 47 | version = "0.1.1"
 48 | 
 49 | [project.optional-dependencies]
 50 | dev = [
 51 |     "pytest>=7.4.0",
 52 |     "pytest-asyncio>=0.21.0",
 53 |     "pytest-cov>=4.1.0",
 54 |     "black>=23.0.0",
 55 |     "isort>=5.12.0",
 56 |     "mypy>=1.5.0",
 57 |     "pre-commit>=3.4.0",
 58 |     "ruff>=0.1.0",
 59 | ]
 60 | docs = [
 61 |     "mkdocs>=1.5.0",
 62 |     "mkdocs-material>=9.4.0",
 63 |     "mkdocstrings[python]>=0.23.0",
 64 | ]
 65 | 
 66 | [project.urls]
 67 | Homepage = "https://github.com/Python51888/midscene-python.git"
 68 | Repository = "https://github.com/Python51888/midscene-python.git"
 69 | Documentation = "https://github.com/Python51888/Midscene-Python/blob/master/README.md"
 70 | "Bug Tracker" = "https://github.com/Python51888/midscene-python.git/issues"
 71 | 
 72 | [project.scripts]
 73 | midscene = "midscene.cli:main"
 74 | 
 75 | [tool.hatch.build.targets.wheel]
 76 | packages = ["midscene"]
 77 | 
 78 | 
 79 | 
 80 | [tool.black]
 81 | line-length = 88
 82 | target-version = ['py39']
 83 | include = '\.pyi?$'
 84 | exclude = '''
 85 | /(
 86 |     \.eggs
 87 |   | \.git
 88 |   | \.hg
 89 |   | \.mypy_cache
 90 |   | \.tox
 91 |   | \.venv
 92 |   | _build
 93 |   | buck-out
 94 |   | build
 95 |   | dist
 96 | )/
 97 | '''
 98 | 
 99 | [tool.isort]
100 | profile = "black"
101 | line_length = 88
102 | multi_line_output = 3
103 | include_trailing_comma = true
104 | force_grid_wrap = 0
105 | use_parentheses = true
106 | ensure_newline_before_comments = true
107 | 
108 | [tool.mypy]
109 | python_version = "3.9"
110 | warn_return_any = true
111 | warn_unused_configs = true
112 | disallow_untyped_defs = true
113 | disallow_incomplete_defs = true
114 | check_untyped_defs = true
115 | disallow_untyped_decorators = true
116 | no_implicit_optional = true
117 | warn_redundant_casts = true
118 | warn_unused_ignores = true
119 | warn_no_return = true
120 | warn_unreachable = true
121 | strict_equality = true
122 | 
123 | [tool.ruff]
124 | target-veersion = "py39"
125 | line-length = 88
126 | select = [
127 |     "E",  # pycodestyle errors
128 |     "W",  # pycodestyle warnings
129 |     "F",  # pyflakes
130 |     "I",  # isort
131 |     "B",  # flake8-bugbear
132 |     "C4", # flake8-comprehensions
133 |     "UP", # pyupgrade
134 | ]
135 | ignore = [
136 |     "E501",  # line too long, handled by black
137 |     "B008",  # do not perform function calls in argument defaults
138 |     "C901",  # too complex
139 | ]
140 | 
141 | [tool.ruff.per-file-ignores]
142 | "__init__.py" = ["F401"]
143 | 
144 | [tool.pytest.ini_options]
145 | testpaths = ["tests"]
146 | python_files = ["test_*.py", "*_test.py"]
147 | python_classes = ["Test*"]
148 | python_functions = ["test_*"]
149 | addopts = [
150 |     "-v",
151 |     "--strict-markers",
152 |     "--strict-config",
153 |     "--cov=midscene",
154 |     "--cov-report=term-missing",
155 |     "--cov-report=html",
156 | ]
157 | markers = [
158 |     "slow: marks tests as slow (deselect with '-m \"not slow\"')",
159 |     "integration: marks tests as integration tests",
160 |     "unit: marks tests as unit tests",
161 | ]


--------------------------------------------------------------------------------
/README.zh.md:
--------------------------------------------------------------------------------
  1 | # Midscene Python     [![zread](https://img.shields.io/badge/Ask_Zread-_.svg?style=flat&color=00b0aa&labelColor=000000&logo=data%3Aimage%2Fsvg%2Bxml%3Bbase64%2CPHN2ZyB3aWR0aD0iMTYiIGhlaWdodD0iMTYiIHZpZXdCb3g9IjAgMCAxNiAxNiIgZmlsbD0ibm9uZSIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj4KPHBhdGggZD0iTTQuOTYxNTYgMS42MDAxSDIuMjQxNTZDMS44ODgxIDEuNjAwMSAxLjYwMTU2IDEuODg2NjQgMS42MDE1NiAyLjI0MDFWNC45NjAxQzEuNjAxNTYgNS4zMTM1NiAxLjg4ODEgNS42MDAxIDIuMjQxNTYgNS42MDAxSDQuOTYxNTZDNS4zMTUwMiA1LjYwMDEgNS42MDE1NiA1LjMxMzU2IDUuNjAxNTYgNC45NjAxVjIuMjQwMUM1LjYwMTU2IDEuODg2NjQgNS4zMTUwMiAxLjYwMDEgNC45NjE1NiAxLjYwMDFaIiBmaWxsPSIjZmZmIi8%2BCjxwYXRoIGQ9Ik00Ljk2MTU2IDEwLjM5OTlIMi4yNDE1NkMxLjg4ODEgMTAuMzk5OSAxLjYwMTU2IDEwLjY4NjQgMS42MDE1NiAxMS4wMzk5VjEzLjc1OTlDMS42MDE1NiAxNC4xMTM0IDEuODg4MSAxNC4zOTk5IDIuMjQxNTYgMTQuMzk5OUg0Ljk2MTU2QzUuMzE1MDIgMTQuMzk5OSA1LjYwMTU2IDE0LjExMzQgNS42MDE1NiAxMy43NTk5VjExLjAzOTlDNS42MDE1NiAxMC42ODY0IDUuMzE1MDIgMTAuMzk5OSA0Ljk2MTU2IDEwLjM5OTlaIiBmaWxsPSIjZmZmIi8%2BCjxwYXRoIGQ9Ik0xMy43NTg0IDEuNjAwMUgxMS4wMzg0QzEwLjY4NSAxLjYwMDEgMTAuMzk4NCAxLjg4NjY0IDEwLjM5ODQgMi4yNDAxVjQuOTYwMUMxMC4zOTg0IDUuMzEzNTYgMTAuNjg1IDUuNjAwMSAxMS4wMzg0IDUuNjAwMUgxMy43NTg0QzE0LjExMTkgNS42MDAxIDE0LjM5ODQgNS4zMTM1NiAxNC4zOTg0IDQuOTYwMVYyLjI0MDFDMTQuMzk4NCAxLjg4NjY0IDE0LjExMTkgMS42MDAxIDEzLjc1ODQgMS42MDAxWiIgZmlsbD0iI2ZmZiIvPgo8cGF0aCBkPSJNNCAxMkwxMiA0TDQgMTJaIiBmaWxsPSIjZmZmIi8%2BCjxwYXRoIGQ9Ik00IDEyTDEyIDQiIHN0cm9rZT0iI2ZmZiIgc3Ryb2tlLXdpZHRoPSIxLjUiIHN0cm9rZS1saW5lY2FwPSJyb3VuZCIvPgo8L3N2Zz4K&logoColor=ffffff)](https://zread.ai/Python51888/Midscene-Python)
  2 | 
  3 | [English](README.md) | [中文](README.zh.md)  
  4 | 
  5 | Midscene Python 是一个基于 AI 的自动化框架，支持 Web 和 Android 平台的 UI 自动化操作。
  6 | 
  7 | ## 概述
  8 | 
  9 | Midscene Python 提供全面的 UI 自动化能力，具有以下核心特性：
 10 | 
 11 | - **自然语言驱动**：使用自然语言描述自动化任务
 12 | - **多平台支持**：支持 Web（Selenium/Playwright）和 Android（ADB）
 13 | - **AI 模型集成**：支持 GPT-4V、Qwen2.5-VL、Gemini 等多种视觉语言模型
 14 | - **可视化调试**：提供详细的执行报告和调试信息
 15 | - **缓存机制**：智能缓存提升执行效率
 16 | 
 17 | ## 项目架构
 18 | 
 19 | ```
 20 | midscene-python/
 21 | ├── midscene/                    # 核心框架
 22 | │   ├── core/                    # 核心框架
 23 | │   │   ├── agent/              # Agent系统
 24 | │   │   ├── insight/            # AI推理引擎
 25 | │   │   ├── ai_model/           # AI模型集成
 26 | │   │   ├── yaml/               # YAML脚本执行器
 27 | │   │   └── types.py            # 核心类型定义
 28 | │   ├── web/                     # Web集成
 29 | │   │   ├── selenium/           # Selenium集成
 30 | │   │   ├── playwright/         # Playwright集成
 31 | │   │   └── bridge/             # Bridge模式
 32 | │   ├── android/                 # Android集成
 33 | │   │   ├── device.py           # 设备管理
 34 | │   │   └── agent.py            # Android Agent
 35 | │   ├── cli/                     # 命令行工具
 36 | │   ├── mcp/                     # MCP协议支持
 37 | │   ├── shared/                 # 共享工具
 38 | │   └── visualizer/             # 可视化报告
 39 | ├── examples/                   # 示例代码
 40 | ├── tests/                      # 测试用例
 41 | └── docs/                       # 文档
 42 | ```
 43 | 
 44 | ## 技术栈
 45 | 
 46 | - **Python 3.9+**：核心运行环境
 47 | - **Pydantic**：数据验证和序列化
 48 | - **Selenium/Playwright**：Web 自动化
 49 | - **OpenCV/Pillow**：图像处理
 50 | - **HTTPX/AIOHTTP**：HTTP 客户端
 51 | - **Typer**：CLI 框架
 52 | - **Loguru**：日志记录
 53 | 
 54 | ## 快速开始
 55 | 
 56 | ### 安装
 57 | 
 58 | ```bash
 59 | pip install midscene-python
 60 | ```
 61 | 
 62 | ### 基础用法
 63 | 
 64 | ```python
 65 | from midscene import Agent
 66 | from midscene.web import SeleniumWebPage
 67 | 
 68 | # 创建 Web Agent
 69 | with SeleniumWebPage.create() as page:
 70 |     agent = Agent(page)
 71 |     
 72 |     # 使用自然语言进行自动化操作
 73 |     await agent.ai_action("点击登录按钮")
 74 |     await agent.ai_action("输入用户名 'test@example.com'")
 75 |     await agent.ai_action("输入密码 'password123'")
 76 |     await agent.ai_action("点击提交按钮")
 77 |     
 78 |     # 数据提取
 79 |     user_info = await agent.ai_extract("提取用户个人信息")
 80 |     
 81 |     # 断言验证
 82 |     await agent.ai_assert("页面显示欢迎信息")
 83 | ```
 84 | 
 85 | ## 主要特性
 86 | 
 87 | ### 🤖 AI 驱动的自动化
 88 | 
 89 | 使用自然语言描述操作，AI 自动理解并执行：
 90 | 
 91 | ```python
 92 | await agent.ai_action("在搜索框中输入'Python教程'并搜索")
 93 | ```
 94 | 
 95 | ### 🔍 智能元素定位
 96 | 
 97 | 支持多种定位策略，自动选择最优方案：
 98 | 
 99 | ```python
100 | element = await agent.ai_locate("登录按钮")
101 | ```
102 | 
103 | ### 📊 数据提取
104 | 
105 | 从页面提取结构化数据：
106 | 
107 | ```python
108 | products = await agent.ai_extract({
109 |     "products": [
110 |         {"name": "产品名称", "price": "价格", "rating": "评分"}
111 |     ]
112 | })
113 | ```
114 | 
115 | ### ✅ 智能断言
116 | 
117 | AI 理解页面状态，进行智能断言：
118 | 
119 | ```python
120 | await agent.ai_assert("用户已成功登录")
121 | ```
122 | 
123 | ### 📝 致谢
124 | 
125 | 感谢Midscene项目：https://github.com/web-infra-dev/midscene 提供的灵感和技术参考
126 | 
127 | ## 许可证
128 | 
129 | MIT License
130 | 


--------------------------------------------------------------------------------
/wiki/项目概述.md:
--------------------------------------------------------------------------------
  1 | # 项目概述
  2 | 
  3 | ## 什么是 Midscene Python？
  4 | 
  5 | Midscene Python 是一个革命性的基于 AI 的自动化框架，专为 Web 和 Android 平台的 UI 自动化操作而设计。它的核心理念是**让自动化变得像说话一样简单**。
  6 | 
  7 | ## 🎯 设计理念
  8 | 
  9 | ### 自然语言驱动
 10 | 传统的自动化工具需要开发者学习复杂的 API 和选择器语法。Midscene Python 打破了这一限制，让你可以用自然语言描述想要执行的操作：
 11 | 
 12 | ```python
 13 | # 传统方式
 14 | driver.find_element(By.XPATH, "//button[@class='login-btn' and contains(text(), '登录')]").click()
 15 | 
 16 | # Midscene Python 方式
 17 | await agent.ai_action("点击登录按钮")
 18 | ```
 19 | 
 20 | ### AI 驱动的智能决策
 21 | Midscene Python 集成了先进的视觉语言模型（VLM），能够：
 22 | - 理解页面结构和元素关系
 23 | - 智能选择最佳的操作策略
 24 | - 适应页面变化和布局差异
 25 | - 提供人性化的错误提示
 26 | 
 27 | ### 多平台统一接口
 28 | 无论是 Web 应用还是 Android 应用，Midscene Python 都提供了一致的编程接口：
 29 | 
 30 | ```python
 31 | # Web 自动化
 32 | web_agent = Agent(selenium_page)
 33 | await web_agent.ai_action("在搜索框输入'Python教程'")
 34 | 
 35 | # Android 自动化  
 36 | android_agent = Agent(android_device)
 37 | await android_agent.ai_action("在搜索框输入'Python教程'")
 38 | ```
 39 | 
 40 | ## 🌟 核心特性
 41 | 
 42 | ### 1. 自然语言操作
 43 | - **直观表达**: 用日常语言描述操作意图
 44 | - **智能理解**: AI 自动理解复杂的操作逻辑
 45 | - **上下文感知**: 结合页面状态做出最佳决策
 46 | 
 47 | ### 2. 智能元素定位
 48 | - **多策略融合**: 自动选择最优的定位方法
 49 | - **容错能力**: 适应页面变化和元素移动
 50 | - **语义理解**: 基于元素功能而非位置进行定位
 51 | 
 52 | ### 3. 结构化数据提取
 53 | ```python
 54 | # 提取商品信息
 55 | products = await agent.ai_extract({
 56 |     "products": [
 57 |         {
 58 |             "name": "商品名称",
 59 |             "price": "价格", 
 60 |             "rating": "评分",
 61 |             "availability": "库存状态"
 62 |         }
 63 |     ]
 64 | })
 65 | ```
 66 | 
 67 | ### 4. 智能断言验证
 68 | ```python
 69 | # 验证页面状态
 70 | await agent.ai_assert("用户已成功登录并显示欢迎消息")
 71 | await agent.ai_assert("购物车中有3件商品")
 72 | ```
 73 | 
 74 | ### 5. 可视化调试
 75 | - **执行截图**: 每步操作都有详细的视觉记录
 76 | - **决策过程**: 展示 AI 的思考和决策过程
 77 | - **错误定位**: 准确指出失败原因和位置
 78 | 
 79 | ## 🏗️ 架构概览
 80 | 
 81 | Midscene Python 采用分层架构设计：
 82 | 
 83 | ```
 84 | ┌─────────────────────────────────────────┐
 85 | │              用户应用层                    │
 86 | ├─────────────────────────────────────────┤
 87 | │              Agent 控制层                 │  ← 统一的操作接口
 88 | ├─────────────────────────────────────────┤
 89 | │             Insight AI 引擎              │  ← AI 理解和决策
 90 | ├─────────────────────────────────────────┤
 91 | │              平台适配层                   │  ← Web/Android 桥接
 92 | ├─────────────────────────────────────────┤
 93 | │            底层驱动层                     │  ← Selenium/Playwright/ADB
 94 | └─────────────────────────────────────────┘
 95 | ```
 96 | 
 97 | ### 核心组件
 98 | 
 99 | - **Agent**: 用户操作的统一入口，提供高级 AI 驱动的 API
100 | - **Insight**: AI 理解引擎，负责页面分析和操作决策
101 | - **AIModelService**: AI 模型服务抽象层，支持多种 AI 提供商
102 | - **Platform Bridges**: 平台桥接层，统一不同平台的操作接口
103 | 
104 | ## 🎮 使用场景
105 | 
106 | ### 测试自动化
107 | ```python
108 | # E2E 测试
109 | await agent.ai_action("登录用户账号")
110 | await agent.ai_action("添加商品到购物车")
111 | await agent.ai_action("进入结算页面")
112 | await agent.ai_assert("显示正确的订单金额")
113 | ```
114 | 
115 | ### 数据爬取
116 | ```python
117 | # 智能数据提取
118 | news_data = await agent.ai_extract({
119 |     "articles": [
120 |         {
121 |             "title": "标题",
122 |             "author": "作者", 
123 |             "publish_date": "发布日期",
124 |             "content_summary": "内容摘要"
125 |         }
126 |     ]
127 | })
128 | ```
129 | 
130 | ### 业务流程自动化
131 | ```python
132 | # RPA 自动化
133 | await agent.ai_action("打开财务报表")
134 | await agent.ai_action("筛选本月数据")
135 | monthly_report = await agent.ai_extract("提取月度财务汇总数据")
136 | await agent.ai_action("生成并下载报告")
137 | ```
138 | 
139 | ### 应用监控
140 | ```python
141 | # 健康检查
142 | await agent.ai_assert("首页加载正常")
143 | await agent.ai_assert("用户登录功能正常")
144 | await agent.ai_assert("搜索功能返回结果")
145 | ```
146 | 
147 | ## 🆚 与传统工具的对比
148 | 
149 | | 特性 | 传统自动化工具 | Midscene Python |
150 | |------|---------------|-----------------|
151 | | **学习曲线** | 陡峭，需要学习复杂 API | 平缓，自然语言驱动 |
152 | | **代码可读性** | 晦涩难懂 | 直观易懂 |
153 | | **维护成本** | 高，页面变化需要大量修改 | 低，AI 自动适应变化 |
154 | | **元素定位** | 手动编写选择器 | AI 智能定位 |
155 | | **错误处理** | 需要手动处理各种异常 | AI 自动重试和恢复 |
156 | | **跨平台** | 需要学习不同工具 | 统一接口 |
157 | 
158 | ## 🛣️ 发展路线
159 | 
160 | ### 当前版本 (v0.1.0)
161 | - ✅ 基础 Agent 和 Insight 功能
162 | - ✅ Web 平台支持 (Selenium/Playwright)
163 | - ✅ Android 平台支持
164 | - ✅ 多种 AI 模型集成
165 | - ✅ 基础缓存和报告功能
166 | 
167 | ### 未来规划
168 | - 🔄 桌面应用自动化支持
169 | - 🔄 更多 AI 模型集成
170 | - 🔄 可视化测试编辑器
171 | - 🔄 云端执行服务
172 | - 🔄 团队协作功能
173 | 
174 | ## 📈 性能特点
175 | 
176 | - **执行效率**: 智能缓存机制减少重复的 AI 调用
177 | - **准确性**: 多重验证确保操作的可靠性
178 | - **稳定性**: 自动重试和错误恢复机制
179 | - **扩展性**: 模块化设计支持自定义扩展
180 | 
181 | ## 🤝 社区与生态
182 | 
183 | Midscene Python 是一个开源项目，欢迎社区贡献：
184 | 
185 | - **GitHub**: [Python51888/midscene-python](https://github.com/Python51888/midscene-python.git)
186 | - **文档**: [Python51888/midscene-python](https://github.com/Python51888/Midscene-Python/blob/master/README.md)
187 | - **讨论**: GitHub Discussions
188 | - **问题反馈**: GitHub Issues
189 | 
190 | ---
191 | 
192 | 准备好开始你的 AI 自动化之旅了吗？查看 [快速开始](快速开始.md) 指南！


--------------------------------------------------------------------------------
/midscene/core/ai_model/service.py:
--------------------------------------------------------------------------------
  1 | """
  2 | AI Model Service - Unified interface for different AI providers
  3 | """
  4 | 
  5 | import json
  6 | from abc import ABC, abstractmethod
  7 | from typing import Any, Dict, List, Optional, Type, Union
  8 | 
  9 | import httpx
 10 | from loguru import logger
 11 | from pydantic import BaseModel
 12 | 
 13 | from ..types import AIUsageInfo
 14 | 
 15 | 
 16 | class AIModelConfig(BaseModel):
 17 |     """AI model configuration"""
 18 |     provider: str  # openai, anthropic, qwen, gemini
 19 |     model: str
 20 |     api_key: str
 21 |     base_url: Optional[str] = None
 22 |     max_tokens: int = 4000
 23 |     temperature: float = 0.1
 24 |     timeout: int = 60
 25 | 
 26 | 
 27 | class AIProvider(ABC):
 28 |     """Abstract base class for AI service providers"""
 29 |     
 30 |     @abstractmethod
 31 |     async def call(
 32 |         self,
 33 |         messages: List[Dict[str, Any]],
 34 |         config: AIModelConfig,
 35 |         response_schema: Optional[Type[BaseModel]] = None,
 36 |         **kwargs
 37 |     ) -> Dict[str, Any]:
 38 |         """Call AI service"""
 39 |         pass
 40 | 
 41 | 
 42 | class AIModelService:
 43 |     """Unified AI model service interface"""
 44 |     
 45 |     def __init__(self):
 46 |         self.providers: Dict[str, AIProvider] = {}
 47 |         self._register_providers()
 48 |     
 49 |     def _register_providers(self):
 50 |         """Register available AI providers"""
 51 |         from .providers import (
 52 |             OpenAIProvider, 
 53 |             AnthropicProvider, 
 54 |             QwenProvider, 
 55 |             GeminiProvider
 56 |         )
 57 |         
 58 |         self.providers['openai'] = OpenAIProvider()
 59 |         self.providers['anthropic'] = AnthropicProvider()
 60 |         self.providers['qwen'] = QwenProvider()
 61 |         self.providers['gemini'] = GeminiProvider()
 62 |     
 63 |     async def call_ai(
 64 |         self,
 65 |         messages: List[Dict[str, Any]], 
 66 |         response_schema: Optional[Type[BaseModel]] = None,
 67 |         model_config: Optional[AIModelConfig] = None,
 68 |         **kwargs
 69 |     ) -> Dict[str, Any]:
 70 |         """Call AI model with unified interface"""
 71 |         config = model_config or self._get_default_config()
 72 |         provider = self.providers.get(config.provider)
 73 |         
 74 |         if not provider:
 75 |             raise ValueError(f"Unsupported provider: {config.provider}")
 76 |         
 77 |         try:
 78 |             logger.debug(f"Calling AI provider: {config.provider}")
 79 |             result = await provider.call(
 80 |                 messages=messages,
 81 |                 config=config,
 82 |                 response_schema=response_schema,
 83 |                 **kwargs
 84 |             )
 85 |             return result
 86 |         except Exception as e:
 87 |             logger.error(f"AI call failed: {e}")
 88 |             raise
 89 |     
 90 |     def _get_default_config(self) -> AIModelConfig:
 91 |         """Get default configuration"""
 92 |         import os
 93 |         
 94 |         # Try to get from environment variables
 95 |         provider = os.getenv('MIDSCENE_AI_PROVIDER', 'openai')
 96 |         model = os.getenv('MIDSCENE_AI_MODEL', 'gpt-4-vision-preview')
 97 |         api_key = os.getenv('MIDSCENE_AI_API_KEY', '')
 98 |         base_url = os.getenv('MIDSCENE_AI_BASE_URL')
 99 |         
100 |         if not api_key:
101 |             raise ValueError(
102 |                 "AI API key not configured. Set MIDSCENE_AI_API_KEY environment variable."
103 |             )
104 |         
105 |         return AIModelConfig(
106 |             provider=provider,
107 |             model=model,
108 |             api_key=api_key,
109 |             base_url=base_url
110 |         )
111 | 
112 | 
113 | def parse_json_response(content: str) -> Dict[str, Any]:
114 |     """Parse JSON response from AI model"""
115 |     try:
116 |         # Try to parse as JSON directly
117 |         return json.loads(content)
118 |     except json.JSONDecodeError:
119 |         # Try to extract JSON from code blocks
120 |         import re
121 |         json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', content, re.DOTALL)
122 |         if json_match:
123 |             try:
124 |                 return json.loads(json_match.group(1))
125 |             except json.JSONDecodeError:
126 |                 pass
127 |         
128 |         # Try to find JSON-like content
129 |         json_match = re.search(r'\{.*\}', content, re.DOTALL)
130 |         if json_match:
131 |             try:
132 |                 return json.loads(json_match.group(0))
133 |             except json.JSONDecodeError:
134 |                 pass
135 |         
136 |         raise ValueError(f"Failed to parse JSON from response: {content}")
137 | 
138 | 
139 | def create_usage_info(usage_data: Dict[str, Any]) -> AIUsageInfo:
140 |     """Create AIUsageInfo from provider response"""
141 |     return AIUsageInfo(
142 |         prompt_tokens=usage_data.get('prompt_tokens', 0),
143 |         completion_tokens=usage_data.get('completion_tokens', 0),
144 |         total_tokens=usage_data.get('total_tokens', 0),
145 |         cost=usage_data.get('cost')
146 |     )


--------------------------------------------------------------------------------
/scripts/validate_requirements.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | 依赖验证脚本
  4 | 验证生成的requirements.txt文件是否包含所有必要依赖
  5 | """
  6 | 
  7 | import subprocess
  8 | import sys
  9 | import tempfile
 10 | import os
 11 | from pathlib import Path
 12 | 
 13 | 
 14 | def run_command(cmd, check=True, capture_output=True):
 15 |     """运行命令并返回结果"""
 16 |     try:
 17 |         result = subprocess.run(
 18 |             cmd, 
 19 |             shell=True, 
 20 |             check=check, 
 21 |             capture_output=capture_output,
 22 |             text=True
 23 |         )
 24 |         return result
 25 |     except subprocess.CalledProcessError as e:
 26 |         print(f"命令执行失败: {cmd}")
 27 |         print(f"错误输出: {e.stderr}")
 28 |         sys.exit(1)
 29 | 
 30 | 
 31 | def create_test_environment():
 32 |     """创建临时测试环境"""
 33 |     print("=== 创建临时测试环境 ===")
 34 |     
 35 |     # 创建临时目录
 36 |     temp_dir = tempfile.mkdtemp(prefix="midscene_test_")
 37 |     print(f"临时目录: {temp_dir}")
 38 |     
 39 |     # 创建虚拟环境
 40 |     venv_path = os.path.join(temp_dir, "test_env")
 41 |     print("创建虚拟环境...")
 42 |     run_command(f"python -m venv {venv_path}")
 43 |     
 44 |     # 获取虚拟环境的Python路径
 45 |     if sys.platform == "win32":
 46 |         python_path = os.path.join(venv_path, "Scripts", "python.exe")
 47 |         pip_path = os.path.join(venv_path, "Scripts", "pip.exe")
 48 |     else:
 49 |         python_path = os.path.join(venv_path, "bin", "python")
 50 |         pip_path = os.path.join(venv_path, "bin", "pip")
 51 |     
 52 |     return temp_dir, python_path, pip_path
 53 | 
 54 | 
 55 | def install_requirements(pip_path, requirements_file):
 56 |     """在测试环境中安装依赖"""
 57 |     print("=== 安装依赖包 ===")
 58 |     print(f"使用requirements文件: {requirements_file}")
 59 |     
 60 |     # 升级pip
 61 |     run_command(f'"{pip_path}" install --upgrade pip')
 62 |     
 63 |     # 安装依赖
 64 |     run_command(f'"{pip_path}" install -r "{requirements_file}"')
 65 |     print("依赖安装完成")
 66 | 
 67 | 
 68 | def validate_imports(python_path):
 69 |     """验证核心包导入"""
 70 |     print("=== 验证包导入 ===")
 71 |     
 72 |     test_imports = [
 73 |         "import midscene",
 74 |         "import pydantic",
 75 |         "import selenium",
 76 |         "import playwright",
 77 |         "import pytest",
 78 |         "import black",
 79 |         "import mkdocs",
 80 |         "import numpy",
 81 |         "import cv2",
 82 |         "import PIL",
 83 |         "import loguru",
 84 |         "import typer",
 85 |         "import httpx",
 86 |         "import aiohttp",
 87 |         "import openai",
 88 |         "import anthropic",
 89 |     ]
 90 |     
 91 |     for import_stmt in test_imports:
 92 |         try:
 93 |             print(f"测试: {import_stmt}")
 94 |             run_command(f'"{python_path}" -c "{import_stmt}"')
 95 |             print(f"✓ {import_stmt} - 成功")
 96 |         except:
 97 |             print(f"✗ {import_stmt} - 失败")
 98 |             return False
 99 |     
100 |     return True
101 | 
102 | 
103 | def validate_cli_tools(python_path):
104 |     """验证CLI工具可用性"""
105 |     print("=== 验证CLI工具 ===")
106 |     
107 |     cli_tests = [
108 |         (f'"{python_path}" -m pytest --version', "pytest"),
109 |         (f'"{python_path}" -m black --version', "black"),
110 |         (f'"{python_path}" -m mkdocs --version', "mkdocs"),
111 |     ]
112 |     
113 |     for cmd, tool_name in cli_tests:
114 |         try:
115 |             print(f"测试: {tool_name}")
116 |             result = run_command(cmd)
117 |             print(f"✓ {tool_name} - 可用")
118 |         except:
119 |             print(f"✗ {tool_name} - 不可用")
120 |             return False
121 |     
122 |     return True
123 | 
124 | 
125 | def cleanup(temp_dir):
126 |     """清理临时文件"""
127 |     print("=== 清理临时文件 ===")
128 |     try:
129 |         import shutil
130 |         shutil.rmtree(temp_dir)
131 |         print(f"已删除临时目录: {temp_dir}")
132 |     except Exception as e:
133 |         print(f"清理失败: {e}")
134 | 
135 | 
136 | def main():
137 |     """主函数"""
138 |     print("=== Midscene Python 依赖验证 ===\n")
139 |     
140 |     # 检查requirements.txt是否存在
141 |     requirements_file = Path("requirements.txt")
142 |     if not requirements_file.exists():
143 |         print("错误: requirements.txt 文件不存在")
144 |         print("请先运行: make requirements-freeze")
145 |         sys.exit(1)
146 |     
147 |     temp_dir = None
148 |     try:
149 |         # 创建测试环境
150 |         temp_dir, python_path, pip_path = create_test_environment()
151 |         
152 |         # 安装依赖
153 |         install_requirements(pip_path, requirements_file)
154 |         
155 |         # 验证导入
156 |         if not validate_imports(python_path):
157 |             print("\n❌ 包导入验证失败")
158 |             sys.exit(1)
159 |         
160 |         # 验证CLI工具
161 |         if not validate_cli_tools(python_path):
162 |             print("\n❌ CLI工具验证失败")
163 |             sys.exit(1)
164 |         
165 |         print("\n✅ 所有依赖验证通过!")
166 |         print("requirements.txt 文件完整且可用")
167 |         
168 |     except KeyboardInterrupt:
169 |         print("\n用户中断验证过程")
170 |         sys.exit(1)
171 |     except Exception as e:
172 |         print(f"\n验证过程中出现错误: {e}")
173 |         sys.exit(1)
174 |     finally:
175 |         if temp_dir:
176 |             cleanup(temp_dir)
177 | 
178 | 
179 | if __name__ == "__main__":
180 |     main()


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Midscene Python     [![zread](https://img.shields.io/badge/Ask_Zread-_.svg?style=flat&color=00b0aa&labelColor=000000&logo=data%3Aimage%2Fsvg%2Bxml%3Bbase64%2CPHN2ZyB3aWR0aD0iMTYiIGhlaWdodD0iMTYiIHZpZXdCb3g9IjAgMCAxNiAxNiIgZmlsbD0ibm9uZSIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj4KPHBhdGggZD0iTTQuOTYxNTYgMS42MDAxSDIuMjQxNTZDMS44ODgxIDEuNjAwMSAxLjYwMTU2IDEuODg2NjQgMS42MDE1NiAyLjI0MDFWNC45NjAxQzEuNjAxNTYgNS4zMTM1NiAxLjg4ODEgNS42MDAxIDIuMjQxNTYgNS42MDAxSDQuOTYxNTZDNS4zMTUwMiA1LjYwMDEgNS42MDE1NiA1LjMxMzU2IDUuNjAxNTYgNC45NjAxVjIuMjQwMUM1LjYwMTU2IDEuODg2NjQgNS4zMTUwMiAxLjYwMDEgNC45NjE1NiAxLjYwMDFaIiBmaWxsPSIjZmZmIi8%2BCjxwYXRoIGQ9Ik00Ljk2MTU2IDEwLjM5OTlIMi4yNDE1NkMxLjg4ODEgMTAuMzk5OSAxLjYwMTU2IDEwLjY4NjQgMS42MDE1NiAxMS4wMzk5VjEzLjc1OTlDMS42MDE1NiAxNC4xMTM0IDEuODg4MSAxNC4zOTk5IDIuMjQxNTYgMTQuMzk5OUg0Ljk2MTU2QzUuMzE1MDIgMTQuMzk5OSA1LjYwMTU2IDE0LjExMzQgNS42MDE1NiAxMy43NTk5VjExLjAzOTlDNS42MDE1NiAxMC42ODY0IDUuMzE1MDIgMTAuMzk5OSA0Ljk2MTU2IDEwLjM5OTlaIiBmaWxsPSIjZmZmIi8%2BCjxwYXRoIGQ9Ik0xMy43NTg0IDEuNjAwMUgxMS4wMzg0QzEwLjY4NSAxLjYwMDEgMTAuMzk4NCAxLjg4NjY0IDEwLjM5ODQgMi4yNDAxVjQuOTYwMUMxMC4zOTg0IDUuMzEzNTYgMTAuNjg1IDUuNjAwMSAxMS4wMzg0IDUuNjAwMUgxMy43NTg0QzE0LjExMTkgNS42MDAxIDE0LjM5ODQgNS4zMTM1NiAxNC4zOTg0IDQuOTYwMVYyLjI0MDFDMTQuMzk4NCAxLjg4NjY0IDE0LjExMTkgMS42MDAxIDEzLjc1ODQgMS42MDAxWiIgZmlsbD0iI2ZmZiIvPgo8cGF0aCBkPSJNNCAxMkwxMiA0TDQgMTJaIiBmaWxsPSIjZmZmIi8%2BCjxwYXRoIGQ9Ik00IDEyTDEyIDQiIHN0cm9rZT0iI2ZmZiIgc3Ryb2tlLXdpZHRoPSIxLjUiIHN0cm9rZS1saW5lY2FwPSJyb3VuZCIvPgo8L3N2Zz4K&logoColor=ffffff)](https://zread.ai/Python51888/Midscene-Python)                  
  2 | [English](README.md) | [简体中文](README.zh.md)
  3 | 
  4 | Midscene Python is an AI-based automation framework that supports UI automation operations on Web and Android platforms.    
  5 | 
  6 | ## Overview
  7 | 
  8 | Midscene Python provides comprehensive UI automation capabilities with the following core features:
  9 | 
 10 | - **Natural Language Driven**: Describe automation tasks using natural language
 11 | - **Multi-platform Support**: Supports Web (Selenium/Playwright) and Android (ADB)
 12 | - **AI Model Integration**: Supports multiple vision-language models such as GPT-4V, Qwen2.5-VL, and Gemini  
 13 | - **Visual Debugging**: Provides detailed execution reports and debugging information
 14 | - **Caching Mechanism**: Intelligent caching to improve execution efficiency
 15 | 
 16 | ## Project Architecture
 17 | 
 18 | ```
 19 | midscene-python/
 20 | ├── midscene/                    # Core framework
 21 | │   ├── core/                    # Core framework
 22 | │   │   ├── agent/              # Agent system
 23 | │   │   ├── insight/            # AI inference engine
 24 | │   │   ├── ai_model/           # AI model integration
 25 | │   │   ├── yaml/               # YAML script executor
 26 | │   │   └── types.py            # Core type definitions
 27 | │   ├── web/                     # Web integration
 28 | │   │   ├── selenium/           # Selenium integration
 29 | │   │   ├── playwright/         # Playwright integration
 30 | │   │   └── bridge/             # Bridge mode
 31 | │   ├── android/                 # Android integration
 32 | │   │   ├── device.py           # Device management
 33 | │   │   └── agent.py            # Android Agent
 34 | │   ├── cli/                     # Command line tools
 35 | │   ├── mcp/                     # MCP protocol support
 36 | │   ├── shared/                 # Shared utilities
 37 | │   └── visualizer/             # Visual reports
 38 | ├── examples/                   # Example code
 39 | ├── tests/                      # Test cases
 40 | └── docs/                       # Documentation
 41 | ```
 42 | 
 43 | ## Tech Stack
 44 | 
 45 | - **Python 3.9+**: Core runtime environment
 46 | - **Pydantic**: Data validation and serialization
 47 | - **Selenium/Playwright**: Web automation
 48 | - **OpenCV/Pillow**: Image processing
 49 | - **HTTPX/AIOHTTP**: HTTP client
 50 | - **Typer**: CLI framework
 51 | - **Loguru**: Logging
 52 | 
 53 | ## Quick Start
 54 | 
 55 | ### Installation
 56 | 
 57 | ```bash
 58 | pip install midscene-python
 59 | ```
 60 | 
 61 | ### Basic Usage
 62 | 
 63 | ```python
 64 | from midscene import Agent
 65 | from midscene.web import SeleniumWebPage
 66 | 
 67 | # Create a Web Agent
 68 | with SeleniumWebPage.create() as page:
 69 |     agent = Agent(page)
 70 |     
 71 |     # Perform automation operations using natural language
 72 |     await agent.ai_action("Click the login button")
 73 |     await agent.ai_action("Enter username 'test@example.com'")
 74 |     await agent.ai_action("Enter password 'password123'")
 75 |     await agent.ai_action("Click the submit button")
 76 |     
 77 |     # Data extraction
 78 |     user_info = await agent.ai_extract("Extract user personal information")
 79 |     
 80 |     # Assertion verification
 81 |     await agent.ai_assert("Page displays welcome message")
 82 | ```
 83 | 
 84 | ## Key Features
 85 | 
 86 | ### 🤖 AI-Driven Automation
 87 | 
 88 | Describe operations using natural language, and AI automatically understands and executes:
 89 | 
 90 | ```python
 91 | await agent.ai_action("Enter 'Python tutorial' in the search box and search")
 92 | ```
 93 | 
 94 | ### 🔍 Intelligent Element Location
 95 | 
 96 | Supports multiple location strategies and automatically selects the optimal solution:
 97 | 
 98 | ```python
 99 | element = await agent.ai_locate("Login button")
100 | ```
101 | 
102 | ### 📊 Data Extraction
103 | 
104 | Extract structured data from the page:
105 | 
106 | ```python
107 | products = await agent.ai_extract({
108 |     "products": [
109 |         {"name": "Product Name", "price": "Price", "rating": "Rating"}
110 |     ]
111 | })
112 | ```
113 | 
114 | ### ✅ Intelligent Assertions
115 | 
116 | AI understands page state and performs intelligent assertions:
117 | 
118 | ```python
119 | await agent.ai_assert("User has successfully logged in")
120 | ```
121 | 
122 | ### 📝 Credits
123 | 
124 | Thanks to Midscene Project: https://github.com/web-infra-dev/midscene for inspiration and technical references 
125 | 
126 | ## License
127 | 
128 | MIT License
129 | 


--------------------------------------------------------------------------------
/midscene/core/types.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Core types and interfaces for Midscene Python
  3 | """
  4 | 
  5 | from abc import ABC, abstractmethod
  6 | from dataclasses import dataclass, field
  7 | from enum import Enum
  8 | from typing import Any, Dict, List, Optional, Union, Callable, Awaitable, Generic, TypeVar
  9 | from pydantic import BaseModel
 10 | 
 11 | # Type variables
 12 | ElementType = TypeVar('ElementType', bound='BaseElement')
 13 | T = TypeVar('T')
 14 | 
 15 | 
 16 | class InterfaceType(str, Enum):
 17 |     """Interface type enumeration"""
 18 |     WEB = "web"
 19 |     ANDROID = "android"
 20 | 
 21 | 
 22 | class NodeType(str, Enum):
 23 |     """UI Node type enumeration"""
 24 |     CONTAINER = "container"
 25 |     TEXT = "text"
 26 |     INPUT = "input"
 27 |     BUTTON = "button"
 28 |     IMAGE = "image"
 29 |     LINK = "link"
 30 |     OTHER = "other"
 31 | 
 32 | 
 33 | @dataclass
 34 | class Point:
 35 |     """2D Point representation"""
 36 |     x: float
 37 |     y: float
 38 | 
 39 | 
 40 | @dataclass
 41 | class Size:
 42 |     """Size representation"""
 43 |     width: float
 44 |     height: float
 45 | 
 46 | 
 47 | @dataclass
 48 | class Rect:
 49 |     """Rectangle representation"""
 50 |     left: float
 51 |     top: float
 52 |     width: float
 53 |     height: float
 54 |     
 55 |     @property
 56 |     def right(self) -> float:
 57 |         return self.left + self.width
 58 |     
 59 |     @property
 60 |     def bottom(self) -> float:
 61 |         return self.top + self.height
 62 |     
 63 |     @property
 64 |     def center(self) -> Point:
 65 |         return Point(
 66 |             x=self.left + self.width / 2,
 67 |             y=self.top + self.height / 2
 68 |         )
 69 | 
 70 | 
 71 | class BaseElement(BaseModel):
 72 |     """Base UI element interface"""
 73 |     id: str
 74 |     content: str
 75 |     rect: Rect
 76 |     center: tuple[float, float]
 77 |     node_type: NodeType = NodeType.OTHER
 78 |     attributes: Dict[str, Any] = field(default_factory=dict)
 79 |     is_visible: bool = True
 80 |     xpaths: Optional[List[str]] = None
 81 |     
 82 |     async def tap(self) -> None:
 83 |         """Tap/click this element"""
 84 |         raise NotImplementedError
 85 |     
 86 |     async def input_text(self, text: str) -> None:
 87 |         """Input text to this element"""
 88 |         raise NotImplementedError
 89 | 
 90 | 
 91 | class UINode(BaseModel):
 92 |     """UI tree node representation"""
 93 |     id: str
 94 |     content: str
 95 |     rect: Rect
 96 |     center: tuple[float, float]
 97 |     node_type: NodeType
 98 |     attributes: Dict[str, Any] = field(default_factory=dict)
 99 |     is_visible: bool = True
100 |     children: List['UINode'] = field(default_factory=list)
101 | 
102 | 
103 | class UITree(BaseModel):
104 |     """UI tree representation"""
105 |     node: UINode
106 |     children: List['UITree'] = field(default_factory=list)
107 | 
108 | 
109 | class UIContext(BaseModel, Generic[ElementType]):
110 |     """UI context containing screenshot and element information"""
111 |     screenshot_base64: str
112 |     size: Size
113 |     content: List[ElementType]
114 |     tree: UITree
115 |     
116 |     
117 | class AIUsageInfo(BaseModel):
118 |     """AI usage information"""
119 |     prompt_tokens: int = 0
120 |     completion_tokens: int = 0
121 |     total_tokens: int = 0
122 |     cost: Optional[float] = None
123 | 
124 | 
125 | class LocateResult(BaseModel):
126 |     """Element locate result"""
127 |     element: Optional[BaseElement] = None
128 |     rect: Optional[Rect] = None
129 | 
130 | 
131 | class ExecutionResult(BaseModel, Generic[T]):
132 |     """Generic execution result"""
133 |     success: bool = True
134 |     data: Optional[Any] = None
135 |     error: Optional[str] = None
136 |     usage: Optional[AIUsageInfo] = None
137 | 
138 | 
139 | class AssertResult(BaseModel):
140 |     """Assertion result"""
141 |     passed: bool
142 |     thought: str = ""
143 |     message: str = ""
144 | 
145 | 
146 | # Type aliases
147 | TUserPrompt = Union[str, Dict[str, Any]]
148 | ElementById = Callable[[str], Optional[BaseElement]]
149 | OnTaskStartTip = Callable[[str], Union[None, Awaitable[None]]]
150 | 
151 | 
152 | # Abstract interface for device/platform implementations
153 | class AbstractInterface(ABC):
154 |     """Abstract interface for platform implementations"""
155 |     
156 |     @property
157 |     @abstractmethod
158 |     def interface_type(self) -> InterfaceType:
159 |         """Get interface type"""
160 |         pass
161 |     
162 |     @abstractmethod
163 |     async def get_context(self) -> UIContext:
164 |         """Get current UI context"""
165 |         pass
166 |     
167 |     @abstractmethod
168 |     async def action_space(self) -> List[str]:
169 |         """Get available actions"""
170 |         pass
171 |     
172 |     @abstractmethod
173 |     async def tap(self, x: float, y: float) -> None:
174 |         """Tap at coordinates"""
175 |         pass
176 |     
177 |     @abstractmethod
178 |     async def input_text(self, text: str) -> None:
179 |         """Input text"""
180 |         pass
181 |     
182 |     @abstractmethod
183 |     async def scroll(self, direction: str, distance: Optional[int] = None) -> None:
184 |         """Scroll in direction"""
185 |         pass
186 | 
187 | 
188 | class InsightAction(str, Enum):
189 |     """Insight action types"""
190 |     LOCATE = "locate"
191 |     EXTRACT = "extract"
192 |     ASSERT = "assert"
193 | 
194 | 
195 | @dataclass
196 | class AgentOptions:
197 |     """Agent configuration options"""
198 |     test_id: Optional[str] = None
199 |     cache_id: Optional[str] = None
200 |     group_name: str = "Midscene Report"
201 |     group_description: str = ""
202 |     generate_report: bool = True
203 |     auto_print_report_msg: bool = True
204 |     ai_action_context: Optional[str] = None
205 |     report_file_name: Optional[str] = None
206 |     model_config: Optional[Callable] = None
207 | 
208 | 
209 | @dataclass
210 | class LocateOption:
211 |     """Locate operation options"""
212 |     prompt: Optional[TUserPrompt] = None
213 |     deep_think: bool = False
214 |     cacheable: bool = True
215 |     xpath: Optional[str] = None
216 |     ui_context: Optional[UIContext] = None
217 | 
218 | 
219 | @dataclass
220 | class ExtractOption:
221 |     """Extract operation options"""
222 |     dom_included: Union[bool, str] = False  # False, True, or 'visible-only'
223 |     screenshot_included: bool = True
224 |     return_thought: bool = False
225 |     is_wait_for_assert: bool = False
226 |     do_not_throw_error: bool = False
227 | 
228 | 
229 | class ScrollParam(BaseModel):
230 |     """Scroll parameters"""
231 |     direction: str  # 'down', 'up', 'left', 'right'
232 |     scroll_type: str  # 'once', 'untilBottom', 'untilTop', 'untilLeft', 'untilRight'
233 |     distance: Optional[int] = None  # distance in pixels


--------------------------------------------------------------------------------
/docs/quickstart.md:
--------------------------------------------------------------------------------
  1 | # 快速开始 - Midscene Python
  2 | 
  3 | Midscene Python 是一个基于 AI 的自动化框架，支持 Web 和 Android 平台的 UI 自动化操作。
  4 | 
  5 | ## 安装
  6 | 
  7 | ```bash
  8 | pip install midscene-python
  9 | ```
 10 | 
 11 | ## 基本配置
 12 | 
 13 | ### 1. 配置 AI 模型
 14 | 
 15 | 设置环境变量：
 16 | 
 17 | ```bash
 18 | export MIDSCENE_AI_PROVIDER=openai
 19 | export MIDSCENE_AI_MODEL=gpt-4-vision-preview
 20 | export MIDSCENE_AI_API_KEY=your-api-key-here
 21 | ```
 22 | 
 23 | 或创建配置文件 `midscene.yml`：
 24 | 
 25 | ```yaml
 26 | ai:
 27 |   provider: "openai"
 28 |   model: "gpt-4-vision-preview"
 29 |   api_key: "your-api-key-here"
 30 | ```
 31 | 
 32 | ### 2. 支持的 AI 提供商
 33 | 
 34 | - **OpenAI**: GPT-4V, GPT-4o
 35 | - **Anthropic**: Claude 3.5 Sonnet
 36 | - **阿里云**: Qwen2.5-VL
 37 | - **Google**: Gemini Pro Vision
 38 | 
 39 | ## Web 自动化
 40 | 
 41 | ### Selenium 示例
 42 | 
 43 | ```python
 44 | import asyncio
 45 | from midscene import Agent
 46 | from midscene.web import SeleniumWebPage
 47 | 
 48 | async def web_automation():
 49 |     # 创建浏览器实例
 50 |     with SeleniumWebPage.create(headless=False) as page:
 51 |         agent = Agent(page)
 52 |         
 53 |         # 导航到网站
 54 |         await page.navigate_to("https://example.com")
 55 |         
 56 |         # 使用自然语言进行操作
 57 |         await agent.ai_action("点击登录按钮")
 58 |         await agent.ai_action("在用户名框输入 'demo@example.com'")
 59 |         await agent.ai_action("在密码框输入 'password123'")
 60 |         await agent.ai_action("点击提交按钮")
 61 |         
 62 |         # 数据提取
 63 |         user_info = await agent.ai_extract({
 64 |             "username": "用户名",
 65 |             "email": "邮箱地址"
 66 |         })
 67 |         print(f"用户信息: {user_info}")
 68 |         
 69 |         # 断言验证
 70 |         await agent.ai_assert("页面显示欢迎信息")
 71 | 
 72 | # 运行示例
 73 | asyncio.run(web_automation())
 74 | ```
 75 | 
 76 | ### Playwright 示例
 77 | 
 78 | ```python
 79 | import asyncio
 80 | from midscene import Agent
 81 | from midscene.web import PlaywrightWebPage
 82 | 
 83 | async def playwright_automation():
 84 |     # 创建 Playwright 页面
 85 |     async with await PlaywrightWebPage.create() as page:
 86 |         agent = Agent(page)
 87 |         
 88 |         await page.navigate_to("https://playwright.dev")
 89 |         await agent.ai_action("点击文档链接")
 90 |         
 91 |         # 提取页面信息
 92 |         page_info = await agent.ai_extract({
 93 |             "title": "页面标题",
 94 |             "sections": ["主要章节列表"]
 95 |         })
 96 |         print(f"页面信息: {page_info}")
 97 | 
 98 | asyncio.run(playwright_automation())
 99 | ```
100 | 
101 | ## Android 自动化
102 | 
103 | ```python
104 | import asyncio
105 | from midscene.android import AndroidAgent
106 | 
107 | async def android_automation():
108 |     # 创建 Android Agent（自动检测设备）
109 |     agent = await AndroidAgent.create()
110 |     
111 |     # 启动应用
112 |     await agent.launch_app("com.android.settings")
113 |     
114 |     # 使用自然语言导航
115 |     await agent.ai_action("点击WLAN设置")
116 |     await agent.ai_action("滑动到底部")
117 |     
118 |     # 提取信息
119 |     wifi_list = await agent.ai_extract({
120 |         "networks": [
121 |             {"name": "网络名称", "security": "安全类型"}
122 |         ]
123 |     })
124 |     print(f"WiFi网络: {wifi_list}")
125 |     
126 |     # 返回
127 |     await agent.back()
128 | 
129 | asyncio.run(android_automation())
130 | ```
131 | 
132 | ## 命令行工具
133 | 
134 | ### 运行 YAML 脚本
135 | 
136 | ```bash
137 | # 运行单个脚本
138 | midscene run script.yaml
139 | 
140 | # 运行目录中的所有脚本
141 | midscene run scripts/
142 | 
143 | # 使用配置文件
144 | midscene run script.yaml --config midscene.yml
145 | 
146 | # 并发执行
147 | midscene run scripts/ --concurrent 3
148 | 
149 | # Android 设备指定
150 | midscene run android_script.yaml --device device_id
151 | ```
152 | 
153 | ### 列出 Android 设备
154 | 
155 | ```bash
156 | midscene devices
157 | ```
158 | 
159 | ### 初始化项目
160 | 
161 | ```bash
162 | midscene init my-project
163 | cd my-project
164 | ```
165 | 
166 | ## YAML 脚本格式
167 | 
168 | 创建 `example.yaml`：
169 | 
170 | ```yaml
171 | # Web 自动化脚本
172 | web:
173 |   url: "https://example.com"
174 |   browser: "chrome"
175 |   headless: false
176 | 
177 | tasks:
178 |   - name: "登录操作"
179 |     steps:
180 |       - action: "ai_action"
181 |         prompt: "点击登录按钮"
182 |       
183 |       - action: "ai_action" 
184 |         prompt: "输入用户名 'demo@example.com'"
185 |       
186 |       - action: "ai_action"
187 |         prompt: "输入密码 'password123'"
188 |       
189 |       - action: "ai_action"
190 |         prompt: "点击提交按钮"
191 |   
192 |   - name: "数据提取"
193 |     steps:
194 |       - action: "ai_extract"
195 |         prompt:
196 |           username: "用户名"
197 |           email: "邮箱地址"
198 |         save_to: "user_info"
199 |   
200 |   - name: "状态验证"
201 |     steps:
202 |       - action: "ai_assert"
203 |         prompt: "页面显示欢迎信息"
204 | ```
205 | 
206 | ## 核心概念
207 | 
208 | ### Agent 系统
209 | 
210 | Agent 是自动化操作的核心控制器，协调 AI 模型与设备交互：
211 | 
212 | ```python
213 | from midscene import Agent
214 | from midscene.web import SeleniumWebPage
215 | 
216 | page = SeleniumWebPage.create()
217 | agent = Agent(page)
218 | ```
219 | 
220 | ### AI 操作类型
221 | 
222 | 1. **ai_action**: 执行自然语言描述的操作
223 | 2. **ai_locate**: 定位 UI 元素
224 | 3. **ai_extract**: 提取结构化数据
225 | 4. **ai_assert**: 验证页面状态
226 | 
227 | ### 缓存机制
228 | 
229 | 启用缓存可以提升重复执行的效率：
230 | 
231 | ```python
232 | from midscene.core import AgentOptions
233 | 
234 | options = AgentOptions(
235 |     cache_id="my_automation",
236 |     generate_report=True
237 | )
238 | agent = Agent(page, options)
239 | ```
240 | 
241 | ## 最佳实践
242 | 
243 | ### 1. 错误处理
244 | 
245 | ```python
246 | try:
247 |     await agent.ai_action("点击不存在的按钮")
248 | except Exception as e:
249 |     print(f"操作失败: {e}")
250 | ```
251 | 
252 | ### 2. 等待条件
253 | 
254 | ```python
255 | # 等待元素出现
256 | await agent.ai_wait_for("登录成功页面出现", timeout_ms=10000)
257 | ```
258 | 
259 | ### 3. 数据验证
260 | 
261 | ```python
262 | # 使用断言验证数据
263 | user_data = await agent.ai_extract({"username": "用户名"})
264 | assert user_data["username"], "用户名不能为空"
265 | ```
266 | 
267 | ### 4. 截图和报告
268 | 
269 | ```python
270 | # 生成执行报告
271 | options = AgentOptions(
272 |     generate_report=True,
273 |     report_file_name="automation_report"
274 | )
275 | ```
276 | 
277 | ## 故障排除
278 | 
279 | ### 常见问题
280 | 
281 | 1. **AI API 密钥未设置**
282 |    ```
283 |    ValueError: AI API key not configured
284 |    ```
285 |    解决：设置 `MIDSCENE_AI_API_KEY` 环境变量
286 | 
287 | 2. **Chrome 浏览器未找到**
288 |    ```
289 |    WebDriverException: chrome not found
290 |    ```
291 |    解决：安装 Chrome 浏览器或指定 Chrome 路径
292 | 
293 | 3. **Android 设备连接失败**
294 |    ```
295 |    RuntimeError: No Android devices found
296 |    ```
297 |    解决：确保设备已连接并启用 USB 调试
298 | 
299 | ### 调试技巧
300 | 
301 | 1. **启用详细日志**
302 |    ```python
303 |    from midscene.shared import setup_logger
304 |    setup_logger(level="DEBUG")
305 |    ```
306 | 
307 | 2. **查看生成的报告**
308 |    执行完成后检查 `./reports/` 目录中的 HTML 报告
309 | 
310 | 3. **使用非无头模式**
311 |    设置 `headless=False` 观察浏览器操作过程
312 | 
313 | ## 下一步
314 | 
315 | - 查看 [API 文档](api.md) 了解详细接口
316 | - 浏览 [示例集合](examples/) 学习更多用法
317 | - 阅读 [配置指南](configuration.md) 了解高级配置


--------------------------------------------------------------------------------
/wiki/快速开始.md:
--------------------------------------------------------------------------------
  1 | # 快速开始
  2 | 
  3 | 欢迎使用 Midscene Python！本指南将帮助你在 5 分钟内上手 AI 驱动的自动化操作。
  4 | 
  5 | ## 📋 前置要求
  6 | 
  7 | 在开始之前，请确保你的环境满足以下要求：
  8 | 
  9 | - **Python 3.9+**
 10 | - **pip** 包管理器
 11 | - **浏览器** (Chrome/Firefox/Edge，用于 Web 自动化)
 12 | - **AI 模型 API Key** (OpenAI、Claude、Qwen 或 Gemini 任选其一)
 13 | 
 14 | ## 🚀 快速安装
 15 | 
 16 | ### 1. 安装 Midscene Python
 17 | 
 18 | ```bash
 19 | pip install midscene-python
 20 | ```
 21 | 
 22 | ### 2. 安装浏览器驱动（可选）
 23 | 
 24 | 如果你计划进行 Web 自动化，需要安装对应的浏览器驱动：
 25 | 
 26 | ```bash
 27 | # Selenium WebDriver
 28 | pip install webdriver-manager
 29 | 
 30 | # 或者 Playwright
 31 | pip install playwright
 32 | playwright install
 33 | ```
 34 | 
 35 | ### 3. 配置 AI 模型
 36 | 
 37 | 创建 `.env` 文件配置 AI 模型（以 OpenAI 为例）：
 38 | 
 39 | ```bash
 40 | # .env
 41 | OPENAI_API_KEY=your_openai_api_key_here
 42 | OPENAI_BASE_URL=https://api.openai.com/v1  # 可选，默认官方 API
 43 | ```
 44 | 
 45 | ## 🎯 第一个示例
 46 | 
 47 | 让我们从一个简单的 Web 自动化示例开始：
 48 | 
 49 | ### 示例 1: 搜索操作
 50 | 
 51 | ```python
 52 | import asyncio
 53 | from midscene import Agent
 54 | from midscene.web import SeleniumWebPage
 55 | 
 56 | async def search_example():
 57 |     """在百度搜索 Python 教程"""
 58 |     
 59 |     # 创建 Web 页面实例
 60 |     with SeleniumWebPage.create() as page:
 61 |         # 创建 Agent
 62 |         agent = Agent(page)
 63 |         
 64 |         # 导航到网站
 65 |         await page.goto("https://www.baidu.com")
 66 |         
 67 |         # 使用自然语言进行搜索
 68 |         await agent.ai_action("在搜索框输入'Python 教程'")
 69 |         await agent.ai_action("点击搜索按钮")
 70 |         
 71 |         # 验证搜索结果
 72 |         await agent.ai_assert("页面显示了 Python 教程的搜索结果")
 73 |         
 74 |         print("✅ 搜索操作完成！")
 75 | 
 76 | # 运行示例
 77 | asyncio.run(search_example())
 78 | ```
 79 | 
 80 | ### 示例 2: 数据提取
 81 | 
 82 | ```python
 83 | import asyncio
 84 | from midscene import Agent
 85 | from midscene.web import SeleniumWebPage
 86 | 
 87 | async def extract_example():
 88 |     """提取新闻标题"""
 89 |     
 90 |     with SeleniumWebPage.create() as page:
 91 |         agent = Agent(page)
 92 |         
 93 |         # 访问新闻网站
 94 |         await page.goto("https://news.example.com")
 95 |         
 96 |         # 提取结构化数据
 97 |         news_data = await agent.ai_extract({
 98 |             "articles": [
 99 |                 {
100 |                     "title": "新闻标题",
101 |                     "time": "发布时间",
102 |                     "summary": "新闻摘要"
103 |                 }
104 |             ]
105 |         })
106 |         
107 |         # 输出结果
108 |         for article in news_data["articles"]:
109 |             print(f"📰 {article['title']}")
110 |             print(f"⏰ {article['time']}")
111 |             print(f"📄 {article['summary']}\n")
112 | 
113 | # 运行示例
114 | asyncio.run(extract_example())
115 | ```
116 | 
117 | ## 📱 Android 自动化示例
118 | 
119 | ```python
120 | import asyncio
121 | from midscene import Agent
122 | from midscene.android import AndroidDevice
123 | 
124 | async def android_example():
125 |     """Android 应用自动化"""
126 |     
127 |     # 连接 Android 设备
128 |     device = AndroidDevice()
129 |     await device.connect()
130 |     
131 |     # 创建 Agent
132 |     agent = Agent(device)
133 |     
134 |     # 启动应用
135 |     await device.start_app("com.example.app")
136 |     
137 |     # 自然语言操作
138 |     await agent.ai_action("点击登录按钮")
139 |     await agent.ai_action("输入用户名 'testuser'")
140 |     await agent.ai_action("输入密码 'password123'")
141 |     await agent.ai_action("点击确认登录")
142 |     
143 |     # 验证登录状态
144 |     await agent.ai_assert("显示用户已登录")
145 |     
146 |     print("✅ Android 自动化完成！")
147 | 
148 | # 运行示例
149 | asyncio.run(android_example())
150 | ```
151 | 
152 | ## 🎛️ 配置选项
153 | 
154 | ### AI 模型配置
155 | 
156 | ```python
157 | from midscene.core.ai_model import AIModelConfig
158 | 
159 | # 自定义 AI 配置
160 | config = AIModelConfig(
161 |     provider="openai",  # 或 "claude", "qwen", "gemini"
162 |     model="gpt-4-vision-preview",
163 |     temperature=0.1,
164 |     max_tokens=1000
165 | )
166 | 
167 | agent = Agent(page, ai_config=config)
168 | ```
169 | 
170 | ### Agent 选项
171 | 
172 | ```python
173 | from midscene.core import AgentOptions
174 | 
175 | # 自定义 Agent 选项
176 | options = AgentOptions(
177 |     timeout=30,  # 操作超时时间（秒）
178 |     retry_count=3,  # 重试次数
179 |     screenshot_on_error=True,  # 错误时自动截图
180 |     cache_enabled=True  # 启用智能缓存
181 | )
182 | 
183 | agent = Agent(page, options=options)
184 | ```
185 | 
186 | ## 🔧 常用操作
187 | 
188 | ### 基础交互
189 | 
190 | ```python
191 | # 点击操作
192 | await agent.ai_action("点击提交按钮")
193 | await agent.ai_action("点击页面右上角的用户头像")
194 | 
195 | # 输入操作
196 | await agent.ai_action("在用户名框输入 'admin'")
197 | await agent.ai_action("在密码框输入密码")
198 | 
199 | # 滚动操作
200 | await agent.ai_action("向下滚动查看更多内容")
201 | await agent.ai_action("滚动到页面底部")
202 | 
203 | # 等待操作
204 | await agent.ai_action("等待页面加载完成")
205 | ```
206 | 
207 | ### 元素定位
208 | 
209 | ```python
210 | # 精确定位元素
211 | element = await agent.ai_locate("登录按钮")
212 | await element.click()
213 | 
214 | # 定位多个元素
215 | elements = await agent.ai_locate_all("商品卡片")
216 | for element in elements:
217 |     await element.hover()
218 | ```
219 | 
220 | ### 条件断言
221 | 
222 | ```python
223 | # 页面状态验证
224 | await agent.ai_assert("用户已成功登录")
225 | await agent.ai_assert("购物车显示 3 件商品")
226 | await agent.ai_assert("页面不包含错误信息")
227 | 
228 | # 元素存在性验证
229 | await agent.ai_assert("页面包含搜索结果")
230 | await agent.ai_assert("显示用户个人信息")
231 | ```
232 | 
233 | ## 📊 查看执行报告
234 | 
235 | Midscene Python 自动生成详细的执行报告：
236 | 
237 | ```python
238 | # 运行后，检查生成的报告文件
239 | # 报告位置: ./midscene_reports/
240 | # - execution_report.html  # 可视化报告
241 | # - screenshots/           # 执行截图
242 | # - logs/                  # 详细日志
243 | ```
244 | 
245 | ## 🔍 调试技巧
246 | 
247 | ### 启用详细日志
248 | 
249 | ```python
250 | import logging
251 | from midscene.shared.logger import setup_logger
252 | 
253 | # 启用调试日志
254 | setup_logger(level=logging.DEBUG)
255 | ```
256 | 
257 | ### 截图调试
258 | 
259 | ```python
260 | # 手动截图
261 | screenshot = await page.screenshot()
262 | with open("debug.png", "wb") as f:
263 |     f.write(screenshot)
264 | 
265 | # 获取页面信息
266 | context = await page.get_context()
267 | print(f"页面标题: {context.page_title}")
268 | print(f"页面 URL: {context.url}")
269 | ```
270 | 
271 | ## 🚨 常见问题
272 | 
273 | ### 1. AI 模型调用失败
274 | ```python
275 | # 检查 API Key 配置
276 | import os
277 | print(f"API Key: {os.getenv('OPENAI_API_KEY')[:10]}...")
278 | ```
279 | 
280 | ### 2. 元素定位失败
281 | ```python
282 | # 使用更具体的描述
283 | await agent.ai_action("点击页面左上角的蓝色登录按钮")
284 | ```
285 | 
286 | ### 3. 页面加载问题
287 | ```python
288 | # 添加等待时间
289 | await page.wait_for_page_load()
290 | await agent.ai_action("等待 3 秒让页面完全加载")
291 | ```
292 | 
293 | ## 🎓 下一步
294 | 
295 | 恭喜！你已经掌握了 Midscene Python 的基础用法。接下来可以：
296 | 
297 | 1. 📖 深入学习 [核心概念](核心概念/Agent核心控制器.md)
298 | 2. 🔧 查看 [API 参考](API参考/Agent-API.md)
299 | 3. 🌐 了解 [Web 自动化](平台集成/Web自动化/README.md) 高级特性
300 | 4. 📱 探索 [Android 自动化](平台集成/Android自动化.md)
301 | 5. 🎯 参考 [最佳实践](示例和教程/最佳实践.md)
302 | 
303 | ## 💡 小贴士
304 | 
305 | - 使用具体、清晰的自然语言描述能获得更好的执行效果
306 | - 定期查看执行报告来优化自动化脚本
307 | - 善用缓存机制来提升执行效率
308 | - 为不同环境配置不同的 AI 模型
309 | 
310 | ---
311 | 
312 | *准备好探索更多功能了吗？查看我们的 [示例集合](示例和教程/基础示例.md)！*


--------------------------------------------------------------------------------
/tests/test_core.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Test suite for Midscene Python core functionality
  3 | """
  4 | 
  5 | import pytest
  6 | import asyncio
  7 | from unittest.mock import Mock, AsyncMock
  8 | 
  9 | from midscene.core.types import UIContext, Size, Rect, BaseElement, NodeType
 10 | from midscene.core.insight import Insight
 11 | from midscene.core.agent import Agent
 12 | 
 13 | 
 14 | class MockInterface:
 15 |     """Mock interface for testing"""
 16 |     
 17 |     def __init__(self):
 18 |         self.interface_type = "mock"
 19 |         self._context = None
 20 |     
 21 |     async def get_context(self):
 22 |         if self._context:
 23 |             return self._context
 24 |         
 25 |         # Return mock context
 26 |         return UIContext(
 27 |             screenshot_base64="mock_screenshot",
 28 |             size=Size(width=1920, height=1080),
 29 |             content=[
 30 |                 BaseElement(
 31 |                     id="test_element",
 32 |                     content="Test Button",
 33 |                     rect=Rect(left=100, top=100, width=200, height=50),
 34 |                     center=(200, 125),
 35 |                     node_type=NodeType.BUTTON
 36 |                 )
 37 |             ],
 38 |             tree=Mock()
 39 |         )
 40 |     
 41 |     async def action_space(self):
 42 |         return ["tap", "input", "scroll"]
 43 |     
 44 |     async def tap(self, x, y):
 45 |         pass
 46 |     
 47 |     async def input_text(self, text):
 48 |         pass
 49 |     
 50 |     async def scroll(self, direction, distance=None):
 51 |         pass
 52 | 
 53 | 
 54 | @pytest.fixture
 55 | def mock_interface():
 56 |     """Mock interface fixture"""
 57 |     return MockInterface()
 58 | 
 59 | 
 60 | @pytest.fixture
 61 | def mock_ai_service():
 62 |     """Mock AI service fixture"""
 63 |     ai_service = Mock()
 64 |     ai_service.call_ai = AsyncMock(return_value={
 65 |         "content": {
 66 |             "elements": [{"id": "test_element", "reason": "test"}],
 67 |             "reasoning": "test reasoning",
 68 |             "confidence": 0.9,
 69 |             "errors": []
 70 |         },
 71 |         "usage": {"total_tokens": 100}
 72 |     })
 73 |     return ai_service
 74 | 
 75 | 
 76 | class TestInsight:
 77 |     """Test Insight AI engine"""
 78 |     
 79 |     @pytest.mark.asyncio
 80 |     async def test_locate_element(self, mock_interface, mock_ai_service):
 81 |         """Test element location"""
 82 |         insight = Insight(
 83 |             context_provider=mock_interface.get_context,
 84 |             ai_service=mock_ai_service
 85 |         )
 86 |         
 87 |         result = await insight.locate("test button")
 88 |         
 89 |         assert result.element is not None
 90 |         assert result.element.id == "test_element"
 91 |         mock_ai_service.call_ai.assert_called_once()
 92 |     
 93 |     @pytest.mark.asyncio
 94 |     async def test_extract_data(self, mock_interface, mock_ai_service):
 95 |         """Test data extraction"""
 96 |         # Mock extract response
 97 |         mock_ai_service.call_ai.return_value = {
 98 |             "content": {
 99 |                 "data": {"title": "Test Page", "items": ["item1", "item2"]},
100 |                 "reasoning": "extracted data",
101 |                 "confidence": 0.9,
102 |                 "errors": []
103 |             },
104 |             "usage": {"total_tokens": 150}
105 |         }
106 |         
107 |         insight = Insight(
108 |             context_provider=mock_interface.get_context,
109 |             ai_service=mock_ai_service
110 |         )
111 |         
112 |         result = await insight.extract("extract page data")
113 |         
114 |         assert result["data"]["title"] == "Test Page"
115 |         assert len(result["data"]["items"]) == 2
116 |     
117 |     @pytest.mark.asyncio
118 |     async def test_assert_condition(self, mock_interface, mock_ai_service):
119 |         """Test condition assertion"""
120 |         # Mock assert response
121 |         mock_ai_service.call_ai.return_value = {
122 |             "content": {
123 |                 "passed": True,
124 |                 "reasoning": "condition is met",
125 |                 "confidence": 0.95,
126 |                 "message": "success"
127 |             },
128 |             "usage": {"total_tokens": 80}
129 |         }
130 |         
131 |         insight = Insight(
132 |             context_provider=mock_interface.get_context,
133 |             ai_service=mock_ai_service
134 |         )
135 |         
136 |         result = await insight.assert_condition("page is loaded")
137 |         
138 |         assert result.passed is True
139 |         assert result.thought == "condition is met"
140 | 
141 | 
142 | class TestAgent:
143 |     """Test Agent functionality"""
144 |     
145 |     @pytest.mark.asyncio
146 |     async def test_agent_creation(self, mock_interface):
147 |         """Test agent creation"""
148 |         agent = Agent(mock_interface)
149 |         
150 |         assert agent.interface == mock_interface
151 |         assert agent.insight is not None
152 |         assert agent.task_executor is not None
153 |         assert agent.destroyed is False
154 |     
155 |     @pytest.mark.asyncio
156 |     async def test_ai_locate(self, mock_interface, mock_ai_service):
157 |         """Test AI locate through agent"""
158 |         agent = Agent(mock_interface)
159 |         agent.insight.ai_service = mock_ai_service
160 |         
161 |         result = await agent.ai_locate("test button")
162 |         
163 |         assert result.element is not None
164 |         assert result.element.id == "test_element"
165 |     
166 |     @pytest.mark.asyncio
167 |     async def test_ai_extract(self, mock_interface, mock_ai_service):
168 |         """Test AI extract through agent"""
169 |         # Mock extract response
170 |         mock_ai_service.call_ai.return_value = {
171 |             "content": {
172 |                 "data": {"username": "testuser"},
173 |                 "reasoning": "extracted username",
174 |                 "confidence": 0.9,
175 |                 "errors": []
176 |             },
177 |             "usage": {"total_tokens": 100}
178 |         }
179 |         
180 |         agent = Agent(mock_interface)
181 |         agent.insight.ai_service = mock_ai_service
182 |         
183 |         result = await agent.ai_extract("extract username")
184 |         
185 |         assert result["username"] == "testuser"
186 |     
187 |     @pytest.mark.asyncio
188 |     async def test_ai_assert_success(self, mock_interface, mock_ai_service):
189 |         """Test AI assert success"""
190 |         # Mock assert response
191 |         mock_ai_service.call_ai.return_value = {
192 |             "content": {
193 |                 "passed": True,
194 |                 "reasoning": "condition met",
195 |                 "confidence": 0.9,
196 |                 "message": "success"
197 |             },
198 |             "usage": {"total_tokens": 80}
199 |         }
200 |         
201 |         agent = Agent(mock_interface)
202 |         agent.insight.ai_service = mock_ai_service
203 |         
204 |         # Should not raise exception
205 |         await agent.ai_assert("page is loaded")
206 |     
207 |     @pytest.mark.asyncio
208 |     async def test_ai_assert_failure(self, mock_interface, mock_ai_service):
209 |         """Test AI assert failure"""
210 |         # Mock assert response
211 |         mock_ai_service.call_ai.return_value = {
212 |             "content": {
213 |                 "passed": False,
214 |                 "reasoning": "condition not met",
215 |                 "confidence": 0.9,
216 |                 "message": "login failed"
217 |             },
218 |             "usage": {"total_tokens": 80}
219 |         }
220 |         
221 |         agent = Agent(mock_interface)
222 |         agent.insight.ai_service = mock_ai_service
223 |         
224 |         # Should raise AssertionError
225 |         with pytest.raises(AssertionError):
226 |             await agent.ai_assert("user is logged in")
227 |     
228 |     @pytest.mark.asyncio
229 |     async def test_basic_actions(self, mock_interface):
230 |         """Test basic agent actions"""
231 |         agent = Agent(mock_interface)
232 |         
233 |         # Test tap
234 |         await agent.tap(100, 200)
235 |         
236 |         # Test input
237 |         await agent.input_text("test text")
238 |         
239 |         # Test scroll
240 |         from midscene.core.types import ScrollParam
241 |         scroll_param = ScrollParam(direction="down", scroll_type="once", distance=500)
242 |         await agent.scroll(scroll_param)
243 |     
244 |     @pytest.mark.asyncio
245 |     async def test_agent_destroy(self, mock_interface):
246 |         """Test agent destruction"""
247 |         agent = Agent(mock_interface)
248 |         
249 |         await agent.destroy()
250 |         
251 |         assert agent.destroyed is True
252 |         
253 |         # Should raise error when using destroyed agent
254 |         with pytest.raises(RuntimeError):
255 |             await agent.ai_locate("test")
256 | 
257 | 
258 | if __name__ == "__main__":
259 |     pytest.main([__file__, "-v"])


--------------------------------------------------------------------------------
/wiki/安装配置.md:
--------------------------------------------------------------------------------
  1 | # 安装配置
  2 | 
  3 | 本章节详细介绍 Midscene Python 的安装步骤、环境配置和依赖管理。
  4 | 
  5 | ## 📋 系统要求
  6 | 
  7 | ### 基础要求
  8 | - **Python**: 3.9 或更高版本
  9 | - **操作系统**: Windows 10+, macOS 10.14+, Linux (Ubuntu 18.04+)
 10 | - **内存**: 最少 4GB RAM（推荐 8GB+）
 11 | - **网络**: 稳定的互联网连接（用于 AI 模型调用）
 12 | 
 13 | ### AI 模型要求
 14 | 至少需要以下 AI 服务之一的 API 访问权限：
 15 | - OpenAI GPT-4V
 16 | - Anthropic Claude 3
 17 | - 阿里云通义千问 VL
 18 | - Google Gemini Pro Vision
 19 | 
 20 | ## 🚀 快速安装
 21 | 
 22 | ### 方式一：使用 pip 安装（推荐）
 23 | ```bash
 24 | # 安装最新版本
 25 | pip install midscene-python
 26 | 
 27 | # 或指定版本
 28 | pip install midscene-python==0.1.0
 29 | ```
 30 | 
 31 | ### 方式二：从源码安装
 32 | ```bash
 33 | # 克隆仓库
 34 | git clone https://gitee.com/Python51888/midscene-python.git
 35 | cd midscene-python
 36 | 
 37 | # 安装依赖并安装
 38 | pip install -e .
 39 | ```
 40 | 
 41 | ### 方式三：开发者安装
 42 | ```bash
 43 | # 克隆仓库
 44 | git clone https://gitee.com/Python51888/midscene-python.git
 45 | cd midscene-python
 46 | 
 47 | # 安装开发依赖
 48 | pip install -e ".[dev,docs]"
 49 | 
 50 | # 安装 pre-commit hooks
 51 | pre-commit install
 52 | ```
 53 | 
 54 | ## 🔧 平台特定配置
 55 | 
 56 | ### Web 自动化配置
 57 | 
 58 | #### Selenium 配置
 59 | ```bash
 60 | # 安装 Selenium 和 WebDriver 管理器
 61 | pip install selenium webdriver-manager
 62 | 
 63 | # Python 代码中自动管理驱动
 64 | from selenium import webdriver
 65 | from webdriver_manager.chrome import ChromeDriverManager
 66 | from selenium.webdriver.chrome.service import Service
 67 | 
 68 | service = Service(ChromeDriverManager().install())
 69 | driver = webdriver.Chrome(service=service)
 70 | ```
 71 | 
 72 | #### Playwright 配置
 73 | ```bash
 74 | # 安装 Playwright
 75 | pip install playwright
 76 | 
 77 | # 安装浏览器
 78 | playwright install
 79 | 
 80 | # 仅安装 Chromium（节省空间）
 81 | playwright install chromium
 82 | ```
 83 | 
 84 | ### Android 自动化配置
 85 | 
 86 | #### ADB 设置
 87 | ```bash
 88 | # 安装 ADB（Ubuntu/Debian）
 89 | sudo apt-get install android-tools-adb
 90 | 
 91 | # 安装 ADB（macOS）
 92 | brew install android-platform-tools
 93 | 
 94 | # 安装 ADB（Windows）
 95 | # 下载 Android SDK Platform Tools
 96 | # 添加到系统 PATH
 97 | ```
 98 | 
 99 | #### 设备连接
100 | ```bash
101 | # 启用开发者选项和 USB 调试
102 | # 连接设备后验证
103 | adb devices
104 | 
105 | # 预期输出
106 | List of devices attached
107 | DEVICE_ID    device
108 | ```
109 | 
110 | ## 🔑 AI 模型配置
111 | 
112 | ### 环境变量配置
113 | 创建 `.env` 文件：
114 | 
115 | ```bash
116 | # OpenAI 配置
117 | OPENAI_API_KEY=sk-your-openai-api-key
118 | OPENAI_BASE_URL=https://api.openai.com/v1  # 可选
119 | 
120 | # Anthropic 配置
121 | ANTHROPIC_API_KEY=sk-ant-your-anthropic-key
122 | 
123 | # 通义千问配置
124 | DASHSCOPE_API_KEY=sk-your-dashscope-key
125 | 
126 | # Gemini 配置
127 | GOOGLE_API_KEY=AIza-your-google-api-key
128 | 
129 | # 默认模型配置
130 | MIDSCENE_AI_PROVIDER=openai
131 | MIDSCENE_AI_MODEL=gpt-4-vision-preview
132 | ```
133 | 
134 | ### 代码配置
135 | ```python
136 | from midscene.core.ai_model import AIModelConfig
137 | 
138 | # 多个 AI 提供商配置
139 | configs = {
140 |     "openai": AIModelConfig(
141 |         provider="openai",
142 |         model="gpt-4-vision-preview",
143 |         api_key="your-openai-key",
144 |         temperature=0.1
145 |     ),
146 |     "claude": AIModelConfig(
147 |         provider="anthropic", 
148 |         model="claude-3-sonnet-20240229",
149 |         api_key="your-claude-key",
150 |         temperature=0.1
151 |     )
152 | }
153 | ```
154 | 
155 | ## 📦 依赖管理
156 | 
157 | ### 核心依赖
158 | ```toml
159 | # pyproject.toml 中的核心依赖
160 | [project]
161 | dependencies = [
162 |     "pydantic>=2.0,<3.0",
163 |     "selenium>=4.15.0,<5.0", 
164 |     "playwright>=1.40.0,<2.0",
165 |     "opencv-python>=4.8.0,<5.0",
166 |     "pillow>=10.0.0,<11.0",
167 |     "aiohttp>=3.9.0,<4.0",
168 |     "loguru>=0.7.0,<1.0",
169 |     "typer>=0.9.0,<1.0",
170 |     "httpx>=0.25.0,<1.0",
171 |     "openai>=1.3.0,<2.0",
172 |     "anthropic>=0.7.0,<1.0"
173 | ]
174 | ```
175 | 
176 | ### 可选依赖
177 | ```bash
178 | # 开发工具
179 | pip install "midscene-python[dev]"
180 | 
181 | # 文档工具
182 | pip install "midscene-python[docs]"
183 | 
184 | # 全部依赖
185 | pip install "midscene-python[dev,docs]"
186 | ```
187 | 
188 | ## 🔍 验证安装
189 | 
190 | ### 基础验证
191 | ```python
192 | # test_installation.py
193 | import asyncio
194 | from midscene import Agent
195 | from midscene.core.ai_model import AIModelService
196 | 
197 | async def test_installation():
198 |     """测试安装是否成功"""
199 |     
200 |     # 测试导入
201 |     print("✓ 导入模块成功")
202 |     
203 |     # 测试 AI 服务配置
204 |     try:
205 |         ai_service = AIModelService()
206 |         print("✓ AI 服务初始化成功")
207 |     except Exception as e:
208 |         print(f"✗ AI 服务初始化失败: {e}")
209 |     
210 |     print("🎉 安装验证完成！")
211 | 
212 | # 运行测试
213 | asyncio.run(test_installation())
214 | ```
215 | 
216 | ### Web 平台验证
217 | ```python
218 | # test_web.py
219 | import asyncio
220 | from midscene import Agent
221 | from midscene.web import SeleniumWebPage
222 | 
223 | async def test_web():
224 |     """测试 Web 平台功能"""
225 |     try:
226 |         with SeleniumWebPage.create() as page:
227 |             agent = Agent(page)
228 |             await page.goto("https://www.example.com")
229 |             print("✓ Web 自动化测试成功")
230 |     except Exception as e:
231 |         print(f"✗ Web 自动化测试失败: {e}")
232 | 
233 | asyncio.run(test_web())
234 | ```
235 | 
236 | ### Android 平台验证
237 | ```python
238 | # test_android.py
239 | import asyncio
240 | from midscene import Agent
241 | from midscene.android import AndroidDevice
242 | 
243 | async def test_android():
244 |     """测试 Android 平台功能"""
245 |     try:
246 |         device = AndroidDevice()
247 |         await device.connect()
248 |         agent = Agent(device)
249 |         print("✓ Android 自动化测试成功")
250 |     except Exception as e:
251 |         print(f"✗ Android 自动化测试失败: {e}")
252 | 
253 | asyncio.run(test_android())
254 | ```
255 | 
256 | ## 🔧 常见问题解决
257 | 
258 | ### Python 版本问题
259 | ```bash
260 | # 检查 Python 版本
261 | python --version
262 | 
263 | # 如果版本低于 3.9，安装新版本
264 | # Ubuntu/Debian
265 | sudo apt-get install python3.9
266 | 
267 | # macOS
268 | brew install python@3.9
269 | 
270 | # Windows
271 | # 从 python.org 下载安装
272 | ```
273 | 
274 | ### 依赖冲突解决
275 | ```bash
276 | # 创建虚拟环境（推荐）
277 | python -m venv midscene-env
278 | source midscene-env/bin/activate  # Linux/macOS
279 | # 或
280 | midscene-env\Scripts\activate     # Windows
281 | 
282 | # 在虚拟环境中安装
283 | pip install midscene-python
284 | ```
285 | 
286 | ### 网络连接问题
287 | ```bash
288 | # 使用国内镜像源
289 | pip install -i https://pypi.tuna.tsinghua.edu.cn/simple midscene-python
290 | 
291 | # 或配置永久镜像源
292 | pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
293 | ```
294 | 
295 | ### AI API 连接问题
296 | ```python
297 | # 测试 API 连接
298 | import os
299 | import httpx
300 | 
301 | async def test_openai_connection():
302 |     api_key = os.getenv("OPENAI_API_KEY")
303 |     if not api_key:
304 |         print("❌ 未设置 OPENAI_API_KEY")
305 |         return
306 |     
307 |     async with httpx.AsyncClient() as client:
308 |         try:
309 |             response = await client.get(
310 |                 "https://api.openai.com/v1/models",
311 |                 headers={"Authorization": f"Bearer {api_key}"}
312 |             )
313 |             if response.status_code == 200:
314 |                 print("✅ OpenAI API 连接正常")
315 |             else:
316 |                 print(f"❌ OpenAI API 连接失败: {response.status_code}")
317 |         except Exception as e:
318 |             print(f"❌ 网络连接错误: {e}")
319 | ```
320 | 
321 | ## 🚀 性能优化配置
322 | 
323 | ### 系统级优化
324 | ```bash
325 | # 增加文件描述符限制（Linux/macOS）
326 | ulimit -n 65536
327 | 
328 | # 设置环境变量优化
329 | export PYTHONUNBUFFERED=1
330 | export PYTHONDONTWRITEBYTECODE=1
331 | ```
332 | 
333 | ### Python 配置优化
334 | ```python
335 | # config.py
336 | import asyncio
337 | 
338 | # 设置异步事件循环策略
339 | if hasattr(asyncio, 'WindowsSelectorEventLoopPolicy'):
340 |     asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
341 | 
342 | # 配置日志级别
343 | import logging
344 | logging.getLogger("httpx").setLevel(logging.WARNING)
345 | logging.getLogger("selenium").setLevel(logging.WARNING)
346 | ```
347 | 
348 | ## 📋 配置检查清单
349 | 
350 | ### 安装完成检查
351 | - [ ] Python 3.9+ 已安装
352 | - [ ] midscene-python 包已安装
353 | - [ ] 至少一个 AI 提供商已配置
354 | - [ ] Web 驱动程序已安装（如果使用 Web 自动化）
355 | - [ ] ADB 已安装并设备已连接（如果使用 Android 自动化）
356 | 
357 | ### 环境配置检查
358 | - [ ] 环境变量已设置
359 | - [ ] API 密钥有效且有足够额度
360 | - [ ] 网络连接正常
361 | - [ ] 防火墙和代理配置正确
362 | 
363 | ### 功能测试检查
364 | - [ ] 基础导入测试通过
365 | - [ ] AI 服务初始化成功
366 | - [ ] 选择的平台（Web/Android）测试通过
367 | - [ ] 示例代码可以正常运行
368 | 
369 | ## 🔄 升级和维护
370 | 
371 | ### 版本升级
372 | ```bash
373 | # 检查当前版本
374 | pip show midscene-python
375 | 
376 | # 升级到最新版本
377 | pip install --upgrade midscene-python
378 | 
379 | # 升级特定版本
380 | pip install midscene-python==0.2.0
381 | ```
382 | 
383 | ### 配置备份
384 | ```bash
385 | # 备份配置文件
386 | cp .env .env.backup
387 | cp pyproject.toml pyproject.toml.backup
388 | 
389 | # 导出依赖列表
390 | pip freeze > requirements.txt
391 | ```
392 | 
393 | ### 清理和重装
394 | ```bash
395 | # 卸载当前版本
396 | pip uninstall midscene-python
397 | 
398 | # 清理缓存
399 | pip cache purge
400 | 
401 | # 重新安装
402 | pip install midscene-python
403 | ```
404 | 
405 | ---
406 | 
407 | 完成配置后，您就可以开始使用 Midscene Python 进行 AI 驱动的自动化了！接下来推荐阅读 [快速开始](快速开始.md) 指南。


--------------------------------------------------------------------------------
/wiki/平台集成/README.md:
--------------------------------------------------------------------------------
  1 | # 平台集成
  2 | 
  3 | Midscene Python 支持多个平台的 UI 自动化，提供统一的编程接口和一致的操作体验。
  4 | 
  5 | ## 🏗️ 架构概览
  6 | 
  7 | ```mermaid
  8 | graph TB
  9 |     A[Agent 统一接口] --> B[平台抽象层]
 10 |     B --> C[Web 自动化]
 11 |     B --> D[Android 自动化]
 12 |     
 13 |     C --> E[Selenium 集成]
 14 |     C --> F[Playwright 集成]
 15 |     C --> G[Web 桥接机制]
 16 |     
 17 |     D --> H[ADB 设备管理]
 18 |     D --> I[Android Agent]
 19 |     
 20 |     E --> J[ChromeDriver]
 21 |     E --> K[FirefoxDriver]
 22 |     F --> L[Chromium]
 23 |     F --> M[Firefox]
 24 |     F --> N[Safari]
 25 |     
 26 |     H --> O[USB 设备]
 27 |     H --> P[网络设备]
 28 |     H --> Q[模拟器]
 29 | ```
 30 | 
 31 | ## 📱 支持的平台
 32 | 
 33 | ### Web 自动化
 34 | - **Selenium WebDriver**: 支持 Chrome、Firefox、Safari、Edge
 35 | - **Playwright**: 支持 Chromium、Firefox、WebKit
 36 | - **统一桥接**: 提供一致的 API 接口
 37 | 
 38 | ### Android 自动化  
 39 | - **真实设备**: 通过 USB 或 WiFi 连接
 40 | - **Android 模拟器**: 支持各种 AVD 配置
 41 | - **云设备**: 支持云端设备服务
 42 | 
 43 | ## 🌐 Web 自动化
 44 | 
 45 | ### 快速开始
 46 | ```python
 47 | import asyncio
 48 | from midscene import Agent
 49 | from midscene.web import SeleniumWebPage, PlaywrightPage
 50 | 
 51 | # Selenium 示例
 52 | async def selenium_example():
 53 |     with SeleniumWebPage.create() as page:
 54 |         agent = Agent(page)
 55 |         await page.goto("https://example.com")
 56 |         await agent.ai_action("点击登录按钮")
 57 | 
 58 | # Playwright 示例  
 59 | async def playwright_example():
 60 |     async with PlaywrightPage.create() as page:
 61 |         agent = Agent(page)
 62 |         await page.goto("https://example.com")
 63 |         await agent.ai_action("点击登录按钮")
 64 | ```
 65 | 
 66 | ### 高级配置
 67 | ```python
 68 | from midscene.web import SeleniumWebPage
 69 | from selenium.webdriver.chrome.options import Options
 70 | 
 71 | # 自定义浏览器选项
 72 | chrome_options = Options()
 73 | chrome_options.add_argument("--headless")
 74 | chrome_options.add_argument("--no-sandbox")
 75 | 
 76 | page = SeleniumWebPage.create(
 77 |     browser="chrome",
 78 |     options=chrome_options,
 79 |     window_size=(1920, 1080)
 80 | )
 81 | ```
 82 | 
 83 | ### 详细文档
 84 | - [Selenium集成](Web自动化/Selenium集成.md) - Selenium WebDriver 完整指南
 85 | - [Playwright集成](Web自动化/Playwright集成.md) - Playwright 集成和配置
 86 | - [Web桥接机制](Web自动化/Web桥接机制.md) - 统一的 Web 操作抽象
 87 | 
 88 | ## 📱 Android 自动化
 89 | 
 90 | ### 快速开始
 91 | ```python
 92 | import asyncio
 93 | from midscene import Agent
 94 | from midscene.android import AndroidDevice
 95 | 
 96 | async def android_example():
 97 |     # 连接设备
 98 |     device = AndroidDevice()
 99 |     await device.connect()
100 |     
101 |     # 创建 Agent
102 |     agent = Agent(device)
103 |     
104 |     # 启动应用
105 |     await device.start_app("com.example.app")
106 |     
107 |     # AI 操作
108 |     await agent.ai_action("点击登录按钮")
109 |     await agent.ai_action("输入用户名 'testuser'")
110 |     await agent.ai_action("点击提交")
111 | ```
112 | 
113 | ### 设备管理
114 | ```python
115 | from midscene.android import AndroidDevice, DeviceManager
116 | 
117 | # 连接特定设备
118 | device = AndroidDevice(device_id="emulator-5554")
119 | 
120 | # 设备管理器
121 | manager = DeviceManager()
122 | devices = await manager.list_devices()
123 | for device in devices:
124 |     print(f"设备: {device.id}, 状态: {device.status}")
125 | ```
126 | 
127 | ### 详细文档
128 | - [Android自动化](Android自动化.md) - Android 平台完整指南
129 | 
130 | ## 🔄 统一操作接口
131 | 
132 | 无论使用哪个平台，Midscene Python 都提供一致的操作接口：
133 | 
134 | ### Agent 操作
135 | ```python
136 | # Web 和 Android 使用相同的方法
137 | await agent.ai_action("点击按钮")
138 | await agent.ai_action("输入文本 'hello'")
139 | await agent.ai_action("滚动到底部")
140 | 
141 | # 数据提取
142 | data = await agent.ai_extract({
143 |     "title": "页面标题",
144 |     "items": ["列表项目"]
145 | })
146 | 
147 | # 状态断言
148 | await agent.ai_assert("页面显示成功消息")
149 | ```
150 | 
151 | ### 页面操作
152 | ```python
153 | # 统一的页面操作
154 | await page.goto("https://example.com")  # Web
155 | await device.start_app("com.app")       # Android
156 | 
157 | # 截图
158 | screenshot = await page.screenshot()    # Web
159 | screenshot = await device.screenshot()  # Android
160 | 
161 | # 获取上下文
162 | context = await page.get_context()      # Web  
163 | context = await device.get_context()    # Android
164 | ```
165 | 
166 | ## 🔧 平台适配机制
167 | 
168 | ### AbstractInterface 抽象基类
169 | ```python
170 | from midscene.core.types import AbstractInterface, InterfaceType
171 | 
172 | class CustomPlatform(AbstractInterface):
173 |     @property
174 |     def interface_type(self) -> InterfaceType:
175 |         return InterfaceType.WEB  # 或 InterfaceType.ANDROID
176 |     
177 |     async def get_context(self) -> UIContext:
178 |         # 实现获取页面/屏幕上下文
179 |         pass
180 |     
181 |     async def tap(self, x: float, y: float) -> None:
182 |         # 实现点击操作
183 |         pass
184 |     
185 |     async def input_text(self, text: str) -> None:
186 |         # 实现文本输入
187 |         pass
188 | ```
189 | 
190 | ### 桥接模式实现
191 | ```python
192 | # Web 桥接示例
193 | class WebBridge:
194 |     def __init__(self, driver_type: str):
195 |         if driver_type == "selenium":
196 |             self.driver = SeleniumWebDriver()
197 |         elif driver_type == "playwright":
198 |             self.driver = PlaywrightDriver()
199 |     
200 |     async def unified_action(self, action: str, **kwargs):
201 |         # 统一的操作接口
202 |         return await self.driver.execute_action(action, **kwargs)
203 | ```
204 | 
205 | ## 🚀 平台选择指南
206 | 
207 | ### Web 平台选择
208 | 
209 | #### Selenium
210 | **适用场景**:
211 | - 需要支持多种浏览器
212 | - 与现有 Selenium 项目集成
213 | - 需要特定的 WebDriver 功能
214 | 
215 | **优势**:
216 | - 成熟稳定，社区支持好
217 | - 支持的浏览器最多
218 | - 与 Selenium Grid 集成
219 | 
220 | **劣势**:
221 | - 性能相对较慢
222 | - API 相对复杂
223 | 
224 | #### Playwright
225 | **适用场景**:
226 | - 需要高性能的自动化
227 | - 现代 Web 应用测试
228 | - 需要网络拦截等高级功能
229 | 
230 | **优势**:
231 | - 性能优异
232 | - 现代化的 API 设计
233 | - 内置等待和重试机制
234 | 
235 | **劣势**:
236 | - 相对较新，生态系统较小
237 | - 学习成本稍高
238 | 
239 | ### Android 平台特点
240 | 
241 | **适用场景**:
242 | - 移动应用 UI 测试
243 | - 移动端业务流程自动化
244 | - 跨平台应用测试
245 | 
246 | **优势**:
247 | - 直接操作原生 Android 界面
248 | - 支持各种 Android 版本
249 | - 可以测试真实设备体验
250 | 
251 | **注意事项**:
252 | - 需要 ADB 环境配置
253 | - 设备连接稳定性要求高
254 | - 权限和安全限制较多
255 | 
256 | ## 📊 性能对比
257 | 
258 | | 特性 | Selenium | Playwright | Android |
259 | |------|----------|------------|---------|
260 | | **启动速度** | 中等 | 快 | 较慢 |
261 | | **执行速度** | 中等 | 快 | 取决于设备 |
262 | | **资源占用** | 中等 | 低 | 高 |
263 | | **稳定性** | 高 | 高 | 中等 |
264 | | **调试难度** | 中等 | 低 | 高 |
265 | 
266 | ## 🔗 跨平台最佳实践
267 | 
268 | ### 1. 统一测试脚本
269 | ```python
270 | async def universal_test(platform: str):
271 |     """跨平台测试脚本"""
272 |     
273 |     if platform == "web":
274 |         page = SeleniumWebPage.create()
275 |         agent = Agent(page)
276 |         await page.goto("https://app.example.com")
277 |     
278 |     elif platform == "android":
279 |         device = AndroidDevice()
280 |         await device.connect()
281 |         agent = Agent(device)
282 |         await device.start_app("com.example.app")
283 |     
284 |     # 统一的测试步骤
285 |     await agent.ai_action("点击登录按钮")
286 |     await agent.ai_action("输入用户名 'test'")
287 |     await agent.ai_action("输入密码 'password'")
288 |     await agent.ai_action("点击提交")
289 |     
290 |     # 统一的验证
291 |     await agent.ai_assert("显示欢迎页面")
292 | ```
293 | 
294 | ### 2. 配置管理
295 | ```python
296 | # config.py
297 | PLATFORM_CONFIGS = {
298 |     "web": {
299 |         "browser": "chrome",
300 |         "headless": False,
301 |         "window_size": (1920, 1080)
302 |     },
303 |     "android": {
304 |         "device_id": None,  # 自动选择
305 |         "app_package": "com.example.app",
306 |         "timeout": 30
307 |     }
308 | }
309 | 
310 | def get_platform_config(platform: str) -> dict:
311 |     return PLATFORM_CONFIGS.get(platform, {})
312 | ```
313 | 
314 | ### 3. 错误处理
315 | ```python
316 | async def robust_platform_operation(agent: Agent, action: str):
317 |     """跨平台的健壮操作"""
318 |     
319 |     max_retries = 3
320 |     for attempt in range(max_retries):
321 |         try:
322 |             await agent.ai_action(action)
323 |             return
324 |         except Exception as e:
325 |             if attempt == max_retries - 1:
326 |                 raise
327 |             
328 |             # 根据平台类型进行特定的恢复操作
329 |             platform_type = agent.interface.interface_type
330 |             if platform_type == InterfaceType.WEB:
331 |                 await handle_web_error(agent, e)
332 |             elif platform_type == InterfaceType.ANDROID:
333 |                 await handle_android_error(agent, e)
334 |             
335 |             await asyncio.sleep(1)  # 等待后重试
336 | ```
337 | 
338 | ## 🔍 调试和诊断
339 | 
340 | ### 统一调试接口
341 | ```python
342 | async def debug_platform_info(agent: Agent):
343 |     """获取平台调试信息"""
344 |     
345 |     interface = agent.interface
346 |     platform_type = interface.interface_type
347 |     
348 |     print(f"平台类型: {platform_type}")
349 |     
350 |     if platform_type == InterfaceType.WEB:
351 |         context = await interface.get_context()
352 |         print(f"页面标题: {context.page_title}")
353 |         print(f"页面 URL: {context.url}")
354 |         print(f"视口大小: {context.size}")
355 |     
356 |     elif platform_type == InterfaceType.ANDROID:
357 |         context = await interface.get_context()
358 |         print(f"屏幕尺寸: {context.size}")
359 |         print(f"当前活动: {context.current_activity}")
360 |         print(f"设备信息: {context.device_info}")
361 | ```
362 | 
363 | ### 跨平台截图
364 | ```python
365 | async def take_debug_screenshot(agent: Agent, filename: str):
366 |     """跨平台截图功能"""
367 |     
368 |     interface = agent.interface
369 |     screenshot = await interface.screenshot()
370 |     
371 |     # 添加平台标识
372 |     platform_type = interface.interface_type.value
373 |     timestamped_filename = f"{platform_type}_{filename}_{int(time.time())}.png"
374 |     
375 |     with open(timestamped_filename, "wb") as f:
376 |         f.write(screenshot)
377 |     
378 |     print(f"截图已保存: {timestamped_filename}")
379 | ```
380 | 
381 | ---
382 | 
383 | 通过 Midscene Python 的平台集成能力，你可以用统一的方式处理不同平台的自动化需求。选择适合你项目需求的平台，并利用统一的 API 来简化开发和维护工作！


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
  1 | # This file was autogenerated by uv via the following command:
  2 | #    uv pip compile --all-extras pyproject.toml -o requirements.txt
  3 | aiohappyeyeballs==2.6.1
  4 |     # via aiohttp
  5 | aiohttp==3.12.15
  6 |     # via
  7 |     #   midscene-python (pyproject.toml)
  8 |     #   dashscope
  9 | aiosignal==1.4.0
 10 |     # via aiohttp
 11 | annotated-types==0.7.0
 12 |     # via pydantic
 13 | anthropic==0.64.0
 14 |     # via midscene-python (pyproject.toml)
 15 | anyio==4.10.0
 16 |     # via
 17 |     #   anthropic
 18 |     #   httpx
 19 |     #   openai
 20 | asyncio-mqtt==0.16.2
 21 |     # via midscene-python (pyproject.toml)
 22 | attrs==25.3.0
 23 |     # via
 24 |     #   aiohttp
 25 |     #   outcome
 26 |     #   trio
 27 | babel==2.17.0
 28 |     # via mkdocs-material
 29 | backrefs==5.9
 30 |     # via mkdocs-material
 31 | black==25.1.0
 32 |     # via midscene-python (pyproject.toml)
 33 | cachetools==5.5.2
 34 |     # via google-auth
 35 | certifi==2025.8.3
 36 |     # via
 37 |     #   httpcore
 38 |     #   httpx
 39 |     #   requests
 40 |     #   selenium
 41 | cffi==1.17.1
 42 |     # via
 43 |     #   cryptography
 44 |     #   trio
 45 | cfgv==3.4.0
 46 |     # via pre-commit
 47 | charset-normalizer==3.4.3
 48 |     # via requests
 49 | click==8.2.1
 50 |     # via
 51 |     #   black
 52 |     #   mkdocs
 53 |     #   mkdocs-material
 54 |     #   typer
 55 | colorama==0.4.6
 56 |     # via
 57 |     #   click
 58 |     #   griffe
 59 |     #   loguru
 60 |     #   mkdocs
 61 |     #   mkdocs-material
 62 |     #   pytest
 63 |     #   tqdm
 64 | coverage==7.10.6
 65 |     # via pytest-cov
 66 | cryptography==45.0.7
 67 |     # via dashscope
 68 | dashscope==1.24.2
 69 |     # via midscene-python (pyproject.toml)
 70 | distlib==0.4.0
 71 |     # via virtualenv
 72 | distro==1.9.0
 73 |     # via
 74 |     #   anthropic
 75 |     #   openai
 76 | filelock==3.19.1
 77 |     # via virtualenv
 78 | frozenlist==1.7.0
 79 |     # via
 80 |     #   aiohttp
 81 |     #   aiosignal
 82 | ghp-import==2.1.0
 83 |     # via mkdocs
 84 | google-ai-generativelanguage==0.6.15
 85 |     # via google-generativeai
 86 | google-api-core==2.25.1
 87 |     # via
 88 |     #   google-ai-generativelanguage
 89 |     #   google-api-python-client
 90 |     #   google-generativeai
 91 | google-api-python-client==2.179.0
 92 |     # via google-generativeai
 93 | google-auth==2.40.3
 94 |     # via
 95 |     #   google-ai-generativelanguage
 96 |     #   google-api-core
 97 |     #   google-api-python-client
 98 |     #   google-auth-httplib2
 99 |     #   google-generativeai
100 | google-auth-httplib2==0.2.0
101 |     # via google-api-python-client
102 | google-generativeai==0.8.5
103 |     # via midscene-python (pyproject.toml)
104 | googleapis-common-protos==1.70.0
105 |     # via
106 |     #   google-api-core
107 |     #   grpcio-status
108 | greenlet==3.2.4
109 |     # via playwright
110 | griffe==1.13.0
111 |     # via mkdocstrings-python
112 | grpcio==1.74.0
113 |     # via
114 |     #   google-api-core
115 |     #   grpcio-status
116 | grpcio-status==1.71.2
117 |     # via google-api-core
118 | h11==0.16.0
119 |     # via
120 |     #   httpcore
121 |     #   wsproto
122 | httpcore==1.0.9
123 |     # via httpx
124 | httplib2==0.30.0
125 |     # via
126 |     #   google-api-python-client
127 |     #   google-auth-httplib2
128 | httpx==0.28.1
129 |     # via
130 |     #   midscene-python (pyproject.toml)
131 |     #   anthropic
132 |     #   openai
133 | identify==2.6.13
134 |     # via pre-commit
135 | idna==3.10
136 |     # via
137 |     #   anyio
138 |     #   httpx
139 |     #   requests
140 |     #   trio
141 |     #   yarl
142 | iniconfig==2.1.0
143 |     # via pytest
144 | isort==6.0.1
145 |     # via midscene-python (pyproject.toml)
146 | jinja2==3.1.6
147 |     # via
148 |     #   midscene-python (pyproject.toml)
149 |     #   mkdocs
150 |     #   mkdocs-material
151 |     #   mkdocstrings
152 | jiter==0.10.0
153 |     # via
154 |     #   anthropic
155 |     #   openai
156 | loguru==0.7.3
157 |     # via midscene-python (pyproject.toml)
158 | markdown==3.8.2
159 |     # via
160 |     #   mkdocs
161 |     #   mkdocs-autorefs
162 |     #   mkdocs-material
163 |     #   mkdocstrings
164 |     #   pymdown-extensions
165 | markdown-it-py==4.0.0
166 |     # via rich
167 | markupsafe==3.0.2
168 |     # via
169 |     #   jinja2
170 |     #   mkdocs
171 |     #   mkdocs-autorefs
172 |     #   mkdocstrings
173 | mdurl==0.1.2
174 |     # via markdown-it-py
175 | mergedeep==1.3.4
176 |     # via
177 |     #   mkdocs
178 |     #   mkdocs-get-deps
179 | mkdocs==1.6.1
180 |     # via
181 |     #   midscene-python (pyproject.toml)
182 |     #   mkdocs-autorefs
183 |     #   mkdocs-material
184 |     #   mkdocstrings
185 | mkdocs-autorefs==1.4.3
186 |     # via
187 |     #   mkdocstrings
188 |     #   mkdocstrings-python
189 | mkdocs-get-deps==0.2.0
190 |     # via mkdocs
191 | mkdocs-material==9.6.18
192 |     # via midscene-python (pyproject.toml)
193 | mkdocs-material-extensions==1.3.1
194 |     # via mkdocs-material
195 | mkdocstrings==0.30.0
196 |     # via
197 |     #   midscene-python (pyproject.toml)
198 |     #   mkdocstrings-python
199 | mkdocstrings-python==1.18.2
200 |     # via mkdocstrings
201 | multidict==6.6.4
202 |     # via
203 |     #   aiohttp
204 |     #   yarl
205 | mypy==1.17.1
206 |     # via midscene-python (pyproject.toml)
207 | mypy-extensions==1.1.0
208 |     # via
209 |     #   black
210 |     #   mypy
211 | nodeenv==1.9.1
212 |     # via pre-commit
213 | numpy==1.26.4
214 |     # via
215 |     #   midscene-python (pyproject.toml)
216 |     #   opencv-python
217 | openai==1.102.0
218 |     # via midscene-python (pyproject.toml)
219 | opencv-python==4.11.0.86
220 |     # via midscene-python (pyproject.toml)
221 | outcome==1.3.0.post0
222 |     # via
223 |     #   trio
224 |     #   trio-websocket
225 | packaging==25.0
226 |     # via
227 |     #   black
228 |     #   mkdocs
229 |     #   pytest
230 | paginate==0.5.7
231 |     # via mkdocs-material
232 | paho-mqtt==2.1.0
233 |     # via asyncio-mqtt
234 | pathspec==0.12.1
235 |     # via
236 |     #   black
237 |     #   mkdocs
238 |     #   mypy
239 | pillow==10.4.0
240 |     # via midscene-python (pyproject.toml)
241 | platformdirs==4.4.0
242 |     # via
243 |     #   black
244 |     #   mkdocs-get-deps
245 |     #   virtualenv
246 | playwright==1.55.0
247 |     # via midscene-python (pyproject.toml)
248 | pluggy==1.6.0
249 |     # via
250 |     #   pytest
251 |     #   pytest-cov
252 | pre-commit==4.3.0
253 |     # via midscene-python (pyproject.toml)
254 | propcache==0.3.2
255 |     # via
256 |     #   aiohttp
257 |     #   yarl
258 | proto-plus==1.26.1
259 |     # via
260 |     #   google-ai-generativelanguage
261 |     #   google-api-core
262 | protobuf==5.29.5
263 |     # via
264 |     #   google-ai-generativelanguage
265 |     #   google-api-core
266 |     #   google-generativeai
267 |     #   googleapis-common-protos
268 |     #   grpcio-status
269 |     #   proto-plus
270 | pure-python-adb==0.3.0.dev0
271 |     # via midscene-python (pyproject.toml)
272 | pyasn1==0.6.1
273 |     # via
274 |     #   pyasn1-modules
275 |     #   rsa
276 | pyasn1-modules==0.4.2
277 |     # via google-auth
278 | pycparser==2.22
279 |     # via cffi
280 | pydantic==2.11.7
281 |     # via
282 |     #   midscene-python (pyproject.toml)
283 |     #   anthropic
284 |     #   google-generativeai
285 |     #   openai
286 | pydantic-core==2.33.2
287 |     # via pydantic
288 | pyee==13.0.0
289 |     # via playwright
290 | pygments==2.19.2
291 |     # via
292 |     #   mkdocs-material
293 |     #   pytest
294 |     #   rich
295 | pymdown-extensions==10.16.1
296 |     # via
297 |     #   mkdocs-material
298 |     #   mkdocstrings
299 | pyparsing==3.2.3
300 |     # via httplib2
301 | pysocks==1.7.1
302 |     # via urllib3
303 | pytest==8.4.1
304 |     # via
305 |     #   midscene-python (pyproject.toml)
306 |     #   pytest-asyncio
307 |     #   pytest-cov
308 | pytest-asyncio==1.1.0
309 |     # via midscene-python (pyproject.toml)
310 | pytest-cov==6.2.1
311 |     # via midscene-python (pyproject.toml)
312 | python-dateutil==2.9.0.post0
313 |     # via ghp-import
314 | pyyaml==6.0.2
315 |     # via
316 |     #   midscene-python (pyproject.toml)
317 |     #   mkdocs
318 |     #   mkdocs-get-deps
319 |     #   pre-commit
320 |     #   pymdown-extensions
321 |     #   pyyaml-env-tag
322 | pyyaml-env-tag==1.1
323 |     # via mkdocs
324 | requests==2.32.5
325 |     # via
326 |     #   dashscope
327 |     #   google-api-core
328 |     #   mkdocs-material
329 | rich==14.1.0
330 |     # via typer
331 | rsa==4.9.1
332 |     # via google-auth
333 | ruff==0.12.11
334 |     # via midscene-python (pyproject.toml)
335 | selenium==4.35.0
336 |     # via midscene-python (pyproject.toml)
337 | shellingham==1.5.4
338 |     # via typer
339 | six==1.17.0
340 |     # via python-dateutil
341 | sniffio==1.3.1
342 |     # via
343 |     #   anthropic
344 |     #   anyio
345 |     #   openai
346 |     #   trio
347 | sortedcontainers==2.4.0
348 |     # via trio
349 | tqdm==4.67.1
350 |     # via
351 |     #   google-generativeai
352 |     #   openai
353 | trio==0.30.0
354 |     # via
355 |     #   selenium
356 |     #   trio-websocket
357 | trio-websocket==0.12.2
358 |     # via selenium
359 | typer==0.17.3
360 |     # via midscene-python (pyproject.toml)
361 | typing-extensions==4.14.1
362 |     # via
363 |     #   aiosignal
364 |     #   anthropic
365 |     #   anyio
366 |     #   google-generativeai
367 |     #   mypy
368 |     #   openai
369 |     #   pydantic
370 |     #   pydantic-core
371 |     #   pyee
372 |     #   selenium
373 |     #   typer
374 |     #   typing-inspection
375 | typing-inspection==0.4.1
376 |     # via pydantic
377 | uritemplate==4.2.0
378 |     # via google-api-python-client
379 | urllib3==2.5.0
380 |     # via
381 |     #   requests
382 |     #   selenium
383 | virtualenv==20.34.0
384 |     # via pre-commit
385 | watchdog==6.0.0
386 |     # via mkdocs
387 | websocket-client==1.8.0
388 |     # via
389 |     #   dashscope
390 |     #   selenium
391 | win32-setctime==1.2.0
392 |     # via loguru
393 | wsproto==1.2.0
394 |     # via trio-websocket
395 | yarl==1.20.1
396 |     # via aiohttp
397 | 


--------------------------------------------------------------------------------
/midscene/shared/cache.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Task caching system for performance optimization
  3 | """
  4 | 
  5 | import json
  6 | import hashlib
  7 | import pickle
  8 | from datetime import datetime, timedelta
  9 | from pathlib import Path
 10 | from typing import Any, Dict, List, Optional, Union
 11 | 
 12 | from loguru import logger
 13 | from pydantic import BaseModel
 14 | 
 15 | 
 16 | class CacheEntry(BaseModel):
 17 |     """Cache entry model"""
 18 |     key: str
 19 |     data: Any
 20 |     timestamp: datetime
 21 |     expires_at: Optional[datetime] = None
 22 |     metadata: Dict[str, Any] = {}
 23 | 
 24 | 
 25 | class TaskCache:
 26 |     """Task caching system for storing and retrieving execution results"""
 27 |     
 28 |     def __init__(
 29 |         self, 
 30 |         cache_id: str,
 31 |         enabled: bool = True,
 32 |         cache_dir: Optional[str] = None,
 33 |         max_age_hours: int = 24
 34 |     ):
 35 |         """Initialize task cache
 36 |         
 37 |         Args:
 38 |             cache_id: Unique cache identifier
 39 |             enabled: Whether caching is enabled
 40 |             cache_dir: Cache directory path
 41 |             max_age_hours: Maximum cache age in hours
 42 |         """
 43 |         self.cache_id = cache_id
 44 |         self.enabled = enabled
 45 |         self.max_age_hours = max_age_hours
 46 |         
 47 |         # Setup cache directory
 48 |         if cache_dir:
 49 |             self.cache_dir = Path(cache_dir)
 50 |         else:
 51 |             self.cache_dir = Path.home() / ".midscene" / "cache"
 52 |         
 53 |         self.cache_dir.mkdir(parents=True, exist_ok=True)
 54 |         self.cache_file = self.cache_dir / f"{cache_id}.json"
 55 |         
 56 |         # Load existing cache
 57 |         self._cache: Dict[str, CacheEntry] = {}
 58 |         self._load_cache()
 59 |     
 60 |     def _generate_key(self, data: Union[str, Dict, List]) -> str:
 61 |         """Generate cache key from data
 62 |         
 63 |         Args:
 64 |             data: Data to generate key from
 65 |             
 66 |         Returns:
 67 |             Cache key string
 68 |         """
 69 |         if isinstance(data, str):
 70 |             content = data
 71 |         else:
 72 |             content = json.dumps(data, sort_keys=True, ensure_ascii=False)
 73 |         
 74 |         return hashlib.md5(content.encode('utf-8')).hexdigest()
 75 |     
 76 |     def _load_cache(self) -> None:
 77 |         """Load cache from file"""
 78 |         if not self.enabled or not self.cache_file.exists():
 79 |             return
 80 |         
 81 |         try:
 82 |             with open(self.cache_file, 'r', encoding='utf-8') as f:
 83 |                 cache_data = json.load(f)
 84 |             
 85 |             for key, entry_data in cache_data.items():
 86 |                 # Convert datetime strings back to datetime objects
 87 |                 entry_data['timestamp'] = datetime.fromisoformat(entry_data['timestamp'])
 88 |                 if entry_data.get('expires_at'):
 89 |                     entry_data['expires_at'] = datetime.fromisoformat(entry_data['expires_at'])
 90 |                 
 91 |                 self._cache[key] = CacheEntry(**entry_data)
 92 |             
 93 |             # Clean expired entries
 94 |             self._clean_expired()
 95 |             
 96 |             logger.debug(f"Loaded {len(self._cache)} cache entries")
 97 |             
 98 |         except Exception as e:
 99 |             logger.warning(f"Failed to load cache: {e}")
100 |             self._cache = {}
101 |     
102 |     def _save_cache(self) -> None:
103 |         """Save cache to file"""
104 |         if not self.enabled:
105 |             return
106 |         
107 |         try:
108 |             cache_data = {}
109 |             for key, entry in self._cache.items():
110 |                 entry_dict = entry.model_dump()
111 |                 # Convert datetime objects to strings
112 |                 entry_dict['timestamp'] = entry.timestamp.isoformat()
113 |                 if entry.expires_at:
114 |                     entry_dict['expires_at'] = entry.expires_at.isoformat()
115 |                 
116 |                 cache_data[key] = entry_dict
117 |             
118 |             with open(self.cache_file, 'w', encoding='utf-8') as f:
119 |                 json.dump(cache_data, f, ensure_ascii=False, indent=2)
120 |             
121 |         except Exception as e:
122 |             logger.warning(f"Failed to save cache: {e}")
123 |     
124 |     def _clean_expired(self) -> None:
125 |         """Clean expired cache entries"""
126 |         now = datetime.now()
127 |         expired_keys = []
128 |         
129 |         for key, entry in self._cache.items():
130 |             # Check explicit expiration
131 |             if entry.expires_at and entry.expires_at <= now:
132 |                 expired_keys.append(key)
133 |                 continue
134 |             
135 |             # Check age-based expiration
136 |             age = now - entry.timestamp
137 |             if age > timedelta(hours=self.max_age_hours):
138 |                 expired_keys.append(key)
139 |         
140 |         for key in expired_keys:
141 |             del self._cache[key]
142 |         
143 |         if expired_keys:
144 |             logger.debug(f"Cleaned {len(expired_keys)} expired cache entries")
145 |     
146 |     def get(self, key: str) -> Optional[Any]:
147 |         """Get cached data by key
148 |         
149 |         Args:
150 |             key: Cache key
151 |             
152 |         Returns:
153 |             Cached data or None if not found
154 |         """
155 |         if not self.enabled:
156 |             return None
157 |         
158 |         entry = self._cache.get(key)
159 |         if not entry:
160 |             return None
161 |         
162 |         # Check if expired
163 |         now = datetime.now()
164 |         if entry.expires_at and entry.expires_at <= now:
165 |             del self._cache[key]
166 |             return None
167 |         
168 |         # Check age
169 |         age = now - entry.timestamp
170 |         if age > timedelta(hours=self.max_age_hours):
171 |             del self._cache[key]
172 |             return None
173 |         
174 |         logger.debug(f"Cache hit for key: {key}")
175 |         return entry.data
176 |     
177 |     def put(
178 |         self, 
179 |         key: str, 
180 |         data: Any, 
181 |         expires_in_hours: Optional[int] = None,
182 |         metadata: Optional[Dict[str, Any]] = None
183 |     ) -> None:
184 |         """Store data in cache
185 |         
186 |         Args:
187 |             key: Cache key
188 |             data: Data to cache
189 |             expires_in_hours: Custom expiration time in hours
190 |             metadata: Additional metadata
191 |         """
192 |         if not self.enabled:
193 |             return
194 |         
195 |         now = datetime.now()
196 |         expires_at = None
197 |         
198 |         if expires_in_hours:
199 |             expires_at = now + timedelta(hours=expires_in_hours)
200 |         
201 |         entry = CacheEntry(
202 |             key=key,
203 |             data=data,
204 |             timestamp=now,
205 |             expires_at=expires_at,
206 |             metadata=metadata or {}
207 |         )
208 |         
209 |         self._cache[key] = entry
210 |         self._save_cache()
211 |         
212 |         logger.debug(f"Cached data with key: {key}")
213 |     
214 |     def get_by_data(self, data: Union[str, Dict, List]) -> Optional[Any]:
215 |         """Get cached data by input data
216 |         
217 |         Args:
218 |             data: Input data to generate key from
219 |             
220 |         Returns:
221 |             Cached result or None
222 |         """
223 |         key = self._generate_key(data)
224 |         return self.get(key)
225 |     
226 |     def put_by_data(
227 |         self, 
228 |         input_data: Union[str, Dict, List], 
229 |         result_data: Any,
230 |         expires_in_hours: Optional[int] = None,
231 |         metadata: Optional[Dict[str, Any]] = None
232 |     ) -> None:
233 |         """Store data in cache by input data
234 |         
235 |         Args:
236 |             input_data: Input data to generate key from
237 |             result_data: Result data to cache
238 |             expires_in_hours: Custom expiration time in hours
239 |             metadata: Additional metadata
240 |         """
241 |         key = self._generate_key(input_data)
242 |         self.put(key, result_data, expires_in_hours, metadata)
243 |     
244 |     def match_locate_cache(self, prompt: str) -> Optional[Dict[str, Any]]:
245 |         """Match locate operation from cache
246 |         
247 |         Args:
248 |             prompt: Locate prompt
249 |             
250 |         Returns:
251 |             Cached locate result or None
252 |         """
253 |         cache_key = f"locate:{self._generate_key(prompt)}"
254 |         return self.get(cache_key)
255 |     
256 |     def store_locate_result(
257 |         self, 
258 |         prompt: str, 
259 |         result: Dict[str, Any],
260 |         expires_in_hours: int = 24
261 |     ) -> None:
262 |         """Store locate result in cache
263 |         
264 |         Args:
265 |             prompt: Locate prompt
266 |             result: Locate result
267 |             expires_in_hours: Expiration time in hours
268 |         """
269 |         cache_key = f"locate:{self._generate_key(prompt)}"
270 |         self.put(cache_key, result, expires_in_hours, {"type": "locate"})
271 |     
272 |     def clear(self) -> None:
273 |         """Clear all cache entries"""
274 |         self._cache.clear()
275 |         if self.cache_file.exists():
276 |             self.cache_file.unlink()
277 |         logger.info("Cache cleared")
278 |     
279 |     def get_stats(self) -> Dict[str, Any]:
280 |         """Get cache statistics
281 |         
282 |         Returns:
283 |             Cache statistics
284 |         """
285 |         now = datetime.now()
286 |         total_entries = len(self._cache)
287 |         
288 |         expired_count = 0
289 |         for entry in self._cache.values():
290 |             if entry.expires_at and entry.expires_at <= now:
291 |                 expired_count += 1
292 |             elif (now - entry.timestamp) > timedelta(hours=self.max_age_hours):
293 |                 expired_count += 1
294 |         
295 |         return {
296 |             "total_entries": total_entries,
297 |             "expired_entries": expired_count,
298 |             "cache_file": str(self.cache_file),
299 |             "cache_size_mb": self.cache_file.stat().st_size / 1024 / 1024 if self.cache_file.exists() else 0,
300 |             "enabled": self.enabled
301 |         }


--------------------------------------------------------------------------------
/wiki/核心概念/Agent核心控制器.md:
--------------------------------------------------------------------------------
  1 | # Agent 核心控制器
  2 | 
  3 | Agent 是 Midscene Python 的核心控制器，为用户提供统一的自动化操作接口。它充当用户代码与底层平台之间的桥梁，通过 AI 理解用户意图并执行相应的操作。
  4 | 
  5 | ## 🎯 设计理念
  6 | 
  7 | ### 统一接口设计
  8 | Agent 为不同平台（Web、Android）提供完全一致的编程接口，用户无需学习不同平台的特定 API：
  9 | 
 10 | ```python
 11 | # Web 和 Android 使用相同的接口
 12 | web_agent = Agent(selenium_page)
 13 | android_agent = Agent(android_device)
 14 | 
 15 | # 相同的操作方法
 16 | await web_agent.ai_action("点击登录按钮")
 17 | await android_agent.ai_action("点击登录按钮")
 18 | ```
 19 | 
 20 | ### AI 驱动的智能操作
 21 | Agent 将自然语言指令转换为具体的操作步骤，让自动化变得更加直观：
 22 | 
 23 | ```python
 24 | # 传统方式需要精确的选择器
 25 | element = driver.find_element(By.CSS_SELECTOR, "#login-form button[type='submit']")
 26 | element.click()
 27 | 
 28 | # Agent 方式使用自然语言
 29 | await agent.ai_action("点击登录表单的提交按钮")
 30 | ```
 31 | 
 32 | ## 🏗️ 架构设计
 33 | 
 34 | ### 核心组件
 35 | 
 36 | ```mermaid
 37 | graph TB
 38 |     A[Agent] --> B[TaskExecutor]
 39 |     A --> C[Insight Engine]
 40 |     A --> D[AI Service]
 41 |     A --> E[Platform Interface]
 42 |     
 43 |     B --> C
 44 |     B --> E
 45 |     C --> D
 46 |     
 47 |     subgraph "Agent 核心"
 48 |         A
 49 |         B
 50 |     end
 51 |     
 52 |     subgraph "AI 理解层"
 53 |         C
 54 |         D
 55 |     end
 56 |     
 57 |     subgraph "平台抽象层"
 58 |         E
 59 |     end
 60 | ```
 61 | 
 62 | ### Agent 类结构
 63 | 
 64 | ```python
 65 | class Agent:
 66 |     """Core Agent class that orchestrates AI model and device interactions"""
 67 |     
 68 |     def __init__(
 69 |         self,
 70 |         interface: AbstractInterface,
 71 |         options: Optional[AgentOptions] = None
 72 |     ):
 73 |         self.interface = interface              # 平台接口
 74 |         self.options = options or AgentOptions() # 配置选项
 75 |         self.ai_service = AIModelService()      # AI 服务
 76 |         self.insight = Insight(...)             # UI 理解引擎
 77 |         self.task_executor = TaskExecutor(...)  # 任务执行器
 78 | ```
 79 | 
 80 | ## 🎮 主要功能
 81 | 
 82 | ### 1. AI 驱动的操作 (ai_action)
 83 | 
 84 | `ai_action` 是 Agent 最核心的方法，支持各种自然语言驱动的操作：
 85 | 
 86 | ```python
 87 | # 基础交互
 88 | await agent.ai_action("点击登录按钮")
 89 | await agent.ai_action("在用户名框输入 'admin'")
 90 | await agent.ai_action("选择下拉菜单中的第二个选项")
 91 | 
 92 | # 复杂操作
 93 | await agent.ai_action("滚动到页面底部并点击加载更多按钮")
 94 | await agent.ai_action("在搜索框输入'Python'并按回车搜索")
 95 | 
 96 | # 条件操作
 97 | await agent.ai_action("如果页面显示错误信息，点击确定按钮")
 98 | ```
 99 | 
100 | #### 工作流程
101 | 
102 | 1. **指令解析**: 将自然语言转换为操作意图
103 | 2. **页面分析**: 获取当前页面的截图和上下文信息
104 | 3. **计划生成**: AI 生成详细的执行计划
105 | 4. **步骤执行**: 逐步执行计划中的每个操作
106 | 5. **结果验证**: 验证操作是否成功完成
107 | 
108 | ```python
109 | async def ai_action(self, prompt: TUserPrompt, **kwargs) -> None:
110 |     """Execute AI-driven action"""
111 |     self._ensure_not_destroyed()
112 |     
113 |     # 委托给任务执行器
114 |     result = await self.task_executor.execute_ai_action(prompt, **kwargs)
115 |     
116 |     if not result.success:
117 |         raise Exception(f"Action failed: {result.error}")
118 | ```
119 | 
120 | ### 2. 智能元素定位 (ai_locate)
121 | 
122 | 精确定位页面元素，支持各种描述方式：
123 | 
124 | ```python
125 | # 基础定位
126 | login_btn = await agent.ai_locate("登录按钮")
127 | search_box = await agent.ai_locate("搜索输入框")
128 | 
129 | # 描述性定位
130 | submit_btn = await agent.ai_locate("蓝色的提交按钮")
131 | user_avatar = await agent.ai_locate("页面右上角的用户头像")
132 | 
133 | # 相对定位
134 | next_btn = await agent.ai_locate("位于分页控件中的下一页按钮")
135 | ```
136 | 
137 | #### 定位策略
138 | 
139 | Agent 使用多种策略进行元素定位：
140 | 
141 | 1. **视觉识别**: 基于截图进行 AI 视觉识别
142 | 2. **语义理解**: 理解元素的功能和上下文
143 | 3. **多重验证**: 结合多种信息确保定位准确性
144 | 4. **容错机制**: 支持页面变化和布局调整
145 | 
146 | ### 3. 数据提取 (ai_extract)
147 | 
148 | 从页面提取结构化数据：
149 | 
150 | ```python
151 | # 提取单个对象
152 | user_info = await agent.ai_extract({
153 |     "name": "用户姓名",
154 |     "email": "邮箱地址",
155 |     "role": "用户角色"
156 | })
157 | 
158 | # 提取列表数据
159 | products = await agent.ai_extract({
160 |     "products": [
161 |         {
162 |             "name": "商品名称",
163 |             "price": "价格",
164 |             "rating": "评分",
165 |             "in_stock": "是否有货"
166 |         }
167 |     ]
168 | })
169 | 
170 | # 复杂嵌套结构
171 | order_data = await agent.ai_extract({
172 |     "order_id": "订单号",
173 |     "customer": {
174 |         "name": "客户姓名",
175 |         "address": "送货地址"
176 |     },
177 |     "items": [
178 |         {
179 |             "product": "商品名称",
180 |             "quantity": "数量",
181 |             "price": "单价"
182 |         }
183 |     ],
184 |     "total": "总金额"
185 | })
186 | ```
187 | 
188 | ### 4. 智能断言 (ai_assert)
189 | 
190 | 验证页面状态和内容：
191 | 
192 | ```python
193 | # 状态验证
194 | await agent.ai_assert("用户已成功登录")
195 | await agent.ai_assert("页面显示错误信息")
196 | await agent.ai_assert("表单验证通过")
197 | 
198 | # 内容验证
199 | await agent.ai_assert("搜索结果包含'Python 教程'")
200 | await agent.ai_assert("购物车中有 3 件商品")
201 | await agent.ai_assert("订单状态为已发货")
202 | 
203 | # 条件验证
204 | await agent.ai_assert("如果是新用户，显示欢迎向导")
205 | ```
206 | 
207 | ## ⚙️ 配置选项
208 | 
209 | ### AgentOptions 配置
210 | 
211 | ```python
212 | from midscene.core import AgentOptions
213 | 
214 | options = AgentOptions(
215 |     # 超时设置
216 |     timeout=30,                    # 操作超时时间（秒）
217 |     
218 |     # 重试机制
219 |     retry_count=3,                 # 失败重试次数
220 |     retry_delay=1.0,               # 重试间隔（秒）
221 |     
222 |     # 调试选项
223 |     screenshot_on_error=True,      # 错误时自动截图
224 |     save_execution_logs=True,      # 保存执行日志
225 |     
226 |     # 性能优化
227 |     cache_enabled=True,            # 启用智能缓存
228 |     parallel_execution=False,      # 并行执行（实验性）
229 |     
230 |     # AI 模型设置
231 |     model_temperature=0.1,         # AI 响应随机性
232 |     max_tokens=1000,              # 最大 token 数
233 | )
234 | 
235 | agent = Agent(page, options=options)
236 | ```
237 | 
238 | ### 运行时配置
239 | 
240 | ```python
241 | # 临时修改超时时间
242 | await agent.ai_action("点击按钮", timeout=60)
243 | 
244 | # 禁用缓存的单次操作
245 | await agent.ai_extract(schema, use_cache=False)
246 | 
247 | # 自定义重试策略
248 | await agent.ai_action("提交表单", retry_count=5, retry_delay=2.0)
249 | ```
250 | 
251 | ## 🔄 生命周期管理
252 | 
253 | ### 初始化和销毁
254 | 
255 | ```python
256 | # 方式1: 手动管理
257 | agent = Agent(page)
258 | try:
259 |     await agent.ai_action("执行操作")
260 | finally:
261 |     await agent.destroy()
262 | 
263 | # 方式2: 上下文管理器（推荐）
264 | async with Agent(page) as agent:
265 |     await agent.ai_action("执行操作")
266 |     # 自动调用 destroy()
267 | ```
268 | 
269 | ### 状态冻结
270 | 
271 | ```python
272 | # 冻结当前页面状态（用于调试）
273 | await agent.freeze()
274 | 
275 | # 在冻结状态下进行多次操作
276 | await agent.ai_extract(schema1)
277 | await agent.ai_extract(schema2)
278 | 
279 | # 解除冻结
280 | await agent.unfreeze()
281 | ```
282 | 
283 | ## 🔧 高级特性
284 | 
285 | ### 1. 自定义 AI 模型
286 | 
287 | ```python
288 | from midscene.core.ai_model import AIModelConfig
289 | 
290 | # 自定义模型配置
291 | ai_config = AIModelConfig(
292 |     provider="openai",
293 |     model="gpt-4-vision-preview",
294 |     temperature=0.0,
295 |     max_tokens=2000,
296 |     api_key="your_api_key"
297 | )
298 | 
299 | agent = Agent(page, ai_config=ai_config)
300 | ```
301 | 
302 | ### 2. 操作链式调用
303 | 
304 | ```python
305 | # 链式操作
306 | await (agent
307 |        .ai_action("点击登录")
308 |        .ai_action("输入用户名")
309 |        .ai_action("输入密码")
310 |        .ai_action("点击提交"))
311 | ```
312 | 
313 | ### 3. 事件监听
314 | 
315 | ```python
316 | # 操作前后的钩子函数
317 | @agent.on_before_action
318 | async def before_action(prompt: str, context: UIContext):
319 |     print(f"即将执行: {prompt}")
320 | 
321 | @agent.on_after_action  
322 | async def after_action(prompt: str, result: ExecutionResult):
323 |     print(f"执行完成: {prompt}, 结果: {result.success}")
324 | ```
325 | 
326 | ### 4. 批量操作
327 | 
328 | ```python
329 | # 批量执行多个操作
330 | actions = [
331 |     "点击菜单按钮",
332 |     "选择设置选项",
333 |     "修改用户信息",
334 |     "保存更改"
335 | ]
336 | 
337 | results = await agent.batch_execute(actions)
338 | ```
339 | 
340 | ## 📊 性能优化
341 | 
342 | ### 智能缓存
343 | 
344 | Agent 内置智能缓存机制，避免重复的 AI 调用：
345 | 
346 | ```python
347 | # 首次调用会请求 AI 模型
348 | result1 = await agent.ai_extract(schema)
349 | 
350 | # 相同 schema 和页面状态会使用缓存
351 | result2 = await agent.ai_extract(schema)  # 使用缓存，更快
352 | 
353 | # 强制禁用缓存
354 | result3 = await agent.ai_extract(schema, use_cache=False)
355 | ```
356 | 
357 | ### 并发控制
358 | 
359 | ```python
360 | # 控制并发数量，避免过多 AI 请求
361 | agent.set_concurrency_limit(3)
362 | 
363 | # 异步执行多个独立操作
364 | import asyncio
365 | 
366 | tasks = [
367 |     agent.ai_extract(schema1),
368 |     agent.ai_extract(schema2),
369 |     agent.ai_extract(schema3)
370 | ]
371 | 
372 | results = await asyncio.gather(*tasks)
373 | ```
374 | 
375 | ## 🚨 错误处理
376 | 
377 | ### 异常类型
378 | 
379 | ```python
380 | from midscene.core.exceptions import (
381 |     AgentError,
382 |     ElementNotFoundError,
383 |     OperationTimeoutError,
384 |     AIServiceError
385 | )
386 | 
387 | try:
388 |     await agent.ai_action("点击不存在的按钮")
389 | except ElementNotFoundError as e:
390 |     print(f"元素未找到: {e}")
391 | except OperationTimeoutError as e:
392 |     print(f"操作超时: {e}")
393 | except AIServiceError as e:
394 |     print(f"AI 服务错误: {e}")
395 | ```
396 | 
397 | ### 重试机制
398 | 
399 | ```python
400 | # 自动重试配置
401 | options = AgentOptions(
402 |     retry_count=3,
403 |     retry_delay=1.0,
404 |     retry_on_errors=[ElementNotFoundError, OperationTimeoutError]
405 | )
406 | 
407 | # 手动重试
408 | from midscene.shared.retry import retry_async
409 | 
410 | @retry_async(max_attempts=3, delay=1.0)
411 | async def robust_action():
412 |     await agent.ai_action("点击可能不稳定的元素")
413 | ```
414 | 
415 | ## 🔍 调试和诊断
416 | 
417 | ### 详细日志
418 | 
419 | ```python
420 | import logging
421 | from midscene.shared.logger import setup_logger
422 | 
423 | # 启用详细日志
424 | setup_logger(level=logging.DEBUG)
425 | 
426 | # 操作执行时会输出详细信息
427 | await agent.ai_action("点击按钮")
428 | ```
429 | 
430 | ### 执行报告
431 | 
432 | ```python
433 | # 生成详细的执行报告
434 | report = await agent.generate_report()
435 | print(f"总操作数: {report.total_actions}")
436 | print(f"成功率: {report.success_rate}")
437 | print(f"平均执行时间: {report.avg_execution_time}")
438 | 
439 | # 保存报告到文件
440 | await report.save_to_file("execution_report.html")
441 | ```
442 | 
443 | ### 手动调试
444 | 
445 | ```python
446 | # 获取当前页面状态
447 | context = await agent.get_current_context()
448 | print(f"页面标题: {context.page_title}")
449 | print(f"页面 URL: {context.url}")
450 | 
451 | # 手动截图
452 | screenshot = await agent.screenshot()
453 | with open("debug.png", "wb") as f:
454 |     f.write(screenshot)
455 | 
456 | # 获取页面元素信息
457 | elements = await agent.get_all_elements()
458 | for element in elements:
459 |     print(f"元素: {element.tag_name}, 文本: {element.text}")
460 | ```
461 | 
462 | ## 🎯 最佳实践
463 | 
464 | ### 1. 清晰的操作描述
465 | ```python
466 | # ❌ 模糊的描述
467 | await agent.ai_action("点击按钮")
468 | 
469 | # ✅ 具体的描述
470 | await agent.ai_action("点击页面右上角的蓝色登录按钮")
471 | ```
472 | 
473 | ### 2. 合理的超时设置
474 | ```python
475 | # 根据操作复杂度设置超时
476 | await agent.ai_action("点击按钮", timeout=10)          # 简单操作
477 | await agent.ai_action("等待页面加载完成", timeout=30)    # 复杂操作
478 | ```
479 | 
480 | ### 3. 错误处理
481 | ```python
482 | # 优雅的错误处理
483 | try:
484 |     await agent.ai_action("尝试点击可能不存在的按钮")
485 | except ElementNotFoundError:
486 |     # 执行备选方案
487 |     await agent.ai_action("点击替代按钮")
488 | ```
489 | 
490 | ### 4. 资源管理
491 | ```python
492 | # 使用上下文管理器确保资源释放
493 | async with Agent(page) as agent:
494 |     await agent.ai_action("执行操作")
495 |     # 自动清理资源
496 | ```
497 | 
498 | ## 🔗 相关文档
499 | 
500 | - **API 参考**: [Agent API 完整文档](../API参考/Agent-API.md)
501 | - **UI 理解**: [Insight UI理解引擎](Insight-UI理解引擎.md)
502 | - **平台集成**: [Web自动化](../平台集成/Web自动化/README.md) | [Android自动化](../平台集成/Android自动化.md)
503 | - **示例代码**: [基础示例](../示例和教程/基础示例.md)
504 | 
505 | ---
506 | 
507 | Agent 是 Midscene Python 的核心，掌握了 Agent 的使用就掌握了框架的精髓。继续探索其他核心概念来深入理解整个框架的工作原理！


--------------------------------------------------------------------------------
/midscene/web/bridge.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Bridge mode implementation for Chrome extension integration
  3 | """
  4 | 
  5 | import asyncio
  6 | import json
  7 | import websockets
  8 | from typing import Any, Dict, List, Optional
  9 | from loguru import logger
 10 | 
 11 | from ..core.types import (
 12 |     AbstractInterface, InterfaceType, UIContext, BaseElement, UINode, UITree,
 13 |     Size, Rect, Point, NodeType
 14 | )
 15 | 
 16 | 
 17 | class BridgeElement(BaseElement):
 18 |     """Bridge element wrapper"""
 19 |     
 20 |     def __init__(self, bridge: 'BridgeWebPage', **kwargs):
 21 |         self._bridge = bridge
 22 |         super().__init__(**kwargs)
 23 |     
 24 |     async def tap(self) -> None:
 25 |         """Click this element"""
 26 |         try:
 27 |             await self._bridge.send_command({
 28 |                 "action": "click",
 29 |                 "target": {"id": self.id}
 30 |             })
 31 |         except Exception as e:
 32 |             logger.error(f"Failed to click element: {e}")
 33 |             raise
 34 |     
 35 |     async def input_text(self, text: str) -> None:
 36 |         """Input text to this element"""
 37 |         try:
 38 |             await self._bridge.send_command({
 39 |                 "action": "input",
 40 |                 "target": {"id": self.id},
 41 |                 "text": text
 42 |             })
 43 |         except Exception as e:
 44 |             logger.error(f"Failed to input text: {e}")
 45 |             raise
 46 | 
 47 | 
 48 | class BridgeWebPage(AbstractInterface):
 49 |     """Bridge mode page interface for Chrome extension communication"""
 50 |     
 51 |     def __init__(self, websocket_url: str = "ws://localhost:8765"):
 52 |         """Initialize bridge connection
 53 |         
 54 |         Args:
 55 |             websocket_url: WebSocket server URL
 56 |         """
 57 |         self.websocket_url = websocket_url
 58 |         self.websocket: Optional[websockets.WebSocketServerProtocol] = None
 59 |         self._command_id = 0
 60 |         self._response_futures: Dict[int, asyncio.Future] = {}
 61 |     
 62 |     @classmethod
 63 |     async def create(
 64 |         cls,
 65 |         websocket_url: str = "ws://localhost:8765",
 66 |         wait_for_connection: bool = True
 67 |     ) -> 'BridgeWebPage':
 68 |         """Create bridge connection
 69 |         
 70 |         Args:
 71 |             websocket_url: WebSocket server URL
 72 |             wait_for_connection: Wait for extension to connect
 73 |             
 74 |         Returns:
 75 |             BridgeWebPage instance
 76 |         """
 77 |         bridge = cls(websocket_url)
 78 |         
 79 |         if wait_for_connection:
 80 |             await bridge.connect()
 81 |         
 82 |         return bridge
 83 |     
 84 |     async def connect(self, timeout: float = 30.0) -> None:
 85 |         """Connect to Chrome extension"""
 86 |         try:
 87 |             logger.info(f"Waiting for extension connection on {self.websocket_url}")
 88 |             
 89 |             # Start WebSocket server and wait for extension to connect
 90 |             server = await websockets.serve(
 91 |                 self._handle_connection,
 92 |                 "localhost", 
 93 |                 8765
 94 |             )
 95 |             
 96 |             # Wait for connection with timeout
 97 |             start_time = asyncio.get_event_loop().time()
 98 |             while not self.websocket and (asyncio.get_event_loop().time() - start_time) < timeout:
 99 |                 await asyncio.sleep(0.1)
100 |             
101 |             if not self.websocket:
102 |                 raise TimeoutError("Extension connection timeout")
103 |             
104 |             logger.info("Extension connected successfully")
105 |             
106 |         except Exception as e:
107 |             logger.error(f"Failed to connect to extension: {e}")
108 |             raise
109 |     
110 |     async def _handle_connection(self, websocket, path):
111 |         """Handle WebSocket connection from extension"""
112 |         self.websocket = websocket
113 |         logger.info("Extension connected")
114 |         
115 |         try:
116 |             async for message in websocket:
117 |                 await self._handle_message(message)
118 |         except websockets.exceptions.ConnectionClosed:
119 |             logger.info("Extension disconnected")
120 |             self.websocket = None
121 |         except Exception as e:
122 |             logger.error(f"WebSocket error: {e}")
123 |     
124 |     async def _handle_message(self, message: str) -> None:
125 |         """Handle message from extension"""
126 |         try:
127 |             data = json.loads(message)
128 |             
129 |             if "id" in data and data["id"] in self._response_futures:
130 |                 # Response to command
131 |                 future = self._response_futures.pop(data["id"])
132 |                 if not future.done():
133 |                     future.set_result(data)
134 |             else:
135 |                 # Unsolicited message from extension
136 |                 logger.debug(f"Received message: {data}")
137 |                 
138 |         except Exception as e:
139 |             logger.error(f"Failed to handle message: {e}")
140 |     
141 |     async def send_command(self, command: Dict[str, Any]) -> Dict[str, Any]:
142 |         """Send command to extension and wait for response"""
143 |         if not self.websocket:
144 |             raise RuntimeError("Extension not connected")
145 |         
146 |         command_id = self._command_id
147 |         self._command_id += 1
148 |         
149 |         command["id"] = command_id
150 |         
151 |         # Create future for response
152 |         future = asyncio.Future()
153 |         self._response_futures[command_id] = future
154 |         
155 |         try:
156 |             # Send command
157 |             await self.websocket.send(json.dumps(command))
158 |             
159 |             # Wait for response
160 |             response = await asyncio.wait_for(future, timeout=30.0)
161 |             
162 |             if response.get("error"):
163 |                 raise Exception(f"Command failed: {response['error']}")
164 |             
165 |             return response
166 |             
167 |         except asyncio.TimeoutError:
168 |             self._response_futures.pop(command_id, None)
169 |             raise TimeoutError("Command timeout")
170 |         except Exception as e:
171 |             self._response_futures.pop(command_id, None)
172 |             raise
173 |     
174 |     @property
175 |     def interface_type(self) -> InterfaceType:
176 |         """Get interface type"""
177 |         return InterfaceType.WEB
178 |     
179 |     async def get_context(self) -> UIContext:
180 |         """Get current UI context"""
181 |         try:
182 |             response = await self.send_command({"action": "getContext"})
183 |             
184 |             # Parse context data
185 |             context_data = response["data"]
186 |             
187 |             # Convert to UIContext
188 |             screenshot_base64 = context_data["screenshot"]
189 |             size = Size(**context_data["size"])
190 |             
191 |             elements = []
192 |             for elem_data in context_data["elements"]:
193 |                 rect = Rect(**elem_data["rect"])
194 |                 node_type = NodeType(elem_data.get("nodeType", "other"))
195 |                 
196 |                 element = BridgeElement(
197 |                     bridge=self,
198 |                     id=elem_data["id"],
199 |                     content=elem_data["content"],
200 |                     rect=rect,
201 |                     center=tuple(elem_data["center"]),
202 |                     node_type=node_type,
203 |                     attributes=elem_data.get("attributes", {}),
204 |                     is_visible=elem_data.get("isVisible", True)
205 |                 )
206 |                 elements.append(element)
207 |             
208 |             # Build tree
209 |             tree_data = context_data.get("tree", {})
210 |             tree = self._build_tree_from_data(tree_data)
211 |             
212 |             return UIContext(
213 |                 screenshot_base64=screenshot_base64,
214 |                 size=size,
215 |                 content=elements,
216 |                 tree=tree
217 |             )
218 |             
219 |         except Exception as e:
220 |             logger.error(f"Failed to get context: {e}")
221 |             raise
222 |     
223 |     async def action_space(self) -> List[str]:
224 |         """Get available actions"""
225 |         return [
226 |             "tap", "click", "double_click", "right_click",
227 |             "input", "type", "clear",
228 |             "scroll", "scroll_up", "scroll_down", "scroll_left", "scroll_right",
229 |             "hover", "drag", "key_press", "navigate"
230 |         ]
231 |     
232 |     async def tap(self, x: float, y: float) -> None:
233 |         """Tap at coordinates"""
234 |         try:
235 |             await self.send_command({
236 |                 "action": "tap",
237 |                 "coordinates": {"x": x, "y": y}
238 |             })
239 |         except Exception as e:
240 |             logger.error(f"Failed to tap at ({x}, {y}): {e}")
241 |             raise
242 |     
243 |     async def input_text(self, text: str) -> None:
244 |         """Input text to focused element"""
245 |         try:
246 |             await self.send_command({
247 |                 "action": "inputText",
248 |                 "text": text
249 |             })
250 |         except Exception as e:
251 |             logger.error(f"Failed to input text: {e}")
252 |             raise
253 |     
254 |     async def scroll(self, direction: str, distance: Optional[int] = None) -> None:
255 |         """Scroll in direction"""
256 |         try:
257 |             await self.send_command({
258 |                 "action": "scroll",
259 |                 "direction": direction,
260 |                 "distance": distance or 500
261 |             })
262 |         except Exception as e:
263 |             logger.error(f"Failed to scroll {direction}: {e}")
264 |             raise
265 |     
266 |     async def navigate_to(self, url: str) -> None:
267 |         """Navigate to URL"""
268 |         try:
269 |             await self.send_command({
270 |                 "action": "navigate",
271 |                 "url": url
272 |             })
273 |         except Exception as e:
274 |             logger.error(f"Failed to navigate to {url}: {e}")
275 |             raise
276 |     
277 |     def _build_tree_from_data(self, tree_data: Dict[str, Any]) -> UITree:
278 |         """Build UITree from extension data"""
279 |         if not tree_data:
280 |             # Return minimal tree
281 |             root_node = UINode(
282 |                 id="root",
283 |                 content="",
284 |                 rect=Rect(left=0, top=0, width=1920, height=1080),
285 |                 center=(960, 540),
286 |                 node_type=NodeType.CONTAINER,
287 |                 attributes={},
288 |                 is_visible=True,
289 |                 children=[]
290 |             )
291 |             return UITree(node=root_node, children=[])
292 |         
293 |         # Convert tree data to UINode
294 |         node_data = tree_data["node"]
295 |         node = UINode(
296 |             id=node_data["id"],
297 |             content=node_data["content"],
298 |             rect=Rect(**node_data["rect"]),
299 |             center=tuple(node_data["center"]),
300 |             node_type=NodeType(node_data.get("nodeType", "other")),
301 |             attributes=node_data.get("attributes", {}),
302 |             is_visible=node_data.get("isVisible", True),
303 |             children=[]
304 |         )
305 |         
306 |         # Build children recursively
307 |         children = []
308 |         for child_data in tree_data.get("children", []):
309 |             child_tree = self._build_tree_from_data(child_data)
310 |             children.append(child_tree)
311 |         
312 |         return UITree(node=node, children=children)
313 |     
314 |     async def close(self) -> None:
315 |         """Close bridge connection"""
316 |         if self.websocket:
317 |             await self.websocket.close()
318 |             self.websocket = None


--------------------------------------------------------------------------------
/midscene/core/ai_model/providers.py:
--------------------------------------------------------------------------------
  1 | """
  2 | AI Model Providers - Implementations for different AI services
  3 | """
  4 | 
  5 | import json
  6 | from typing import Any, Dict, List, Optional, Type
  7 | 
  8 | import httpx
  9 | from loguru import logger
 10 | from pydantic import BaseModel
 11 | 
 12 | from .service import AIProvider, AIModelConfig, parse_json_response, create_usage_info
 13 | 
 14 | 
 15 | class OpenAIProvider(AIProvider):
 16 |     """OpenAI API provider"""
 17 |     
 18 |     async def call(
 19 |         self,
 20 |         messages: List[Dict[str, Any]], 
 21 |         config: AIModelConfig,
 22 |         response_schema: Optional[Type[BaseModel]] = None,
 23 |         **kwargs
 24 |     ) -> Dict[str, Any]:
 25 |         """Call OpenAI API"""
 26 |         headers = {
 27 |             "Authorization": f"Bearer {config.api_key}",
 28 |             "Content-Type": "application/json"
 29 |         }
 30 |         
 31 |         payload = {
 32 |             "model": config.model,
 33 |             "messages": messages,
 34 |             "max_tokens": config.max_tokens,
 35 |             "temperature": config.temperature
 36 |         }
 37 |         
 38 |         # Support structured output for compatible models
 39 |         if response_schema and "gpt-4" in config.model:
 40 |             payload["response_format"] = {
 41 |                 "type": "json_schema",
 42 |                 "json_schema": {
 43 |                     "name": response_schema.__name__,
 44 |                     "schema": response_schema.model_json_schema()
 45 |                 }
 46 |             }
 47 |         
 48 |         base_url = config.base_url or "https://api.openai.com"
 49 |         url = f"{base_url}/v1/chat/completions"
 50 |         
 51 |         async with httpx.AsyncClient(timeout=config.timeout) as client:
 52 |             response = await client.post(url, headers=headers, json=payload)
 53 |             response.raise_for_status()
 54 |             
 55 |             result = response.json()
 56 |             content = result['choices'][0]['message']['content']
 57 |             
 58 |             if response_schema:
 59 |                 try:
 60 |                     parsed = parse_json_response(content)
 61 |                     validated = response_schema(**parsed)
 62 |                     return {
 63 |                         "content": validated.model_dump(),
 64 |                         "usage": create_usage_info(result.get('usage', {}))
 65 |                     }
 66 |                 except Exception as e:
 67 |                     logger.warning(f"Failed to parse structured response: {e}")
 68 |                     return {
 69 |                         "content": {"error": str(e), "raw_content": content},
 70 |                         "usage": create_usage_info(result.get('usage', {}))
 71 |                     }
 72 |             
 73 |             return {
 74 |                 "content": content,
 75 |                 "usage": create_usage_info(result.get('usage', {}))
 76 |             }
 77 | 
 78 | 
 79 | class AnthropicProvider(AIProvider):
 80 |     """Anthropic Claude API provider"""
 81 |     
 82 |     async def call(
 83 |         self,
 84 |         messages: List[Dict[str, Any]], 
 85 |         config: AIModelConfig,
 86 |         response_schema: Optional[Type[BaseModel]] = None,
 87 |         **kwargs
 88 |     ) -> Dict[str, Any]:
 89 |         """Call Anthropic API"""
 90 |         headers = {
 91 |             "x-api-key": config.api_key,
 92 |             "Content-Type": "application/json",
 93 |             "anthropic-version": "2023-06-01"
 94 |         }
 95 |         
 96 |         # Convert messages format for Anthropic
 97 |         system_message = ""
 98 |         anthropic_messages = []
 99 |         
100 |         for msg in messages:
101 |             if msg["role"] == "system":
102 |                 system_message = msg["content"]
103 |             else:
104 |                 anthropic_messages.append(msg)
105 |         
106 |         payload = {
107 |             "model": config.model,
108 |             "max_tokens": config.max_tokens,
109 |             "temperature": config.temperature,
110 |             "messages": anthropic_messages
111 |         }
112 |         
113 |         if system_message:
114 |             payload["system"] = system_message
115 |         
116 |         base_url = config.base_url or "https://api.anthropic.com"
117 |         url = f"{base_url}/v1/messages"
118 |         
119 |         async with httpx.AsyncClient(timeout=config.timeout) as client:
120 |             response = await client.post(url, headers=headers, json=payload)
121 |             response.raise_for_status()
122 |             
123 |             result = response.json()
124 |             content = result['content'][0]['text']
125 |             
126 |             if response_schema:
127 |                 try:
128 |                     parsed = parse_json_response(content)
129 |                     validated = response_schema(**parsed)
130 |                     return {
131 |                         "content": validated.model_dump(),
132 |                         "usage": create_usage_info(result.get('usage', {}))
133 |                     }
134 |                 except Exception as e:
135 |                     logger.warning(f"Failed to parse structured response: {e}")
136 |                     return {
137 |                         "content": {"error": str(e), "raw_content": content},
138 |                         "usage": create_usage_info(result.get('usage', {}))
139 |                     }
140 |             
141 |             return {
142 |                 "content": content,
143 |                 "usage": create_usage_info(result.get('usage', {}))
144 |             }
145 | 
146 | 
147 | class QwenProvider(AIProvider):
148 |     """Alibaba Qwen API provider"""
149 |     
150 |     async def call(
151 |         self,
152 |         messages: List[Dict[str, Any]], 
153 |         config: AIModelConfig,
154 |         response_schema: Optional[Type[BaseModel]] = None,
155 |         **kwargs
156 |     ) -> Dict[str, Any]:
157 |         """Call Qwen API"""
158 |         try:
159 |             import dashscope
160 |         except ImportError:
161 |             raise ImportError("dashscope is required for Qwen provider. Install with: pip install dashscope")
162 |         
163 |         dashscope.api_key = config.api_key
164 |         
165 |         # Convert messages for Qwen
166 |         qwen_messages = []
167 |         for msg in messages:
168 |             qwen_messages.append({
169 |                 "role": msg["role"],
170 |                 "content": msg["content"]
171 |             })
172 |         
173 |         response = await dashscope.Generation.acall(
174 |             model=config.model,
175 |             messages=qwen_messages,
176 |             max_tokens=config.max_tokens,
177 |             temperature=config.temperature,
178 |             result_format='message'
179 |         )
180 |         
181 |         if response.status_code == 200:
182 |             content = response.output.choices[0]['message']['content']
183 |             
184 |             if response_schema:
185 |                 try:
186 |                     parsed = parse_json_response(content)
187 |                     validated = response_schema(**parsed)
188 |                     return {
189 |                         "content": validated.model_dump(),
190 |                         "usage": create_usage_info(response.usage)
191 |                     }
192 |                 except Exception as e:
193 |                     logger.warning(f"Failed to parse structured response: {e}")
194 |                     return {
195 |                         "content": {"error": str(e), "raw_content": content},
196 |                         "usage": create_usage_info(response.usage)
197 |                     }
198 |             
199 |             return {
200 |                 "content": content,
201 |                 "usage": create_usage_info(response.usage)
202 |             }
203 |         else:
204 |             raise Exception(f"Qwen API error: {response.message}")
205 | 
206 | 
207 | class GeminiProvider(AIProvider):
208 |     """Google Gemini API provider"""
209 |     
210 |     async def call(
211 |         self,
212 |         messages: List[Dict[str, Any]], 
213 |         config: AIModelConfig,
214 |         response_schema: Optional[Type[BaseModel]] = None,
215 |         **kwargs
216 |     ) -> Dict[str, Any]:
217 |         """Call Gemini API"""
218 |         try:
219 |             import google.generativeai as genai
220 |         except ImportError:
221 |             raise ImportError("google-generativeai is required for Gemini provider. Install with: pip install google-generativeai")
222 |         
223 |         genai.configure(api_key=config.api_key)
224 |         model = genai.GenerativeModel(config.model)
225 |         
226 |         # Convert messages format for Gemini
227 |         gemini_messages = []
228 |         for msg in messages:
229 |             if msg["role"] == "system":
230 |                 # Gemini doesn't have system role, prepend to first user message
231 |                 continue
232 |             elif msg["role"] == "user":
233 |                 if isinstance(msg["content"], list):
234 |                     # Handle multimodal content
235 |                     parts = []
236 |                     for part in msg["content"]:
237 |                         if part["type"] == "text":
238 |                             parts.append(part["text"])
239 |                         elif part["type"] == "image_url":
240 |                             # Convert base64 image to Gemini format
241 |                             import base64
242 |                             import io
243 |                             from PIL import Image
244 |                             
245 |                             image_data = part["image_url"]["url"]
246 |                             if image_data.startswith("data:image"):
247 |                                 image_data = image_data.split(",")[1]
248 |                             
249 |                             image_bytes = base64.b64decode(image_data)
250 |                             image = Image.open(io.BytesIO(image_bytes))
251 |                             parts.append(image)
252 |                     
253 |                     gemini_messages.append({"role": "user", "parts": parts})
254 |                 else:
255 |                     gemini_messages.append({"role": "user", "parts": [msg["content"]]})
256 |             elif msg["role"] == "assistant":
257 |                 gemini_messages.append({"role": "model", "parts": [msg["content"]]})
258 |         
259 |         generation_config = genai.types.GenerationConfig(
260 |             max_output_tokens=config.max_tokens,
261 |             temperature=config.temperature
262 |         )
263 |         
264 |         response = await model.generate_content_async(
265 |             gemini_messages,
266 |             generation_config=generation_config
267 |         )
268 |         
269 |         content = response.text
270 |         
271 |         if response_schema:
272 |             try:
273 |                 parsed = parse_json_response(content)
274 |                 validated = response_schema(**parsed)
275 |                 return {
276 |                     "content": validated.model_dump(),
277 |                     "usage": create_usage_info({
278 |                         "prompt_tokens": response.usage_metadata.prompt_token_count,
279 |                         "completion_tokens": response.usage_metadata.candidates_token_count,
280 |                         "total_tokens": response.usage_metadata.total_token_count
281 |                     })
282 |                 }
283 |             except Exception as e:
284 |                 logger.warning(f"Failed to parse structured response: {e}")
285 |                 return {
286 |                     "content": {"error": str(e), "raw_content": content},
287 |                     "usage": create_usage_info({
288 |                         "prompt_tokens": response.usage_metadata.prompt_token_count,
289 |                         "completion_tokens": response.usage_metadata.candidates_token_count,
290 |                         "total_tokens": response.usage_metadata.total_token_count
291 |                     })
292 |                 }
293 |         
294 |         return {
295 |             "content": content,
296 |             "usage": create_usage_info({
297 |                 "prompt_tokens": response.usage_metadata.prompt_token_count,
298 |                 "completion_tokens": response.usage_metadata.candidates_token_count,
299 |                 "total_tokens": response.usage_metadata.total_token_count
300 |             })
301 |         }


--------------------------------------------------------------------------------
/wiki/核心概念/Insight-UI理解引擎.md:
--------------------------------------------------------------------------------
  1 | # Insight UI理解引擎
  2 | 
  3 | Insight 是 Midscene Python 的 AI 驱动的 UI 理解引擎，负责页面分析、元素定位和操作决策。它是连接 AI 模型与实际操作的核心组件。
  4 | 
  5 | ## 🧠 设计理念
  6 | 
  7 | ### AI 驱动的视觉理解
  8 | Insight 利用先进的视觉语言模型（VLM）来理解页面内容：
  9 | 
 10 | ```python
 11 | # Insight 不依赖传统的选择器
 12 | # 而是通过 AI 视觉理解来定位元素
 13 | element = await insight.locate("蓝色的登录按钮")
 14 | element = await insight.locate("位于页面右上角的搜索图标")
 15 | ```
 16 | 
 17 | ### 上下文感知决策
 18 | Insight 结合页面状态、用户意图和历史操作来做出智能决策：
 19 | 
 20 | ```python
 21 | # 同样的描述在不同上下文下可能指向不同元素
 22 | await insight.locate("确定按钮")  # 对话框中的确定按钮
 23 | await insight.locate("确定按钮")  # 表单中的确定按钮
 24 | ```
 25 | 
 26 | ## 🏗️ 架构设计
 27 | 
 28 | ### 核心组件
 29 | 
 30 | ```mermaid
 31 | graph TB
 32 |     A[Insight Engine] --> B[Context Provider]
 33 |     A --> C[AI Model Service]
 34 |     A --> D[Response Processor]
 35 |     A --> E[Dump Subscribers]
 36 |     
 37 |     B --> F[UI Context]
 38 |     C --> G[Multi-Model Support]
 39 |     D --> H[Element Processing]
 40 |     D --> I[Data Validation]
 41 |     
 42 |     subgraph "输入处理"
 43 |         B
 44 |         F
 45 |     end
 46 |     
 47 |     subgraph "AI 推理"
 48 |         C
 49 |         G
 50 |     end
 51 |     
 52 |     subgraph "结果处理"
 53 |         D
 54 |         H
 55 |         I
 56 |     end
 57 | ```
 58 | 
 59 | ### Insight 类结构
 60 | 
 61 | ```python
 62 | class Insight:
 63 |     """AI-powered UI understanding and reasoning engine"""
 64 |     
 65 |     def __init__(
 66 |         self,
 67 |         context_provider: Union[UIContext, Callable],
 68 |         ai_service: Optional[AIModelService] = None,
 69 |         model_config: Optional[AIModelConfig] = None
 70 |     ):
 71 |         self.context_provider = context_provider  # 上下文提供者
 72 |         self.ai_service = ai_service              # AI 模型服务
 73 |         self.model_config = model_config          # 模型配置
 74 |         self._dump_subscribers = []               # 调试订阅者
 75 | ```
 76 | 
 77 | ## 🎯 核心功能
 78 | 
 79 | ### 1. 智能元素定位 (locate)
 80 | 
 81 | Insight 的核心能力是通过自然语言精确定位页面元素：
 82 | 
 83 | ```python
 84 | # 基础定位
 85 | login_btn = await insight.locate("登录按钮")
 86 | search_box = await insight.locate("搜索输入框")
 87 | 
 88 | # 描述性定位
 89 | submit_btn = await insight.locate("绿色的提交按钮")
 90 | close_icon = await insight.locate("模态对话框右上角的关闭图标")
 91 | 
 92 | # 相对定位
 93 | next_page = await insight.locate("分页器中的下一页按钮")
 94 | first_item = await insight.locate("列表中的第一个商品")
 95 | 
 96 | # 条件定位
 97 | error_msg = await insight.locate("如果存在错误信息的提示框")
 98 | ```
 99 | 
100 | #### 定位策略
101 | 
102 | Insight 使用多层次的定位策略：
103 | 
104 | 1. **视觉识别**: 分析截图中的视觉元素
105 | 2. **语义理解**: 理解元素的功能和语义
106 | 3. **布局分析**: 考虑元素的位置关系
107 | 4. **上下文感知**: 结合页面状态和操作历史
108 | 
109 | ```python
110 | class LocateResponse(BaseModel):
111 |     """AI locate response schema"""
112 |     elements: List[Dict[str, Any]]  # 找到的元素列表
113 |     reasoning: str                  # AI 推理过程
114 |     confidence: float              # 置信度
115 |     errors: List[str] = []         # 错误信息
116 | ```
117 | 
118 | #### 定位选项
119 | 
120 | ```python
121 | from midscene.core.types import LocateOption
122 | 
123 | options = LocateOption(
124 |     multiple=True,           # 查找多个匹配的元素
125 |     timeout=10,             # 定位超时时间
126 |     wait_for_visible=True,  # 等待元素可见
127 |     confidence_threshold=0.8 # 最小置信度阈值
128 | )
129 | 
130 | elements = await insight.locate("商品卡片", options)
131 | ```
132 | 
133 | ### 2. 数据提取 (extract)
134 | 
135 | 从页面提取结构化数据：
136 | 
137 | ```python
138 | # 简单数据提取
139 | user_info = await insight.extract({
140 |     "name": "用户姓名",
141 |     "email": "邮箱地址",
142 |     "role": "用户角色"
143 | })
144 | 
145 | # 复杂列表数据
146 | products = await insight.extract({
147 |     "products": [
148 |         {
149 |             "name": "商品名称",
150 |             "price": "价格",
151 |             "rating": "评分",
152 |             "description": "商品描述",
153 |             "in_stock": "是否有库存"
154 |         }
155 |     ]
156 | })
157 | 
158 | # 嵌套结构数据
159 | page_data = await insight.extract({
160 |     "header": {
161 |         "title": "页面标题",
162 |         "user": "当前用户名"
163 |     },
164 |     "content": {
165 |         "articles": [
166 |             {
167 |                 "title": "文章标题",
168 |                 "author": "作者",
169 |                 "date": "发布日期"
170 |             }
171 |         ]
172 |     },
173 |     "footer": {
174 |         "copyright": "版权信息"
175 |     }
176 | })
177 | ```
178 | 
179 | #### 提取选项
180 | 
181 | ```python
182 | from midscene.core.types import ExtractOption
183 | 
184 | options = ExtractOption(
185 |     return_thought=True,     # 返回 AI 的思考过程
186 |     schema_validation=True,  # 启用数据结构验证
187 |     timeout=30              # 提取超时时间
188 | )
189 | 
190 | result = await insight.extract(schema, options)
191 | print(result["thought"])  # AI 的推理过程
192 | print(result["data"])     # 提取的数据
193 | ```
194 | 
195 | ### 3. 智能断言 (assert_condition)
196 | 
197 | 验证页面状态和条件：
198 | 
199 | ```python
200 | # 状态断言
201 | result = await insight.assert_condition("用户已成功登录")
202 | assert result.passed, result.message
203 | 
204 | # 内容断言
205 | result = await insight.assert_condition("页面显示了 5 个搜索结果")
206 | assert result.passed
207 | 
208 | # 复杂条件断言
209 | result = await insight.assert_condition(
210 |     "如果是新用户，页面应该显示欢迎指引"
211 | )
212 | 
213 | # 否定断言
214 | result = await insight.assert_condition("页面没有显示错误信息")
215 | ```
216 | 
217 | #### 断言结果
218 | 
219 | ```python
220 | class AssertResult:
221 |     passed: bool      # 断言是否通过
222 |     reasoning: str    # AI 推理过程
223 |     confidence: float # 置信度
224 |     message: str      # 详细消息
225 | ```
226 | 
227 | ## 🔧 上下文管理
228 | 
229 | ### 上下文提供者
230 | 
231 | Insight 通过上下文提供者获取页面信息：
232 | 
233 | ```python
234 | # 静态上下文
235 | context = UIContext(
236 |     screenshot_base64="...",
237 |     page_title="登录页面",
238 |     url="https://example.com/login"
239 | )
240 | insight = Insight(context)
241 | 
242 | # 动态上下文
243 | async def get_context(action: InsightAction) -> UIContext:
244 |     # 根据操作类型获取不同的上下文信息
245 |     if action == InsightAction.LOCATE:
246 |         return await page.get_locate_context()
247 |     elif action == InsightAction.EXTRACT:
248 |         return await page.get_extract_context()
249 |     else:
250 |         return await page.get_default_context()
251 | 
252 | insight = Insight(get_context)
253 | ```
254 | 
255 | ### 上下文类型
256 | 
257 | ```python
258 | class UIContext(BaseModel):
259 |     """UI context information"""
260 |     screenshot_base64: str      # 页面截图（Base64 编码）
261 |     page_title: str            # 页面标题
262 |     url: str                   # 页面 URL
263 |     viewport_size: tuple       # 视口大小
264 |     device_pixel_ratio: float  # 设备像素比
265 |     elements: List[BaseElement] # 页面元素信息
266 |     timestamp: float           # 时间戳
267 | ```
268 | 
269 | ## 🎨 AI 消息构建
270 | 
271 | ### 定位消息
272 | 
273 | Insight 为不同操作构建专门的 AI 消息：
274 | 
275 | ```python
276 | def _build_locate_messages(
277 |     self, 
278 |     prompt: str, 
279 |     context: UIContext, 
280 |     options: LocateOption
281 | ) -> List[Dict]:
282 |     """构建元素定位的 AI 消息"""
283 |     return [
284 |         {
285 |             "role": "system",
286 |             "content": self._get_locate_system_prompt()
287 |         },
288 |         {
289 |             "role": "user", 
290 |             "content": [
291 |                 {
292 |                     "type": "text",
293 |                     "text": f"请在页面中定位：{prompt}"
294 |                 },
295 |                 {
296 |                     "type": "image_url",
297 |                     "image_url": {
298 |                         "url": f"data:image/png;base64,{context.screenshot_base64}"
299 |                     }
300 |                 }
301 |             ]
302 |         }
303 |     ]
304 | ```
305 | 
306 | ### 系统提示词
307 | 
308 | ```python
309 | def _get_locate_system_prompt(self) -> str:
310 |     """获取元素定位的系统提示词"""
311 |     return """
312 | 你是一个专业的UI元素定位专家。请分析页面截图，根据用户描述精确定位目标元素。
313 | 
314 | 定位原则：
315 | 1. 优先考虑功能语义而非视觉外观
316 | 2. 结合上下文理解元素关系
317 | 3. 对于模糊描述，选择最可能的候选元素
318 | 4. 提供详细的定位推理过程
319 | 
320 | 返回格式：
321 | {
322 |     "elements": [
323 |         {
324 |             "rect": {"x": 0, "y": 0, "width": 100, "height": 30},
325 |             "text": "元素文本",
326 |             "tag": "元素标签",
327 |             "attributes": {"id": "...", "class": "..."},
328 |             "confidence": 0.95
329 |         }
330 |     ],
331 |     "reasoning": "定位推理过程",
332 |     "confidence": 0.9
333 | }
334 |     """.strip()
335 | ```
336 | 
337 | ## 📊 响应处理
338 | 
339 | ### 元素处理
340 | 
341 | ```python
342 | def _process_locate_response(
343 |     self, 
344 |     response: LocateResponse, 
345 |     context: UIContext
346 | ) -> Optional[BaseElement]:
347 |     """处理定位响应，返回最佳匹配元素"""
348 |     
349 |     if not response.elements:
350 |         return None
351 |     
352 |     # 选择置信度最高的元素
353 |     best_element = max(
354 |         response.elements, 
355 |         key=lambda e: e.get("confidence", 0)
356 |     )
357 |     
358 |     # 创建元素对象
359 |     element = BaseElement(
360 |         rect=best_element["rect"],
361 |         text=best_element.get("text", ""),
362 |         tag_name=best_element.get("tag", ""),
363 |         attributes=best_element.get("attributes", {})
364 |     )
365 |     
366 |     return element
367 | ```
368 | 
369 | ### 数据验证
370 | 
371 | ```python
372 | def _validate_extract_response(
373 |     self, 
374 |     response: ExtractResponse, 
375 |     schema: Dict
376 | ) -> bool:
377 |     """验证提取数据的结构是否符合预期"""
378 |     
379 |     try:
380 |         # 使用 Pydantic 进行结构验证
381 |         from pydantic import create_model
382 |         
383 |         # 动态创建验证模型
384 |         validator = create_model("ExtractValidator", **schema)
385 |         validator(**response.data)
386 |         
387 |         return True
388 |     except Exception as e:
389 |         logger.warning(f"Data validation failed: {e}")
390 |         return False
391 | ```
392 | 
393 | ## 🔍 调试和监控
394 | 
395 | ### 调试订阅者
396 | 
397 | Insight 支持调试订阅者来监控执行过程：
398 | 
399 | ```python
400 | async def debug_subscriber(dump_data: Dict):
401 |     """调试订阅者函数"""
402 |     operation = dump_data["type"]
403 |     prompt = dump_data.get("prompt", "")
404 |     
405 |     print(f"🔍 操作: {operation}")
406 |     print(f"📝 提示: {prompt}")
407 |     
408 |     if "error" in dump_data:
409 |         print(f"❌ 错误: {dump_data['error']}")
410 |     else:
411 |         print(f"✅ 成功")
412 |     
413 |     # 保存调试信息到文件
414 |     with open(f"debug_{operation}.json", "w") as f:
415 |         json.dump(dump_data, f, indent=2)
416 | 
417 | # 注册调试订阅者
418 | insight.subscribe_to_dump(debug_subscriber)
419 | ```
420 | 
421 | ### 执行统计
422 | 
423 | ```python
424 | class InsightMetrics:
425 |     """Insight 执行统计"""
426 |     
427 |     def __init__(self):
428 |         self.operation_count = 0
429 |         self.total_time = 0
430 |         self.success_count = 0
431 |         self.ai_tokens_used = 0
432 |     
433 |     def record_operation(self, operation: str, duration: float, success: bool, tokens: int):
434 |         self.operation_count += 1
435 |         self.total_time += duration
436 |         if success:
437 |             self.success_count += 1
438 |         self.ai_tokens_used += tokens
439 |     
440 |     @property
441 |     def success_rate(self) -> float:
442 |         return self.success_count / self.operation_count if self.operation_count > 0 else 0
443 |     
444 |     @property
445 |     def avg_time(self) -> float:
446 |         return self.total_time / self.operation_count if self.operation_count > 0 else 0
447 | 
448 | # 使用统计
449 | metrics = InsightMetrics()
450 | insight.set_metrics_collector(metrics)
451 | ```
452 | 
453 | ## ⚙️ 高级配置
454 | 
455 | ### 模型配置
456 | 
457 | ```python
458 | from midscene.core.ai_model import AIModelConfig
459 | 
460 | # 针对不同操作使用不同配置
461 | locate_config = AIModelConfig(
462 |     provider="openai",
463 |     model="gpt-4-vision-preview",
464 |     temperature=0.1,    # 定位需要更确定性
465 |     max_tokens=500
466 | )
467 | 
468 | extract_config = AIModelConfig(
469 |     provider="claude",
470 |     model="claude-3-sonnet-20240229",
471 |     temperature=0.2,    # 提取允许更多创造性
472 |     max_tokens=2000
473 | )
474 | 
475 | # 创建专门的 Insight 实例
476 | locate_insight = Insight(context_provider, model_config=locate_config)
477 | extract_insight = Insight(context_provider, model_config=extract_config)
478 | ```
479 | 
480 | ### 缓存配置
481 | 
482 | ```python
483 | # 启用智能缓存
484 | insight.enable_cache(
485 |     cache_size=1000,           # 缓存条目数
486 |     ttl=3600,                 # 缓存过期时间（秒）
487 |     hash_screenshot=True,      # 基于截图内容生成缓存键
488 |     cache_ai_responses=True    # 缓存 AI 响应
489 | )
490 | 
491 | # 缓存策略配置
492 | insight.set_cache_strategy(
493 |     locate_cache_enabled=True,     # 定位操作缓存
494 |     extract_cache_enabled=True,    # 提取操作缓存
495 |     assert_cache_enabled=False     # 断言操作不缓存（实时性要求高）
496 | )
497 | ```
498 | 
499 | ## 🚀 性能优化
500 | 
501 | ### 批量操作
502 | 
503 | ```python
504 | # 批量定位多个元素
505 | elements = await insight.batch_locate([
506 |     "登录按钮",
507 |     "注册链接", 
508 |     "忘记密码链接"
509 | ])
510 | 
511 | # 批量提取多个数据块
512 | data_blocks = await insight.batch_extract([
513 |     {"user_info": {"name": "姓名", "email": "邮箱"}},
514 |     {"product_list": [{"name": "商品名", "price": "价格"}]},
515 |     {"navigation": {"items": ["导航项目"]}}
516 | ])
517 | ```
518 | 
519 | ### 并发控制
520 | 
521 | ```python
522 | # 设置并发限制
523 | insight.set_concurrency_limit(3)
524 | 
525 | # 异步并发执行
526 | import asyncio
527 | 
528 | async def parallel_operations():
529 |     tasks = [
530 |         insight.locate("按钮1"),
531 |         insight.locate("按钮2"),
532 |         insight.extract(schema1),
533 |         insight.extract(schema2)
534 |     ]
535 |     
536 |     results = await asyncio.gather(*tasks, return_exceptions=True)
537 |     return results
538 | ```
539 | 
540 | ## 🎯 最佳实践
541 | 
542 | ### 1. 清晰的描述
543 | ```python
544 | # ❌ 模糊描述
545 | await insight.locate("按钮")
546 | 
547 | # ✅ 具体描述
548 | await insight.locate("页面右上角的蓝色登录按钮")
549 | ```
550 | 
551 | ### 2. 合理的置信度阈值
552 | ```python
553 | # 根据场景调整置信度要求
554 | options = LocateOption(
555 |     confidence_threshold=0.9  # 高要求场景
556 | )
557 | element = await insight.locate("重要操作按钮", options)
558 | ```
559 | 
560 | ### 3. 错误处理和重试
561 | ```python
562 | async def robust_locate(prompt: str, max_retries: int = 3):
563 |     for attempt in range(max_retries):
564 |         try:
565 |             result = await insight.locate(prompt)
566 |             if result.element:
567 |                 return result
568 |         except Exception as e:
569 |             if attempt == max_retries - 1:
570 |                 raise
571 |             await asyncio.sleep(1)  # 等待后重试
572 |     
573 |     raise ElementNotFoundError(f"Element not found after {max_retries} attempts")
574 | ```
575 | 
576 | ### 4. 上下文优化
577 | ```python
578 | # 为不同操作提供优化的上下文
579 | async def optimized_context_provider(action: InsightAction) -> UIContext:
580 |     base_context = await page.get_context()
581 |     
582 |     if action == InsightAction.LOCATE:
583 |         # 定位操作需要更详细的元素信息
584 |         base_context.elements = await page.get_all_elements()
585 |     elif action == InsightAction.EXTRACT:
586 |         # 提取操作需要更完整的页面内容
587 |         base_context.page_content = await page.get_page_content()
588 |     
589 |     return base_context
590 | ```
591 | 
592 | ## 🔗 相关文档
593 | 
594 | - **Agent 集成**: [Agent 核心控制器](Agent核心控制器.md)
595 | - **AI 模型**: [AI模型服务抽象层](AI模型服务抽象层.md)
596 | - **数据类型**: [UI上下文与数据模型](UI上下文与数据模型.md)
597 | - **API 参考**: [Insight API](../API参考/Insight-API.md)
598 | 
599 | ---
600 | 
601 | Insight 是 Midscene Python 的智能核心，它让 AI 能够真正"看懂"和"理解"用户界面。掌握 Insight 的使用将大大提升你的自动化脚本的智能程度和稳定性！


--------------------------------------------------------------------------------
/midscene/shared/report.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Report generation and visualization
  3 | """
  4 | 
  5 | import json
  6 | from datetime import datetime
  7 | from pathlib import Path
  8 | from typing import Any, Dict, List, Optional
  9 | 
 10 | from jinja2 import Template
 11 | from loguru import logger
 12 | 
 13 | 
 14 | class ExecutionReport:
 15 |     """Execution report data model"""
 16 |     
 17 |     def __init__(self):
 18 |         self.start_time = datetime.now()
 19 |         self.end_time: Optional[datetime] = None
 20 |         self.success = True
 21 |         self.error: Optional[str] = None
 22 |         self.tasks: List[Dict[str, Any]] = []
 23 |         self.metadata: Dict[str, Any] = {}
 24 |         self.screenshots: List[str] = []
 25 |         self.ai_usage: Dict[str, Any] = {}
 26 |     
 27 |     def add_task(self, task_data: Dict[str, Any]) -> None:
 28 |         """Add task execution data"""
 29 |         self.tasks.append({
 30 |             **task_data,
 31 |             "timestamp": datetime.now().isoformat()
 32 |         })
 33 |     
 34 |     def add_screenshot(self, screenshot_base64: str, description: str = "") -> None:
 35 |         """Add screenshot to report"""
 36 |         self.screenshots.append({
 37 |             "image": screenshot_base64,
 38 |             "description": description,
 39 |             "timestamp": datetime.now().isoformat()
 40 |         })
 41 |     
 42 |     def update_ai_usage(self, usage_data: Dict[str, Any]) -> None:
 43 |         """Update AI usage statistics"""
 44 |         for key, value in usage_data.items():
 45 |             if key in self.ai_usage:
 46 |                 if isinstance(value, (int, float)):
 47 |                     self.ai_usage[key] += value
 48 |                 else:
 49 |                     self.ai_usage[key] = value
 50 |             else:
 51 |                 self.ai_usage[key] = value
 52 |     
 53 |     def finalize(self, success: bool = True, error: Optional[str] = None) -> None:
 54 |         """Finalize report"""
 55 |         self.end_time = datetime.now()
 56 |         self.success = success
 57 |         self.error = error
 58 |     
 59 |     def to_dict(self) -> Dict[str, Any]:
 60 |         """Convert to dictionary"""
 61 |         duration = None
 62 |         if self.end_time:
 63 |             duration = (self.end_time - self.start_time).total_seconds()
 64 |         
 65 |         return {
 66 |             "start_time": self.start_time.isoformat(),
 67 |             "end_time": self.end_time.isoformat() if self.end_time else None,
 68 |             "duration_seconds": duration,
 69 |             "success": self.success,
 70 |             "error": self.error,
 71 |             "tasks": self.tasks,
 72 |             "metadata": self.metadata,
 73 |             "screenshots": self.screenshots,
 74 |             "ai_usage": self.ai_usage,
 75 |             "summary": {
 76 |                 "total_tasks": len(self.tasks),
 77 |                 "successful_tasks": len([t for t in self.tasks if t.get("success", True)]),
 78 |                 "failed_tasks": len([t for t in self.tasks if not t.get("success", True)]),
 79 |                 "total_screenshots": len(self.screenshots)
 80 |             }
 81 |         }
 82 | 
 83 | 
 84 | class ReportGenerator:
 85 |     """Generate execution reports in various formats"""
 86 |     
 87 |     def __init__(self, output_dir: str = "./reports"):
 88 |         """Initialize report generator
 89 |         
 90 |         Args:
 91 |             output_dir: Output directory for reports
 92 |         """
 93 |         self.output_dir = Path(output_dir)
 94 |         self.output_dir.mkdir(parents=True, exist_ok=True)
 95 |     
 96 |     def generate_html_report(
 97 |         self, 
 98 |         report: ExecutionReport, 
 99 |         template_path: Optional[str] = None
100 |     ) -> str:
101 |         """Generate HTML report
102 |         
103 |         Args:
104 |             report: Execution report data
105 |             template_path: Custom template path
106 |             
107 |         Returns:
108 |             Path to generated HTML file
109 |         """
110 |         if template_path:
111 |             with open(template_path, 'r', encoding='utf-8') as f:
112 |                 template_content = f.read()
113 |         else:
114 |             template_content = self._get_default_html_template()
115 |         
116 |         template = Template(template_content)
117 |         
118 |         # Generate report
119 |         html_content = template.render(
120 |             report=report.to_dict(),
121 |             generated_at=datetime.now().isoformat()
122 |         )
123 |         
124 |         # Save to file
125 |         timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
126 |         filename = f"midscene_report_{timestamp}.html"
127 |         file_path = self.output_dir / filename
128 |         
129 |         with open(file_path, 'w', encoding='utf-8') as f:
130 |             f.write(html_content)
131 |         
132 |         logger.info(f"HTML report generated: {file_path}")
133 |         return str(file_path)
134 |     
135 |     def generate_json_report(self, report: ExecutionReport) -> str:
136 |         """Generate JSON report
137 |         
138 |         Args:
139 |             report: Execution report data
140 |             
141 |         Returns:
142 |             Path to generated JSON file
143 |         """
144 |         # Save to file
145 |         timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
146 |         filename = f"midscene_report_{timestamp}.json"
147 |         file_path = self.output_dir / filename
148 |         
149 |         with open(file_path, 'w', encoding='utf-8') as f:
150 |             json.dump(report.to_dict(), f, ensure_ascii=False, indent=2)
151 |         
152 |         logger.info(f"JSON report generated: {file_path}")
153 |         return str(file_path)
154 |     
155 |     def _get_default_html_template(self) -> str:
156 |         """Get default HTML template"""
157 |         return """
158 | <!DOCTYPE html>
159 | <html lang="en">
160 | <head>
161 |     <meta charset="UTF-8">
162 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
163 |     <title>Midscene Execution Report</title>
164 |     <style>
165 |         body {
166 |             font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
167 |             line-height: 1.6;
168 |             margin: 0;
169 |             padding: 20px;
170 |             background-color: #f5f5f5;
171 |         }
172 |         .container {
173 |             max-width: 1200px;
174 |             margin: 0 auto;
175 |             background: white;
176 |             border-radius: 8px;
177 |             box-shadow: 0 2px 10px rgba(0,0,0,0.1);
178 |             padding: 30px;
179 |         }
180 |         .header {
181 |             border-bottom: 2px solid #e1e5e9;
182 |             padding-bottom: 20px;
183 |             margin-bottom: 30px;
184 |         }
185 |         .header h1 {
186 |             margin: 0;
187 |             color: #2c3e50;
188 |             font-size: 2.5em;
189 |         }
190 |         .status {
191 |             display: inline-block;
192 |             padding: 8px 16px;
193 |             border-radius: 20px;
194 |             font-weight: bold;
195 |             margin-top: 10px;
196 |         }
197 |         .status.success {
198 |             background-color: #d4edda;
199 |             color: #155724;
200 |         }
201 |         .status.failure {
202 |             background-color: #f8d7da;
203 |             color: #721c24;
204 |         }
205 |         .summary {
206 |             display: grid;
207 |             grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
208 |             gap: 20px;
209 |             margin-bottom: 30px;
210 |         }
211 |         .summary-card {
212 |             background: #f8f9fa;
213 |             padding: 20px;
214 |             border-radius: 6px;
215 |             border-left: 4px solid #007bff;
216 |         }
217 |         .summary-card h3 {
218 |             margin: 0 0 10px 0;
219 |             color: #495057;
220 |             font-size: 0.9em;
221 |             text-transform: uppercase;
222 |             letter-spacing: 0.5px;
223 |         }
224 |         .summary-card .value {
225 |             font-size: 2em;
226 |             font-weight: bold;
227 |             color: #2c3e50;
228 |         }
229 |         .tasks {
230 |             margin-bottom: 30px;
231 |         }
232 |         .task {
233 |             background: #f8f9fa;
234 |             border: 1px solid #e9ecef;
235 |             border-radius: 6px;
236 |             margin-bottom: 15px;
237 |             padding: 20px;
238 |         }
239 |         .task.success {
240 |             border-left: 4px solid #28a745;
241 |         }
242 |         .task.failure {
243 |             border-left: 4px solid #dc3545;
244 |         }
245 |         .task-header {
246 |             display: flex;
247 |             justify-content: space-between;
248 |             align-items: center;
249 |             margin-bottom: 10px;
250 |         }
251 |         .task-title {
252 |             font-weight: bold;
253 |             font-size: 1.1em;
254 |             color: #2c3e50;
255 |         }
256 |         .task-status {
257 |             padding: 4px 12px;
258 |             border-radius: 12px;
259 |             font-size: 0.8em;
260 |             font-weight: bold;
261 |         }
262 |         .task-status.success {
263 |             background-color: #d4edda;
264 |             color: #155724;
265 |         }
266 |         .task-status.failure {
267 |             background-color: #f8d7da;
268 |             color: #721c24;
269 |         }
270 |         .screenshots {
271 |             display: grid;
272 |             grid-template-columns: repeat(auto-fill, minmax(300px, 1fr));
273 |             gap: 20px;
274 |             margin-top: 30px;
275 |         }
276 |         .screenshot {
277 |             border: 1px solid #dee2e6;
278 |             border-radius: 6px;
279 |             overflow: hidden;
280 |         }
281 |         .screenshot img {
282 |             width: 100%;
283 |             height: auto;
284 |             display: block;
285 |         }
286 |         .screenshot-caption {
287 |             padding: 10px;
288 |             background: #f8f9fa;
289 |             font-size: 0.9em;
290 |             color: #6c757d;
291 |         }
292 |         .ai-usage {
293 |             background: #e3f2fd;
294 |             border-radius: 6px;
295 |             padding: 20px;
296 |             margin-top: 30px;
297 |         }
298 |         .ai-usage h3 {
299 |             margin-top: 0;
300 |             color: #1976d2;
301 |         }
302 |         .usage-grid {
303 |             display: grid;
304 |             grid-template-columns: repeat(auto-fit, minmax(150px, 1fr));
305 |             gap: 15px;
306 |         }
307 |         .usage-item {
308 |             text-align: center;
309 |         }
310 |         .usage-item .label {
311 |             font-size: 0.8em;
312 |             color: #666;
313 |             text-transform: uppercase;
314 |         }
315 |         .usage-item .value {
316 |             font-size: 1.5em;
317 |             font-weight: bold;
318 |             color: #1976d2;
319 |         }
320 |         .footer {
321 |             margin-top: 40px;
322 |             padding-top: 20px;
323 |             border-top: 1px solid #e1e5e9;
324 |             text-align: center;
325 |             color: #6c757d;
326 |             font-size: 0.9em;
327 |         }
328 |     </style>
329 | </head>
330 | <body>
331 |     <div class="container">
332 |         <div class="header">
333 |             <h1>🤖 Midscene Execution Report</h1>
334 |             <div class="status {{ 'success' if report.success else 'failure' }}">
335 |                 {{ '✅ Success' if report.success else '❌ Failed' }}
336 |             </div>
337 |             {% if report.error %}
338 |             <div style="margin-top: 15px; padding: 15px; background: #f8d7da; color: #721c24; border-radius: 4px;">
339 |                 <strong>Error:</strong> {{ report.error }}
340 |             </div>
341 |             {% endif %}
342 |         </div>
343 | 
344 |         <div class="summary">
345 |             <div class="summary-card">
346 |                 <h3>Duration</h3>
347 |                 <div class="value">
348 |                     {% if report.duration_seconds %}
349 |                         {{ "%.1f"|format(report.duration_seconds) }}s
350 |                     {% else %}
351 |                         -
352 |                     {% endif %}
353 |                 </div>
354 |             </div>
355 |             <div class="summary-card">
356 |                 <h3>Total Tasks</h3>
357 |                 <div class="value">{{ report.summary.total_tasks }}</div>
358 |             </div>
359 |             <div class="summary-card">
360 |                 <h3>Successful</h3>
361 |                 <div class="value">{{ report.summary.successful_tasks }}</div>
362 |             </div>
363 |             <div class="summary-card">
364 |                 <h3>Failed</h3>
365 |                 <div class="value">{{ report.summary.failed_tasks }}</div>
366 |             </div>
367 |         </div>
368 | 
369 |         {% if report.tasks %}
370 |         <div class="tasks">
371 |             <h2>📋 Task Execution</h2>
372 |             {% for task in report.tasks %}
373 |             <div class="task {{ 'success' if task.get('success', True) else 'failure' }}">
374 |                 <div class="task-header">
375 |                     <div class="task-title">{{ task.get('type', 'Task') }}: {{ task.get('description', 'Unknown') }}</div>
376 |                     <div class="task-status {{ 'success' if task.get('success', True) else 'failure' }}">
377 |                         {{ 'Success' if task.get('success', True) else 'Failed' }}
378 |                     </div>
379 |                 </div>
380 |                 {% if task.get('error') %}
381 |                 <div style="color: #dc3545; margin-top: 10px;">
382 |                     <strong>Error:</strong> {{ task.error }}
383 |                 </div>
384 |                 {% endif %}
385 |                 {% if task.get('result') %}
386 |                 <div style="margin-top: 10px;">
387 |                     <strong>Result:</strong> {{ task.result }}
388 |                 </div>
389 |                 {% endif %}
390 |                 <div style="margin-top: 10px; font-size: 0.9em; color: #6c757d;">
391 |                     {{ task.timestamp }}
392 |                 </div>
393 |             </div>
394 |             {% endfor %}
395 |         </div>
396 |         {% endif %}
397 | 
398 |         {% if report.ai_usage %}
399 |         <div class="ai-usage">
400 |             <h3>🧠 AI Usage Statistics</h3>
401 |             <div class="usage-grid">
402 |                 {% for key, value in report.ai_usage.items() %}
403 |                 <div class="usage-item">
404 |                     <div class="label">{{ key.replace('_', ' ').title() }}</div>
405 |                     <div class="value">{{ value }}</div>
406 |                 </div>
407 |                 {% endfor %}
408 |             </div>
409 |         </div>
410 |         {% endif %}
411 | 
412 |         {% if report.screenshots %}
413 |         <div class="screenshots">
414 |             <h2>📸 Screenshots</h2>
415 |             {% for screenshot in report.screenshots %}
416 |             <div class="screenshot">
417 |                 <img src="data:image/png;base64,{{ screenshot.image }}" alt="Screenshot">
418 |                 {% if screenshot.description %}
419 |                 <div class="screenshot-caption">{{ screenshot.description }}</div>
420 |                 {% endif %}
421 |             </div>
422 |             {% endfor %}
423 |         </div>
424 |         {% endif %}
425 | 
426 |         <div class="footer">
427 |             Generated by Midscene Python at {{ generated_at }}
428 |         </div>
429 |     </div>
430 | </body>
431 | </html>
432 |         """.strip()
433 | 
434 | 
435 | def create_report() -> ExecutionReport:
436 |     """Create new execution report
437 |     
438 |     Returns:
439 |         ExecutionReport instance
440 |     """
441 |     return ExecutionReport()


--------------------------------------------------------------------------------
/midscene/web/playwright_page.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Playwright integration for Midscene
  3 | """
  4 | 
  5 | import base64
  6 | import json
  7 | from typing import Any, Dict, List, Optional
  8 | 
  9 | from playwright.async_api import async_playwright, Page, Browser, BrowserContext
 10 | from loguru import logger
 11 | 
 12 | from ..core.types import (
 13 |     AbstractInterface, InterfaceType, UIContext, BaseElement, UINode, UITree,
 14 |     Size, Rect, Point, NodeType
 15 | )
 16 | 
 17 | 
 18 | class PlaywrightElement(BaseElement):
 19 |     """Playwright element wrapper"""
 20 |     
 21 |     def __init__(self, page: Page, selector: str, **kwargs):
 22 |         self._page = page
 23 |         self._selector = selector
 24 |         super().__init__(**kwargs)
 25 |     
 26 |     async def tap(self) -> None:
 27 |         """Click this element"""
 28 |         try:
 29 |             await self._page.click(self._selector)
 30 |         except Exception as e:
 31 |             logger.error(f"Failed to click element: {e}")
 32 |             raise
 33 |     
 34 |     async def input_text(self, text: str) -> None:
 35 |         """Input text to this element"""
 36 |         try:
 37 |             await self._page.fill(self._selector, text)
 38 |         except Exception as e:
 39 |             logger.error(f"Failed to input text: {e}")
 40 |             raise
 41 | 
 42 | 
 43 | class PlaywrightWebPage(AbstractInterface):
 44 |     """Playwright page interface"""
 45 |     
 46 |     def __init__(self, page: Page, context: BrowserContext, browser: Browser):
 47 |         """Initialize with Playwright page
 48 |         
 49 |         Args:
 50 |             page: Playwright page instance
 51 |             context: Browser context
 52 |             browser: Browser instance
 53 |         """
 54 |         self.page = page
 55 |         self.context = context
 56 |         self.browser = browser
 57 |     
 58 |     @classmethod
 59 |     async def create(
 60 |         cls,
 61 |         headless: bool = False,
 62 |         viewport_size: tuple[int, int] = (1920, 1080),
 63 |         user_data_dir: Optional[str] = None,
 64 |         **browser_options
 65 |     ) -> 'PlaywrightWebPage':
 66 |         """Create new Playwright page instance
 67 |         
 68 |         Args:
 69 |             headless: Run in headless mode
 70 |             viewport_size: Browser viewport size
 71 |             user_data_dir: Browser user data directory
 72 |             **browser_options: Additional browser options
 73 |             
 74 |         Returns:
 75 |             PlaywrightWebPage instance
 76 |         """
 77 |         playwright = await async_playwright().start()
 78 |         
 79 |         launch_options = {
 80 |             "headless": headless,
 81 |             **browser_options
 82 |         }
 83 |         
 84 |         if user_data_dir:
 85 |             launch_options["user_data_dir"] = user_data_dir
 86 |         
 87 |         browser = await playwright.chromium.launch(**launch_options)
 88 |         
 89 |         context = await browser.new_context(
 90 |             viewport={"width": viewport_size[0], "height": viewport_size[1]}
 91 |         )
 92 |         
 93 |         page = await context.new_page()
 94 |         
 95 |         return cls(page, context, browser)
 96 |     
 97 |     @property
 98 |     def interface_type(self) -> InterfaceType:
 99 |         """Get interface type"""
100 |         return InterfaceType.WEB
101 |     
102 |     async def get_context(self) -> UIContext:
103 |         """Get current UI context"""
104 |         try:
105 |             # Take screenshot
106 |             screenshot_base64 = await self._take_screenshot()
107 |             
108 |             # Get page size
109 |             size = await self._get_page_size()
110 |             
111 |             # Extract DOM elements
112 |             elements = await self._extract_elements()
113 |             
114 |             # Build UI tree
115 |             tree = await self._build_ui_tree()
116 |             
117 |             return UIContext(
118 |                 screenshot_base64=screenshot_base64,
119 |                 size=size,
120 |                 content=elements,
121 |                 tree=tree
122 |             )
123 |             
124 |         except Exception as e:
125 |             logger.error(f"Failed to get context: {e}")
126 |             raise
127 |     
128 |     async def action_space(self) -> List[str]:
129 |         """Get available actions"""
130 |         return [
131 |             "tap", "click", "double_click", "right_click",
132 |             "input", "type", "fill", "clear",
133 |             "scroll", "scroll_up", "scroll_down", "scroll_left", "scroll_right",
134 |             "hover", "drag", "key_press", "navigate", "reload",
135 |             "go_back", "go_forward"
136 |         ]
137 |     
138 |     async def tap(self, x: float, y: float) -> None:
139 |         """Tap at coordinates"""
140 |         try:
141 |             await self.page.mouse.click(x, y)
142 |         except Exception as e:
143 |             logger.error(f"Failed to tap at ({x}, {y}): {e}")
144 |             raise
145 |     
146 |     async def input_text(self, text: str) -> None:
147 |         """Input text to focused element"""
148 |         try:
149 |             await self.page.keyboard.type(text)
150 |         except Exception as e:
151 |             logger.error(f"Failed to input text: {e}")
152 |             raise
153 |     
154 |     async def scroll(self, direction: str, distance: Optional[int] = None) -> None:
155 |         """Scroll in direction"""
156 |         try:
157 |             distance = distance or 500
158 |             
159 |             if direction == "down":
160 |                 await self.page.mouse.wheel(0, distance)
161 |             elif direction == "up":
162 |                 await self.page.mouse.wheel(0, -distance)
163 |             elif direction == "right":
164 |                 await self.page.mouse.wheel(distance, 0)
165 |             elif direction == "left":
166 |                 await self.page.mouse.wheel(-distance, 0)
167 |             else:
168 |                 raise ValueError(f"Invalid scroll direction: {direction}")
169 |                 
170 |         except Exception as e:
171 |             logger.error(f"Failed to scroll {direction}: {e}")
172 |             raise
173 |     
174 |     async def navigate_to(self, url: str) -> None:
175 |         """Navigate to URL"""
176 |         try:
177 |             await self.page.goto(url, wait_until="networkidle")
178 |         except Exception as e:
179 |             logger.error(f"Failed to navigate to {url}: {e}")
180 |             raise
181 |     
182 |     async def wait_for_element(
183 |         self, 
184 |         selector: str, 
185 |         timeout: float = 10000
186 |     ) -> None:
187 |         """Wait for element to be present"""
188 |         try:
189 |             await self.page.wait_for_selector(selector, timeout=timeout)
190 |         except Exception as e:
191 |             raise TimeoutError(f"Element not found: {selector}")
192 |     
193 |     async def evaluate_script(self, script: str, *args) -> Any:
194 |         """Evaluate JavaScript"""
195 |         return await self.page.evaluate(script, *args)
196 |     
197 |     async def close(self) -> None:
198 |         """Close the browser"""
199 |         try:
200 |             await self.context.close()
201 |             await self.browser.close()
202 |         except Exception as e:
203 |             logger.warning(f"Error closing browser: {e}")
204 |     
205 |     async def _take_screenshot(self) -> str:
206 |         """Take screenshot and return base64 string"""
207 |         try:
208 |             # Take screenshot as bytes
209 |             screenshot_bytes = await self.page.screenshot(type="png")
210 |             
211 |             # Convert to base64
212 |             screenshot_base64 = base64.b64encode(screenshot_bytes).decode('utf-8')
213 |             
214 |             return screenshot_base64
215 |             
216 |         except Exception as e:
217 |             logger.error(f"Failed to take screenshot: {e}")
218 |             raise
219 |     
220 |     async def _get_page_size(self) -> Size:
221 |         """Get page viewport size"""
222 |         try:
223 |             viewport_size = await self.page.evaluate("""
224 |                 () => ({
225 |                     width: window.innerWidth,
226 |                     height: window.innerHeight
227 |                 })
228 |             """)
229 |             
230 |             return Size(
231 |                 width=viewport_size['width'],
232 |                 height=viewport_size['height']
233 |             )
234 |             
235 |         except Exception as e:
236 |             logger.error(f"Failed to get page size: {e}")
237 |             return Size(width=1920, height=1080)
238 |     
239 |     async def _extract_elements(self) -> List[PlaywrightElement]:
240 |         """Extract all visible elements from page"""
241 |         try:
242 |             # Use JavaScript to extract element information
243 |             element_data = await self.page.evaluate("""
244 |                 () => {
245 |                     const elements = [];
246 |                     const allElements = document.querySelectorAll('*');
247 |                     
248 |                     allElements.forEach((el, index) => {
249 |                         const rect = el.getBoundingClientRect();
250 |                         
251 |                         // Skip elements that are not visible
252 |                         if (rect.width === 0 || rect.height === 0 || 
253 |                             rect.top < 0 || rect.left < 0 ||
254 |                             getComputedStyle(el).visibility === 'hidden' ||
255 |                             getComputedStyle(el).display === 'none') {
256 |                             return;
257 |                         }
258 |                         
259 |                         // Generate a selector for this element
260 |                         const selector = generateSelector(el);
261 |                         
262 |                         elements.push({
263 |                             id: `element_${index}`,
264 |                             selector: selector,
265 |                             tagName: el.tagName.toLowerCase(),
266 |                             content: el.textContent?.trim() || el.getAttribute('alt') || el.getAttribute('title') || '',
267 |                             rect: {
268 |                                 left: rect.left,
269 |                                 top: rect.top,
270 |                                 width: rect.width,
271 |                                 height: rect.height
272 |                             },
273 |                             center: [rect.left + rect.width / 2, rect.top + rect.height / 2],
274 |                             attributes: {
275 |                                 id: el.id,
276 |                                 className: el.className,
277 |                                 type: el.type,
278 |                                 name: el.name,
279 |                                 href: el.href,
280 |                                 src: el.src,
281 |                                 value: el.value,
282 |                                 placeholder: el.placeholder
283 |                             }
284 |                         });
285 |                     });
286 |                     
287 |                     function generateSelector(element) {
288 |                         if (element.id) {
289 |                             return `#${element.id}`;
290 |                         }
291 |                         
292 |                         let path = element.tagName.toLowerCase();
293 |                         let parent = element.parentElement;
294 |                         
295 |                         while (parent && parent !== document.body) {
296 |                             const siblings = Array.from(parent.children);
297 |                             const index = siblings.indexOf(element) + 1;
298 |                             path = `${parent.tagName.toLowerCase()}:nth-child(${index}) > ${path}`;
299 |                             element = parent;
300 |                             parent = element.parentElement;
301 |                         }
302 |                         
303 |                         return path;
304 |                     }
305 |                     
306 |                     return elements;
307 |                 }
308 |             """)
309 |             
310 |             elements = []
311 |             for data in element_data:
312 |                 rect_data = data['rect']
313 |                 rect = Rect(
314 |                     left=rect_data['left'],
315 |                     top=rect_data['top'],
316 |                     width=rect_data['width'],
317 |                     height=rect_data['height']
318 |                 )
319 |                 
320 |                 # Determine node type
321 |                 tag_name = data['tagName']
322 |                 node_type = self._get_node_type(tag_name, data['attributes'])
323 |                 
324 |                 element = PlaywrightElement(
325 |                     page=self.page,
326 |                     selector=data['selector'],
327 |                     id=data['id'],
328 |                     content=data['content'],
329 |                     rect=rect,
330 |                     center=tuple(data['center']),
331 |                     node_type=node_type,
332 |                     attributes=data['attributes'],
333 |                     is_visible=True
334 |                 )
335 |                 
336 |                 elements.append(element)
337 |             
338 |             return elements
339 |             
340 |         except Exception as e:
341 |             logger.error(f"Failed to extract elements: {e}")
342 |             return []
343 |     
344 |     def _get_node_type(self, tag_name: str, attributes: Dict[str, Any]) -> NodeType:
345 |         """Determine node type from tag name and attributes"""
346 |         if tag_name in ['input', 'textarea']:
347 |             input_type = attributes.get('type', '').lower()
348 |             if input_type in ['text', 'password', 'email', 'search', 'url', 'tel']:
349 |                 return NodeType.INPUT
350 |             elif input_type in ['button', 'submit', 'reset']:
351 |                 return NodeType.BUTTON
352 |         elif tag_name in ['button']:
353 |             return NodeType.BUTTON
354 |         elif tag_name in ['a']:
355 |             return NodeType.LINK
356 |         elif tag_name in ['img']:
357 |             return NodeType.IMAGE
358 |         elif tag_name in ['div', 'span', 'section', 'article', 'header', 'footer', 'nav']:
359 |             return NodeType.CONTAINER
360 |         elif tag_name in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'label', 'td', 'th']:
361 |             return NodeType.TEXT
362 |         else:
363 |             return NodeType.OTHER
364 |     
365 |     async def _build_ui_tree(self) -> UITree:
366 |         """Build UI tree structure"""
367 |         try:
368 |             # Simplified tree building - just create a root container
369 |             # In a full implementation, we would parse the actual DOM tree
370 |             root_node = UINode(
371 |                 id="root",
372 |                 content="",
373 |                 rect=Rect(left=0, top=0, width=1920, height=1080),
374 |                 center=(960, 540),
375 |                 node_type=NodeType.CONTAINER,
376 |                 attributes={},
377 |                 is_visible=True,
378 |                 children=[]
379 |             )
380 |             
381 |             return UITree(node=root_node, children=[])
382 |             
383 |         except Exception as e:
384 |             logger.error(f"Failed to build UI tree: {e}")
385 |             # Return minimal tree
386 |             root_node = UINode(
387 |                 id="root",
388 |                 content="",
389 |                 rect=Rect(left=0, top=0, width=1920, height=1080),
390 |                 center=(960, 540),
391 |                 node_type=NodeType.CONTAINER,
392 |                 attributes={},
393 |                 is_visible=True,
394 |                 children=[]
395 |             )
396 |             return UITree(node=root_node, children=[])
397 |     
398 |     async def __aenter__(self):
399 |         return self
400 |     
401 |     async def __aexit__(self, exc_type, exc_val, exc_tb):
402 |         await self.close()


--------------------------------------------------------------------------------