├── .python-version ├── src └── android_mcp │ ├── __init__.py │ ├── tree │ ├── __init__.py │ ├── config.py │ ├── utils.py │ ├── views.py │ └── service.py │ ├── mobile │ ├── __init__.py │ ├── views.py │ └── service.py │ └── __main__.py ├── .gitignore ├── pyproject.toml ├── LICENSE ├── CONTRIBUTING.md └── README.md /.python-version: -------------------------------------------------------------------------------- 1 | 3.13 2 | -------------------------------------------------------------------------------- /src/android_mcp/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/android_mcp/tree/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/android_mcp/mobile/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Python-generated files 2 | __pycache__/ 3 | *.py[oc] 4 | build/ 5 | dist/ 6 | wheels/ 7 | *.egg-info 8 | 9 | # Virtual environments 10 | .venv 11 | 12 | notebook.ipynb -------------------------------------------------------------------------------- /src/android_mcp/tree/config.py: -------------------------------------------------------------------------------- 1 | INTERACTIVE_CLASSES = [ 2 | "android.widget.Button", 3 | "android.widget.ImageButton", 4 | "android.widget.EditText", 5 | "android.widget.CheckBox", 6 | "android.widget.Switch", 7 | "android.widget.RadioButton", 8 | "android.widget.Spinner", 9 | "android.widget.SeekBar", 10 | ] 11 | 12 | -------------------------------------------------------------------------------- /src/android_mcp/mobile/views.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from android_mcp.tree.views import TreeState 3 | from PIL.Image import Image 4 | from typing import Literal 5 | 6 | @dataclass 7 | class App: 8 | name:str 9 | status:Literal['Maximized','Minimized'] 10 | 11 | @dataclass 12 | class MobileState: 13 | tree_state:TreeState 14 | screenshot:bytes|str|Image|None -------------------------------------------------------------------------------- /src/android_mcp/tree/utils.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | def extract_cordinates(node): 4 | attributes = node.attrib 5 | bounds=attributes.get('bounds') 6 | match = re.search(r'\[(\d+),(\d+)]\[(\d+),(\d+)]', bounds) 7 | if match: 8 | x1, y1, x2, y2 = map(int, match.groups()) 9 | return x1, y1, x2, y2 10 | 11 | def get_center_cordinates(cordinates:tuple[int,int,int,int]): 12 | x_center,y_center = (cordinates[0]+cordinates[2])//2,(cordinates[1]+cordinates[3])//2 13 | return x_center,y_center -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "android-mcp" 3 | version = "0.1.0" 4 | description = "Lightweight MCP Server for Android Operating System" 5 | readme = "README.md" 6 | requires-python = ">=3.13" 7 | license = { file = "LICENSE" } 8 | urls = { Homepage = "https://github.com/CursorTouch/Android-MCP" } 9 | keywords = ["android", "mcp", "mobile", "automation"] 10 | dependencies = [ 11 | "fastmcp>=2.14.0", 12 | "ipykernel>=6.30.1", 13 | "pillow>=11.2.1", 14 | "tabulate>=0.9.0", 15 | "uiautomator2>=3.3.1", 16 | ] 17 | 18 | [project.scripts] 19 | android-mcp = "android_mcp.__main__:main" 20 | 21 | [build-system] 22 | requires = ["hatchling"] 23 | build-backend = "hatchling.build" 24 | -------------------------------------------------------------------------------- /src/android_mcp/tree/views.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from tabulate import tabulate 3 | 4 | @dataclass 5 | class ElementNode: 6 | name: str 7 | class_name: str 8 | coordinates: 'CenterCord' 9 | bounding_box: 'BoundingBox' 10 | 11 | @dataclass 12 | class BoundingBox: 13 | x1:int 14 | y1:int 15 | x2:int 16 | y2:int 17 | 18 | def to_string(self): 19 | return f'[{self.x1},{self.y1}][{self.x2},{self.y2}]' 20 | 21 | @dataclass 22 | class TreeState: 23 | interactive_elements:list[ElementNode] 24 | 25 | def to_string(self): 26 | data = [[index, node.name, node.class_name, node.coordinates.to_string()] for index, node in enumerate(self.interactive_elements)] 27 | return tabulate(data, headers=["Label", "Name", "Class", "Coordinates"], tablefmt="plain") 28 | 29 | @dataclass 30 | class CenterCord: 31 | x: int 32 | y: int 33 | 34 | def to_string(self): 35 | return f'({self.x},{self.y})' -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 JEOMON GEORGE 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /src/android_mcp/__main__.py: -------------------------------------------------------------------------------- 1 | from fastmcp import FastMCP 2 | from fastmcp.utilities.types import Image 3 | from contextlib import asynccontextmanager 4 | from argparse import ArgumentParser 5 | from android_mcp.mobile.service import Mobile 6 | from textwrap import dedent 7 | import asyncio 8 | 9 | parser = ArgumentParser() 10 | parser.add_argument('--emulator',action='store_true',help='Use the emulator') 11 | args = parser.parse_args() 12 | 13 | instructions=dedent(''' 14 | Android MCP server provides tools to interact directly with the Android device, 15 | thus enabling to operate the mobile device like an actual USER.''') 16 | 17 | @asynccontextmanager 18 | async def lifespan(app: FastMCP): 19 | """Runs initialization code before the server starts and cleanup code after it shuts down.""" 20 | await asyncio.sleep(1) # Simulate startup latency 21 | yield 22 | 23 | mcp=FastMCP(name="Android-MCP",instructions=instructions) 24 | 25 | mobile=Mobile(device=None if not args.emulator else 'emulator-5554') 26 | device=mobile.get_device() 27 | 28 | @mcp.tool(name='Click-Tool',description='Click on a specific cordinate') 29 | def click_tool(x:int,y:int): 30 | device.click(x,y) 31 | return f'Clicked on ({x},{y})' 32 | 33 | @mcp.tool('State-Tool',description='Get the state of the device. Optionally includes visual screenshot when use_vision=True.') 34 | def state_tool(use_vision:bool=False): 35 | mobile_state=mobile.get_state(use_vision=use_vision,as_bytes=True) 36 | return [mobile_state.tree_state.to_string()]+([Image(data=mobile_state.screenshot,format='PNG')] if use_vision else []) 37 | 38 | @mcp.tool(name='Long-Click-Tool',description='Long click on a specific cordinate') 39 | def long_click_tool(x:int,y:int): 40 | device.long_click(x,y) 41 | return f'Long Clicked on ({x},{y})' 42 | 43 | @mcp.tool(name='Swipe-Tool',description='Swipe on a specific cordinate') 44 | def swipe_tool(x1:int,y1:int,x2:int,y2:int): 45 | device.swipe(x1,y1,x2,y2) 46 | return f'Swiped from ({x1},{y1}) to ({x2},{y2})' 47 | 48 | @mcp.tool(name='Type-Tool',description='Type on a specific cordinate') 49 | def type_tool(text:str,x:int,y:int,clear:bool=False): 50 | device.set_fastinput_ime(enable=True) 51 | device.send_keys(text=text,clear=clear) 52 | return f'Typed "{text}" on ({x},{y})' 53 | 54 | @mcp.tool(name='Drag-Tool',description='Drag from location and drop on another location') 55 | def drag_tool(x1:int,y1:int,x2:int,y2:int): 56 | device.drag(x1,y1,x2,y2) 57 | return f'Dragged from ({x1},{y1}) and dropped on ({x2},{y2})' 58 | 59 | @mcp.tool(name='Press-Tool',description='Press on specific button on the device') 60 | def press_tool(button:str): 61 | device.press(button) 62 | return f'Pressed the "{button}" button' 63 | 64 | @mcp.tool(name='Notification-Tool',description='Access the notifications seen on the device') 65 | def notification_tool(): 66 | device.open_notification() 67 | return 'Accessed notification bar' 68 | 69 | @mcp.tool(name='Wait-Tool',description='Wait for a specific amount of time') 70 | def wait_tool(duration:int): 71 | device.sleep(duration) 72 | return f'Waited for {duration} seconds' 73 | 74 | def main(): 75 | mcp.run() 76 | 77 | if __name__ == '__main__': 78 | main() -------------------------------------------------------------------------------- /src/android_mcp/mobile/service.py: -------------------------------------------------------------------------------- 1 | from android_mcp.mobile.views import MobileState 2 | from android_mcp.tree.service import Tree 3 | import uiautomator2 as u2 4 | from io import BytesIO 5 | from PIL import Image 6 | import base64 7 | 8 | class Mobile: 9 | def __init__(self,device:str=None): 10 | try: 11 | self.device = u2.connect(device) 12 | self.device.info 13 | except u2.ConnectError as e: 14 | raise ConnectionError(f"Failed to connect to device {device}: {e}") 15 | except Exception as e: 16 | raise RuntimeError(f"Unexpected error connecting to device {device}: {e}") 17 | 18 | def get_device(self): 19 | return self.device 20 | 21 | def get_state(self,use_vision=False,as_bytes:bool=False,as_base64:bool=False): 22 | try: 23 | tree = Tree(self) 24 | tree_state = tree.get_state() 25 | if use_vision: 26 | nodes=tree_state.interactive_elements 27 | annotated_screenshot=tree.annotated_screenshot(nodes=nodes,scale=1.0) 28 | if as_base64: 29 | screenshot=self.as_base64(annotated_screenshot) 30 | elif as_bytes: 31 | screenshot=self.screenshot_in_bytes(annotated_screenshot) 32 | else: 33 | screenshot=annotated_screenshot 34 | else: 35 | screenshot=None 36 | return MobileState(tree_state=tree_state,screenshot=screenshot) 37 | except Exception as e: 38 | raise RuntimeError(f"Failed to get device state: {e}") 39 | 40 | def get_screenshot(self,scale:float=0.7)->Image.Image: 41 | try: 42 | screenshot=self.device.screenshot() 43 | if screenshot is None: 44 | raise ValueError("Screenshot capture returned None.") 45 | size=(screenshot.width*scale, screenshot.height*scale) 46 | screenshot.thumbnail(size=size, resample=Image.Resampling.LANCZOS) 47 | return screenshot 48 | except Exception as e: 49 | raise RuntimeError(f"Failed to get screenshot: {e}") 50 | 51 | def screenshot_in_bytes(self,screenshot:Image.Image)->bytes: 52 | try: 53 | if screenshot is None: 54 | raise ValueError("Screenshot is None") 55 | io=BytesIO() 56 | screenshot.save(io,format='PNG') 57 | bytes=io.getvalue() 58 | if len(bytes) == 0: 59 | raise ValueError("Screenshot conversion resulted in empty bytes.") 60 | return bytes 61 | except Exception as e: 62 | raise RuntimeError(f"Failed to convert screenshot to bytes: {e}") 63 | 64 | def as_base64(self,screenshot:Image.Image)->str: 65 | try: 66 | if screenshot is None: 67 | raise ValueError("Screenshot is None") 68 | io=BytesIO() 69 | screenshot.save(io,format='PNG') 70 | bytes=io.getvalue() 71 | if len(bytes) == 0: 72 | raise ValueError("Screenshot conversion resulted in empty bytes.") 73 | return base64.b64encode(bytes).decode('utf-8') 74 | except Exception as e: 75 | raise RuntimeError(f"Failed to convert screenshot to base64: {e}") 76 | 77 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to Android-MCP 2 | 3 | Thank you for your interest in contributing to MCP-Use! This document provides guidelines and instructions for contributing to this project. 4 | 5 | ## Table of Contents 6 | 7 | - [Getting Started](#getting-started) 8 | - [Development Environment](#development-environment) 9 | - [Installation from Source](#installation-from-source) 10 | - [Development Workflow](#development-workflow) 11 | - [Branching Strategy](#branching-strategy) 12 | - [Commit Messages](#commit-messages) 13 | - [Code Style](#code-style) 14 | - [Pre-commit Hooks](#pre-commit-hooks) 15 | - [Testing](#testing) 16 | - [Running Tests](#running-tests) 17 | - [Adding Tests](#adding-tests) 18 | - [Pull Requests](#pull-requests) 19 | - [Creating a Pull Request](#creating-a-pull-request) 20 | - [Pull Request Template](#pull-request-template) 21 | - [Documentation](#documentation) 22 | - [Release Process](#release-process) 23 | - [Getting Help](#getting-help) 24 | 25 | ## Getting Started 26 | 27 | ### Development Environment 28 | 29 | Android-MCP requires: 30 | - Python 3.11 or later 31 | 32 | ### Installation from Source 33 | 34 | 1. Fork the repository on GitHub. 35 | 2. Clone your fork locally: 36 | 37 | ```bash 38 | git clone https://github.com/Jeomon/Windows-MCP.git 39 | cd Android-MCP 40 | ``` 41 | 42 | 3. Install the package in development mode: 43 | 44 | ```bash 45 | pip install -e ".[dev,search]" 46 | ``` 47 | 48 | 4. Set up pre-commit hooks: 49 | 50 | ```bash 51 | pip install pre-commit 52 | pre-commit install 53 | ``` 54 | 55 | ## Development Workflow 56 | 57 | ### Branching Strategy 58 | 59 | - `main` branch contains the latest stable code 60 | - Create feature branches from `main` named according to the feature you're implementing: `feature/your-feature-name` 61 | - For bug fixes, use: `fix/bug-description` 62 | 63 | ### Commit Messages 64 | 65 | For now no commit style is enforced, try to keep your commit messages informational. 66 | 67 | ### Code Style 68 | 69 | Key style guidelines: 70 | 71 | - Line length: 100 characters 72 | - Use double quotes for strings 73 | - Follow PEP 8 naming conventions 74 | - Add type hints to function signatures 75 | 76 | ### Pre-commit Hooks 77 | 78 | We use pre-commit hooks to ensure code quality before committing. The configuration is in `.pre-commit-config.yaml`. 79 | 80 | The hooks will: 81 | 82 | - Run linting checks 83 | - Check for trailing whitespace and fix it 84 | - Ensure files end with a newline 85 | - Validate YAML files 86 | - Check for large files 87 | - Remove debug statements 88 | 89 | ## Testing 90 | 91 | ### Running Tests 92 | 93 | Run the test suite with pytest: 94 | 95 | ```bash 96 | pytest 97 | ``` 98 | 99 | To run specific test categories: 100 | 101 | ```bash 102 | pytest tests/ 103 | ``` 104 | 105 | ### Adding Tests 106 | 107 | - Add unit tests for new functionality in `tests/unit/` 108 | - For slow or network-dependent tests, mark them with `@pytest.mark.slow` or `@pytest.mark.integration` 109 | - Aim for high test coverage of new code 110 | 111 | ## Pull Requests 112 | 113 | ### Creating a Pull Request 114 | 115 | 1. Ensure your code passes all tests and pre-commit hooks 116 | 2. Push your changes to your fork 117 | 3. Submit a pull request to the main repository 118 | 4. Follow the pull request template 119 | 120 | ## Documentation 121 | 122 | - Update docstrings for new or modified functions, classes, and methods 123 | - Use Google-style docstrings: 124 | 125 | ```python 126 | def function_name(param1: type, param2: type) -> return_type: 127 | """Short description. 128 | Longer description if needed. 129 | 130 | Args: 131 | param1: Description of param1 132 | param2: Description of param2 133 | 134 | Returns: 135 | Description of return value 136 | 137 | Raises: 138 | ExceptionType: When and why this exception is raised 139 | """ 140 | ``` 141 | 142 | - Update README.md for user-facing changes 143 | 144 | ## Getting Help 145 | 146 | If you need help with your contribution: 147 | 148 | - Open an issue for discussion 149 | - Reach out to the maintainers 150 | - Check existing code for examples 151 | 152 | Thank you for contributing to Android-MCP! -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 | 3 |

🤖 Android MCP

4 | 5 | 6 | License 7 | 8 | Python 9 | Platform 10 | Last Commit 11 |
12 | 13 | Follow on Twitter 14 | 15 | 16 | Join us on Discord 17 | 18 | 19 |
20 | 21 |
22 | 23 | **Android-MCP** is a lightweight, open-source tool that bridge between AI agents and Android devices. Running as an MCP server, it lets LLM agents perform real-world tasks such as **app navigation, UI interaction and automated QA testing** without relying on traditional computer-vision pipelines or preprogramed scripts. 24 | 25 | 26 | 27 | ## ✨ Features 28 | 29 | - **Native Android Integration** 30 | Interact with UI elements via ADB and the Android Accessibility API: launch apps, tap, swipe, input text, and read view hierarchies. 31 | 32 | - **Bring Your Own LLM/VLM** 33 | Works with any language model, no fine-tuned CV model or OCR pipeline required. 34 | 35 | - **Rich Toolset for Mobile Automation** 36 | Pre-built tools for gestures, keystrokes, capture, device state, shell commands execution. 37 | 38 | - **Real-Time Interaction** 39 | Typical latency between actions (e.g., two taps) ranges **2-4s** depending on device specs and load. 40 | 41 | ### Supported Operating Systems 42 | 43 | - Android 10+ 44 | 45 | ## Installation 46 | 47 | ### 📦 Prerequisites 48 | 49 | - Python 3.10+ 50 | - UIautomator2 51 | - Android 10+ (Emulator/ Android Device) 52 | - A computer to run MCP server 53 | 54 | ### 🏁 Getting Started 55 | 56 | You can run the Android MCP server using **UVX** (recommended) or **UV** (for local development). 57 | 58 | #### Option 1: UVX (Recommended) 59 | 60 | No need to install dependencies manually. Just configure Claude Desktop: 61 | 62 | 1. **Locate your config file** 63 | - Windows: `%APPDATA%\Claude\claude_desktop_config.json` 64 | - macOS: `~/Library/Application Support/Claude/claude_desktop_config.json` 65 | 66 | 2. **Add the configuration** 67 | ```json 68 | { 69 | "mcpServers": { 70 | "android-mcp": { 71 | "command": "uvx", 72 | "args": [ 73 | "android-mcp", 74 | "--emulator" 75 | ] 76 | } 77 | } 78 | } 79 | ``` 80 | > **Note:** Remove `--emulator` if providing a physical device. 81 | 82 | #### Option 2: UV Mode (Local Development) 83 | 84 | 1. **Clone and Install** 85 | ```shell 86 | git clone https://github.com/CursorTouch/Android-MCP.git 87 | cd Android-MCP 88 | uv sync 89 | ``` 90 | 91 | 2. **Configure Claude Desktop** 92 | ```json 93 | { 94 | "mcpServers": { 95 | "android-mcp": { 96 | "command": "uv", 97 | "args": [ 98 | "--directory", 99 | "", 100 | "run", 101 | "android-mcp", 102 | "--emulator" 103 | ] 104 | } 105 | } 106 | } 107 | ``` 108 | > **Note:** Replace `` with the full path to your cloned directory. 109 | 110 | 3. **Restart the Claude Desktop** 111 | 112 | Restart your Claude Desktop. You should see "android-mcp" listed as an available integration. That's it, now you're ready to start controlling your Android device with natural language. 113 | 114 | For troubleshooting tips (log locations, common ADB issues), see the [MCP docs](https://modelcontextprotocol.io/quickstart/server#android-mcp-integration-issues). 115 | 116 | --- 117 | 118 | ## 🛠️ Available Tools 119 | 120 | Claude can access the following tools to interact with Windows: 121 | 122 | - `State-Tool`: To understand the state of the device. 123 | - `Click-Tool`: Click on the screen at the given coordinates. 124 | - `Long-Click-Tool`: Perform long click on the screen at the given coordinates. 125 | - `Type-Tool`: Type text on the specified coordinates (optionally clears existing text). 126 | - `Swipe-Tool`: Perform swipe from one location to other. 127 | - `Drag-Tool`: Drag from one point to another. 128 | - `Press-Tool`: To press the keys on the mobile device (Back, Volume Up, ...etc). 129 | - `Wait-Tool`: Pause for a defined duration. 130 | - `State-Tool`: Combined snapshot of active apps and interactive UI elements. 131 | - `Notification-Tool`: To access the notifications seen on the device. 132 | - `Shell-Tool`: To execute shell commands on the android device. 133 | 134 | ## ⚠️ Caution 135 | 136 | Android-MCP can execute arbitrary UI actions on your mobile device. Use it in controlled environments (emulators, test devices) when running untrusted prompts or agents. 137 | 138 | ## 🪪 License 139 | 140 | This project is licensed under the MIT License. See [LICENSE](LICENSE) for details. 141 | 142 | ## 🤝 Contributing 143 | 144 | Contributions are welcome! Please read [CONTRIBUTING](CONTRIBUTING) for dev setup and PR guidelines. 145 | 146 | Made with ❤️ by [CursorTouch](https://github.com/cursortouch), 147 | 148 | developers: [Jeomon George](https://github.com/jeomon), [Muhammad Yaseen](https://github.com/mhmdyaseen) 149 | 150 | ## Citation 151 | 152 | ```bibtex 153 | @misc{ 154 | author = {cursortouch}, 155 | title = {Android-MCP}, 156 | year = {2025}, 157 | publisher = {GitHub}, 158 | howpublished = {\url{https://github.com/CursorTouch/Android-MCP}}, 159 | note = {Lightweight open-source bridge between LLM agents and Android}, 160 | } 161 | ``` 162 | -------------------------------------------------------------------------------- /src/android_mcp/tree/service.py: -------------------------------------------------------------------------------- 1 | from android_mcp.tree.views import TreeState, ElementNode, CenterCord, BoundingBox 2 | from android_mcp.tree.utils import extract_cordinates,get_center_cordinates 3 | from android_mcp.tree.config import INTERACTIVE_CLASSES 4 | from PIL import Image, ImageFont, ImageDraw 5 | from xml.etree.ElementTree import Element 6 | from xml.etree import ElementTree 7 | from typing import TYPE_CHECKING 8 | import random 9 | import logging 10 | 11 | logging.basicConfig(level=logging.INFO) 12 | logger = logging.getLogger(__name__) 13 | 14 | if TYPE_CHECKING: 15 | from android_mcp.mobile import Mobile 16 | 17 | class Tree: 18 | def __init__(self,mobile:'Mobile'): 19 | self.mobile = mobile 20 | 21 | def get_element_tree(self)->'Element': 22 | tree_string = self.mobile.device.dump_hierarchy() 23 | logger.debug(tree_string) 24 | return ElementTree.fromstring(tree_string) 25 | 26 | def get_state(self)->TreeState: 27 | interactive_elements=self.get_interactive_elements() 28 | return TreeState(interactive_elements=interactive_elements) 29 | 30 | def get_interactive_elements(self)->list: 31 | interactive_elements=[] 32 | element_tree = self.get_element_tree() 33 | nodes=element_tree.findall('.//node[@enabled="true"]') 34 | for node in nodes: 35 | if self.is_interactive(node): 36 | x1,y1,x2,y2 = extract_cordinates(node) 37 | name=self.get_element_name(node) 38 | if not name: 39 | continue 40 | x_center,y_center = get_center_cordinates((x1,y1,x2,y2)) 41 | interactive_elements.append(ElementNode(**{ 42 | 'name':name, 43 | 'class_name':node.get('class'), 44 | 'coordinates':CenterCord(x=x_center,y=y_center), 45 | 'bounding_box':BoundingBox(x1=x1,y1=y1,x2=x2,y2=y2) 46 | })) 47 | return interactive_elements 48 | 49 | def get_element_name(self, node) -> str: 50 | name = node.get('content-desc') or node.get('text') 51 | if not name: 52 | texts = [] 53 | fallback_texts = [] 54 | 55 | def collect_text(n): 56 | # Check if this node is actionable (and not the root node we started with) 57 | is_actionable = (n is not node) and ( 58 | n.get('clickable') == "true" or 59 | n.get('long-clickable') == "true" or 60 | n.get('checkable') == "true" or 61 | n.get('scrollable') == "true") 62 | 63 | val = n.get('text') or n.get('content-desc') or n.get('hint') 64 | 65 | if is_actionable: 66 | if val: 67 | fallback_texts.append(val) 68 | return # Stop recursing into actionable nodes 69 | 70 | if val: 71 | texts.append(val) 72 | 73 | for child in n: 74 | collect_text(child) 75 | 76 | collect_text(node) 77 | 78 | # Use primary texts if found, otherwise use fallback texts from actionable children 79 | final_texts = texts if texts else fallback_texts 80 | name = " ".join(final_texts).strip() 81 | return name 82 | 83 | def is_interactive(self, node) -> bool: 84 | attributes = node.attrib 85 | return (attributes.get('focusable') == "true" or 86 | attributes.get('clickable') == "true" or 87 | attributes.get('long-clickable') == "true" or 88 | attributes.get('checkable') == "true" or 89 | attributes.get('scrollable') == "true" or 90 | attributes.get('selected') == "true" or 91 | attributes.get('password') == "true" or 92 | attributes.get('class') in INTERACTIVE_CLASSES) 93 | 94 | def annotated_screenshot(self, nodes: list[ElementNode],scale:float=0.7) -> Image.Image: 95 | screenshot = self.mobile.get_screenshot(scale=scale) 96 | # Add padding 97 | padding = 15 98 | width = screenshot.width + (2 * padding) 99 | height = screenshot.height + (2 * padding) 100 | padded_screenshot = Image.new("RGB", (width, height), color=(255, 255, 255)) 101 | padded_screenshot.paste(screenshot, (padding, padding)) 102 | 103 | draw = ImageDraw.Draw(padded_screenshot) 104 | font_size = 12 105 | try: 106 | font = ImageFont.truetype('arial.ttf', font_size) 107 | except IOError: 108 | font = ImageFont.load_default() 109 | 110 | def get_random_color(): 111 | return "#{:06x}".format(random.randint(0, 0xFFFFFF)) 112 | 113 | def draw_annotation(label, node: ElementNode): 114 | bounding_box = node.bounding_box 115 | color = get_random_color() 116 | 117 | # Scale and pad the bounding box also clip the bounding box 118 | adjusted_box = ( 119 | int(bounding_box.x1 * scale) + padding, 120 | int(bounding_box.y1 * scale) + padding, 121 | int(bounding_box.x2 * scale) + padding, 122 | int(bounding_box.y2 * scale) + padding 123 | ) 124 | # Draw bounding box 125 | draw.rectangle(adjusted_box, outline=color, width=2) 126 | 127 | # Label dimensions 128 | label_width = draw.textlength(str(label), font=font) 129 | label_height = font_size 130 | left, top, right, bottom = adjusted_box 131 | 132 | # Label position above bounding box 133 | label_x1 = right - label_width 134 | label_y1 = top - label_height - 4 135 | label_x2 = label_x1 + label_width 136 | label_y2 = label_y1 + label_height + 4 137 | 138 | # Draw label background and text 139 | draw.rectangle([(label_x1, label_y1), (label_x2, label_y2)], fill=color) 140 | draw.text((label_x1 + 2, label_y1 + 2), str(label), fill=(255, 255, 255), font=font) 141 | 142 | # Draw annotations sequentially for better performance and thread safety 143 | for i, node in enumerate(nodes): 144 | draw_annotation(i, node) 145 | 146 | return padded_screenshot 147 | --------------------------------------------------------------------------------