├── .gitignore ├── src ├── utils.ts ├── index.ts ├── mobile_computer.ts ├── adb_client.ts └── ui_dump_parser.ts ├── examples ├── download_app.txt ├── youtube_search ├── linkedIn_automation.txt ├── zomato_order.txt ├── porter_booking.txt ├── instagram_direct_message_anthropic.ts ├── open_zepto_azure.ts └── Instagram_direct_message_openai.ts ├── scripts ├── export_ui_dump.ts └── take_screenshot.ts ├── tsconfig.json ├── .github └── workflows │ └── publish.yml ├── bin └── run ├── LICENSE ├── package.json ├── README.md └── CODE_OF_CONDUCT.md /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | dist 3 | temp 4 | .env.local 5 | test 6 | .DS_Store -------------------------------------------------------------------------------- /src/utils.ts: -------------------------------------------------------------------------------- 1 | export const wait = (ms: number) => 2 | new Promise((resolve) => setTimeout(resolve, ms)); 3 | -------------------------------------------------------------------------------- /examples/download_app.txt: -------------------------------------------------------------------------------- 1 | 1. Go to playstore 2 | 2. Search for 'fampay' 3 | 3. Go to the app page 4 | 4. Install the app 5 | 5. Wait for it to install -------------------------------------------------------------------------------- /examples/youtube_search: -------------------------------------------------------------------------------- 1 | Open youtube app 2 | Search for cute cat video 3 | Click on first video 4 | Go to comment section of the video 5 | Write a comment after analyzing comment section -------------------------------------------------------------------------------- /scripts/export_ui_dump.ts: -------------------------------------------------------------------------------- 1 | import { ADBClient } from "../src/adb_client"; 2 | 3 | async function main() { 4 | const adb = new ADBClient(); 5 | const ui = await adb.dumpUI(); 6 | console.log(ui); 7 | } 8 | 9 | main(); 10 | -------------------------------------------------------------------------------- /examples/linkedIn_automation.txt: -------------------------------------------------------------------------------- 1 | 1. Open LinkedIn 2 | 2. search for Ankit Chowdhary, He is at south park commons 3 | 3. Open His profile 4 | 4. Send connection request if not already connected 5 | 5. See all his posts and like top 2 of them -------------------------------------------------------------------------------- /examples/zomato_order.txt: -------------------------------------------------------------------------------- 1 | 1. Open zomato app 2 | 2. Set default address to bangalore, house 951 (Home) 3 | 3. Go to delivery part of zomato 4 | 4. Search for masala chaas. 5 | 5. Order masala chaas and complete the payment using default payment option. -------------------------------------------------------------------------------- /examples/porter_booking.txt: -------------------------------------------------------------------------------- 1 | 1. Open porter app 2 | 2. Book a 2 wheeler 3 | 3. Select pickup location as '20th a cross road, bangalore' 4 | 4. Select drop location as 'asha tiffins, bangalore' 5 | 5. Select other things as defult 6 | 6. Payment mode as 'cash' 7 | 7. After booking, confirm the sending a message on whatsapp to 'saksham boi'. -------------------------------------------------------------------------------- /scripts/take_screenshot.ts: -------------------------------------------------------------------------------- 1 | import { writeFile } from "node:fs/promises"; 2 | import { ADBClient } from "../src/adb_client"; 3 | 4 | async function main() { 5 | const adb = new ADBClient(); 6 | const screenshot = await adb.screenshot(); 7 | console.log(await adb.screenSize()); 8 | await writeFile("mobile.png", screenshot); 9 | } 10 | 11 | main(); 12 | -------------------------------------------------------------------------------- /examples/instagram_direct_message_anthropic.ts: -------------------------------------------------------------------------------- 1 | import { mobileUse } from "@/src"; 2 | 3 | async function main() { 4 | // here we are using claude 3.5 sonnet 5 | const response = await mobileUse({ 6 | task: "Open instagram and go to direct messages, send hi {instagram_username} to the first person", 7 | }); 8 | console.log(response.text); 9 | } 10 | 11 | main(); 12 | -------------------------------------------------------------------------------- /examples/open_zepto_azure.ts: -------------------------------------------------------------------------------- 1 | import { mobileUse } from "@/src"; 2 | import { azure } from '@ai-sdk/azure'; 3 | 4 | async function main() { 5 | process.env.AZURE_RESOURCE_NAME = ''; 6 | process.env.AZURE_API_KEY = ''; 7 | 8 | const response = await mobileUse({ 9 | task: "Open zepto", 10 | llm: azure("gpt-4o"), 11 | }); 12 | console.log("Azure LLM response:", response.text); 13 | } 14 | 15 | main(); -------------------------------------------------------------------------------- /examples/Instagram_direct_message_openai.ts: -------------------------------------------------------------------------------- 1 | import { mobileUse } from "@/src"; 2 | import { openai } from "@ai-sdk/openai"; 3 | 4 | async function main() { 5 | process.env.OPENAI_API_KEY = ""; 6 | 7 | const response = await mobileUse({ 8 | task: "Open instagram and go to direct messages, send hi {instagram_username} to the first person", 9 | llm: openai("gpt-4o"), 10 | }); 11 | console.log("OpenAI LLM response:", response.text); 12 | } 13 | 14 | main(); 15 | -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "module": "commonjs", 4 | "esModuleInterop": true, 5 | "target": "ES2022", 6 | "noImplicitAny": true, 7 | "moduleResolution": "node", 8 | "sourceMap": true, 9 | "outDir": "dist", 10 | "baseUrl": ".", 11 | "paths": { 12 | "*": ["node_modules/*", "lib/types/*"], 13 | "@/*": ["./*"] 14 | }, 15 | "skipLibCheck": true, 16 | "declaration": true 17 | }, 18 | "exclude": ["node_modules", "dist", ".eslintrc.cjs"] 19 | } -------------------------------------------------------------------------------- /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: Publish Package to npmjs 2 | on: 3 | release: 4 | types: [published] 5 | jobs: 6 | build: 7 | runs-on: ubuntu-latest 8 | permissions: 9 | contents: read 10 | id-token: write 11 | steps: 12 | - uses: actions/checkout@v4 13 | # Setup .npmrc file to publish to npm 14 | - uses: actions/setup-node@v4 15 | with: 16 | node-version: "20.x" 17 | registry-url: "https://registry.npmjs.org" 18 | - run: npm ci 19 | - run: npm run build 20 | - run: npm publish --provenance --access public 21 | env: 22 | NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }} 23 | -------------------------------------------------------------------------------- /bin/run: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | const { mobileUse } = require("../dist"); 4 | const { readFile, access } = require("node:fs/promises"); 5 | const yargs = require("yargs"); 6 | const { hideBin } = require("yargs/helpers"); 7 | 8 | const argv = yargs(hideBin(process.argv)) 9 | .usage("Usage: $0 [options]") 10 | .demandCommand(1, "Please provide either a command string or a file path") 11 | .help().argv; 12 | 13 | async function run() { 14 | const input = argv._[0].toString(); 15 | let task; 16 | try { 17 | await access(input); 18 | task = (await readFile(input, "utf-8")).trim(); 19 | } catch { 20 | task = input; 21 | } 22 | const response = await mobileUse({ 23 | task, 24 | }); 25 | return response.text; 26 | } 27 | 28 | run() 29 | .catch((error) => console.error(error)) 30 | .then((result) => console.log(result)); 31 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2025 cloudycotton 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 8 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "mobile-use", 3 | "version": "0.0.7", 4 | "main": "dist/index.js", 5 | "types": "dist/index.d.ts", 6 | "scripts": { 7 | "test": "echo \"Error: no test specified\" && exit 1", 8 | "build": "tsup src/index.ts --dts" 9 | }, 10 | "bin": { 11 | "mobile-use": "./bin/run" 12 | }, 13 | "repository": { 14 | "type": "git", 15 | "url": "https://github.com/runablehq/mobile-use.git" 16 | }, 17 | "license": "MIT", 18 | "description": "Use AI to control your mobile", 19 | "dependencies": { 20 | "@ai-sdk/anthropic": "^1.1.17", 21 | "@ai-sdk/azure": "^1.2.7", 22 | "@ai-sdk/google": "^1.1.25", 23 | "@ai-sdk/openai": "^1.2.5", 24 | "@openrouter/ai-sdk-provider": "^0.4.3", 25 | "ai": "^4.1.61", 26 | "fast-xml-parser": "^5.0.9", 27 | "playwright": "^1.51.1", 28 | "sharp": "^0.33.5", 29 | "tsx": "^4.19.2", 30 | "typescript": "^5.7.3", 31 | "yargs": "^17.7.2", 32 | "zod": "^3.24.2" 33 | }, 34 | "devDependencies": { 35 | "@types/node": "^22.13.10", 36 | "tsup": "^8.4.0" 37 | }, 38 | "files": [ 39 | "bin", 40 | "dist", 41 | "src" 42 | ] 43 | } 44 | -------------------------------------------------------------------------------- /src/index.ts: -------------------------------------------------------------------------------- 1 | import { generateText, LanguageModel, tool } from "ai"; 2 | import { z } from "zod"; 3 | import { ADBClient } from "./adb_client"; 4 | import { createMobileComputer } from "./mobile_computer"; 5 | import { openai } from "@ai-sdk/openai"; 6 | export { ADBClient } from "./adb_client"; 7 | 8 | const MobileUsePrompt = `You are an experienced mobile automation engineer. 9 | Your job is to navigate an android device and perform actions to fullfil request of the user. 10 | 11 | 12 | If the user asks to use a specific app in the request, open it before performing any other action. 13 | Do not take ui dump more than once per action. If you think you don't need to take ui dump, skip it. Use it sparingly. 14 | 15 | `; 16 | 17 | interface MobileUseOptions { 18 | task: string; 19 | llm?: LanguageModel; 20 | } 21 | 22 | export async function mobileUse({ 23 | task, 24 | llm = openai("gpt-4o"), 25 | }: MobileUseOptions) { 26 | const adbClient = new ADBClient(); 27 | await adbClient.init(); 28 | const computer = await createMobileComputer(adbClient); 29 | const response = await generateText({ 30 | messages: [ 31 | { 32 | role: "system", 33 | content: MobileUsePrompt, 34 | }, 35 | { 36 | role: "user", 37 | content: task, 38 | }, 39 | ], 40 | model: llm, 41 | maxRetries: 3, 42 | maxSteps: 100, 43 | tools: { 44 | openApp: tool({ 45 | parameters: z.object({ 46 | name: z 47 | .string() 48 | .describe( 49 | "package name of the app to open such as com.google.android.dialer" 50 | ), 51 | }), 52 | description: "Open an on on android device.", 53 | async execute({ name }) { 54 | await adbClient.openApp(name); 55 | return `Successfull opened ${name}`; 56 | }, 57 | }), 58 | listApps: tool({ 59 | parameters: z.object({ 60 | name: z.string().describe("Name of the package to filter."), 61 | }), 62 | description: "Use this to list packages.", 63 | async execute({ name }) { 64 | const list = await adbClient.listPackages(name); 65 | return list.join("\n"); 66 | }, 67 | }), 68 | computer, 69 | }, 70 | }); 71 | return response; 72 | } 73 | -------------------------------------------------------------------------------- /src/mobile_computer.ts: -------------------------------------------------------------------------------- 1 | import { tool } from "ai"; 2 | import { ADBClient } from "./adb_client"; 3 | import { z } from "zod"; 4 | import { wait } from "./utils"; 5 | 6 | const Coordinate = z.array(z.number()); 7 | 8 | export const createMobileComputer = async (adbClient: ADBClient) => { 9 | const viewportSize = await adbClient.screenSize(); 10 | const mobileComputer = tool({ 11 | description: `Mobile tool to perform actions on a mobile device.`, 12 | 13 | experimental_toToolResultContent(result: any) { 14 | return typeof result === "string" 15 | ? [{ type: "text", text: result }] 16 | : [{ type: "image", data: result?.data, mimeType: "image/png" }]; 17 | }, 18 | args: { 19 | displayHeightPx: viewportSize.height, 20 | displayWidthPx: viewportSize.width, 21 | displayNumber: 0, 22 | }, 23 | parameters: z.object({ 24 | action: z.enum([ 25 | "ui_dump", 26 | "tap", 27 | "swipe", 28 | "type", 29 | "press", 30 | "wait", 31 | "screenshot", 32 | ]) 33 | .describe(`ui_dump: Get UI elements you can interact with for the current screen. 34 | tap: Tap on the provided coordinate. 35 | swipe: Swipe from start_coordinate to end_coordinate. 36 | type: Type in the box. 37 | press: Press mobile key or button. 38 | screenshot: Take a screenshot of the current screen if UI dump is not helpful or where you need to see visuals. 39 | `), 40 | coordinate: Coordinate.optional(), 41 | start_coordinate: Coordinate.optional(), 42 | end_coordinate: Coordinate.optional(), 43 | text: z.string().optional(), 44 | duration: z.number().optional(), 45 | }), 46 | async execute({ 47 | action, 48 | coordinate, 49 | text, 50 | duration, 51 | start_coordinate, 52 | end_coordinate, 53 | }) { 54 | if (action === "ui_dump") { 55 | return adbClient.dumpUI(); 56 | } 57 | 58 | if (action === "tap") { 59 | const [x, y] = coordinate; 60 | await adbClient.tap({ x, y }); 61 | return adbClient.dumpUI(); 62 | } 63 | 64 | if (action === "press") { 65 | await adbClient.keyPress(text); 66 | return adbClient.dumpUI(); 67 | } 68 | 69 | if (action === "type") { 70 | await adbClient.type(text); 71 | return adbClient.dumpUI(); 72 | } 73 | 74 | if (action === "screenshot") { 75 | const screenshot = await adbClient.screenshot(); 76 | return { 77 | data: screenshot.toString("base64"), 78 | type: "image/png", 79 | }; 80 | } 81 | 82 | if (action === "swipe") { 83 | const [start_coordinate_x, start_coordinate_y] = start_coordinate; 84 | const [end_coordinate_x, end_coordinate_y] = end_coordinate; 85 | await adbClient.swipe( 86 | { x: start_coordinate_x, y: start_coordinate_y }, 87 | { 88 | x: end_coordinate_x, 89 | y: end_coordinate_y, 90 | }, 91 | duration 92 | ); 93 | return adbClient.dumpUI(); 94 | } 95 | 96 | if (action === "wait") { 97 | await wait(duration); 98 | } 99 | }, 100 | }); 101 | 102 | return mobileComputer; 103 | }; 104 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 | 3 | # 📱 mobile-use 4 | **Use AI to control your Android phone — with natural language.** 5 | 6 | [![MIT License](https://img.shields.io/badge/License-MIT-green.svg)](LICENSE) 7 | [![Join Discord](https://img.shields.io/badge/Join-Discord-purple.svg)](https://discord.gg/BcWWRCnap6) 8 | [![npm version](https://img.shields.io/npm/v/mobile-use.svg)](https://www.npmjs.com/package/mobile-use) 9 | [![GitHub Stars](https://img.shields.io/github/stars/your-org/mobile-use.svg?style=social)](https://github.com/runablehq/mobile-use/stargazers) 10 | 11 | https://github.com/user-attachments/assets/88ab0a2d-d6e6-4d80-922e-b13d3ae91c85 12 | 13 |
14 | 15 | --- 16 | 17 | ## ✨ What is this? 18 | 19 | **`mobile-use`** lets you control your Android phone using simple, natural-language instructions. 20 | 21 | Just type: 22 | 23 | > 🗣 *“Open Instagram, go to DMs, and send ‘hi’ to the first person.”* 24 | 25 | …and watch it run on your device — powered by AI. 26 | 27 | Think RPA, but for mobile — built for devs, hackers, and productivity nerds. 28 | 29 | --- 30 | 31 | ## 🚀 Quick Start 32 | 33 | ### 📦 Install via npm 34 | 35 | ```bash 36 | npm install mobile-use 37 | ``` 38 | 39 | Or run the MCP server (includes setup): 40 | 41 | ```bash 42 | npx mobile-mcp install 43 | ``` 44 | 45 | --- 46 | 47 | ## 🧠 AI in Action 48 | 49 | ```ts 50 | import { mobileUse } from "mobile-use"; 51 | 52 | const response = await mobileUse({ 53 | task: "Open instagram and go to direct messages, send hi to first person", 54 | // Optional: use your own LLM via API 55 | // llm: { provider: "your-llm", apiKey: process.env.YOUR_API_KEY } 56 | }); 57 | 58 | console.log(response.text); 59 | ``` 60 | 61 | > Default model: Claude (via Anthropic). 62 | > Set `ANTHROPIC_API_KEY` in your `.env` or environment to use it. 63 | 64 | --- 65 | 66 | ## 🖥️ Command Line Usage 67 | 68 | ```bash 69 | # Run a task directly from your terminal 70 | npx mobile-use "Open Instagram and send 'hi'" 71 | 72 | # Run a task from a file 73 | npx mobile-use instruction.txt 74 | ``` 75 | 76 | --- 77 | 78 | ## 📱 Requirements 79 | 80 | - Android phone or Emulator running in background(iOS not supported yet) 81 | - [Android SDK Platform Tools](https://developer.android.com/studio/releases/platform-tools) installed (`adb`)(For Emulators) 82 | - USB Debugging enabled 83 | 84 | --- 85 | 86 | ## 💬 Join the Community 87 | 88 | Have a feature idea? Want to see what others are building? 89 | 90 | Join our developer Discord — we’re shaping the roadmap with the community! 91 | 92 | [![Join our Discord](https://img.shields.io/badge/Join%20us-Discord-7289DA?logo=discord&logoColor=white)](https://discord.gg/BcWWRCnap6) 93 | 94 | --- 95 | 96 | ## 🧩 What's Coming Next? 97 | 98 | - iOS support (experimental) 99 | - Visual workflows 100 | - Common protocol for mobiles, browsers and computers 101 | 102 | > Have ideas? We’re building it *with you* — hop into Discord or open a GitHub issue. 103 | 104 | --- 105 | 106 | ## ⭐ Like it? 107 | 108 | If this project made you say "whoa!", help us grow: 109 | 110 | - ⭐ [Star this repo](https://github.com/runablehq/mobile-use) 111 | - 🐦 Share on Twitter/X 112 | - 💬 [Invite friends to Discord](https://discord.gg/BcWWRCnap6) 113 | 114 | --- 115 | 116 | ## 📄 License 117 | 118 | MIT — free to use, fork, and build on. 119 | 120 | --- 121 | 122 | ## 🙌 Built with love for devs by devs -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | 2 | # Contributor Covenant Code of Conduct 3 | 4 | ## Our Pledge 5 | 6 | We as members, contributors, and leaders pledge to make participation in our 7 | community a harassment-free experience for everyone, regardless of age, body 8 | size, visible or invisible disability, ethnicity, sex characteristics, gender 9 | identity and expression, level of experience, education, socio-economic status, 10 | nationality, personal appearance, race, caste, color, religion, or sexual 11 | identity and orientation. 12 | 13 | We pledge to act and interact in ways that contribute to an open, welcoming, 14 | diverse, inclusive, and healthy community. 15 | 16 | ## Our Standards 17 | 18 | Examples of behavior that contributes to a positive environment for our 19 | community include: 20 | 21 | * Demonstrating empathy and kindness toward other people 22 | * Being respectful of differing opinions, viewpoints, and experiences 23 | * Giving and gracefully accepting constructive feedback 24 | * Accepting responsibility and apologizing to those affected by our mistakes, 25 | and learning from the experience 26 | * Focusing on what is best not just for us as individuals, but for the overall 27 | community 28 | 29 | Examples of unacceptable behavior include: 30 | 31 | * The use of sexualized language or imagery, and sexual attention or advances of 32 | any kind 33 | * Trolling, insulting or derogatory comments, and personal or political attacks 34 | * Public or private harassment 35 | * Publishing others' private information, such as a physical or email address, 36 | without their explicit permission 37 | * Other conduct which could reasonably be considered inappropriate in a 38 | professional setting 39 | 40 | ## Enforcement Responsibilities 41 | 42 | Community leaders are responsible for clarifying and enforcing our standards of 43 | acceptable behavior and will take appropriate and fair corrective action in 44 | response to any behavior that they deem inappropriate, threatening, offensive, 45 | or harmful. 46 | 47 | Community leaders have the right and responsibility to remove, edit, or reject 48 | comments, commits, code, wiki edits, issues, and other contributions that are 49 | not aligned to this Code of Conduct, and will communicate reasons for moderation 50 | decisions when appropriate. 51 | 52 | ## Scope 53 | 54 | This Code of Conduct applies within all community spaces, and also applies when 55 | an individual is officially representing the community in public spaces. 56 | Examples of representing our community include using an official email address, 57 | posting via an official social media account, or acting as an appointed 58 | representative at an online or offline event. 59 | 60 | ## Enforcement 61 | 62 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 63 | reported to the community leaders responsible for enforcement to @cloudycotton. 64 | All complaints will be reviewed and investigated promptly and fairly. 65 | 66 | All community leaders are obligated to respect the privacy and security of the 67 | reporter of any incident. 68 | 69 | ## Enforcement Guidelines 70 | 71 | Community leaders will follow these Community Impact Guidelines in determining 72 | the consequences for any action they deem in violation of this Code of Conduct: 73 | 74 | ### 1. Correction 75 | 76 | **Community Impact**: Use of inappropriate language or other behavior deemed 77 | unprofessional or unwelcome in the community. 78 | 79 | **Consequence**: A private, written warning from community leaders, providing 80 | clarity around the nature of the violation and an explanation of why the 81 | behavior was inappropriate. A public apology may be requested. 82 | 83 | ### 2. Warning 84 | 85 | **Community Impact**: A violation through a single incident or series of 86 | actions. 87 | 88 | **Consequence**: A warning with consequences for continued behavior. No 89 | interaction with the people involved, including unsolicited interaction with 90 | those enforcing the Code of Conduct, for a specified period of time. This 91 | includes avoiding interactions in community spaces as well as external channels 92 | like social media. Violating these terms may lead to a temporary or permanent 93 | ban. 94 | 95 | ### 3. Temporary Ban 96 | 97 | **Community Impact**: A serious violation of community standards, including 98 | sustained inappropriate behavior. 99 | 100 | **Consequence**: A temporary ban from any sort of interaction or public 101 | communication with the community for a specified period of time. No public or 102 | private interaction with the people involved, including unsolicited interaction 103 | with those enforcing the Code of Conduct, is allowed during this period. 104 | Violating these terms may lead to a permanent ban. 105 | 106 | ### 4. Permanent Ban 107 | 108 | **Community Impact**: Demonstrating a pattern of violation of community 109 | standards, including sustained inappropriate behavior, harassment of an 110 | individual, or aggression toward or disparagement of classes of individuals. 111 | 112 | **Consequence**: A permanent ban from any sort of public interaction within the 113 | community. 114 | 115 | ## Attribution 116 | 117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], 118 | version 2.1, available at 119 | [https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1]. 120 | 121 | Community Impact Guidelines were inspired by 122 | [Mozilla's code of conduct enforcement ladder][Mozilla CoC]. 123 | 124 | For answers to common questions about this code of conduct, see the FAQ at 125 | [https://www.contributor-covenant.org/faq][FAQ]. Translations are available at 126 | [https://www.contributor-covenant.org/translations][translations]. 127 | 128 | [homepage]: https://www.contributor-covenant.org 129 | [v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html 130 | [Mozilla CoC]: https://github.com/mozilla/diversity 131 | [FAQ]: https://www.contributor-covenant.org/faq 132 | [translations]: https://www.contributor-covenant.org/translations 133 | -------------------------------------------------------------------------------- /src/adb_client.ts: -------------------------------------------------------------------------------- 1 | import { exec } from "child_process"; 2 | import { promisify } from "util"; 3 | import { parseUiDump } from "./ui_dump_parser"; 4 | import { homedir } from "os"; 5 | import { join } from "path"; 6 | import { existsSync } from "fs"; 7 | import sharp from "sharp"; 8 | 9 | const execAsync = promisify(exec); 10 | 11 | const ANDROID_KEY_EVENTS: Record = Object.entries({ 12 | Enter: "KEYCODE_ENTER", 13 | Backspace: "KEYCODE_DEL", 14 | Tab: "KEYCODE_TAB", 15 | ArrowUp: "KEYCODE_DPAD_UP", 16 | ArrowDown: "KEYCODE_DPAD_DOWN", 17 | ArrowLeft: "KEYCODE_DPAD_LEFT", 18 | ArrowRight: "KEYCODE_DPAD_RIGHT", 19 | Escape: "KEYCODE_ESCAPE", 20 | Home: "KEYCODE_HOME", 21 | Back: "KEYCODE_BACK", 22 | }).reduce((keyMap, [key, value]) => { 23 | keyMap[key.toLowerCase().trim()] = value; 24 | return keyMap; 25 | }, {} as Record); 26 | 27 | interface Coordinate { 28 | x: number; 29 | y: number; 30 | } 31 | 32 | export function getPotentialADBPaths(): string[] { 33 | const home = homedir(); 34 | const platform = process.platform; 35 | const paths: string[] = []; 36 | 37 | if (platform === "win32") { 38 | // Windows-specific paths 39 | paths.push( 40 | join( 41 | process.env.LOCALAPPDATA ?? "", 42 | "Android/Sdk/platform-tools/adb.exe" 43 | ), 44 | "C:\\Android\\sdk\\platform-tools\\adb.exe", 45 | join(home, "AppData/Local/Android/Sdk/platform-tools/adb.exe"), 46 | join(home, "AppData/Local/Android/android-sdk/platform-tools/adb.exe"), 47 | "C:\\Program Files\\Android\\android-sdk\\platform-tools\\adb.exe", 48 | "C:\\Program Files (x86)\\Android\\android-sdk\\platform-tools\\adb.exe" 49 | ); 50 | } else if (platform === "darwin") { 51 | // macOS-specific paths 52 | paths.push( 53 | "/usr/local/bin/adb", 54 | "/opt/homebrew/bin/adb", 55 | join(home, "Library/Android/sdk/platform-tools/adb"), 56 | "/Applications/Android Studio.app/Contents/sdk/platform-tools/adb" 57 | ); 58 | } else if (platform === "linux") { 59 | // Linux-specific paths 60 | paths.push( 61 | "/usr/local/bin/adb", 62 | "/usr/bin/adb", 63 | join(home, "Android/Sdk/platform-tools/adb"), 64 | "/opt/android-sdk/platform-tools/adb", 65 | "/opt/android-studio/sdk/platform-tools/adb" 66 | ); 67 | } else { 68 | // Other platforms (FreeBSD, OpenBSD, etc.) 69 | paths.push( 70 | "/usr/local/bin/adb", 71 | "/usr/bin/adb", 72 | join(home, "android-sdk/platform-tools/adb") 73 | ); 74 | } 75 | 76 | // Add ANDROID_HOME path for all platforms 77 | if (process.env.ANDROID_HOME) { 78 | const adbExecutable = platform === "win32" ? "adb.exe" : "adb"; 79 | paths.push(join(process.env.ANDROID_HOME, "platform-tools", adbExecutable)); 80 | } 81 | 82 | return paths; 83 | } 84 | 85 | export interface ADBClientOptions { 86 | adbPath?: string; 87 | } 88 | 89 | export class ADBClient { 90 | private adbPath: string; 91 | 92 | constructor(options?: ADBClientOptions) { 93 | if (!options?.adbPath) { 94 | this.adbPath = this.getAdbPath(); 95 | } else { 96 | this.adbPath = options.adbPath; 97 | } 98 | } 99 | 100 | getAdbPath() { 101 | const paths = getPotentialADBPaths(); 102 | const validPath = paths.find((path) => existsSync(path)); 103 | 104 | if (!validPath) { 105 | throw new Error( 106 | "ADB not found. Please ensure Android SDK is installed and properly configured." 107 | ); 108 | } 109 | return validPath; 110 | } 111 | 112 | async init() { 113 | await this.shell("settings put global window_animation_scale 0"); 114 | await this.shell("settings put global transition_animation_scale 0"); 115 | await this.shell("settings put global animator_duration_scale 0"); 116 | } 117 | 118 | async screenshot() { 119 | const { stdout } = await execAsync( 120 | `"${this.adbPath}" exec-out screencap -p`, 121 | { 122 | encoding: "buffer", 123 | maxBuffer: 25 * 1024 * 1024, 124 | } 125 | ); 126 | return sharp(stdout) 127 | .png({ 128 | quality: 25, 129 | }) 130 | .toBuffer(); 131 | } 132 | 133 | async screenSize() { 134 | const { stdout } = await this.execOut("wm size"); 135 | const match = stdout.match(/Physical size: (\d+)x(\d+)/); 136 | if (!match) { 137 | throw new Error("Failed to get viewport size"); 138 | } 139 | return { 140 | width: parseInt(match[1]), 141 | height: parseInt(match[2]), 142 | }; 143 | } 144 | 145 | async execOut(command: string) { 146 | return execAsync(`"${this.adbPath}" exec-out ${command}`); 147 | } 148 | 149 | async shell(command: string) { 150 | return execAsync(`"${this.adbPath}" shell ${command}`); 151 | } 152 | 153 | async doubleTap(coordinate: Coordinate) { 154 | const { x, y } = coordinate; 155 | await this.shell(`input tap ${x} ${y}`); 156 | return this.shell(`input tap ${x} ${y}`); 157 | } 158 | 159 | async tap(coordinate: Coordinate) { 160 | const { x, y } = coordinate; 161 | return this.shell(`input tap ${x} ${y}`); 162 | } 163 | 164 | async swipe(start: Coordinate, end: Coordinate, duration: number = 300) { 165 | const { x: startX, y: startY } = start; 166 | const { x: endX, y: endY } = end; 167 | return this.shell( 168 | `input swipe ${startX} ${startY} ${endX} ${endY} ${duration}` 169 | ); 170 | } 171 | 172 | async type(text: string) { 173 | return this.shell(`input text "${text.replace(/["\s]/g, "\\ ")}"`); 174 | } 175 | 176 | async keyPress(key: string) { 177 | const androidKey = ANDROID_KEY_EVENTS[key.toLowerCase()]; 178 | if (!androidKey) { 179 | throw new Error(`Unsupported key: ${key}`); 180 | } 181 | 182 | return this.shell(`input keyevent ${androidKey}`); 183 | } 184 | 185 | async listPackages(filter?: string) { 186 | const { stdout } = await this.execOut(`pm list packages ${filter || ""}`); 187 | return stdout 188 | .split("\n") 189 | .map((line) => line.replace("package:", "").trim()) 190 | .filter(Boolean); 191 | } 192 | 193 | async openApp(packageName: string) { 194 | const result = await this.shell(`monkey -p ${packageName} 1`); 195 | if (result.stderr && result.stderr.includes("No activities found")) { 196 | throw new Error(`Failed to open app: ${result.stderr}`); 197 | } 198 | return result; 199 | } 200 | 201 | async dumpUI() { 202 | try { 203 | const { stdout } = await this.execOut( 204 | `uiautomator dump --compressed /dev/tty` 205 | ); 206 | const ui = JSON.stringify(parseUiDump(stdout)); 207 | return ui; 208 | } catch (error) { 209 | throw new Error(`Failed to get UI hierarchy: ${error.message}`); 210 | } 211 | } 212 | } 213 | -------------------------------------------------------------------------------- /src/ui_dump_parser.ts: -------------------------------------------------------------------------------- 1 | import { XMLParser } from "fast-xml-parser"; 2 | 3 | interface UiElement { 4 | id?: string; 5 | type: string; 6 | text?: string; 7 | desc?: string; 8 | clickable: boolean; 9 | bounds: string; 10 | children?: UiElement[]; 11 | } 12 | 13 | /** 14 | * Parses ADB UI dumps into a simplified tree for AI agent navigation 15 | */ 16 | export function parseUiDump(xmlDump: string): UiElement { 17 | const options = { 18 | ignoreAttributes: false, 19 | attributeNamePrefix: "", 20 | parseAttributeValue: true, 21 | isArray: (name: string) => name === "node", 22 | }; 23 | 24 | const parser = new XMLParser(options); 25 | const parsed = parser.parse(xmlDump); 26 | 27 | if (parsed.hierarchy && parsed.hierarchy.node && parsed.hierarchy.node[0]) { 28 | return simplifyNode(parsed.hierarchy.node[0]); 29 | } 30 | 31 | return { type: "root", clickable: false, bounds: "[0,0][0,0]" }; 32 | } 33 | 34 | /** 35 | * Safely checks if a string has content 36 | */ 37 | function hasContent(str: any): boolean { 38 | return typeof str === "string" && str !== ""; 39 | } 40 | 41 | /** 42 | * Converts a complex node into a simplified structure 43 | */ 44 | function simplifyNode(node: any): UiElement { 45 | // Extract only the most essential properties 46 | const element: UiElement = { 47 | type: getElementType(node), 48 | clickable: node.clickable === "true", 49 | bounds: node.bounds || "[0,0][0,0]", 50 | }; 51 | 52 | // Only include text if it exists and has content 53 | if (hasContent(node.text)) { 54 | element.text = node.text; 55 | } 56 | 57 | // Include content description if available 58 | if (hasContent(node["content-desc"])) { 59 | element.desc = node["content-desc"]; 60 | } 61 | 62 | // Include resource ID but simplify it 63 | if (hasContent(node["resource-id"])) { 64 | // Extract just the name part of the ID for readability 65 | const idParts = node["resource-id"].split("/"); 66 | element.id = idParts[idParts.length - 1]; 67 | } 68 | 69 | // Process children if they exist 70 | if (node.node && node.node.length > 0) { 71 | // Skip intermediate containers 72 | if (shouldCollapseContainer(node)) { 73 | // Directly include children of this container instead 74 | element.children = flattenChildren(node.node); 75 | } else { 76 | // Only include meaningful children 77 | const meaningfulChildren = node.node 78 | .filter((child: any) => isMeaningfulNode(child)) 79 | .map((child: any) => simplifyNode(child)); 80 | 81 | if (meaningfulChildren.length > 0) { 82 | element.children = meaningfulChildren; 83 | } 84 | } 85 | } 86 | 87 | return element; 88 | } 89 | 90 | /** 91 | * Determines if a container node should be collapsed to reduce tree depth 92 | */ 93 | function shouldCollapseContainer(node: any): boolean { 94 | // Skip intermediate containers that just add nesting 95 | return ( 96 | !hasContent(node.text) && 97 | !hasContent(node["content-desc"]) && 98 | node.clickable !== "true" && 99 | node.scrollable !== "true" && 100 | !hasContent(node["resource-id"]) && 101 | (node.class?.includes("Layout") || node.class?.includes("ViewGroup")) 102 | ); 103 | } 104 | 105 | /** 106 | * Flattens children of intermediate containers 107 | */ 108 | function flattenChildren(nodes: any[]): UiElement[] { 109 | let result: UiElement[] = []; 110 | 111 | for (const child of nodes) { 112 | if (shouldCollapseContainer(child) && child.node) { 113 | // Recursively flatten this container's children 114 | result = result.concat(flattenChildren(child.node)); 115 | } else if (isMeaningfulNode(child)) { 116 | // Add this meaningful node 117 | result.push(simplifyNode(child)); 118 | } 119 | } 120 | 121 | return result; 122 | } 123 | 124 | /** 125 | * Determines if a node has meaningful content for an agent 126 | */ 127 | function isMeaningfulNode(node: any): boolean { 128 | // Keep nodes that are interactive 129 | if (node.clickable === "true" || node.scrollable === "true") { 130 | return true; 131 | } 132 | 133 | // Keep nodes with text or content description 134 | if (hasContent(node.text) || hasContent(node["content-desc"])) { 135 | return true; 136 | } 137 | 138 | // Keep nodes with specific resource IDs 139 | if (hasContent(node["resource-id"])) { 140 | return true; 141 | } 142 | 143 | // Check if the node has meaningful children 144 | if (node.node && node.node.length > 0) { 145 | return node.node.some((child: any) => isMeaningfulNode(child)); 146 | } 147 | 148 | return false; 149 | } 150 | 151 | /** 152 | * Maps Android UI element classes to simpler type names 153 | */ 154 | function getElementType(node: any): string { 155 | const className = node.class || ""; 156 | 157 | // Map common Android classes to simpler types 158 | if (className.includes("Button")) return "button"; 159 | if (className.includes("EditText")) return "input"; 160 | if (className.includes("TextView")) return "text"; 161 | if (className.includes("ImageView")) return "image"; 162 | if (className.includes("CheckBox")) return "checkbox"; 163 | if (className.includes("RadioButton")) return "radio"; 164 | if (className.includes("RecyclerView") || className.includes("ListView")) 165 | return "list"; 166 | if (className.includes("CardView")) return "card"; 167 | 168 | // Special case for dialpad buttons 169 | if ( 170 | hasContent(node["resource-id"]) && 171 | (node["resource-id"].includes("one") || 172 | node["resource-id"].includes("two") || 173 | node["resource-id"].includes("three") || 174 | node["resource-id"].includes("four") || 175 | node["resource-id"].includes("five") || 176 | node["resource-id"].includes("six") || 177 | node["resource-id"].includes("seven") || 178 | node["resource-id"].includes("eight") || 179 | node["resource-id"].includes("nine") || 180 | node["resource-id"].includes("zero") || 181 | node["resource-id"].includes("star") || 182 | node["resource-id"].includes("pound")) 183 | ) { 184 | return "dialpad_button"; 185 | } 186 | 187 | return "view"; 188 | } 189 | 190 | /** 191 | * Creates a simple text summary of key UI elements for the agent 192 | */ 193 | export function describeUi(ui: UiElement): string { 194 | const interactiveElements = findAllInteractiveElements(ui); 195 | 196 | if (interactiveElements.length === 0) { 197 | return "No interactive elements found."; 198 | } 199 | 200 | let summary = `Found ${interactiveElements.length} interactive elements:\n`; 201 | 202 | interactiveElements.forEach((el, i) => { 203 | const description = [ 204 | el.text ? `"${el.text}"` : "", 205 | el.desc ? `(${el.desc})` : "", 206 | el.id ? `[${el.id}]` : "", 207 | el.type, 208 | ] 209 | .filter(Boolean) 210 | .join(" "); 211 | 212 | summary += `${i + 1}. ${description} at ${el.bounds}\n`; 213 | }); 214 | 215 | return summary; 216 | } 217 | 218 | /** 219 | * Finds all interactive elements in the UI 220 | */ 221 | function findAllInteractiveElements(element: UiElement): UiElement[] { 222 | let results: UiElement[] = []; 223 | 224 | if ( 225 | element.clickable || 226 | element.type === "input" || 227 | element.type === "list" 228 | ) { 229 | results.push(element); 230 | } 231 | 232 | if (element.children) { 233 | for (const child of element.children) { 234 | results = results.concat(findAllInteractiveElements(child)); 235 | } 236 | } 237 | 238 | return results; 239 | } 240 | --------------------------------------------------------------------------------