├── demo
    ├── demo_web_frontend
    │   ├── .gitattributes
    │   ├── public
    │   │   ├── logo.png
    │   │   └── favicon.ico
    │   ├── src
    │   │   ├── assets
    │   │   │   ├── head.png
    │   │   │   ├── logo.png
    │   │   │   ├── logo.svg
    │   │   │   ├── main.css
    │   │   │   └── base.css
    │   │   ├── router
    │   │   │   └── index.js
    │   │   ├── store.js
    │   │   ├── main.js
    │   │   ├── App.vue
    │   │   └── components
    │   │   │   ├── ScreenPage.vue
    │   │   │   ├── TaskPage.vue
    │   │   │   ├── ChatPage.vue
    │   │   │   └── ReasoningPage.vue
    │   ├── .prettierrc.json
    │   ├── jsconfig.json
    │   ├── .editorconfig
    │   ├── index.html
    │   ├── .gitignore
    │   ├── vite.config.js
    │   ├── README.md
    │   ├── eslint.config.js
    │   └── package.json
    ├── demo_agent_backend.py
    └── demo_web_backend.py
├── assets
    ├── GUI-KRB.webp
    ├── compare.webp
    ├── SPA-Bench.webp
    ├── web-demo.webp
    ├── AndroidWorld.webp
    └── GUI-explorer.webp
├── __init__.py
├── utils
    ├── __init__.py
    ├── prompt_templates.py
    ├── retrieval.py
    ├── utils.py
    ├── knowledge_generation.py
    ├── memory.py
    ├── embedding_pipeline.py
    └── device.py
├── MLLM_Agent
    ├── __init__.py
    └── json_action.py
├── requirements.txt
├── .env.example
├── LICENSE
├── .gitignore
├── README.md
└── exploration_and_mining.py


/demo/demo_web_frontend/.gitattributes:
--------------------------------------------------------------------------------
1 | * text=auto eol=lf
2 | 


--------------------------------------------------------------------------------
/assets/GUI-KRB.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JiuTian-VL/GUI-explorer/HEAD/assets/GUI-KRB.webp


--------------------------------------------------------------------------------
/assets/compare.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JiuTian-VL/GUI-explorer/HEAD/assets/compare.webp


--------------------------------------------------------------------------------
/assets/SPA-Bench.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JiuTian-VL/GUI-explorer/HEAD/assets/SPA-Bench.webp


--------------------------------------------------------------------------------
/assets/web-demo.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JiuTian-VL/GUI-explorer/HEAD/assets/web-demo.webp


--------------------------------------------------------------------------------
/assets/AndroidWorld.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JiuTian-VL/GUI-explorer/HEAD/assets/AndroidWorld.webp


--------------------------------------------------------------------------------
/assets/GUI-explorer.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JiuTian-VL/GUI-explorer/HEAD/assets/GUI-explorer.webp


--------------------------------------------------------------------------------
/demo/demo_web_frontend/public/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JiuTian-VL/GUI-explorer/HEAD/demo/demo_web_frontend/public/logo.png


--------------------------------------------------------------------------------
/demo/demo_web_frontend/public/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JiuTian-VL/GUI-explorer/HEAD/demo/demo_web_frontend/public/favicon.ico


--------------------------------------------------------------------------------
/demo/demo_web_frontend/src/assets/head.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JiuTian-VL/GUI-explorer/HEAD/demo/demo_web_frontend/src/assets/head.png


--------------------------------------------------------------------------------
/demo/demo_web_frontend/src/assets/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JiuTian-VL/GUI-explorer/HEAD/demo/demo_web_frontend/src/assets/logo.png


--------------------------------------------------------------------------------
/demo/demo_web_frontend/.prettierrc.json:
--------------------------------------------------------------------------------
1 | {
2 |   "$schema": "https://json.schemastore.org/prettierrc",
3 |   "semi": false,
4 |   "singleQuote": true,
5 |   "printWidth": 100
6 | }
7 | 


--------------------------------------------------------------------------------
/demo/demo_web_frontend/jsconfig.json:
--------------------------------------------------------------------------------
1 | {
2 |   "compilerOptions": {
3 |     "paths": {
4 |       "@/*": ["./src/*"]
5 |     }
6 |   },
7 |   "exclude": ["node_modules", "dist"]
8 | }
9 | 


--------------------------------------------------------------------------------
/demo/demo_web_frontend/src/router/index.js:
--------------------------------------------------------------------------------
 1 | import { createRouter, createWebHistory } from 'vue-router'
 2 | 
 3 | const router = createRouter({
 4 |   history: createWebHistory(import.meta.env.BASE_URL),
 5 |   routes: [
 6 |   ],
 7 | })
 8 | 
 9 | export default router
10 | 


--------------------------------------------------------------------------------
/demo/demo_web_frontend/src/store.js:
--------------------------------------------------------------------------------
1 | import { defineStore } from 'pinia'
2 | export const useGlobalStore = defineStore('global', {
3 |   state: () => ({ runningTask: false ,controller: null,}),
4 |   actions: { toggleRunningTask() { this.runningTask = !this.runningTask } }
5 | })


--------------------------------------------------------------------------------
/demo/demo_web_frontend/.editorconfig:
--------------------------------------------------------------------------------
 1 | [*.{js,jsx,mjs,cjs,ts,tsx,mts,cts,vue,css,scss,sass,less,styl}]
 2 | charset = utf-8
 3 | indent_size = 2
 4 | indent_style = space
 5 | insert_final_newline = true
 6 | trim_trailing_whitespace = true
 7 | 
 8 | end_of_line = lf
 9 | max_line_length = 100
10 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | from dotenv import load_dotenv
3 | 
4 | # 获取 .env 文件的路径
5 | dotenv_path = os.path.join(os.path.dirname(__file__), ".env")
6 | assert os.path.exists(dotenv_path), f"{dotenv_path} not found"
7 | load_dotenv(dotenv_path=dotenv_path, verbose=True, override=True)
8 | print("环境变量已加载")
9 | 


--------------------------------------------------------------------------------
/demo/demo_web_frontend/src/assets/logo.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 261.76 226.69"><path d="M161.096.001l-30.225 52.351L100.647.001H-.005l130.877 226.688L261.749.001z" fill="#41b883"/><path d="M161.096.001l-30.225 52.351L100.647.001H52.346l78.526 136.01L209.398.001z" fill="#34495e"/></svg>
2 | 


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | from dotenv import load_dotenv
3 | 
4 | # 获取 .env 文件的路径，为当前文件夹的父目录
5 | dotenv_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), ".env")
6 | assert os.path.exists(dotenv_path), f"{dotenv_path} not found"
7 | load_dotenv(dotenv_path=dotenv_path, verbose=True, override=True)
8 | print("环境变量已加载")
9 | 


--------------------------------------------------------------------------------
/MLLM_Agent/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | from dotenv import load_dotenv
3 | 
4 | # 获取 .env 文件的路径，为当前文件夹的父目录
5 | dotenv_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), ".env")
6 | assert os.path.exists(dotenv_path), f"{dotenv_path} not found"
7 | load_dotenv(dotenv_path=dotenv_path, verbose=True, override=True)
8 | print("环境变量已加载")
9 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | androguard>=4
 2 | python-dotenv
 3 | Pillow
 4 | uiautomator2>=3.2.5
 5 | 
 6 | torch>=2.4.0
 7 | torchvision>=0.19.0
 8 | torchaudio>=2.4.0
 9 | 
10 | transformers
11 | sentencepiece
12 | protobuf
13 | faiss-cpu
14 | opencv-python-headless
15 | opencv-contrib-python-headless
16 | ImageHash
17 | gradio
18 | zstd
19 | nest-asyncio
20 | aiohttp
21 | 
22 | fastapi
23 | uvicorn[standard]
24 | 


--------------------------------------------------------------------------------
/demo/demo_web_frontend/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | 
 4 | <head>
 5 |   <meta charset="UTF-8">
 6 |   <!-- <link rel="icon" href="/favicon.ico"> -->
 7 |   <link rel="icon" href="/logo.png">
 8 |   <meta name="viewport" content="width=device-width, initial-scale=1.0">
 9 |   <title>GUI Agent Demo</title>
10 | </head>
11 | 
12 | <body>
13 |   <div id="app"></div>
14 |   <script type="module" src="/src/main.js"></script>
15 | </body>
16 | 
17 | </html>


--------------------------------------------------------------------------------
/demo/demo_web_frontend/src/main.js:
--------------------------------------------------------------------------------
 1 | import './assets/main.css'
 2 | 
 3 | import { createApp } from 'vue'
 4 | import App from './App.vue'
 5 | import router from './router'
 6 | 
 7 | import Antd from 'ant-design-vue';
 8 | import 'ant-design-vue/dist/reset.css';
 9 | 
10 | import { createPinia } from 'pinia'
11 | 
12 | const app = createApp(App)
13 | 
14 | app.use(Antd)
15 | app.use(router)
16 | 
17 | const pinia = createPinia()
18 | app.use(pinia)
19 | 
20 | app.mount('#app')
21 | 


--------------------------------------------------------------------------------
/demo/demo_web_frontend/.gitignore:
--------------------------------------------------------------------------------
 1 | # Logs
 2 | logs
 3 | *.log
 4 | npm-debug.log*
 5 | yarn-debug.log*
 6 | yarn-error.log*
 7 | pnpm-debug.log*
 8 | lerna-debug.log*
 9 | 
10 | node_modules
11 | .DS_Store
12 | dist
13 | dist-ssr
14 | coverage
15 | *.local
16 | 
17 | /cypress/videos/
18 | /cypress/screenshots/
19 | 
20 | # Editor directories and files
21 | .vscode/*
22 | !.vscode/extensions.json
23 | .idea
24 | *.suo
25 | *.ntvs*
26 | *.njsproj
27 | *.sln
28 | *.sw?
29 | 
30 | *.tsbuildinfo
31 | 


--------------------------------------------------------------------------------
/demo/demo_web_frontend/vite.config.js:
--------------------------------------------------------------------------------
 1 | import { fileURLToPath, URL } from 'node:url'
 2 | 
 3 | import { defineConfig } from 'vite'
 4 | import vue from '@vitejs/plugin-vue'
 5 | import vueDevTools from 'vite-plugin-vue-devtools'
 6 | 
 7 | // https://vite.dev/config/
 8 | export default defineConfig({
 9 |   plugins: [
10 |     vue(),
11 |     vueDevTools(),
12 |   ],
13 |   resolve: {
14 |     alias: {
15 |       '@': fileURLToPath(new URL('./src', import.meta.url))
16 |     },
17 |   },
18 | })
19 | 


--------------------------------------------------------------------------------
/demo/demo_web_frontend/src/assets/main.css:
--------------------------------------------------------------------------------
 1 | @import './base.css';
 2 | 
 3 | /* #app {
 4 |   max-width: 1280px;
 5 |   margin: 0 auto;
 6 |   padding: 2rem;
 7 |   font-weight: normal;
 8 | } */
 9 | 
10 | a,
11 | .green {
12 |   text-decoration: none;
13 |   color: hsla(160, 100%, 37%, 1);
14 |   transition: 0.4s;
15 |   padding: 3px;
16 | }
17 | 
18 | @media (hover: hover) {
19 |   a:hover {
20 |     background-color: hsla(160, 100%, 37%, 0.2);
21 |   }
22 | }
23 | 
24 | /* @media (min-width: 1024px) {
25 |   body {
26 |     display: flex;
27 |     place-items: center;
28 |   }
29 | 
30 |   #app {
31 |     display: grid;
32 |     grid-template-columns: 1fr 1fr;
33 |     padding: 0 2rem;
34 |   }
35 | } */
36 | 


--------------------------------------------------------------------------------
/demo/demo_web_frontend/README.md:
--------------------------------------------------------------------------------
 1 | # vue-project
 2 | 
 3 | This template should help get you started developing with Vue 3 in Vite.
 4 | 
 5 | ## Recommended IDE Setup
 6 | 
 7 | [VSCode](https://code.visualstudio.com/) + [Volar](https://marketplace.visualstudio.com/items?itemName=Vue.volar) (and disable Vetur).
 8 | 
 9 | ## Customize configuration
10 | 
11 | See [Vite Configuration Reference](https://vite.dev/config/).
12 | 
13 | ## Project Setup
14 | 
15 | ```sh
16 | pnpm install
17 | ```
18 | 
19 | ### Compile and Hot-Reload for Development
20 | 
21 | ```sh
22 | pnpm dev
23 | ```
24 | 
25 | ### Compile and Minify for Production
26 | 
27 | ```sh
28 | pnpm build
29 | ```
30 | 
31 | ### Lint with [ESLint](https://eslint.org/)
32 | 
33 | ```sh
34 | pnpm lint
35 | ```
36 | 


--------------------------------------------------------------------------------
/demo/demo_web_frontend/eslint.config.js:
--------------------------------------------------------------------------------
 1 | import { defineConfig, globalIgnores } from 'eslint/config'
 2 | import globals from 'globals'
 3 | import js from '@eslint/js'
 4 | import pluginVue from 'eslint-plugin-vue'
 5 | import pluginOxlint from 'eslint-plugin-oxlint'
 6 | import skipFormatting from '@vue/eslint-config-prettier/skip-formatting'
 7 | 
 8 | export default defineConfig([
 9 |   {
10 |     name: 'app/files-to-lint',
11 |     files: ['**/*.{js,mjs,jsx,vue}'],
12 |   },
13 | 
14 |   globalIgnores(['**/dist/**', '**/dist-ssr/**', '**/coverage/**']),
15 | 
16 |   {
17 |     languageOptions: {
18 |       globals: {
19 |         ...globals.browser,
20 |       },
21 |     },
22 |   },
23 | 
24 |   js.configs.recommended,
25 |   ...pluginVue.configs['flat/essential'],
26 |   ...pluginOxlint.configs['flat/recommended'],
27 |   skipFormatting,
28 | ])
29 | 


--------------------------------------------------------------------------------
/demo/demo_web_frontend/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "vue-project",
 3 |   "version": "0.0.0",
 4 |   "private": true,
 5 |   "type": "module",
 6 |   "scripts": {
 7 |     "dev": "vite",
 8 |     "build": "vite build",
 9 |     "preview": "vite preview",
10 |     "lint:oxlint": "oxlint . --fix -D correctness --ignore-path .gitignore",
11 |     "lint:eslint": "eslint . --fix",
12 |     "lint": "run-s lint:*",
13 |     "format": "prettier --write src/"
14 |   },
15 |   "dependencies": {
16 |     "ant-design-vue": "4.x",
17 |     "pinia": "~3.0.1",
18 |     "vue": "3.5.13",
19 |     "vue-router": "~4.5.0"
20 |   },
21 |   "devDependencies": {
22 |     "@eslint/js": "^9.22.0",
23 |     "@vitejs/plugin-vue": "^5.2.3",
24 |     "@vue/eslint-config-prettier": "^10.2.0",
25 |     "eslint": "^9.22.0",
26 |     "eslint-plugin-oxlint": "^0.16.0",
27 |     "eslint-plugin-vue": "~10.0.0",
28 |     "globals": "^16.0.0",
29 |     "npm-run-all2": "^7.0.2",
30 |     "oxlint": "^0.16.0",
31 |     "prettier": "3.5.3",
32 |     "vite": "^6.2.5",
33 |     "vite-plugin-vue-devtools": "^7.7.2"
34 |   }
35 | }
36 | 


--------------------------------------------------------------------------------
/.env.example:
--------------------------------------------------------------------------------
 1 | OPENAI_API_KEY=""
 2 | OPENAI_BASE_URL="https://api.openai.com/v1"
 3 | 
 4 | OPENAI_API_MODEL="gpt-4o"
 5 | 
 6 | HF_ENDPOINT = "https://hf-mirror.com"
 7 | #HTTP_PROXY="http://127.0.0.1:1080"
 8 | 
 9 | # 如果版本升级，是否清空知识库重新生成
10 | EMPTY_KNOWLEDGE_BASE_WHEN_VERSION_UPGRADE = "False"
11 | # 如果版本降级，是否清空知识库重新生成
12 | EMPTY_KNOWLEDGE_BASE_WHEN_VERSION_DOWNGRADE = "True"
13 | 
14 | KNOWLEDGE_BASE_ABSOLUTE_ROOT_PATH = "./knowledge_base"
15 | 
16 | SERVER_EMBEDDING_DEVICE="cuda" # "cuda" 或者 "cpu" 或者 "mps"
17 | SERVER_EMBEDDING_ENDPOINT="http://127.0.0.1:8765" # 用于 embedding 服务的地址，详见 utils\embedding_pipeline.py
18 | CLIENT_EMBEDDING_DEVICE="server" # "cuda" 或者 "cpu" 或者 "mps" 或者 "server" ， 如果填写 "server" 则需要运行 python -m utils.embedding_pipeline
19 | 
20 | TURN_ON_DEMO_MODE="True" # 是否开启 demo 模式
21 | MESSAGE_SERVER_ENDPOINT="http://127.0.0.1:8768" # see demo/demo_web_backend.py
22 | DEMO_BACKEND_ENDPOINT="http://127.0.0.1:8767" # see demo/demo_agent_backend.py
23 | RAG_SERVER_ENDPOINT="http://127.0.0.1:8769" # see utils/retrieval.py
24 | 
25 | LOW_RESOLUTION="False" # 是否开启低分辨率模式，开启后会将屏幕截图降低分辨率，减少Token数量
26 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 xieincz
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | knowledge_base
 2 | exploration_output
 3 | 
 4 | .env
 5 | screenshot
 6 | results
 7 | config.yaml
 8 | tmp
 9 | *_tmp
10 | tmp*
11 | screenlog*
12 | * copy*
13 | .ruff_cache
14 | screenshot_*
15 | temp_*
16 | *test*
17 | swift
18 | flagged
19 | 
20 | 
21 | # Created by .ignore support plugin (hsz.mobi)
22 | ### Python template
23 | # Byte-compiled / optimized / DLL files
24 | __pycache__/
25 | *.py[cod]
26 | *$py.class
27 | 
28 | # Distribution / packaging
29 | .Python
30 | env/
31 | build/
32 | develop-eggs/
33 | dist/
34 | downloads/
35 | eggs/
36 | .eggs/
37 | lib/
38 | lib64/
39 | parts/
40 | sdist/
41 | var/
42 | *.egg-info/
43 | .installed.cfg
44 | *.egg
45 | 
46 | # PyInstaller
47 | #  Usually these files are written by a python script from a template
48 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
49 | *.manifest
50 | *.spec
51 | 
52 | # Installer logs
53 | pip-log.txt
54 | pip-delete-this-directory.txt
55 | 
56 | # Unit test / coverage reports
57 | htmlcov/
58 | .tox/
59 | .coverage
60 | .coverage.*
61 | .cache
62 | nosetests.xml
63 | coverage.xml
64 | *,cover
65 | 
66 | # Translations
67 | *.mo
68 | *.pot
69 | 
70 | # Django stuff:
71 | *.log
72 | 
73 | # Sphinx documentation
74 | docs/_build/
75 | 
76 | # PyBuilder
77 | target/
78 | .idea
79 | droidbot_output
80 | _site
81 | Gemfile.lock
82 | .DS_Store
83 | documents
84 | temp
85 | venv
86 | apks/
87 | output/
88 | debug.py
89 | 


--------------------------------------------------------------------------------
/demo/demo_agent_backend.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 启动 python -m utils.demo_agent_backend
 3 | """
 4 | 
 5 | import os
 6 | from MLLM_Agent.GUI_explorer import GUI_explorer
 7 | 
 8 | os.environ["no_proxy"] = "localhost, 127.0.0.1/8, ::1"
 9 | print("Agent Service")
10 | print("Loading Agent...")
11 | assert os.getenv("TURN_ON_DEMO_MODE", "False").lower() == "true"
12 | agent = GUI_explorer()
13 | 
14 | from fastapi import FastAPI, Request
15 | from fastapi.middleware.cors import CORSMiddleware
16 | 
17 | app = FastAPI()
18 | app.add_middleware(
19 |     CORSMiddleware,
20 |     allow_origins=["*"],
21 |     allow_credentials=True,
22 |     allow_methods=["*"],
23 |     allow_headers=["*"],
24 | )
25 | 
26 | 
27 | @app.post("/run_task")
28 | async def sent_massage(request: Request):
29 |     """
30 |     ```js
31 |     fetch('http://127.0.0.1:8767/run_task', {
32 |     method: 'POST',
33 |     headers: {
34 |         'accept': 'application/json',
35 |         'Content-Type': 'application/json'
36 |     },
37 |     body: JSON.stringify({
38 |         "task_goal": "打开chrome浏览器",
39 |     })
40 |     })
41 |     .then(response => response.text())
42 |     .then(data => console.log(data))
43 |     .catch(error => console.error(error));
44 |     ```
45 |     """
46 |     # 从请求中解析原始 JSON
47 |     massage = await request.json()
48 |     print(f"Received message: {str(massage)[:30]} ...")
49 |     agent.early_stop = False
50 |     agent.run(massage["task_goal"])
51 |     agent.early_stop = False
52 |     return "success"
53 | 
54 | 
55 | @app.post("/stop")
56 | async def sent_massage2(request: Request):
57 |     massage = await request.json()
58 |     print(f"Received message: {str(massage)[:30]} ...")
59 |     agent.early_stop = True
60 |     return "success"
61 | 
62 | 
63 | if __name__ == "__main__":
64 |     import uvicorn
65 | 
66 |     uvicorn.run(app, host="0.0.0.0", port=8767, timeout_graceful_shutdown=3)
67 | 


--------------------------------------------------------------------------------
/demo/demo_web_frontend/src/assets/base.css:
--------------------------------------------------------------------------------
 1 | /* color palette from <https://github.com/vuejs/theme> */
 2 | :root {
 3 |   --vt-c-white: #ffffff;
 4 |   --vt-c-white-soft: #f8f8f8;
 5 |   --vt-c-white-mute: #f2f2f2;
 6 | 
 7 |   --vt-c-black: #181818;
 8 |   --vt-c-black-soft: #222222;
 9 |   --vt-c-black-mute: #282828;
10 | 
11 |   --vt-c-indigo: #2c3e50;
12 | 
13 |   --vt-c-divider-light-1: rgba(60, 60, 60, 0.29);
14 |   --vt-c-divider-light-2: rgba(60, 60, 60, 0.12);
15 |   --vt-c-divider-dark-1: rgba(84, 84, 84, 0.65);
16 |   --vt-c-divider-dark-2: rgba(84, 84, 84, 0.48);
17 | 
18 |   --vt-c-text-light-1: var(--vt-c-indigo);
19 |   --vt-c-text-light-2: rgba(60, 60, 60, 0.66);
20 |   --vt-c-text-dark-1: var(--vt-c-white);
21 |   --vt-c-text-dark-2: rgba(235, 235, 235, 0.64);
22 | }
23 | 
24 | /* semantic color variables for this project */
25 | :root {
26 |   --color-background: var(--vt-c-white);
27 |   --color-background-soft: var(--vt-c-white-soft);
28 |   --color-background-mute: var(--vt-c-white-mute);
29 | 
30 |   --color-border: var(--vt-c-divider-light-2);
31 |   --color-border-hover: var(--vt-c-divider-light-1);
32 | 
33 |   --color-heading: var(--vt-c-text-light-1);
34 |   --color-text: var(--vt-c-text-light-1);
35 | 
36 |   --section-gap: 160px;
37 | }
38 | 
39 | @media (prefers-color-scheme: dark) {
40 |   :root {
41 |     --color-background: var(--vt-c-black);
42 |     --color-background-soft: var(--vt-c-black-soft);
43 |     --color-background-mute: var(--vt-c-black-mute);
44 | 
45 |     --color-border: var(--vt-c-divider-dark-2);
46 |     --color-border-hover: var(--vt-c-divider-dark-1);
47 | 
48 |     --color-heading: var(--vt-c-text-dark-1);
49 |     --color-text: var(--vt-c-text-dark-2);
50 |   }
51 | }
52 | 
53 | *,
54 | *::before,
55 | *::after {
56 |   box-sizing: border-box;
57 |   margin: 0;
58 |   font-weight: normal;
59 | }
60 | 
61 | body {
62 |   /*min-height: 100vh;*/
63 |   color: var(--color-text);
64 |   background: var(--color-background);
65 |   transition:
66 |     color 0.5s,
67 |     background-color 0.5s;
68 |   line-height: 1.6;
69 |   font-family:
70 |     Inter,
71 |     -apple-system,
72 |     BlinkMacSystemFont,
73 |     'Segoe UI',
74 |     Roboto,
75 |     Oxygen,
76 |     Ubuntu,
77 |     Cantarell,
78 |     'Fira Sans',
79 |     'Droid Sans',
80 |     'Helvetica Neue',
81 |     sans-serif;
82 |   font-size: 15px;
83 |   text-rendering: optimizeLegibility;
84 |   -webkit-font-smoothing: antialiased;
85 |   -moz-osx-font-smoothing: grayscale;
86 | }
87 | 


--------------------------------------------------------------------------------
/demo/demo_web_frontend/src/App.vue:
--------------------------------------------------------------------------------
  1 | <script setup>
  2 | import ScreenPage from '/src/components/ScreenPage.vue'
  3 | import TaskPage from '/src/components/TaskPage.vue'
  4 | import ReasoningPage from '/src/components/ReasoningPage.vue'
  5 | import { ref, onMounted, onUnmounted } from 'vue'
  6 | import { useGlobalStore } from '/src/store.js'
  7 | 
  8 | const store = useGlobalStore()
  9 | // 响应式聊天数据
 10 | const chatMessages = ref([])
 11 | 
 12 | // 添加新消息
 13 | //     chatMessages.value.push({
 14 | //       messageType: 'text',
 15 | //       name: 'System',
 16 | //       position: 'left',
 17 | //       html: '新消息已送达'
 18 | //     })
 19 | 
 20 | //  chatMessages.value.push({
 21 | //   messageType: 'raw',
 22 | //   name: 'Agent',
 23 | //   position: 'left',
 24 | //   html: `<img
 25 | //          src="https://xj-psd-1258344703.cos.ap-guangzhou.myqcloud.com/image/hunyuan/file/url.svg"
 26 | //          style="max-width: 200px; border-radius: 8px; margin: 4px 0;"
 27 | //        ><br>图片带文字是可以的`
 28 | // })
 29 | 
 30 | const pollingInterval = ref(null)
 31 | 
 32 | const checkMessages = async () => {
 33 | 	try {
 34 | 		const response = await fetch('http://127.0.0.1:8768/get_a_massage2', {
 35 | 			method: 'GET',
 36 | 			headers: {
 37 | 				accept: 'application/json',
 38 | 				'Content-Type': 'application/json',
 39 | 			},
 40 | 		})
 41 | 
 42 | 		if (!response.ok) return
 43 | 		const data = await response.json()
 44 | 
 45 | 		if (data === '<None>') return
 46 | 
 47 | 		//if ("<None>" in data) return
 48 | 
 49 | 		try {
 50 | 			//const jsonData = JSON.parse(data)
 51 | 			//if (jsonData.action) {
 52 | 			if (data.message_type === 'knowledge') {
 53 | 				store.runningTask = true
 54 | 				chatMessages.value.push({
 55 | 					//messageType: 'text',
 56 | 					messageType: 'raw',
 57 | 					name: 'Agent',
 58 | 					position: 'left',
 59 | 					//html: jsonData.action
 60 | 					//html:data.task_goal
 61 | 					html: `<strong>Dynamic Guidance</strong><br>${data.message}`.replace('\n','<br>'),
 62 | 					// html: `<strong>知识库检索结果</strong><br>${data.message}`.replace('\n','<br>'),//#TODO: 汉化
 63 | 				})
 64 | 			} else if (data.message_type === 'reasoning') {
 65 | 				store.runningTask = true
 66 | 				chatMessages.value.push({
 67 | 					//messageType: 'text',
 68 | 					messageType: 'raw',
 69 | 					name: 'Agent',
 70 | 					position: 'left',
 71 | 					//html: jsonData.action
 72 | 					//html:data.task_goal
 73 | 					html: `<strong>Reasoning</strong><br>${data.message}`.replace('\n', '<br>'),
 74 | 					// html: `<strong>思考</strong><br>${data.message}`.replace('\n', '<br>'),//#TODO: 汉化
 75 | 				})
 76 | 			} else if (data.message_type === 'done') {
 77 | 				store.runningTask = false
 78 | 			}else {
 79 | 				chatMessages.value.push({
 80 | 					//messageType: 'text',
 81 | 					messageType: 'raw',
 82 | 					name: 'Agent',
 83 | 					position: 'left',
 84 | 					//html: jsonData.action
 85 | 					//html:data.task_goal
 86 | 					html: `<strong>Unknown</strong><br>${data.message}`.replace('\n', '<br>'),
 87 | 				})
 88 | 			}
 89 | 			//}
 90 | 		} catch (e) {
 91 | 			console.error('JSON解析失败:', e)
 92 | 		}
 93 | 	} catch (error) {
 94 | 		console.error('获取消息失败:', error)
 95 | 	}
 96 | }
 97 | 
 98 | // 生命周期钩子
 99 | onMounted(() => {
100 | 	pollingInterval.value = setInterval(checkMessages, 499)
101 | })
102 | 
103 | onUnmounted(() => {
104 | 	clearInterval(pollingInterval.value)
105 | })
106 | </script>
107 | 
108 | <template>
109 | 	<div class="main-container">
110 | 		<img alt="JiuTian" class="head_logo" src="@/assets/head.png" />
111 | 		<div wrap justify="center" align="center" gap="5" class="content-flex">
112 | 			<ScreenPage></ScreenPage>
113 | 			<div class="task-wrapper">
114 | 				<TaskPage></TaskPage>
115 | 			</div>
116 | 			<div class="reasoning-wrapper">
117 | 				<ReasoningPage :messages="chatMessages" class="reasoning-content"></ReasoningPage>
118 | 			</div>
119 | 		</div>
120 | 	</div>
121 | </template>
122 | 
123 | <style scoped>
124 | /* 在全局样式或父容器中 */
125 | body,
126 | #app {
127 | 	height: 100%; /* 确保html/body高度占满视口 */
128 | 	min-height: 100vh; /* 兼容移动端 */
129 | 	margin: 10px 10px 10px 10px;
130 | 	display: flex;
131 | 	justify-content: center; /* 水平居中 */
132 | 	align-items: center; /* 垂直居中 */
133 | }
134 | .main-container {
135 | 	min-height: 100vh; /* 关键：占据完整视口高度 */
136 | 	display: flex;
137 | 	flex-direction: column; /* 上下布局 */
138 | 	gap: 5px;
139 | 	align-items: center;
140 | }
141 | .head_logo {
142 | 	width: 658px;
143 | 	height: auto;
144 | }
145 | /* Flex 容器自适应 */
146 | .content-flex {
147 | 	/*width: 100%;
148 | 	height: 100%;*/
149 | 	display: flex;
150 | 	gap: 5px;
151 | 	align-items: center;
152 | 	width: 658px;
153 | }
154 | 
155 | /* ReasoningPage 固定宽度容器 */
156 | .reasoning-wrapper {
157 | 	background: #f8f9fa;
158 | 	border: 0.5px solid #ccc;
159 | 	border-radius: 5px;
160 | }
161 | 
162 | /* ReasoningPage 内容尺寸 */
163 | .reasoning-content {
164 | 	width: 216px;
165 | 	height: 515px;
166 | }
167 | </style>
168 | 


--------------------------------------------------------------------------------
/MLLM_Agent/json_action.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2024 The android_world Authors.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | """Represents an action for Android interaction, parsed from a JSON format."""
 16 | 
 17 | import dataclasses
 18 | import json
 19 | from typing import Optional
 20 | 
 21 | 
 22 | _JSON_SEPARATORS = (',', ':')
 23 | 
 24 | ANSWER = 'answer'
 25 | CLICK = 'click'
 26 | DOUBLE_TAP = 'double_tap'
 27 | INPUT_TEXT = 'input_text'
 28 | KEYBOARD_ENTER = 'keyboard_enter'
 29 | LONG_PRESS = 'long_press'
 30 | NAVIGATE_BACK = 'navigate_back'
 31 | NAVIGATE_HOME = 'navigate_home'
 32 | OPEN_APP = 'open_app'
 33 | SCROLL = 'scroll'
 34 | STATUS = 'status'
 35 | SWIPE = 'swipe'
 36 | UNKNOWN = 'unknown'
 37 | WAIT = 'wait'
 38 | 
 39 | _ACTION_TYPES = (
 40 |     CLICK,
 41 |     DOUBLE_TAP,
 42 |     SCROLL,
 43 |     SWIPE,
 44 |     INPUT_TEXT,
 45 |     NAVIGATE_HOME,
 46 |     NAVIGATE_BACK,
 47 |     KEYBOARD_ENTER,
 48 |     OPEN_APP,
 49 |     STATUS,
 50 |     WAIT,
 51 |     LONG_PRESS,
 52 |     ANSWER,
 53 |     UNKNOWN,
 54 | )
 55 | 
 56 | _SCROLL_DIRECTIONS = ('left', 'right', 'down', 'up')
 57 | 
 58 | # Keys of JSON action.
 59 | ACTION_TYPE = 'action_type'
 60 | INDEX = 'index'
 61 | X = 'x'
 62 | Y = 'y'
 63 | TEXT = 'text'
 64 | DIRECTION = 'direction'
 65 | APP_NAME = 'app_name'
 66 | GOAL_STATUS = 'goal_status'
 67 | 
 68 | 
 69 | @dataclasses.dataclass()
 70 | class JSONAction:
 71 |   """Represents a parsed JSON action.
 72 | 
 73 |   # Example
 74 |   result_json = {'action_type': 'click', 'x': %d, 'y': %d}
 75 |   action = JSONAction(**result_json)
 76 | 
 77 |   Attributes:
 78 |     action_type: The action type.
 79 |     index: The index to click, if action is a click. Either an index or a <x, y>
 80 |       should be provided. See x, y attributes below.
 81 |     x: The x position to click, if the action is a click.
 82 |     y: The y position to click, if the action is a click.
 83 |     text: The text to type, if action is type.
 84 |     direction: The direction to scroll, if action is scroll.
 85 |     goal_status: If the status is a 'status' type, indicates the status of the
 86 |       goal.
 87 |     app_name: The app name to launch, if the action type is 'open_app'.
 88 |   """
 89 | 
 90 |   action_type: Optional[str] = None
 91 |   index: Optional[str | int] = None
 92 |   x: Optional[int] = None
 93 |   y: Optional[int] = None
 94 |   text: Optional[str] = None
 95 |   direction: Optional[str] = None
 96 |   goal_status: Optional[str] = None
 97 |   app_name: Optional[str] = None
 98 | 
 99 |   def __post_init__(self):
100 |     if self.action_type not in _ACTION_TYPES:
101 |       raise ValueError(f'Invalid action type: {self.action_type}')
102 |     if self.index is not None:
103 |       self.index = int(self.index)
104 |       if self.x is not None or self.y is not None:
105 |         raise ValueError('Either an index or a <x, y> should be provided.')
106 |     if self.direction and self.direction not in _SCROLL_DIRECTIONS:
107 |       raise ValueError(f'Invalid scroll direction: {self.direction}')
108 |     if self.text is not None and not isinstance(self.text, str):
109 |       self.text = str(self.text)
110 | 
111 |   def __repr__(self) -> str:
112 |     properties = []
113 |     for key, value in self.__dict__.items():
114 |       if value is not None:
115 |         if isinstance(value, float):
116 |           value = f'{value:.3f}'
117 |         properties.append(f'{key}={value!r}')
118 |     return f"JSONAction({', '.join(properties)})"
119 | 
120 |   def __eq__(self, other):
121 |     if isinstance(other, JSONAction):
122 |       return _compare_actions(self, other)
123 |     return False
124 | 
125 |   def __ne__(self, other):
126 |     return not self.__eq__(other)
127 | 
128 |   def json_str(self) -> str:
129 |     non_null = {}
130 |     for key, value in self.__dict__.items():
131 |       if value is not None:
132 |         non_null[key] = value
133 |     return json.dumps(non_null, separators=_JSON_SEPARATORS)
134 | 
135 | 
136 | def _compare_actions(a: JSONAction, b: JSONAction) -> bool:
137 |   """Compares two JSONActions.
138 | 
139 |   Args:
140 |     a: The first action.
141 |     b: The second action.
142 | 
143 |   Returns:
144 |     If the actions are equal.
145 |   """
146 |   # Ignore cases.
147 |   if a.app_name is not None and b.app_name is not None:
148 |     app_name_match = a.app_name.lower() == b.app_name.lower()
149 |   else:
150 |     app_name_match = a.app_name == b.app_name
151 | 
152 |   if a.text is not None and b.text is not None:
153 |     text_match = a.text.lower() == b.text.lower()
154 |   else:
155 |     text_match = a.text == b.text
156 | 
157 |   # Compare the non-metadata fields.
158 |   return (
159 |       app_name_match
160 |       and text_match
161 |       and a.action_type == b.action_type
162 |       and a.index == b.index
163 |       and a.x == b.x
164 |       and a.y == b.y
165 |       and a.direction == b.direction
166 |       and a.goal_status == b.goal_status
167 |   )
168 | 


--------------------------------------------------------------------------------
/demo/demo_web_frontend/src/components/ScreenPage.vue:
--------------------------------------------------------------------------------
  1 | <template>
  2 | 	<a-flex justify="center" align="center" vertical gap="5">
  3 | 		<div :style="{ border: '1px solid #ccc', borderRadius: '5px' }">
  4 | 			<img
  5 | 				:width="width"
  6 | 				:height="height"
  7 | 				:style="{ borderRadius: '5px' }"
  8 | 				:src="imageUrl"
  9 | 				@error="handleImageError"
 10 | 			/>
 11 | 		</div>
 12 | 		<a-flex justify="space-between" align="center" gap="5" style="width: 100%">
 13 | 			<a-button
 14 | 				type="primary"
 15 | 				flex="1"
 16 | 				style="width: 50%"
 17 | 				:disabled="store.runningTask"
 18 | 				@click="onReset"
 19 | 				>Reset<!-- 重置--></a-button
 20 | 			>
 21 | 			<a-button
 22 | 				flex="1"
 23 | 				style="width: 50%"
 24 | 				:disabled="!store.runningTask"
 25 | 				@click="onTerminate"
 26 | 				>Terminate<!-- 终止--></a-button
 27 | 			>
 28 | 		</a-flex>
 29 | 	</a-flex>
 30 | </template>
 31 | <script setup>
 32 | import { useGlobalStore } from '/src/store.js'
 33 | import { onMounted, onBeforeUnmount, ref } from 'vue'
 34 | const width = ref(216)
 35 | const height = ref(480)
 36 | const store = useGlobalStore()
 37 | 
 38 | //var width=216;
 39 | //var height=480;
 40 | const imageUrl = ref('http://127.0.0.1:8768/get_screenshot')
 41 | let fetchController = null // 用于取消未完成的请求
 42 | let timerId = null // 用于定时器
 43 | onMounted(() => {
 44 | 	const fetchImage = async () => {
 45 | 		try {
 46 | 			// 创建新的AbortController
 47 | 			const controller = new AbortController()
 48 | 			fetchController = controller
 49 | 
 50 | 			// 添加时间戳参数
 51 | 			const url = `http://127.0.0.1:8768/get_screenshot?t=${Date.now()}`
 52 | 
 53 | 			// 使用fetch获取图片
 54 | 			const response = await fetch(url, {
 55 | 				signal: controller.signal,
 56 | 				cache: 'no-cache', // 禁用缓存
 57 | 			})
 58 | 
 59 | 			if (!response.ok) throw new Error('Network response was not ok')
 60 | 
 61 | 			// 转换为Blob
 62 | 			const blob = await response.blob()
 63 | 
 64 | 			// 创建新的对象URL
 65 | 			const newUrl = URL.createObjectURL(blob)
 66 | 
 67 | 			// 释放旧URL的内存
 68 | 			if (imageUrl.value) {
 69 | 				URL.revokeObjectURL(imageUrl.value)
 70 | 			}
 71 | 
 72 | 			// 更新图片地址
 73 | 			imageUrl.value = newUrl
 74 | 		} catch (error) {
 75 | 			if (error.name !== 'AbortError') {
 76 | 				console.error('图片下载失败:', error)
 77 | 			}
 78 | 		} finally {
 79 | 			// 设置下次请求
 80 | 			timerId = setTimeout(fetchImage, 5)
 81 | 		}
 82 | 	}
 83 | 
 84 | 	timerId = setTimeout(fetchImage, 0) // 立即执行首次请求
 85 | })
 86 | 
 87 | onBeforeUnmount(() => {
 88 | 	// 清除定时器和中断请求
 89 | 	clearTimeout(timerId)
 90 | 	if (fetchController) {
 91 | 		fetchController.abort()
 92 | 	}
 93 | 	// 释放对象URL内存
 94 | 	if (imageUrl.value) {
 95 | 		URL.revokeObjectURL(imageUrl.value)
 96 | 	}
 97 | })
 98 | async function fetchWithTimeout(url, options = {}) {
 99 | 	const { timeout = 300000 } = options // 默认300 秒
100 | 	const controller = new AbortController()
101 | 
102 | 	const timeoutId = setTimeout(() => {
103 | 		controller.abort() // 强制终止请求
104 | 	}, timeout)
105 | 
106 | 	try {
107 | 		const response = await fetch(url, {
108 | 			...options,
109 | 			signal: controller.signal,
110 | 		})
111 | 		clearTimeout(timeoutId)
112 | 		return response
113 | 	} catch (error) {
114 | 		if (error.name === 'AbortError') {
115 | 			throw new Error(`请求超时（${timeout}ms）`)
116 | 		}
117 | 		throw error
118 | 	}
119 | }
120 | const onReset = async () => {
121 | 	try {
122 | 		store.runningTask = true // 开始加载
123 | 		// 发送 POST 请求
124 | 		// const response = await fetch('http://127.0.0.1:8768/reset', {
125 | 		// 	method: 'POST',
126 | 		// 	headers: {
127 | 		// 		accept: 'application/json',
128 | 		// 		'Content-Type': 'application/json',
129 | 		// 	},
130 | 		// 	//body: JSON.stringify({ task_goal: taskGoal.value }),
131 | 		// })
132 | 		const response = await fetchWithTimeout('http://127.0.0.1:8768/reset', {
133 | 		  method: 'POST',
134 | 		  headers: {
135 | 		    accept: 'application/json',
136 | 		    'Content-Type': 'application/json',
137 | 		  },
138 | 		  body: JSON.stringify({ 'msg': 'reset' }),
139 | 		  timeout: 30*1000 // 30s
140 | 		});
141 | 
142 | 		// 处理响应
143 | 		if (!response.ok) {
144 | 			throw new Error(`HTTP error! status: ${response.status}`)
145 | 		}
146 | 		const data = await response.text()
147 | 		console.log('Server response:', data)
148 | 	} finally {
149 | 		store.runningTask = false // 无论成功/失败都关闭
150 | 	}
151 | }
152 | const onTerminate = async () => {
153 | 	try {
154 | 		// 发送 POST 请求
155 | 		// const response = await fetch('http://127.0.0.1:8767/stop', {
156 | 		// 	method: 'POST',
157 | 		// 	headers: {
158 | 		// 		accept: 'application/json',
159 | 		// 		'Content-Type': 'application/json',
160 | 		// 	},
161 | 		// 	//body: JSON.stringify({ task_goal: taskGoal.value }),
162 | 		// })
163 | 		store.controller.abort()
164 | 		const response = await fetchWithTimeout('http://127.0.0.1:8768/sent_a_massage3', {
165 | 		  method: 'POST',
166 | 		  headers: {
167 | 		    accept: 'application/json',
168 | 		    'Content-Type': 'application/json',
169 | 		  },
170 | 		  body: JSON.stringify({ 'msg': 'stop' }),
171 | 		  timeout: 30*1000 // 30s
172 | 		});
173 | 
174 | 		// 处理响应
175 | 		if (!response.ok) {
176 | 			throw new Error(`HTTP error! status: ${response.status}`)
177 | 		}
178 | 		const data = await response.text()
179 | 		console.log('Server response:', data)
180 | 	} finally {
181 | 		//store.runningTask = false // 无论成功/失败都关闭
182 | 		//store.controller=null
183 | 	}
184 | }
185 | const handleImageError = () => {
186 | 	console.error('图片加载失败')
187 | 	// 可选：回退到默认图或重试逻辑
188 | }
189 | </script>
190 | <style></style>
191 | 


--------------------------------------------------------------------------------
/demo/demo_web_frontend/src/components/TaskPage.vue:
--------------------------------------------------------------------------------
  1 | <template>
  2 | 	<a-flex justify="center" align="center" vertical gap="5">
  3 | 		<div class="chat-wrapper">
  4 | 			<ChatPage :messages="chatMessages" ref="chatComp" class="chat-container" />
  5 | 		</div>
  6 | 		<a-input-group compact style="display: flex">
  7 | 			<a-auto-complete
  8 | 				style="flex: 1"
  9 | 				v-model:value="taskGoal"
 10 | 				:options="options"
 11 | 				allow-clear
 12 | 				:disabled="store.runningTask"
 13 | 			/>
 14 | 			<a-button type="primary" :loading="store.runningTask" @click="handleSearch">Run<!-- 执行 --></a-button>
 15 | 		</a-input-group>
 16 | 	</a-flex>
 17 | </template>
 18 | 
 19 | <script setup>
 20 | import { ref, onMounted, onUnmounted } from 'vue'
 21 | import ChatPage from '/src/components/ChatPage.vue'
 22 | import { useGlobalStore } from '/src/store.js'
 23 | 
 24 | const store = useGlobalStore()
 25 | const taskGoal = ref('')
 26 | const options = ref([
 27 | 	{
 28 | 		value: `Add the following recipe into the Broccoli app:
 29 | Title: Spicy Tuna Wraps
 30 | Description: A quick and easy meal, perfect for busy weekdays.`,
 31 | 	},
 32 | 	{
 33 | 		value: `Add the following recipe into the Broccoli app:
 34 | Title: Greek Salad Pita Pockets
 35 | Description: An ideal recipe for experimenting with different flavors and ingredients.
 36 | Servings: 3-4 servings
 37 | Preparation Time: 20 mins
 38 | Ingredients: various amounts
 39 | Directions: Fill pita pockets with lettuce, cucumber, tomato, feta, olives, and Greek dressing. Feel free to substitute with ingredients you have on hand.`,
 40 | 	},
 41 | ])
 42 | 
 43 | // 响应式聊天数据
 44 | const chatMessages = ref([])
 45 | 
 46 | // 添加新消息
 47 | chatMessages.value.push({
 48 | 	messageType: 'text',
 49 | 	name: 'Agent',
 50 | 	position: 'left',
 51 | 	html: 'Please send me a task goal.',
 52 | 	// html: '请给我发一个任务目标。',//#TODO: 汉化
 53 | })
 54 | async function fetchWithTimeout(url, options = {}, controller = new AbortController()) {
 55 | 	const { timeout = 300000 } = options // 默认300 秒
 56 | 
 57 | 	const timeoutId = setTimeout(() => {
 58 | 		controller.abort() // 强制终止请求
 59 | 	}, timeout)
 60 | 
 61 | 	try {
 62 | 		const response = await fetch(url, {
 63 | 			...options,
 64 | 			signal: controller.signal,
 65 | 		})
 66 | 		clearTimeout(timeoutId)
 67 | 		return response
 68 | 	} catch (error) {
 69 | 		if (error.name === 'AbortError') {
 70 | 			throw new Error(`请求超时或被取消（${timeout}ms）`)
 71 | 		}
 72 | 		throw error
 73 | 	}
 74 | }
 75 | const handleSearch = async () => {
 76 | 	try {
 77 | 		store.runningTask = true
 78 | 
 79 | 		chatMessages.value.push({
 80 | 			messageType: 'text',
 81 | 			name: 'Human',
 82 | 			position: 'right',
 83 | 			html: taskGoal.value,
 84 | 		})
 85 | 
 86 | 		store.controller = new AbortController()
 87 | 		const response = await fetchWithTimeout(
 88 | 			'http://127.0.0.1:8767/run_task',
 89 | 			{
 90 | 				method: 'POST',
 91 | 				headers: {
 92 | 					accept: 'application/json',
 93 | 					'Content-Type': 'application/json',
 94 | 				},
 95 | 				body: JSON.stringify({ task_goal: taskGoal.value }),
 96 | 				timeout: 3000 * 1000, // 3000s
 97 | 			},
 98 | 			store.controller
 99 | 		)
100 | 
101 | 		// 处理响应
102 | 		if (!response.ok) {
103 | 			throw new Error(`HTTP error! status: ${response.status}`)
104 | 		}
105 | 		const data = await response.text()
106 | 		console.log('Server response:', data)
107 | 	} catch (error) {
108 | 		console.error('请求失败:', error)
109 | 		// 可以在这里添加错误消息到聊天窗口
110 | 		chatMessages.value.push({
111 | 			messageType: 'text',
112 | 			name: 'System',
113 | 			position: 'left',
114 | 			html: `请求失败: ${error.message}`,
115 | 		})
116 | 	} finally {
117 | 		store.controller = null
118 | 		store.runningTask = false
119 | 	}
120 | }
121 | 
122 | const pollingInterval = ref(null)
123 | 
124 | const checkMessages = async () => {
125 | 	try {
126 | 		const response = await fetch('http://127.0.0.1:8768/get_a_massage', {
127 | 			method: 'GET',
128 | 			headers: {
129 | 				accept: 'application/json',
130 | 				'Content-Type': 'application/json',
131 | 			},
132 | 		})
133 | 
134 | 		if (!response.ok) return
135 | 		const data = await response.json()
136 | 
137 | 		if (data === '<None>') return
138 | 
139 | 		//if ("<None>" in data) return
140 | 
141 | 		try {
142 | 			//const jsonData = JSON.parse(data)
143 | 			if (data.message_type === 'action') {
144 | 				store.runningTask = true
145 | 				chatMessages.value.push({
146 | 					//messageType: 'text',
147 | 					messageType: 'raw',
148 | 					name: 'Agent',
149 | 					position: 'left',
150 | 					//html: jsonData.action
151 | 					//html:data.task_goal
152 | 					html: `<strong>Action</strong><br>${data.message}`.replace('\n', '<br>'),
153 | 					// html: `<strong>行动</strong><br>${data.message}`.replace('\n', '<br>'),//#TODO: 汉化
154 | 				})
155 | 			} else if (data.message_type === 'summary') {
156 | 				store.runningTask = true
157 | 				chatMessages.value.push({
158 | 					//messageType: 'text',
159 | 					messageType: 'raw',
160 | 					name: 'Agent',
161 | 					position: 'left',
162 | 					//html: jsonData.action
163 | 					//html:data.task_goal
164 | 					html: `<strong>Summary</strong><br>${data.message}`.replace('\n', '<br>'),
165 | 					// html: `<strong>总结</strong><br>${data.message}`.replace('\n', '<br>'),//#TODO: 汉化
166 | 				})
167 | 			} else if (data.message_type === 'done') {
168 | 				store.runningTask = false
169 | 			}
170 | 		} catch (e) {
171 | 			console.error('JSON解析失败:', e)
172 | 		}
173 | 	} catch (error) {
174 | 		console.error('获取消息失败:', error)
175 | 	}
176 | }
177 | 
178 | // 生命周期钩子
179 | onMounted(() => {
180 | 	pollingInterval.value = setInterval(checkMessages, 500)
181 | })
182 | 
183 | onUnmounted(() => {
184 | 	clearInterval(pollingInterval.value)
185 | })
186 | </script>
187 | 
188 | <style scoped>
189 | .chat-wrapper {
190 | 	border: 0.5px solid #ccc;
191 | 	border-radius: 5px;
192 | 	background-color: #f8f9fa;
193 | 	height: 480px;
194 | 	width: 216px;
195 | 	/*width: 100%;
196 | 	height: 100%;*/
197 | }
198 | .chat-container {
199 | 	width: 100%;
200 | 	height: 100%;
201 | }
202 | </style>
203 | 


--------------------------------------------------------------------------------
/demo/demo_web_backend.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 启动 python -m utils.demo_web_backend
  3 | """
  4 | 
  5 | import os
  6 | import requests
  7 | 
  8 | 
  9 | def send_message(message: dict = None, text: str = None, images: list[str] = None):
 10 |     url = (
 11 |         os.getenv("MESSAGE_SERVER_ENDPOINT", "http://127.0.0.1:8768")
 12 |         + "/sent_a_massage"
 13 |     )
 14 |     rsp, ret = None, None
 15 |     try:
 16 |         try:
 17 |             _message = message if message else {}
 18 |             if text:
 19 |                 _message["text"] = text
 20 |             if images:
 21 |                 _message["images"] = images
 22 |             if len(_message.keys()) == 0:
 23 |                 return None
 24 |             rsp = requests.post(url, json=_message)
 25 |             ret = rsp.json()
 26 |         except:
 27 |             ret = rsp.text
 28 |     except:
 29 |         return ret
 30 | 
 31 | 
 32 | def send_message2(message: dict = None, text: str = None, images: list[str] = None):
 33 |     url = (
 34 |         os.getenv("MESSAGE_SERVER_ENDPOINT", "http://127.0.0.1:8768")
 35 |         + "/sent_a_massage2"
 36 |     )
 37 |     rsp, ret = None, None
 38 |     try:
 39 |         try:
 40 |             _message = message if message else {}
 41 |             if text:
 42 |                 _message["text"] = text
 43 |             if images:
 44 |                 _message["images"] = images
 45 |             if len(_message.keys()) == 0:
 46 |                 return None
 47 |             rsp = requests.post(url, json=_message)
 48 |             ret = rsp.json()
 49 |         except:
 50 |             ret = rsp.text
 51 |     except:
 52 |         return ret
 53 | 
 54 | 
 55 | def get_a_message3():
 56 |     url = (
 57 |         os.getenv("MESSAGE_SERVER_ENDPOINT", "http://127.0.0.1:8768")
 58 |         + "/get_a_massage3"
 59 |     )
 60 |     rsp = requests.get(url)
 61 |     try:
 62 |         try:
 63 |             return rsp.json()
 64 |         except:
 65 |             return rsp.text
 66 |     except:
 67 |         return None
 68 | 
 69 | 
 70 | def is_need_stop() -> bool:
 71 |     msg = str(get_a_message3()).lower()
 72 |     return "stop" in msg
 73 | 
 74 | 
 75 | from multiprocessing import Queue
 76 | from queue import Empty as QueueEmpty
 77 | from fastapi import FastAPI, Request
 78 | from fastapi.middleware.cors import CORSMiddleware
 79 | 
 80 | app = FastAPI()
 81 | app.add_middleware(
 82 |     CORSMiddleware,
 83 |     allow_origins=["*"],
 84 |     allow_credentials=True,
 85 |     allow_methods=["*"],
 86 |     allow_headers=["*"],
 87 | )
 88 | from fastapi.middleware.gzip import GZipMiddleware
 89 | 
 90 | app.add_middleware(GZipMiddleware, minimum_size=1000, compresslevel=6)
 91 | 
 92 | massage_queue = None
 93 | massage_queue2 = None
 94 | massage_queue3 = None
 95 | from utils.device import Device
 96 | 
 97 | d = Device()
 98 | 
 99 | from PIL import Image
100 | import io
101 | import base64
102 | 
103 | 
104 | @app.get("/get_a_massage")
105 | async def get_massage():
106 |     """
107 |     ```js
108 |     fetch('http://127.0.0.1:8768/get_a_massage', {
109 |     method: 'GET',
110 |     headers: {
111 |         'accept': 'application/json',
112 |         'Content-Type': 'application/json'
113 |     }})
114 |     .then(response => response.text())
115 |     .then(data => console.log(data))
116 |     .catch(error => console.error(error));
117 |     ```
118 |     """
119 |     try:
120 |         msg = massage_queue.get_nowait()
121 |         # print(f"Sent message: {str(msg)[:30]} ...")
122 |         return msg
123 |     except QueueEmpty:
124 |         # print(f"Sent message: <None> ")
125 |         return "<None>"
126 | 
127 | 
128 | @app.post("/sent_a_massage")
129 | async def sent_massage(request: Request):
130 |     """
131 |     ```js
132 |     fetch('http://127.0.0.1:8768/sent_a_massage', {
133 |     method: 'POST',
134 |     headers: {
135 |         'accept': 'application/json',
136 |         'Content-Type': 'application/json'
137 |     },
138 |     body: JSON.stringify({
139 |         "data": "测试任意的json body",
140 |         "image":"123"
141 |     })
142 |     })
143 |     .then(response => response.text())
144 |     .then(data => console.log(data))
145 |     .catch(error => console.error(error));
146 |     ```
147 |     """
148 |     # 从请求中解析原始 JSON
149 |     massage = await request.json()
150 |     massage_queue.put(massage)
151 |     print(f"Received message: {str(massage)[:30]} ...")
152 |     return "success"
153 | 
154 | 
155 | @app.get("/get_a_massage2")
156 | async def get_massage2():
157 |     try:
158 |         msg = massage_queue2.get_nowait()
159 |         return msg
160 |     except QueueEmpty:
161 |         return "<None>"
162 | 
163 | 
164 | @app.post("/sent_a_massage2")
165 | async def sent_massage2(request: Request):
166 |     massage = await request.json()
167 |     massage_queue2.put(massage)
168 |     print(f"Received message: {str(massage)[:30]} ...")
169 |     return "success"
170 | 
171 | 
172 | @app.get("/get_a_massage3")
173 | async def get_massage3():
174 |     try:
175 |         msg = massage_queue3.get_nowait()
176 |         return msg
177 |     except QueueEmpty:
178 |         return "<None>"
179 | 
180 | 
181 | @app.post("/sent_a_massage3")
182 | async def sent_massage3(request: Request):
183 |     massage = await request.json()
184 |     massage_queue3.put(massage)
185 |     print(f"Received message: {str(massage)[:30]} ...")
186 |     return "success"
187 | 
188 | 
189 | @app.post("/reset")
190 | async def reset(request: Request):
191 |     massage = await request.json()
192 |     print(f"Received message: {str(massage)[:30]} ...")
193 |     d.stop_all_apps()
194 |     # d.home()
195 |     return "success"
196 | 
197 | 
198 | from fastapi.responses import StreamingResponse
199 | 
200 | 
201 | @app.get("/get_screenshot")
202 | async def get_screenshot():
203 |     sc = d.get_screenshot()
204 |     scale = 2.4
205 |     sc = sc.resize((int(sc.size[0] / scale), int(sc.size[1] / scale)), Image.LANCZOS)
206 |     # sc = sc.convert("RGB")
207 |     buffered = io.BytesIO()
208 |     sc.save(buffered, format="WEBP", quality=75)
209 |     buffered.seek(0)
210 |     return StreamingResponse(buffered, media_type="image/webp")
211 | 
212 | 
213 | if __name__ == "__main__":
214 |     print("Fast API is starting")
215 |     massage_queue = Queue()
216 |     massage_queue2 = Queue()
217 |     massage_queue3 = Queue()
218 |     import uvicorn
219 | 
220 |     uvicorn.run(app, host="0.0.0.0", port=8768, timeout_graceful_shutdown=3)
221 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <div align="center">
  2 | <!-- <h1>JiuTian (九天) </h1> -->
  3 | 
  4 | <h2 class="papername">GUI-explorer: Autonomous Exploration and Mining of Transition-aware Knowledge for GUI Agent</h2>
  5 | <div>
  6 | <div>
  7 |     <a href="https://xieincz.github.io/" target="_blank">Bin Xie<sup>1</sup></a>,
  8 |     <a href="https://rshaojimmy.github.io/" target="_blank">Rui Shao<sup>1*</sup></a>,
  9 |     <a href="https://scholar.google.com/citations?user=Mpg0w3cAAAAJ" target="_blank">Gongwei Chen<sup>1*</sup></a>,
 10 |     <a href="https://jnhujnhu.github.io/" target="_blank">Kaiwen Zhou<sup>2</sup></a>,
 11 |     <a href="https://yinchuanll.github.io/" target="_blank">Yinchuan Li<sup>2</sup></a>,
 12 |     <a href="https://faculty.hitsz.edu.cn/liujie" target="_blank">Jie Liu<sup>1</sup></a>,
 13 |     <a href="https://zhangmin-nlp-ai.github.io/" target="_blank">Min Zhang<sup>1</sup></a>,
 14 |     <a href="https://liqiangnie.github.io/" target="_blank">Liqiang Nie<sup>1</sup></a>
 15 | </div>
 16 | 
 17 | <sup>1</sup>Harbin Institute of Technology, Shenzhen, <sup>2</sup>Huawei Noah’s Ark Lab<br>
 18 | <sup>*</sup>Corresponding author
 19 | 
 20 | Annual Meeting of the Association for Computational Linguistics (**ACL**) 2025
 21 | 
 22 | [[Paper]](https://arxiv.org/abs/2505.16827) [[Code]](https://github.com/JiuTian-VL/GUI-explorer) [[Project Page]](https://xieincz.github.io/GUI-explorer.github.io/) 
 23 | 
 24 | :fire: Details will be released. Stay tuned :beers: :+1: 
 25 | 
 26 | </div>
 27 | </div>
 28 | 
 29 | ## If you find this work useful for your research, please kindly cite our paper and star our repo.
 30 | 
 31 | ## Updates
 32 | 
 33 | - [05/2025] [Project Page](https://xieincz.github.io/GUI-explorer.github.io/) released.
 34 | - [05/2025] [Arxiv paper](https://arxiv.org/abs/2505.16827) released.
 35 | - [05/2025] [Code](https://github.com/JiuTian-VL/GUI-explorer) released.
 36 | 
 37 | ## Introduction
 38 | 
 39 | This is the github repository of *GUI-explorer: Autonomous Exploration and Mining of Transition-aware Knowledge for GUI Agent*. In this work, we propose GUI-explorer. It synergizes two key components: (1) Autonomous Exploration of Function-Aware Trajectory; (2) Unsupervised Mining of Transition-Aware Knowledge.
 40 | 
 41 | The overview of the proposed GUI-explorer:
 42 | 
 43 | <div align="center">
 44 | <img src='./assets/GUI-explorer.webp' width='100%'>
 45 | </div>
 46 | 
 47 | ## Installation
 48 | 
 49 | ### Download
 50 | 
 51 | ```bash
 52 | git clone https://github.com/JiuTian-VL/GUI-explorer.git
 53 | cd GUI-explorer
 54 | mkdir knowledge_base
 55 | cd knowledge_base
 56 | wget https://github.com/JiuTian-VL/GUI-explorer/releases/download/knowledge_base/knowledge_data.pkl
 57 | ```
 58 | 
 59 | ### Environment
 60 | 
 61 | ```bash
 62 | cd GUI-explorer
 63 | conda create -n GUI_explorer python=3.12 -y
 64 | conda activate GUI_explorer
 65 | pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126
 66 | pip install -r requirements.txt
 67 | ```
 68 | 
 69 | Duplicate `.env.example` and rename it to `.env`. Then, in the `.env` file, fill in your `OPENAI_API_KEY`.
 70 | 
 71 | ## Usage
 72 | 
 73 | ### Prepare api servers
 74 | 
 75 | ```bash
 76 | # Open a new shell window and run
 77 | cd GUI-explorer
 78 | conda activate GUI_explorer
 79 | python -m utils.embedding_pipeline
 80 | 
 81 | # Open a new shell window and run (Need to wait for embedding_pipeline to start up)
 82 | cd GUI-explorer
 83 | conda activate GUI_explorer
 84 | python -m utils.retrieval 
 85 | ```
 86 | 
 87 | #### Exploration
 88 | 
 89 | ```bash
 90 | # After prepare api servers
 91 | cd GUI-explorer
 92 | conda activate GUI_explorer
 93 | python exploration_and_mining.py -device_serial emulator-5554 -max_branching_factor 10 -max_exploration_steps 30 -max_exploration_depth 5 -package_name net.osmand
 94 | # After the update of knowledge_base, you need to restart `python -m utils.retrieval` to load the new knowledge_base
 95 | ```
 96 | 
 97 | `device_serial` can be obtained by running `adb devices`. (If not, you need to follow the `Setup` section in [this tutorial](https://github.com/ai-agents-2030/SPA-Bench/blob/main/Documentation.md#setup)).
 98 | 
 99 | `package_name` can be obtained from the app's link on the app store. For example, in `https://play.google.com/store/apps/details?id=net.osmand`, `net.osmand` is the `package_name` for this app.
100 | 
101 | #### Demo
102 | 
103 | ```bash
104 | # After prepare api servers
105 | # Connect an Android device to this computer and make sure you can see it in `adb devices`.
106 | # Open a new shell window and run
107 | cd GUI-explorer
108 | conda activate GUI_explorer
109 | python -m demo.demo_web_backend
110 | 
111 | # Open a new shell window and run
112 | cd GUI-explorer
113 | conda activate GUI_explorer
114 | python -m demo.demo_agent_backend
115 | 
116 | # Open a new shell window and run
117 | cd GUI-explorer/demo/demo_web_frontend
118 | pnpm install
119 | pnpm run dev
120 | ```
121 | 
122 | Open http://localhost:5173 in your browser.
123 | 
124 | You should be able to see something like this:
125 | 
126 | ![web-demo](assets/web-demo.webp)
127 | 
128 | 
129 | 
130 | ## Evaluation Results
131 | 
132 | Table 1: Main Result of GUI-explorer on SPA-Bench single-app English Level 3 tasks. 
133 | <img src="assets/SPA-Bench.webp" alt="SPA-Bench" style="zoom: 25%;" />
134 | 
135 | Table 2: Main Result of GUI-explorer on AndroidWorld tasks. 
136 | <img src="assets/AndroidWorld.webp" alt="AndroidWorld" style="zoom:25%;" />
137 | 
138 | Table 3: Main Result of GUI-explorer on GUI-KRB. 
139 | <img src="assets/GUI-KRB.webp" alt="GUI-KRB" style="zoom:25%;" />
140 | 
141 | 
142 | 
143 | ## Showcases
144 | 
145 | |                         Instruction                          |                            Video                             |
146 | | :----------------------------------------------------------: | :----------------------------------------------------------: |
147 | | Open Google Chrome and search for today's weather in Shenzhen. Carefully observe the screen and record the current weather conditions. Then, in Markor, create a note named "today.md" and write the temperature read from the webpage into it. | <video src="https://github.com/user-attachments/assets/d8fb64fc-862c-4ef3-af3a-dc86543bfb13" height="788" /> |
148 | | Get the search results for stay tonight near 'wembley stadium' for 1 adult. Add one result to wishlist. Confirm that this item is in the wishlist. | <video src="https://github.com/user-attachments/assets/9dbae8ca-ceb1-4472-b908-f7de6df71189" height="788" /> |
149 | 
150 | 
151 | 
152 | ## More Examples
153 | 
154 | <img src="assets/compare.webp" alt="compare" style="zoom:13%;" />
155 | 
156 | 
157 | 
158 | ## Citation
159 | 
160 | If you find this work useful for your research, please kindly cite our paper:
161 | ```
162 | @inproceedings{xie2025gui,
163 |     title={GUI-explorer: Autonomous Exploration and Mining of Transition-aware Knowledge for GUI Agent}, 
164 |     author={Bin Xie and Rui Shao and Gongwei Chen and Kaiwen Zhou and Yinchuan Li and Jie Liu and Min Zhang and Liqiang Nie},
165 |     booktitle={Annual Meeting of the Association for Computational Linguistics (ACL)},
166 |     year={2025}
167 | }
168 | ```
169 | 


--------------------------------------------------------------------------------
/utils/prompt_templates.py:
--------------------------------------------------------------------------------
  1 | # INPUTS: app_name, package_name, activity_list
  2 | TASK_GOAL_GENERATOR = """Given the screenshot of {app_name} and its available activities, generate a comprehensive list of practical user tasks that:
  3 | 
  4 | 1. Start from the current screen shown in the screenshot
  5 | 2. Can be completed within 10-30 steps
  6 | 3. Utilize the app's full feature set based on the activity list
  7 | 4. Are concrete and specific (like searching for a particular item rather than just "search")
  8 | 5. Cover different user interaction patterns (viewing, editing, sharing, etc.)
  9 | 6. Include both basic and advanced features
 10 | 7. Represent realistic user behaviors and goals
 11 | 8. Avoid excessive steps on form-filling or scrolling pages
 12 | 
 13 | Important context:
 14 | - App name: {app_name}
 15 | - Package name: {package_name} 
 16 | - Available activities (app screens/features):
 17 | ```{activity_list}```
 18 | 
 19 | Format requirements:
 20 | 1. List only the tasks without explanations or commentary
 21 | 2. Each task should be a single, clear directive
 22 | 3. Use specific examples (e.g., concrete search terms, actions, settings)
 23 | 4. Include the expected outcome where relevant
 24 | 5. Tasks should follow this pattern: [Starting action] + [Specific steps] + [End goal]
 25 | 
 26 | Example tasks from other apps (for reference only):
 27 | 1. Search for "ocean waves" white noise, then sort results by most played
 28 | 2. Open the first recommended video, then post "Great content!" as a comment
 29 | 3. Play the trending video, then add it to your "Watch Later" playlist
 30 | 4. Navigate to the comments section of a featured video, then like the top comment
 31 | 
 32 | Generate diverse tasks that would help a user explore and utilize all major features visible in the screenshot and implied by the activity list."""
 33 | 
 34 | 
 35 | # INPUTS: task_description, numeric_tag_of_element, ui_element_attributes, action
 36 | KNOWLEDGE_EXTRACTOR = """Objective: Describe the functionality of a specific UI element in a mobile app screenshot.
 37 | 
 38 | Input:
 39 | - Two screenshots: Before and after interacting with a UI element
 40 | - UI element marked with a numeric tag in the top-left corner
 41 | - Element number: {numeric_tag_of_element}
 42 | - Broader task context: {task_description}
 43 | - Action taken: {action}
 44 | - UI Element Attributes: 
 45 |   ```
 46 |   {ui_element_attributes}
 47 |   ```
 48 | 
 49 | Requirements for Functionality Description:
 50 | 1. Concise: 1-2 sentences
 51 | 2. Focus on general function, not specific details
 52 | 3. Avoid mentioning the numeric tag
 53 | 4. Use generic terms like "UI element" or appropriate pronouns
 54 | 
 55 | Example:
 56 | - Incorrect: "Tapping the element #3 displays David's saved recipes in the results panel"
 57 | - Correct: "Tapping this element will initiates a search and displays matching results"
 58 | 
 59 | Guidance:
 60 | - Describe the core action and immediate result of interacting with the UI element
 61 | - Prioritize clarity and generality in the description"""
 62 | 
 63 | 
 64 | # INPUTS: task_goal, knowledge_a, knowledge_b
 65 | RANKER = """Given the user instruction: {task_goal}, determine which of the following two knowledge entries is more useful.
 66 | Respond ONLY with a integer value:
 67 | 1 means Knowledge A is strictly better.
 68 | 2 means Knowledge B is strictly better.
 69 | 
 70 | Knowledge A: {knowledge_a}
 71 | Knowledge B: {knowledge_b}
 72 | 
 73 | Please provide your response:
 74 | """
 75 | 
 76 | 
 77 | # INPUTS: task_goal, history, ui_elements, knowledge
 78 | REASONING = """## Role Definition
 79 | You are an Android operation AI that fulfills user requests through precise screen interactions.
 80 | The current screenshot and the same screenshot with bounding boxes and labels added are also given to you.
 81 | 
 82 | ## Action Catalog
 83 | Available actions (STRICT JSON FORMAT REQUIRED):
 84 | 1. Status Operations:
 85 |    - Task Complete: {{"action_type": "status", "goal_status": "complete"}}
 86 |    - Task Infeasible: {{"action_type": "status", "goal_status": "infeasible"}}
 87 | 2. Information Actions:
 88 |    - Answer Question: {{"action_type": "answer", "text": "<answer_text>"}}
 89 | 3. Screen Interactions:
 90 |    - Tap Element: {{"action_type": "click", "index": <visible_index>}}
 91 |    - Long Press: {{"action_type": "long_press", "index": <visible_index>}}
 92 |    - Scroll: Scroll the screen or a specific scrollable UI element. Use the `index` of the target element if scrolling a specific element, or omit `index` to scroll the whole screen. {{"action_type": "scroll", "direction": <"up"|"down"|"left"|"right">, "index": <optional_target_index>}}
 93 | 4. Input Operations:
 94 |    - Text Entry: {{"action_type": "input_text", "text": "<content>", "index": <text_field_index>}}
 95 |    - Keyboard Enter: {{"action_type": "keyboard_enter"}}
 96 | 5. Navigation:
 97 |    - Home Screen: {{"action_type": "navigate_home"}}
 98 |    - Back Navigation: {{"action_type": "navigate_back"}}
 99 | 6. System Actions:
100 |    - Launch App: {{"action_type": "open_app", "app_name": "<exact_name>"}}
101 |    - Wait Refresh: {{"action_type": "wait"}}
102 | 
103 | ## Current Objective
104 | User Goal: {task_goal}
105 | 
106 | ## Execution Context
107 | Action History:
108 | {history}
109 | 
110 | Visible UI Elements (Only interact with *visible=true elements):
111 | {ui_elements}
112 | 
113 | ## Core Strategy
114 | 1. Path Optimization:
115 |    - Prefer direct methods (e.g., open_app > app drawer navigation)
116 |    - Always use the `input_text` action for entering text into designated text fields.
117 |    - Verify element visibility (`visible=true`) before attempting any interaction (click, long_press, input_text). Do not interact with elements marked `visible=false`.
118 |    - Use `scroll` when necessary to bring off-screen elements into view. Prioritize scrolling specific containers (`index` provided) over full-screen scrolls if possible.
119 | 
120 | 2. Error Handling Protocol:
121 |    - Switch approach after ≥ 2 failed attempts
122 |    - Prioritize scrolling (`scroll` action) over force-acting on invisible elements
123 |    - If an element is not visible, use `scroll` in the likely direction (e.g., 'down' to find elements below the current view).
124 |    - Try opposite scroll direction if initial fails (up/down, left/right)
125 |    - If the `open_app` action fails to correctly open the app, find the corresponding app in the app drawer and open it.
126 | 
127 | 3. Information Tasks:
128 |    - MANDATORY: Use answer action for questions
129 |    - Verify data freshness (e.g., check calendar date)
130 | 
131 | ## Expert Techniques
132 | Here are some tips for you:
133 | {knowledge}
134 | 
135 | ## Response Format
136 | STRICTLY follow:
137 | Reasoning: [Step-by-step analysis covering:
138 |            - Visibility verification
139 |            - History effectiveness evaluation
140 |            - Alternative approach comparison
141 |            - Consideration of scrolling if needed]
142 | Action: [SINGLE JSON action from catalog]
143 | 
144 | Generate response:
145 | """
146 | 
147 | # INPUTS: task_goal, before_ui_elements, after_ui_elements, action, reasoning
148 | SUMMARY="""
149 | Goal: {task_goal}
150 | 
151 | Before screenshot elements:
152 | {before_ui_elements}
153 | 
154 | After screenshot elements:
155 | {after_ui_elements}
156 | 
157 | Action: {action}
158 | Reasoning: {reasoning}
159 | 
160 | Provide a concise single-line summary (under 50 words) of this step by comparing screenshots and action outcome. Include:
161 | - What was intended
162 | - Whether it succeeded
163 | - Key information for future actions
164 | - Critical analysis if action/reasoning was flawed
165 | - Important data to remember across apps
166 | 
167 | For actions like 'answer' or 'wait' with no screen change, assume they worked as intended.
168 | 
169 | Summary:
170 | """


--------------------------------------------------------------------------------
/utils/retrieval.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 使用方式： python -m utils.retrieval
  3 | """
  4 | 
  5 | import copy
  6 | from typing import Any, List, Dict, Tuple, Union
  7 | from PIL import Image
  8 | from utils.utils import str_to_md5
  9 | import os
 10 | 
 11 | import uuid
 12 | import numpy as np
 13 | 
 14 | from fastapi import FastAPI, Request, HTTPException
 15 | from fastapi.middleware.cors import CORSMiddleware
 16 | from utils.memory import load_memories, KnowledgeStore
 17 | 
 18 | app = FastAPI()
 19 | app.add_middleware(
 20 |     CORSMiddleware,
 21 |     allow_origins=["*"],
 22 |     allow_credentials=True,
 23 |     allow_methods=["*"],
 24 |     allow_headers=["*"],
 25 | )
 26 | 
 27 | __KNOWLEDGE_BASE_ABSOLUTE_ROOT_PATH = None
 28 | __MEMORY: dict[str, KnowledgeStore] = {}
 29 | 
 30 | import base64
 31 | import io
 32 | from PIL import Image
 33 | 
 34 | 
 35 | def pil_to_base64(pil_image: Image.Image) -> str:
 36 |     """Convert a PIL Image to a base64 encoded string.
 37 | 
 38 |     Args:
 39 |         pil_image (Image.Image): The PIL Image object.
 40 | 
 41 |     Returns:
 42 |         str: The base64 encoded string.
 43 |     """
 44 |     buffered = io.BytesIO()
 45 |     pil_image.save(buffered, format="WEBP", quality=95)
 46 |     return base64.b64encode(buffered.getvalue()).decode()
 47 | 
 48 | 
 49 | def ndarray_to_base64(ndarray) -> str:
 50 |     """Convert a numpy array to a base64 encoded string.
 51 | 
 52 |     Args:
 53 |         ndarray (np.ndarray): The numpy array.
 54 | 
 55 |     Returns:
 56 |         str: The base64 encoded string.
 57 |     """
 58 |     return pil_to_base64(Image.fromarray(ndarray))
 59 | 
 60 | 
 61 | import requests
 62 | import time
 63 | import os
 64 | 
 65 | 
 66 | def retrieval_api(
 67 |     query: Image.Image, top_k: int = 1, threshold: float = 0.9, package_name: str = None
 68 | ) -> list[dict]:  # NOTE:将这个函数复制到需要调用retrieval_api的地方即可
 69 |     """检索出query对应的knowledge
 70 | 
 71 |     Returns:
 72 |         List[dict[str,Any]]: 返回的结果列表（注意长度可能小于top_k）
 73 |     """
 74 |     ret, rsp, max_retries = None, None, 3
 75 |     data = {
 76 |         "package_name": package_name,  # "com.example.app",
 77 |         "query": pil_to_base64(query),  # "base64 image",
 78 |         "top_k": top_k,
 79 |         "threshold": threshold,
 80 |     }
 81 |     url = os.getenv("RAG_SERVER_ENDPOINT", "http://localhost:8769") + "/retrieval"
 82 |     for i in range(max_retries):
 83 |         try:
 84 |             rsp = requests.post(url, json=data, timeout=300)
 85 |             ret = rsp.json()
 86 |             return ret["results"]
 87 |         except Exception as e:
 88 |             print(f"retrieval_api error: {e} retrying {i+1}/{max_retries}")
 89 |             if i == max_retries - 1:
 90 |                 raise e
 91 |             time.sleep(1)
 92 | 
 93 | 
 94 | def retrieval_batch_api(
 95 |     queries: list[Image.Image],
 96 |     top_k: int = 1,
 97 |     threshold: float = 0.9,
 98 |     package_name: str = None,
 99 | ) -> list[list[dict]]:
100 |     """检索出query对应的knowledge
101 | 
102 |     Returns:
103 |         List[List[dict[str,Any]]]: 返回的结果列表（注意长度可能小于top_k）
104 |     """
105 |     ret, rsp, max_retries = None, None, 3
106 |     data = {
107 |         "package_name": package_name,  # "com.example.app",
108 |         "queries": [pil_to_base64(query) for query in queries],  # "base64 image",
109 |         "top_k": top_k,
110 |         "threshold": threshold,
111 |     }
112 |     url = os.getenv("RAG_SERVER_ENDPOINT", "http://localhost:8769") + "/retrieval_batch"
113 |     for i in range(max_retries):
114 |         try:
115 |             rsp = requests.post(url, json=data, timeout=300)
116 |             ret = rsp.json()
117 |             return ret["results"]
118 |         except Exception as e:
119 |             print(f"retrieval_api error: {e} retrying {i+1}/{max_retries}")
120 |             if i == max_retries - 1:
121 |                 raise e
122 |             time.sleep(1)
123 | 
124 | 
125 | def base64_to_pil(base64_str: str) -> Image.Image:
126 |     """Convert a base64 encoded string to a PIL Image.
127 | 
128 |     Args:
129 |         base64_str (str): The base64 string representing the image.
130 | 
131 |     Returns:
132 |         Image.Image: A PIL Image object.
133 |     """
134 |     return Image.open(io.BytesIO(base64.b64decode(base64_str))).convert("RGB")
135 | 
136 | 
137 | @app.post("/retrieval")
138 | async def retrieval(request: Request):
139 |     """
140 |     body: {
141 |         "package_name": "com.example.app",  # 在哪个app对应的知识库中检索，如果不指定包名就在所有的知识库中检索
142 |         "query": "base64 image",  # base64编码的图片
143 |         "top_k": 1,  # 返回的结果数量 Optional
144 |         "threshold": 0  # 距离或者相似度的阈值
145 |         "similarity": "cosine" or "l2"  # 相似度计算方式（目前暂时限定为cosine） Optional TODO:等待后续支持l2
146 |     }
147 | 
148 |     response: {
149 |         "results": List[dict[str,Any]]  # 返回的结果列表（注意长度可能小于top_k）。{"knowledge": str,"similarity": float,}
150 |     }
151 |     """
152 |     try:
153 |         # 从请求中解析原始 JSON
154 |         data = await request.json()
155 |         query = data.get("query", None)
156 |         top_k = data.get("top_k", 1)
157 |         package_name = data.get("package_name", None)
158 |         threshold = data.get("threshold", 0.9)
159 |         # similarity = data.get("similarity", "l2") #TODO:等待后续支持l2
160 |         result = {"results": []}
161 |         if query is not None:
162 |             memory = __MEMORY["fusion"]
163 |             if package_name is not None:
164 |                 if package_name in __MEMORY:
165 |                     memory = __MEMORY[package_name]
166 | 
167 |             if memory is not None:
168 |                 result["results"] = memory.search(
169 |                     base64_to_pil(query), top_k, threshold
170 |                 )
171 |         return result
172 |     except Exception as e:
173 |         raise HTTPException(status_code=500, detail=str(e))
174 | 
175 | 
176 | @app.post("/retrieval_batch")
177 | async def retrieval_batch(request: Request):
178 |     """
179 |     批量检索知识库中的信息
180 |     body: {
181 |         "package_name": "com.example.app",  # 在哪个app对应的知识库中检索，如果不指定包名就在所有的知识库中检索
182 |         "queries": ["base64 image"],  # base64编码的图片
183 |         "top_k": 1,  # 返回的结果数量 Optional
184 |         "threshold": 0  # 距离或者相似度的阈值
185 |         "similarity": "cosine" or "l2"  # 相似度计算方式（目前暂时限定为cosine） Optional TODO:等待后续支持l2
186 |     }
187 | 
188 |     response: {
189 |         "results": List[List[dict[str,Any]]]  # 返回的结果列表（注意长度可能小于top_k）。{"knowledge": str,"similarity": float,}
190 |     }
191 |     """
192 |     try:
193 |         # 从请求中解析原始 JSON
194 |         data = await request.json()
195 |         queries = data.get("queries", [])
196 |         top_k = data.get("top_k", 1)
197 |         package_name = data.get("package_name", None)
198 |         threshold = data.get("threshold", 0.9)
199 |         # similarity = data.get("similarity", "l2") #TODO:等待后续支持l2
200 |         result = {"results": []}
201 |         if queries:
202 |             memory = __MEMORY["fusion"]
203 |             if package_name is not None:
204 |                 if package_name in __MEMORY:
205 |                     memory = __MEMORY[package_name]
206 | 
207 |             if memory is not None:
208 |                 pil_images = [base64_to_pil(query) for query in queries]
209 |                 res = memory.search_batch(pil_images, top_k, threshold)
210 |                 result["results"] = res
211 |         return result
212 |     except Exception as e:
213 |         raise HTTPException(status_code=500, detail=str(e))
214 | 
215 | 
216 | if __name__ == "__main__":
217 |     """
218 |     Usage: python -m utils.retrieval
219 |     """
220 | 
221 |     __KNOWLEDGE_BASE_ABSOLUTE_ROOT_PATH = os.getenv("KNOWLEDGE_BASE_ABSOLUTE_ROOT_PATH")
222 | 
223 |     if __KNOWLEDGE_BASE_ABSOLUTE_ROOT_PATH is not None and os.path.exists(
224 |         __KNOWLEDGE_BASE_ABSOLUTE_ROOT_PATH
225 |     ):
226 |         print(f"Using knowledge base at {__KNOWLEDGE_BASE_ABSOLUTE_ROOT_PATH}")
227 |     else:
228 |         print(
229 |             f"WARNING: No knowledge base found at {__KNOWLEDGE_BASE_ABSOLUTE_ROOT_PATH}, please set KNOWLEDGE_BASE_ABSOLUTE_ROOT_PATH in environment variable or .env file"
230 |         )
231 |         exit(1)
232 | 
233 |     os.environ["no_proxy"] = "localhost, 127.0.0.1/8, ::1"
234 |     print("Retrieval Service")
235 |     print("Loading Memory...")
236 |     __MEMORY = load_memories(__KNOWLEDGE_BASE_ABSOLUTE_ROOT_PATH)
237 | 
238 |     print("Fast API is starting")
239 |     import uvicorn
240 | 
241 |     uvicorn.run(app, host="0.0.0.0", port=8769, timeout_graceful_shutdown=3)
242 | 
243 |     exit(0)
244 | 


--------------------------------------------------------------------------------
/utils/utils.py:
--------------------------------------------------------------------------------
  1 | from androguard.util import set_log
  2 | 
  3 | try:
  4 |     set_log("ERROR")  # 关闭琐碎的DEBUG输出
  5 | except:
  6 |     pass
  7 | 
  8 | import subprocess
  9 | import time
 10 | import re
 11 | from androguard.core.apk import APK
 12 | import os
 13 | 
 14 | # from dotenv import load_dotenv
 15 | import io
 16 | import json
 17 | from PIL import Image
 18 | import uuid
 19 | import base64
 20 | import hashlib
 21 | import cv2
 22 | 
 23 | 
 24 | # load_dotenv(verbose=True, override=True)
 25 | 
 26 | import requests
 27 | import urllib3
 28 | 
 29 | urllib3.disable_warnings()
 30 | 
 31 | import pickle
 32 | import zstd
 33 | 
 34 | 
 35 | def save_object_to_disk(obj: object, file_path: str, compress_level: int = 3):
 36 |     """将对象序列化为pickle格式并使用Zstandard压缩保存到本地文件
 37 |     Args:
 38 |         obj (object): 要保存的对象
 39 |         file_path (str): 保存文件的路径
 40 |         compress_level (int): compression level, ultra-fast levels from -100 (ultra) to -1 (fast) available since zstd-1.3.4, and from 1 (fast) to 22 (slowest), 0 or unset - means default (3). Default 3.
 41 |     """
 42 |     pickled_data = pickle.dumps(obj)
 43 |     compressed_data = zstd.compress(pickled_data, compress_level)
 44 |     with open(file_path, "wb") as file:
 45 |         file.write(compressed_data)
 46 | 
 47 | 
 48 | def load_object_from_disk(file_path: str) -> object:
 49 |     """从本地文件读取Zstandard压缩的pickle数据并反序列化为对象"""
 50 |     with open(file_path, "rb") as file:
 51 |         compressed_data = file.read()
 52 |     pickled_data = zstd.decompress(compressed_data)
 53 |     return pickle.loads(pickled_data)
 54 | 
 55 | 
 56 | from PIL import Image
 57 | import numpy as np
 58 | 
 59 | 
 60 | def resize_pil_image(image: Image.Image, target_max_size: int = 1000) -> Image.Image:
 61 |     """
 62 |     Resize a PIL image to fit within a square of target_max_size x target_max_size pixels,
 63 |     maintaining the aspect ratio.
 64 |     """
 65 |     width, height = image.size
 66 |     if width > height:
 67 |         new_width = target_max_size
 68 |         new_height = int((height / width) * target_max_size)
 69 |     else:
 70 |         new_height = target_max_size
 71 |         new_width = int((width / height) * target_max_size)
 72 |     return image.resize((new_width, new_height), Image.LANCZOS)
 73 | 
 74 | 
 75 | def resize_ndarray_image(image: np.ndarray, target_max_size: int = 1000) -> np.ndarray:
 76 |     """
 77 |     Resize a numpy ndarray image to fit within a square of target_max_size x target_max_size pixels, maintaining the aspect ratio.
 78 |     """
 79 |     return np.array(resize_pil_image(Image.fromarray(image), target_max_size))
 80 | 
 81 | 
 82 | def openai_request(
 83 |     messages: list,
 84 |     model: str = "env",
 85 |     max_retry: int = 5,
 86 |     timeout: int = 60,
 87 |     temperature: float = 0.0,
 88 |     max_tokens: int = 300,
 89 |     usage: dict[str, int] = {"prompt_tokens": 0, "completion_tokens": 0},
 90 | ) -> str:
 91 |     headers = {
 92 |         "Content-Type": "application/json",
 93 |         "Authorization": f'Bearer {os.getenv("OPENAI_API_KEY")}',
 94 |     }
 95 |     data = {
 96 |         "model": os.getenv("OPENAI_API_MODEL", model) if model == "env" else model,
 97 |         "messages": messages,
 98 |         "max_tokens": max_tokens,
 99 |         "temperature": temperature,
100 |     }
101 |     url = (
102 |         f"{os.getenv("OPENAI_BASE_URL", "https://api.openai.com/v1")}/chat/completions"
103 |     )
104 |     HTTP_PROXY = os.getenv("HTTP_PROXY", None)
105 |     proxies = None
106 |     # if HTTP_PROXY:
107 |     #     proxies = {
108 |     #         "http": HTTP_PROXY,
109 |     #         "https": HTTP_PROXY,
110 |     #     }
111 |     r = None
112 |     for i in range(max_retry + 1):
113 |         try:
114 |             r = requests.post(
115 |                 url,
116 |                 headers=headers,
117 |                 json=data,
118 |                 timeout=timeout,
119 |                 verify=False,  # 禁用证书验证
120 |                 proxies=proxies,
121 |             )  # .json()
122 |             d = r.json()
123 |             content = d.get("choices", [{}])[0].get("message", {})["content"]
124 |             usage["prompt_tokens"] += d.get("usage", {}).get("prompt_tokens", 0)
125 |             usage["completion_tokens"] += d.get("usage", {}).get("completion_tokens", 0)
126 |             return content
127 |         except Exception as e:
128 |             print(
129 |                 f"Request failed: {e} , retrying {i+1} of {max_retry} after {(i + 1) ** 3} seconds"
130 |             )
131 |             if r is not None:
132 |                 print(r.text)
133 |             time.sleep((i + 1) ** 3)
134 |     raise Exception(f"Request failed after retrying {max_retry} times")
135 | 
136 | 
137 | def str_to_md5(input_str: str) -> str:
138 |     return hashlib.md5(input_str.encode()).hexdigest().upper()
139 | 
140 | 
141 | def pil_to_webp_base64(img: Image.Image) -> str:
142 |     buffered = io.BytesIO()
143 |     img.convert("RGB").save(buffered, format="WEBP", quality=95)
144 |     return base64.b64encode(buffered.getvalue()).decode("utf-8")
145 | 
146 | 
147 | def ndarray_to_webp_base64(img: np.ndarray) -> str:
148 |     """
149 |     Convert a numpy ndarray image to a base64 encoded string.
150 |     """
151 |     return pil_to_webp_base64(Image.fromarray(img))
152 | 
153 | 
154 | def base64_to_pil(base64_str: str) -> Image.Image:
155 |     """
156 |     Convert a base64 encoded string to a PIL Image.
157 | 
158 |     Args:
159 |         base64_str (str): The base64 string representing the image.
160 | 
161 |     Returns:
162 |         Image.Image: A PIL Image object.
163 |     """
164 |     return Image.open(io.BytesIO(base64.b64decode(base64_str))).convert("RGB")
165 | 
166 | 
167 | def cv2_to_pil(cv2_img):
168 |     # 将 cv2 图像转换为 RGB 格式（OpenCV 使用 BGR）
169 |     cv2_img_rgb = cv2.cvtColor(cv2_img, cv2.COLOR_BGR2RGB)
170 |     # 将 NumPy 数组转换为 PIL 图像
171 |     pil_img = Image.fromarray(cv2_img_rgb)
172 |     return pil_img
173 | 
174 | 
175 | def safe_decode(byte_data, encoding_list=["utf-8", "gbk"]):
176 |     for encoding in encoding_list:
177 |         try:
178 |             return byte_data.decode(encoding)
179 |         except UnicodeDecodeError:
180 |             continue
181 |     raise UnicodeDecodeError(f"Unable to decode with encodings: {encoding_list}")
182 | 
183 | 
184 | import ast
185 | import re
186 | import json
187 | from typing import Any, Optional
188 | 
189 | 
190 | def extract_json(s: str) -> Optional[dict[str, Any]]:
191 |     """Extracts the first JSON object found in a string.
192 | 
193 |     Handles multi-line JSON and JSON embedded within other text.
194 | 
195 |     Args:
196 |       s: A string potentially containing a JSON object.
197 |          E.g., "{'hello': 'world'}" (Python-like) or '"key": "value", "boolean": true, "nothing": null' (Standard JSON) or CoT: "let's think step-by-step, ..., { ... json ... } ... more text"
198 | 
199 |     Returns:
200 |       The parsed JSON object as a Python dictionary, or None if no valid
201 |       JSON object is found or parsing fails.
202 |     """
203 |     pattern = r"\{.*\}"
204 |     match = re.search(pattern, s, re.DOTALL)
205 |     if match:
206 |         potential_json_string = match.group()
207 |         try:
208 |             return json.loads(potential_json_string)
209 |         except json.JSONDecodeError as json_error:
210 |             # print(
211 |             #     f"JSON parsing failed ({json_error}), attempting Python literal eval."
212 |             # )
213 |             try:
214 |                 return ast.literal_eval(potential_json_string)
215 |             except (SyntaxError, ValueError) as eval_error:
216 |                 print(
217 |                     f"Python literal eval also failed ({eval_error}), cannot extract dictionary."
218 |                 )
219 |                 return None
220 |     else:
221 |         return None
222 | 
223 | 
224 | def get_apk(package_name: str, local_apk_path: str, device_serial: str = None) -> str:
225 |     command = "adb "
226 |     if device_serial:
227 |         command += f" -s {device_serial} "
228 |     command += f" shell pm path {package_name}"
229 |     apk_path = execute_cmd(command)
230 |     if apk_path == "ERROR":
231 |         return "ERROR"
232 |     apk_path = apk_path.split("package:")[1].strip()
233 |     command = "adb "
234 |     if device_serial:
235 |         command += f" -s {device_serial} "
236 |     command += f" pull {apk_path} {local_apk_path}"
237 |     return execute_cmd(command)
238 | 
239 | 
240 | def execute_cmd(command: str, verbose=True) -> str:
241 |     result = subprocess.run(command, shell=True, capture_output=True, text=True)
242 |     if result.returncode == 0:
243 |         return result.stdout.strip()
244 |     if verbose:
245 |         print(f"Command execution failed: {command}")
246 |         print(result.stderr)
247 |     return "ERROR"
248 | 
249 | 
250 | def get_all_devices() -> list:
251 |     command = "adb devices"
252 |     device_list = []
253 |     result = execute_cmd(command)
254 |     if result != "ERROR":
255 |         devices = result.split("\n")[1:]
256 |         for d in devices:
257 |             device_list.append(d.split()[0])
258 | 
259 |     return device_list
260 | 


--------------------------------------------------------------------------------
/utils/knowledge_generation.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 在执行完task后，调用 update_trajectory_to_knowledge 即可，如果需要马上用上新的知识库，可以再继续看看 memory.py
  3 | """
  4 | 
  5 | import xml.etree.ElementTree as ET
  6 | import subprocess
  7 | import time
  8 | import re
  9 | import os
 10 | import re
 11 | import glob
 12 | import imagehash
 13 | import openai
 14 | 
 15 | # from dotenv import load_dotenv
 16 | from PIL import Image
 17 | import io
 18 | import os
 19 | import json
 20 | import uuid
 21 | import base64
 22 | import hashlib
 23 | 
 24 | from utils.utils import pil_to_webp_base64, cv2_to_pil
 25 | from typing import List, Any, Dict, Union
 26 | import cv2
 27 | import copy
 28 | 
 29 | # 如果版本升级，是否清空知识库重新生成
 30 | EMPTY_KNOWLEDGE_BASE_WHEN_VERSION_UPGRADE = (
 31 |     os.getenv("EMPTY_KNOWLEDGE_BASE_WHEN_VERSION_UPGRADE", "False").lower() == "false"
 32 | )
 33 | # 如果版本降级，是否清空知识库重新生成
 34 | EMPTY_KNOWLEDGE_BASE_WHEN_VERSION_DOWNGRADE = (
 35 |     os.getenv("EMPTY_KNOWLEDGE_BASE_WHEN_VERSION_DOWNGRADE", "True").lower() == "true"
 36 | )
 37 | 
 38 | 
 39 | from utils.memory import KnowledgeStore
 40 | from tqdm import tqdm
 41 | import math
 42 | import operator
 43 | from dataclasses import asdict
 44 | import numpy as np
 45 | from utils.utils import ndarray_to_webp_base64, resize_ndarray_image
 46 | from utils.device import UIElement
 47 | import imagehash
 48 | from PIL import Image
 49 | from utils.prompt_templates import KNOWLEDGE_EXTRACTOR
 50 | from utils.utils import openai_request
 51 | from utils.device import (
 52 |     _generate_ui_element_description,
 53 |     add_screenshot_label,
 54 |     add_ui_element_mark,
 55 | )
 56 | 
 57 | 
 58 | def pil_image_to_phash(pil_image: Image.Image) -> str:
 59 |     """Convert a PIL Image to a perceptual hash.
 60 | 
 61 |     Args:
 62 |         pil_image (Image.Image): The PIL Image object.
 63 | 
 64 |     Returns:
 65 |         str: The perceptual hash.
 66 |     """
 67 | 
 68 |     return str(imagehash.phash(pil_image, hash_size=16, highfreq_factor=8)).upper()
 69 | 
 70 | 
 71 | def ndarray_image_to_phash(ndarray_image: np.ndarray) -> str:
 72 |     """Convert a NumPy ndarray image to a perceptual hash.
 73 | 
 74 |     Args:
 75 |         ndarray_image (np.ndarray): The NumPy ndarray image.
 76 | 
 77 |     Returns:
 78 |         str: The perceptual hash.
 79 |     """
 80 |     return pil_image_to_phash(Image.fromarray(ndarray_image))
 81 | 
 82 | 
 83 | def dot_product(v1: list, v2: list) -> float:
 84 |     return sum(map(operator.mul, v1, v2))
 85 | 
 86 | 
 87 | def cosine_similarity(v1: list, v2: list) -> float:
 88 |     """越接近1越相似"""
 89 |     prod = dot_product(v1, v2)
 90 |     len1 = math.sqrt(dot_product(v1, v1))
 91 |     len2 = math.sqrt(dot_product(v2, v2))
 92 |     return prod / (len1 * len2)
 93 | 
 94 | 
 95 | def update_trajectory_to_knowledge(
 96 |     trajectory_data: list[dict],
 97 |     locations: list[tuple[str, int]],
 98 |     fusion_memory: KnowledgeStore,
 99 |     knowledge_data: dict[str, dict],
100 |     usage: dict[str, int] = {"prompt_tokens": 0, "completion_tokens": 0},
101 | ) -> None:
102 |     """将轨迹数据转换为知识数据，并且更新到知识库（knowledge_data）中。需要在调用完这个函数之后手动保存更新后的knowledge_data，并且重新载入一次fusion memory（因为concat的消息尚未在memory中进行更新）
103 | 
104 |     Args:
105 |         trajectory_data (list[dict]): 轨迹数据
106 |         locations (list[tuple[str, int]]): 检索出来的index到knowledge_data位置的映射,val分别是package_name和index
107 |         fusion_memory (KnowledgeStore): 知识库
108 |         knowledge_data (dict[str, dict[str, Any]]): 知识数据
109 |         usage (dict[str, int], optional): 本次调用使用的token数，会在这个函数中更新
110 | 
111 |     Returns:
112 |         None
113 |     """
114 |     tmp_memory = None
115 | 
116 |     def is_transition_valid(
117 |         before_screenshot: np.ndarray, after_screenshot: np.ndarray
118 |     ) -> bool:
119 |         """判断两个截图之间的转换是否有效"""
120 |         return ndarray_image_to_phash(before_screenshot) != ndarray_image_to_phash(
121 |             after_screenshot
122 |         )
123 | 
124 |     for d in tqdm(trajectory_data, ncols=80, leave=False, desc="Updating knowledge"):
125 |         before_screenshot = d["before_screenshot"]
126 |         after_screenshot = d["after_screenshot"]
127 |         if not is_transition_valid(before_screenshot, after_screenshot):
128 |             continue
129 |         pil_before_screenshot = Image.fromarray(before_screenshot).convert("RGB")
130 |         task_description = d["goal"]
131 |         numeric_tag_of_element = (
132 |             d["converted_action"].index
133 |             if hasattr(d["converted_action"], "index")
134 |             else None
135 |         )
136 |         if numeric_tag_of_element is None:
137 |             continue
138 |         if d["target_element"] is None:
139 |             continue
140 |         e = UIElement(**d["target_element"])
141 |         x_min, y_min, x_max, y_max = (
142 |             e.bbox_pixels.x_min,
143 |             e.bbox_pixels.y_min,
144 |             e.bbox_pixels.x_max,
145 |             e.bbox_pixels.y_max,
146 |         )
147 |         w, h = pil_before_screenshot.size
148 |         x_min = int(max(x_min, 0))
149 |         y_min = int(max(y_min, 0))
150 |         x_max = int(min(x_max, w))
151 |         y_max = int(min(y_max, h))
152 |         image_patch = pil_before_screenshot.crop((x_min, y_min, x_max, y_max))
153 |         logical_screen_size = (w, h)
154 |         physical_frame_boundary = (0, 0, w, h)
155 |         orientation = 0
156 |         add_ui_element_mark(
157 |             before_screenshot,
158 |             e,
159 |             numeric_tag_of_element,
160 |             logical_screen_size,
161 |             physical_frame_boundary,
162 |             orientation,
163 |         )
164 |         add_screenshot_label(
165 |             before_screenshot,
166 |             "Before",
167 |         )
168 |         add_screenshot_label(
169 |             after_screenshot,
170 |             "After",
171 |         )
172 |         if tmp_memory is not None:
173 |             res = tmp_memory.search(image_patch, k=1, similarity_threshold=0.99)
174 |             if len(res) > 0:  # 说明在本次轨迹中已经被处理过
175 |                 continue
176 |         res = fusion_memory.search(image_patch, k=1, similarity_threshold=0.99)
177 |         ui_element_attributes = (
178 |             _generate_ui_element_description(e, numeric_tag_of_element)
179 |             if d["target_element"] is not None
180 |             else "None"
181 |         )
182 |         action = d["converted_action"].json_str()
183 |         package_name = d["top_app_package_name"]
184 |         p = KNOWLEDGE_EXTRACTOR.format(
185 |             task_description=task_description,
186 |             numeric_tag_of_element=numeric_tag_of_element,
187 |             ui_element_attributes=ui_element_attributes,
188 |             action=action,
189 |         )
190 |         low_resolution = os.getenv("LOW_RESOLUTION", "False").lower() == "true"
191 |         if low_resolution:
192 |             before_screenshot = resize_ndarray_image(before_screenshot, 1000)
193 |             after_screenshot = resize_ndarray_image(after_screenshot, 1000)
194 |         messages = [
195 |             {
196 |                 "role": "user",
197 |                 "content": [
198 |                     {
199 |                         "type": "image_url",
200 |                         "image_url": {
201 |                             "url": f"data:image/webp;base64,{ndarray_to_webp_base64(before_screenshot)}",
202 |                         },
203 |                     },
204 |                     {
205 |                         "type": "image_url",
206 |                         "image_url": {
207 |                             "url": f"data:image/webp;base64,{ndarray_to_webp_base64(after_screenshot)}",
208 |                         },
209 |                     },
210 |                     {"type": "text", "text": p},
211 |                 ],
212 |             },
213 |         ]
214 |         rsp_txt = openai_request(
215 |             messages=messages,
216 |             temperature=0.0,
217 |             max_tokens=1000,
218 |             timeout=120,
219 |             usage=usage,
220 |         )
221 |         rsp_txt = rsp_txt.strip()
222 |         assert rsp_txt != "", "empty response from MLLM"
223 |         if not rsp_txt.endswith("."):
224 |             rsp_txt += "."
225 |         rsp_txt += " "
226 |         if tmp_memory is None:
227 |             tmp_memory = KnowledgeStore(
228 |                 knowledge_items=[{"image": np.array(image_patch), "info": rsp_txt}],
229 |                 embedding_pipeline=fusion_memory.embedding_pipeline,
230 |             )
231 |         else:
232 |             tmp_memory.add_knowledge_items(
233 |                 [{"image": np.array(image_patch), "info": rsp_txt}]
234 |             )
235 | 
236 |         add_as_new = True
237 |         rsp_txt_embedding = fusion_memory.embedding_pipeline(rsp_txt)
238 |         for r in res:
239 |             add_as_new = False
240 |             idx = r["index"]
241 |             txt = r["knowledge"]
242 |             txt_embedding = fusion_memory.embedding_pipeline(txt)
243 |             similarity = cosine_similarity(rsp_txt_embedding, txt_embedding)
244 |             if similarity <= 0.1:
245 |                 pkg, k_idx = locations[idx]
246 |                 knowledge_data[pkg]["knowledge"][k_idx]["info"] += rsp_txt
247 |         if add_as_new:
248 |             d = {
249 |                 "attrib": asdict(e),
250 |                 "image": np.array(image_patch),
251 |                 "info": rsp_txt,
252 |             }
253 |             if "knowledge" not in knowledge_data[package_name]:
254 |                 knowledge_data[package_name]["knowledge"] = []
255 |             knowledge_data[package_name]["knowledge"].append(d)
256 |             fusion_memory.add_knowledge_items([d])
257 |             locations.append(
258 |                 (package_name, len(knowledge_data[package_name]["knowledge"]) - 1)
259 |             )
260 | 


--------------------------------------------------------------------------------
/demo/demo_web_frontend/src/components/ChatPage.vue:
--------------------------------------------------------------------------------
  1 | <template>
  2 | 	<div class="chat-container">
  3 | 		<div ref="chatbox" class="lite-chatbox"></div>
  4 | 	</div>
  5 | </template>
  6 | 
  7 | <script>
  8 | const TitleType = { admin: 'admin', owner: 'owner' }
  9 | var keepScrolledToBottom=true
 10 | 
 11 | function beforeRenderingHTML(messages, selector) {
 12 | 	const container = document.querySelector(selector)
 13 | 	if (!container) return
 14 | 
 15 | 	let html = ''
 16 | 	messages.forEach((msg) => {
 17 | 		html += `
 18 |       <div class="c${msg.position} cmsg">
 19 |         <span class="name">
 20 |           ${renderTitleHtml(msg.htitle, TitleType[msg.htitleType] || '')}
 21 |           <span>${escapeHtml(msg.name) || '&nbsp;'}</span>
 22 |         </span>
 23 |         <span class="content">${msg.messageType === 'raw' ? msg.html : escapeHtml(msg.html)}</span>
 24 |       </div>
 25 |     `
 26 | 	})
 27 | 
 28 | 	container.innerHTML = html
 29 | 	if (keepScrolledToBottom) {
 30 | 		container.scrollTop = container.scrollHeight
 31 | 		const scrollToBottom = () => {
 32 | 			container.scrollTo({
 33 | 				top: container.scrollHeight,
 34 | 				behavior: 'smooth', // 启用平滑滚动
 35 | 			})
 36 | 		}
 37 | 		const images = container.querySelectorAll('img')
 38 | 		if (images.length > 0) {
 39 | 			let loadedCount = 0
 40 | 			images.forEach((img) => {
 41 | 				if (img.complete) {
 42 | 					loadedCount++
 43 | 				} else {
 44 | 					img.onload = () => {
 45 | 						loadedCount++
 46 | 						if (loadedCount === images.length) scrollToBottom()
 47 | 					}
 48 | 				}
 49 | 			})
 50 | 			if (loadedCount === images.length) scrollToBottom()
 51 | 		}
 52 | 	}
 53 | }
 54 | 
 55 | function renderTitleHtml(htitle, type) {
 56 | 	return htitle ? `<span class="htitle ${type}" style="margin: 0 4px 0 0;">${htitle}</span>` : ''
 57 | }
 58 | 
 59 | function escapeHtml(text) {
 60 | 	const div = document.createElement('div')
 61 | 	div.textContent = text
 62 | 	return div.innerHTML
 63 | }
 64 | 
 65 | export default {
 66 | 	name: 'ChatComponent',
 67 | 	props: {
 68 | 		messages: {
 69 | 			type: Array,
 70 | 			required: true,
 71 | 			default: () => [],
 72 | 		},
 73 | 	},
 74 | 	data() {
 75 | 		return {
 76 | 			scrollDebounce: null, // 滚动节流
 77 | 			htmls: [],
 78 | 		}
 79 | 	},
 80 | 	watch: {
 81 | 		messages: {
 82 | 			deep: true,
 83 | 			immediate: true,
 84 | 			handler(newVal) {
 85 | 				this.htmls = [...newVal]
 86 | 				this.$nextTick(this.renderChat)
 87 | 			},
 88 | 		},
 89 | 	},
 90 | 	mounted() {
 91 | 		this.renderChat()
 92 | 		this.$nextTick(() => {
 93 | 			const container = this.$refs.chatbox
 94 | 			container.addEventListener('scroll', this.handleScroll)
 95 | 		})
 96 | 	},
 97 | 	beforeUnmount() {
 98 | 		const container = this.$refs.chatbox
 99 | 		container.removeEventListener('scroll', this.handleScroll)
100 | 	},
101 | 	methods: {
102 | 		renderChat() {
103 | 			if (typeof beforeRenderingHTML === 'function') {
104 | 				this.$nextTick(() => {
105 | 					beforeRenderingHTML(this.htmls, '.lite-chatbox')
106 | 				})
107 | 			}
108 | 		},
109 | 		handleScroll() {
110 | 			const container = this.$refs.chatbox
111 | 			const threshold = 100 // 滚动阈值
112 | 
113 | 			// 节流处理（300ms）
114 | 			if (this.scrollDebounce) return
115 | 			if (container.scrollHeight > 2 * container.clientHeight)
116 | 				this.scrollDebounce = setTimeout(() => {
117 | 					const isNearBottom =
118 | 						container.scrollHeight - container.scrollTop - container.clientHeight <=
119 | 						threshold
120 | 
121 | 					keepScrolledToBottom = isNearBottom
122 | 					this.scrollDebounce = null
123 | 				}, 300)
124 | 		},
125 | 	},
126 | }
127 | </script>
128 | 
129 | <style scoped>
130 | html {
131 | 	line-height: 1;
132 | 	-webkit-text-size-adjust: 100%;
133 | }
134 | body {
135 | 	margin: 0;
136 | }
137 | main {
138 | 	display: block;
139 | }
140 | h1 {
141 | 	font-size: 2em;
142 | 	margin: 0.67em 0;
143 | }
144 | hr {
145 | 	-webkit-box-sizing: content-box;
146 | 	box-sizing: content-box;
147 | 	height: 0;
148 | 	overflow: visible;
149 | }
150 | pre {
151 | 	font-family: monospace, monospace;
152 | 	font-size: 1em;
153 | }
154 | a {
155 | 	background-color: transparent;
156 | }
157 | abbr[title] {
158 | 	border-bottom: none;
159 | 	text-decoration: underline;
160 | 	-webkit-text-decoration: underline dotted;
161 | 	text-decoration: underline dotted;
162 | }
163 | b,
164 | strong {
165 | 	font-weight: bolder;
166 | }
167 | code,
168 | kbd,
169 | samp {
170 | 	font-family: monospace, monospace;
171 | 	font-size: 1em;
172 | }
173 | small {
174 | 	font-size: 80%;
175 | }
176 | sub,
177 | sup {
178 | 	font-size: 75%;
179 | 	line-height: 0;
180 | 	position: relative;
181 | 	vertical-align: baseline;
182 | }
183 | sub {
184 | 	bottom: -0.25em;
185 | }
186 | sup {
187 | 	top: -0.5em;
188 | }
189 | img {
190 | 	border-style: none;
191 | }
192 | button,
193 | input,
194 | optgroup,
195 | select,
196 | textarea {
197 | 	font-family: inherit;
198 | 	font-size: 100%;
199 | 	line-height: 1;
200 | 	margin: 0;
201 | }
202 | button,
203 | input {
204 | 	overflow: visible;
205 | }
206 | button,
207 | select {
208 | 	text-transform: none;
209 | }
210 | [type='button'],
211 | [type='reset'],
212 | [type='submit'],
213 | button {
214 | 	-webkit-appearance: button;
215 | 	appearance: button;
216 | }
217 | [type='button']::-moz-focus-inner,
218 | [type='reset']::-moz-focus-inner,
219 | [type='submit']::-moz-focus-inner,
220 | button::-moz-focus-inner {
221 | 	border-style: none;
222 | 	padding: 0;
223 | }
224 | [type='button']:-moz-focusring,
225 | [type='reset']:-moz-focusring,
226 | [type='submit']:-moz-focusring,
227 | button:-moz-focusring {
228 | 	outline: 1px dotted ButtonText;
229 | }
230 | fieldset {
231 | 	padding: 0.35em 0.75em 0.625em;
232 | }
233 | legend {
234 | 	-webkit-box-sizing: border-box;
235 | 	box-sizing: border-box;
236 | 	color: inherit;
237 | 	display: table;
238 | 	max-width: 100%;
239 | 	padding: 0;
240 | 	white-space: normal;
241 | }
242 | progress {
243 | 	vertical-align: baseline;
244 | }
245 | textarea {
246 | 	overflow: auto;
247 | }
248 | [type='checkbox'],
249 | [type='radio'] {
250 | 	-webkit-box-sizing: border-box;
251 | 	box-sizing: border-box;
252 | 	padding: 0;
253 | }
254 | [type='number']::-webkit-inner-spin-button,
255 | [type='number']::-webkit-outer-spin-button {
256 | 	height: auto;
257 | }
258 | [type='search'] {
259 | 	-webkit-appearance: textfield;
260 | 	appearance: textfield;
261 | 	outline-offset: -2px;
262 | }
263 | [type='search']::-webkit-search-decoration {
264 | 	-webkit-appearance: none;
265 | }
266 | ::-webkit-file-upload-button {
267 | 	-webkit-appearance: button;
268 | 	font: inherit;
269 | }
270 | details {
271 | 	display: block;
272 | }
273 | summary {
274 | 	display: list-item;
275 | }
276 | template {
277 | 	display: none;
278 | }
279 | [hidden] {
280 | 	display: none;
281 | }
282 | * {
283 | 	scrollbar-color: #5c6163 rgba(56, 59, 60, 0.031372549);
284 | }
285 | ::-webkit-scrollbar {
286 | 	width: 7px;
287 | 	height: 1px;
288 | }
289 | ::-webkit-scrollbar-thumb {
290 | 	border-radius: 10px;
291 | 	background-color: rgba(144, 147, 153, 0.5);
292 | 	border: 0;
293 | }
294 | [litewebchat-theme='dark'] ::-webkit-scrollbar-thumb {
295 | 	background-color: rgba(84, 91, 95, 0.5);
296 | }
297 | ::-webkit-scrollbar-track {
298 | 	background: #fff;
299 | 	min-height: 50%;
300 | 	min-height: 20px;
301 | }
302 | [litewebchat-theme='dark'] ::-webkit-scrollbar-track {
303 | 	background: #181a1b;
304 | }
305 | ::-webkit-scrollbar-corner {
306 | 	background-color: transparent;
307 | }
308 | ::-moz-selection {
309 | 	background-color: #1963bd !important;
310 | 	color: #f8f6f3 !important;
311 | }
312 | ::selection {
313 | 	background-color: #1963bd !important;
314 | 	color: #f8f6f3 !important;
315 | }
316 | body {
317 | 	font-family: Helvetica, 'PingFang SC', 'Microsoft YaHei', sans-serif;
318 | }
319 | :deep(.lite-chatbox) {
320 | 	scroll-behavior: smooth;
321 | 	overscroll-behavior: contain;
322 | 	padding: 0;
323 | 	width: 100%;
324 | 	position: relative;
325 | 	font-size: 14px;
326 | 	/*background-color: #f8f9fa;*/
327 | 	overflow-y: auto;
328 | 	overflow-x: hidden;
329 | 	max-height: 100%; /* 确保内容超出时触发滚动 */
330 | }
331 | [litewebchat-theme='dark'] :deep(.lite-chatbox) {
332 | 	background-color: #131415;
333 | }
334 | :deep(.lite-chatbox .cmsg) {
335 | 	position: relative;
336 | 	margin: 4px 4px;
337 | 	min-height: 50px;
338 | 	border: 0;
339 | }
340 | :deep(.lite-chatbox .cright) {
341 | 	text-align: right;
342 | 	margin-left: 4px;
343 | }
344 | 
345 | :deep(.lite-chatbox .cright .name) {
346 | 	margin: 0 0px 2px 0;
347 | }
348 | :deep(.lite-chatbox .cright .content) {
349 | 	margin: 0 0px 0 0;
350 | 	border-radius: 10px 0 10px 10px;
351 | 	color: #fff;
352 | 	text-shadow: 1px 1px 2px rgba(0, 0, 0, 0.3);
353 | 	background: -o-linear-gradient(70deg, rgba(63, 143, 225, 0.8) 0, #44d7c9 100%);
354 | 	background: linear-gradient(20deg, rgba(63, 143, 225, 0.8) 0, #44d7c9 100%);
355 | 	-webkit-box-shadow: 5px 5px 15px 0 rgba(102, 102, 102, 0.15);
356 | 	box-shadow: 5px 5px 15px 0 rgba(102, 102, 102, 0.15);
357 | }
358 | [litewebchat-theme='dark'] :deep(.lite-chatbox .cright .content) {
359 | 	background: -o-linear-gradient(70deg, rgba(25, 91, 159, 0.8) 0, #219a92 100%);
360 | 	background: linear-gradient(20deg, rgba(25, 91, 159, 0.8) 0, #219a92 100%);
361 | }
362 | :deep(.lite-chatbox .cright .content::after) {
363 | 	left: -12px;
364 | 	top: 8px;
365 | }
366 | :deep(.lite-chatbox .cleft) {
367 | 	text-align: left;
368 | 	margin-right: 4px;
369 | }
370 | 
371 | :deep(.lite-chatbox .cleft .name) {
372 | 	margin: 0 0 2px 0px;
373 | }
374 | :deep(.lite-chatbox .cleft .content) {
375 | 	margin: 0 0 0 0px;
376 | 	border-radius: 0 10px 10px 10px;
377 | 	background: #fff;
378 | 	color: #373737;
379 | 	border: 1px solid rgba(0, 0, 0, 0.05);
380 | 	-webkit-box-shadow: 5px 5px 15px 0 rgba(102, 102, 102, 0.1);
381 | 	box-shadow: 5px 5px 15px 0 rgba(102, 102, 102, 0.1);
382 | }
383 | [litewebchat-theme='dark'] :deep(.lite-chatbox .cleft .content) {
384 | 	background: #22242a;
385 | }
386 | [litewebchat-theme='dark'] :deep(.lite-chatbox .cleft .content) {
387 | 	color: #d4d4d4;
388 | }
389 | :deep(.lite-chatbox .cleft .content::after) {
390 | 	left: -12px;
391 | 	top: 8px;
392 | }
393 | 
394 | :deep(.lite-chatbox img.radius) {
395 | 	border-radius: 50%;
396 | }
397 | :deep(.lite-chatbox .name) {
398 | 	color: #8b8b8b;
399 | 	font-size: 14px;
400 | 	display: block;
401 | 	line-height: 16px;
402 | }
403 | :deep(.lite-chatbox .name > span) {
404 | 	vertical-align: middle;
405 | }
406 | :deep(.lite-chatbox .name .htitle) {
407 | 	display: inline-block;
408 | 	padding: 0 3px 0 3px;
409 | 	background-color: #ccc;
410 | 	color: #fff;
411 | 	border-radius: 4px;
412 | 	margin-right: 4px;
413 | 	font-size: 14px;
414 | 	overflow: hidden;
415 | 	-o-text-overflow: ellipsis;
416 | 	text-overflow: ellipsis;
417 | 	white-space: nowrap;
418 | 	vertical-align: middle;
419 | 	max-width: 50px;
420 | }
421 | [litewebchat-theme='dark'] :deep(.lite-chatbox .name .htitle) {
422 | 	background-color: #4c5052;
423 | }
424 | :deep(.lite-chatbox .name .htitle.admin) {
425 | 	background-color: #72d6a0;
426 | }
427 | [litewebchat-theme='dark'] :deep(.lite-chatbox .name .htitle.admin) {
428 | 	background-color: #3c916e;
429 | }
430 | :deep(.lite-chatbox .name .htitle.owner) {
431 | 	background-color: #f2bf25;
432 | }
433 | [litewebchat-theme='dark'] :deep(.lite-chatbox .name .htitle.owner) {
434 | 	background-color: #9a7c21;
435 | }
436 | :deep(.lite-chatbox .content) {
437 | 	word-break: break-all;
438 | 	word-wrap: break-word;
439 | 	text-align: left;
440 | 	/*text-align: center;*/
441 | 	position: relative;
442 | 	display: inline-block;
443 | 	font-size: 14px;
444 | 	padding: 4px 4px; /*文本框和文字之间的空白*/
445 | 	line-height: 18px;
446 | 	white-space: pre-wrap;
447 | 	width: 100%;
448 | 	min-height: 18px;
449 | }
450 | :deep(.lite-chatbox .content img) {
451 | 	width: 100%;
452 | 	height: auto;
453 | }
454 | :deep(.lite-chatbox .content a) {
455 | 	color: #0072c1;
456 | 	margin: 0 5px;
457 | 	cursor: hand;
458 | }
459 | :deep([litewebchat-theme='dark'] .lite-chatbox .content a) {
460 | 	color: #00c3ff;
461 | }
462 | </style>
463 | 


--------------------------------------------------------------------------------
/demo/demo_web_frontend/src/components/ReasoningPage.vue:
--------------------------------------------------------------------------------
  1 | <template>
  2 | 	<div class="chat-container2">
  3 | 		<div ref="chatbox2" class="lite-chatbox22"></div>
  4 | 	</div>
  5 | </template>
  6 | 
  7 | <script>
  8 | const TitleType = { admin: 'admin', owner: 'owner' }
  9 | var keepScrolledToBottom=true
 10 | 
 11 | function beforeRenderingHTML(messages, selector) {
 12 | 	const container = document.querySelector(selector)
 13 | 	if (!container) return
 14 | 
 15 | 	let html = ''
 16 | 	messages.forEach((msg) => {
 17 | 		html += `
 18 |       <div class="c${msg.position} cmsg">
 19 |         <span class="name">
 20 |           ${renderTitleHtml(msg.htitle, TitleType[msg.htitleType] || '')}
 21 |           <span>${escapeHtml(msg.name) || '&nbsp;'}</span>
 22 |         </span>
 23 |         <span class="content">${msg.messageType === 'raw' ? msg.html : escapeHtml(msg.html)}</span>
 24 |       </div>
 25 |     `
 26 | 	})
 27 | 
 28 | 	container.innerHTML = html
 29 | 	if (keepScrolledToBottom) {
 30 | 		container.scrollTop = container.scrollHeight
 31 | 		const scrollToBottom = () => {
 32 | 			container.scrollTo({
 33 | 				top: container.scrollHeight,
 34 | 				behavior: 'smooth', // 启用平滑滚动
 35 | 			})
 36 | 		}
 37 | 		const images = container.querySelectorAll('img')
 38 | 		if (images.length > 0) {
 39 | 			let loadedCount = 0
 40 | 			images.forEach((img) => {
 41 | 				if (img.complete) {
 42 | 					loadedCount++
 43 | 				} else {
 44 | 					img.onload = () => {
 45 | 						loadedCount++
 46 | 						if (loadedCount === images.length) scrollToBottom()
 47 | 					}
 48 | 				}
 49 | 			})
 50 | 			if (loadedCount === images.length) scrollToBottom()
 51 | 		}
 52 | 	}
 53 | }
 54 | 
 55 | function renderTitleHtml(htitle, type) {
 56 | 	return htitle ? `<span class="htitle ${type}" style="margin: 0 4px 0 0;">${htitle}</span>` : ''
 57 | }
 58 | 
 59 | function escapeHtml(text) {
 60 | 	const div = document.createElement('div')
 61 | 	div.textContent = text
 62 | 	return div.innerHTML
 63 | }
 64 | 
 65 | export default {
 66 | 	name: 'ChatComponent',
 67 | 	props: {
 68 | 		messages: {
 69 | 			type: Array,
 70 | 			required: true,
 71 | 			default: () => [],
 72 | 		},
 73 | 	},
 74 | 	data() {
 75 | 		return {
 76 | 			scrollDebounce: null, // 滚动节流
 77 | 			htmls: [],
 78 | 		}
 79 | 	},
 80 | 	watch: {
 81 | 		messages: {
 82 | 			deep: true,
 83 | 			immediate: true,
 84 | 			handler(newVal) {
 85 | 				this.htmls = [...newVal]
 86 | 				this.$nextTick(this.renderChat)
 87 | 			},
 88 | 		},
 89 | 	},
 90 | 	mounted() {
 91 | 		this.renderChat()
 92 | 		this.$nextTick(() => {
 93 | 			const container = this.$refs.chatbox2
 94 | 			container.addEventListener('scroll', this.handleScroll)
 95 | 		})
 96 | 	},
 97 | 	beforeUnmount() {
 98 | 		const container = this.$refs.chatbox2
 99 | 		container.removeEventListener('scroll', this.handleScroll)
100 | 	},
101 | 	methods: {
102 | 		renderChat() {
103 | 			if (typeof beforeRenderingHTML === 'function') {
104 | 				this.$nextTick(() => {
105 | 					beforeRenderingHTML(this.htmls, '.lite-chatbox22')
106 | 				})
107 | 			}
108 | 		},
109 | 		handleScroll() {
110 | 			const container = this.$refs.chatbox2
111 | 			const threshold = 100 // 滚动阈值
112 | 
113 | 			// 节流处理（300ms）
114 | 			if (this.scrollDebounce) return
115 | 			if (container.scrollHeight > 2 * container.clientHeight)
116 | 				this.scrollDebounce = setTimeout(() => {
117 | 					const isNearBottom =
118 | 						container.scrollHeight - container.scrollTop - container.clientHeight <=
119 | 						threshold
120 | 
121 | 					keepScrolledToBottom = isNearBottom
122 | 					this.scrollDebounce = null
123 | 				}, 300)
124 | 		},
125 | 	},
126 | }
127 | </script>
128 | 
129 | <style scoped>
130 | html {
131 | 	line-height: 1;
132 | 	-webkit-text-size-adjust: 100%;
133 | }
134 | body {
135 | 	margin: 0;
136 | }
137 | main {
138 | 	display: block;
139 | }
140 | h1 {
141 | 	font-size: 2em;
142 | 	margin: 0.67em 0;
143 | }
144 | hr {
145 | 	-webkit-box-sizing: content-box;
146 | 	box-sizing: content-box;
147 | 	height: 0;
148 | 	overflow: visible;
149 | }
150 | pre {
151 | 	font-family: monospace, monospace;
152 | 	font-size: 1em;
153 | }
154 | a {
155 | 	background-color: transparent;
156 | }
157 | abbr[title] {
158 | 	border-bottom: none;
159 | 	text-decoration: underline;
160 | 	-webkit-text-decoration: underline dotted;
161 | 	text-decoration: underline dotted;
162 | }
163 | b,
164 | strong {
165 | 	font-weight: bolder;
166 | }
167 | code,
168 | kbd,
169 | samp {
170 | 	font-family: monospace, monospace;
171 | 	font-size: 1em;
172 | }
173 | small {
174 | 	font-size: 80%;
175 | }
176 | sub,
177 | sup {
178 | 	font-size: 75%;
179 | 	line-height: 0;
180 | 	position: relative;
181 | 	vertical-align: baseline;
182 | }
183 | sub {
184 | 	bottom: -0.25em;
185 | }
186 | sup {
187 | 	top: -0.5em;
188 | }
189 | img {
190 | 	border-style: none;
191 | }
192 | button,
193 | input,
194 | optgroup,
195 | select,
196 | textarea {
197 | 	font-family: inherit;
198 | 	font-size: 100%;
199 | 	line-height: 1;
200 | 	margin: 0;
201 | }
202 | button,
203 | input {
204 | 	overflow: visible;
205 | }
206 | button,
207 | select {
208 | 	text-transform: none;
209 | }
210 | [type='button'],
211 | [type='reset'],
212 | [type='submit'],
213 | button {
214 | 	-webkit-appearance: button;
215 | 	appearance: button;
216 | }
217 | [type='button']::-moz-focus-inner,
218 | [type='reset']::-moz-focus-inner,
219 | [type='submit']::-moz-focus-inner,
220 | button::-moz-focus-inner {
221 | 	border-style: none;
222 | 	padding: 0;
223 | }
224 | [type='button']:-moz-focusring,
225 | [type='reset']:-moz-focusring,
226 | [type='submit']:-moz-focusring,
227 | button:-moz-focusring {
228 | 	outline: 1px dotted ButtonText;
229 | }
230 | fieldset {
231 | 	padding: 0.35em 0.75em 0.625em;
232 | }
233 | legend {
234 | 	-webkit-box-sizing: border-box;
235 | 	box-sizing: border-box;
236 | 	color: inherit;
237 | 	display: table;
238 | 	max-width: 100%;
239 | 	padding: 0;
240 | 	white-space: normal;
241 | }
242 | progress {
243 | 	vertical-align: baseline;
244 | }
245 | textarea {
246 | 	overflow: auto;
247 | }
248 | [type='checkbox'],
249 | [type='radio'] {
250 | 	-webkit-box-sizing: border-box;
251 | 	box-sizing: border-box;
252 | 	padding: 0;
253 | }
254 | [type='number']::-webkit-inner-spin-button,
255 | [type='number']::-webkit-outer-spin-button {
256 | 	height: auto;
257 | }
258 | [type='search'] {
259 | 	-webkit-appearance: textfield;
260 | 	appearance: textfield;
261 | 	outline-offset: -2px;
262 | }
263 | [type='search']::-webkit-search-decoration {
264 | 	-webkit-appearance: none;
265 | }
266 | ::-webkit-file-upload-button {
267 | 	-webkit-appearance: button;
268 | 	font: inherit;
269 | }
270 | details {
271 | 	display: block;
272 | }
273 | summary {
274 | 	display: list-item;
275 | }
276 | template {
277 | 	display: none;
278 | }
279 | [hidden] {
280 | 	display: none;
281 | }
282 | * {
283 | 	scrollbar-color: #5c6163 rgba(56, 59, 60, 0.031372549);
284 | }
285 | ::-webkit-scrollbar {
286 | 	width: 7px;
287 | 	height: 1px;
288 | }
289 | ::-webkit-scrollbar-thumb {
290 | 	border-radius: 10px;
291 | 	background-color: rgba(144, 147, 153, 0.5);
292 | 	border: 0;
293 | }
294 | [litewebchat-theme='dark'] ::-webkit-scrollbar-thumb {
295 | 	background-color: rgba(84, 91, 95, 0.5);
296 | }
297 | ::-webkit-scrollbar-track {
298 | 	background: #fff;
299 | 	min-height: 50%;
300 | 	min-height: 20px;
301 | }
302 | [litewebchat-theme='dark'] ::-webkit-scrollbar-track {
303 | 	background: #181a1b;
304 | }
305 | ::-webkit-scrollbar-corner {
306 | 	background-color: transparent;
307 | }
308 | ::-moz-selection {
309 | 	background-color: #1963bd !important;
310 | 	color: #f8f6f3 !important;
311 | }
312 | ::selection {
313 | 	background-color: #1963bd !important;
314 | 	color: #f8f6f3 !important;
315 | }
316 | body {
317 | 	font-family: Helvetica, 'PingFang SC', 'Microsoft YaHei', sans-serif;
318 | }
319 | :deep(.lite-chatbox22) {
320 | 	scroll-behavior: smooth;
321 | 	overscroll-behavior: contain;
322 | 	padding: 0;
323 | 	width: 100%;
324 | 	position: relative;
325 | 	font-size: 14px;
326 | 	/*background-color: #f8f9fa;*/
327 | 	overflow-y: auto;
328 | 	overflow-x: hidden;
329 | 	max-height: 100%; /* 确保内容超出时触发滚动 */
330 | }
331 | [litewebchat-theme='dark'] :deep(.lite-chatbox22) {
332 | 	background-color: #131415;
333 | }
334 | :deep(.lite-chatbox22 .cmsg) {
335 | 	position: relative;
336 | 	margin: 4px 4px;
337 | 	min-height: 50px;
338 | 	border: 0;
339 | }
340 | :deep(.lite-chatbox22 .cright) {
341 | 	text-align: right;
342 | 	margin-left: 4px;
343 | }
344 | 
345 | :deep(.lite-chatbox22 .cright .name) {
346 | 	margin: 0 0px 2px 0;
347 | }
348 | :deep(.lite-chatbox22 .cright .content) {
349 | 	margin: 0 0px 0 0;
350 | 	border-radius: 10px 0 10px 10px;
351 | 	color: #fff;
352 | 	text-shadow: 1px 1px 2px rgba(0, 0, 0, 0.3);
353 | 	background: -o-linear-gradient(70deg, rgba(63, 143, 225, 0.8) 0, #44d7c9 100%);
354 | 	background: linear-gradient(20deg, rgba(63, 143, 225, 0.8) 0, #44d7c9 100%);
355 | 	-webkit-box-shadow: 5px 5px 15px 0 rgba(102, 102, 102, 0.15);
356 | 	box-shadow: 5px 5px 15px 0 rgba(102, 102, 102, 0.15);
357 | }
358 | [litewebchat-theme='dark'] :deep(.lite-chatbox22 .cright .content) {
359 | 	background: -o-linear-gradient(70deg, rgba(25, 91, 159, 0.8) 0, #219a92 100%);
360 | 	background: linear-gradient(20deg, rgba(25, 91, 159, 0.8) 0, #219a92 100%);
361 | }
362 | :deep(.lite-chatbox22 .cright .content::after) {
363 | 	left: -12px;
364 | 	top: 8px;
365 | }
366 | :deep(.lite-chatbox22 .cleft) {
367 | 	text-align: left;
368 | 	margin-right: 4px;
369 | }
370 | 
371 | :deep(.lite-chatbox22 .cleft .name) {
372 | 	margin: 0 0 2px 0px;
373 | }
374 | :deep(.lite-chatbox22 .cleft .content) {
375 | 	margin: 0 0 0 0px;
376 | 	border-radius: 0 10px 10px 10px;
377 | 	background: #fff;
378 | 	color: #373737;
379 | 	border: 1px solid rgba(0, 0, 0, 0.05);
380 | 	-webkit-box-shadow: 5px 5px 15px 0 rgba(102, 102, 102, 0.1);
381 | 	box-shadow: 5px 5px 15px 0 rgba(102, 102, 102, 0.1);
382 | }
383 | [litewebchat-theme='dark'] :deep(.lite-chatbox22 .cleft .content) {
384 | 	background: #22242a;
385 | }
386 | [litewebchat-theme='dark'] :deep(.lite-chatbox22 .cleft .content) {
387 | 	color: #d4d4d4; /* This was a separate rule, combined here for brevity if background and color are for the same state */
388 | }
389 | :deep(.lite-chatbox22 .cleft .content::after) {
390 | 	left: -12px;
391 | 	top: 8px;
392 | }
393 | 
394 | :deep(.lite-chatbox22 img.radius) {
395 | 	border-radius: 50%;
396 | }
397 | :deep(.lite-chatbox22 .name) {
398 | 	color: #8b8b8b;
399 | 	font-size: 14px;
400 | 	display: block;
401 | 	line-height: 16px;
402 | }
403 | :deep(.lite-chatbox22 .name > span) {
404 | 	vertical-align: middle;
405 | }
406 | :deep(.lite-chatbox22 .name .htitle) {
407 | 	display: inline-block;
408 | 	padding: 0 3px 0 3px;
409 | 	background-color: #ccc;
410 | 	color: #fff;
411 | 	border-radius: 4px;
412 | 	margin-right: 4px;
413 | 	font-size: 14px;
414 | 	overflow: hidden;
415 | 	-o-text-overflow: ellipsis;
416 | 	text-overflow: ellipsis;
417 | 	white-space: nowrap;
418 | 	vertical-align: middle;
419 | 	max-width: 50px;
420 | }
421 | [litewebchat-theme='dark'] :deep(.lite-chatbox22 .name .htitle) {
422 | 	background-color: #4c5052;
423 | }
424 | :deep(.lite-chatbox22 .name .htitle.admin) {
425 | 	background-color: #72d6a0;
426 | }
427 | [litewebchat-theme='dark'] :deep(.lite-chatbox22 .name .htitle.admin) {
428 | 	background-color: #3c916e;
429 | }
430 | :deep(.lite-chatbox22 .name .htitle.owner) {
431 | 	background-color: #f2bf25;
432 | }
433 | [litewebchat-theme='dark'] :deep(.lite-chatbox22 .name .htitle.owner) {
434 | 	background-color: #9a7c21;
435 | }
436 | :deep(.lite-chatbox22 .content) {
437 | 	word-break: break-all;
438 | 	word-wrap: break-word;
439 | 	text-align: left;
440 | 	/*text-align: center;*/
441 | 	position: relative;
442 | 	display: inline-block;
443 | 	font-size: 14px;
444 | 	padding: 4px 4px; /*文本框和文字之间的空白*/
445 | 	line-height: 18px;
446 | 	white-space: pre-wrap;
447 | 	width: 100%;
448 | 	min-height: 18px;
449 | }
450 | :deep(.lite-chatbox22 .content img) {
451 | 	width: 100%;
452 | 	height: auto;
453 | }
454 | :deep(.lite-chatbox22 .content a) {
455 | 	color: #0072c1;
456 | 	margin: 0 5px;
457 | 	cursor: hand; /* 'hand' is deprecated, 'pointer' is standard */
458 | }
459 | :deep([litewebchat-theme='dark'] .lite-chatbox22 .content a) {
460 | 	color: #00c3ff;
461 | }
462 | </style>
463 | 


--------------------------------------------------------------------------------
/utils/memory.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 首先用 knowledge_generation.py 从执行轨迹生成经验，
  3 | 然后用本文件的 load_knowledge_raw_data 把生成的所有app的经验都读取进内存，
  4 | 然后用 knowledge_raw_data_to_memory 将这些经验转成向量数据库。（也可以直接用load_memories()完成这两步）
  5 | """
  6 | 
  7 | import torch
  8 | import os
  9 | import shutil
 10 | import json
 11 | from tqdm import tqdm
 12 | 
 13 | import gc
 14 | from utils.embedding_pipeline import SiglipMultimodalEmbeddingPipeline
 15 | from typing import List, Dict, Any, Union, Tuple
 16 | 
 17 | from PIL import Image
 18 | import numpy as np
 19 | import faiss
 20 | import numpy as np
 21 | from utils.utils import save_object_to_disk, load_object_from_disk
 22 | 
 23 | 
 24 | class FaissCosineSearcher:
 25 |     def __init__(
 26 |         self, vector_database_np: np.ndarray = None, embedding_dim: int = 1152
 27 |     ):
 28 |         """
 29 |         初始化Faiss余弦相似度搜索器。
 30 | 
 31 |         Args:
 32 |             vector_database_np (np.ndarray): n*m 的二维NumPy数组，作为向量数据库。
 33 |             embedding_dim (int): 向量的维度，默认为1152。
 34 |         """
 35 |         self.m = embedding_dim  # 等于vector_database_np.shape[1]  # 向量维度
 36 | 
 37 |         # 构建Faiss索引 (IndexFlatL2 用于精确搜索)
 38 |         # 对于余弦相似度，我们在归一化向量上使用L2距离索引，因为将向量归一化之后 cosine_similarity = 1 - (L2_distance^2 / 2)
 39 |         self.index = faiss.IndexFlatL2(self.m)
 40 |         if vector_database_np is not None:
 41 |             self.normalized_db = self._normalize_vectors(
 42 |                 vector_database_np.astype(np.float32)
 43 |             )
 44 |             if self.normalized_db.shape[0] > 0:  # 只有当数据库非空时才添加
 45 |                 self.index.add(self.normalized_db)
 46 | 
 47 |     def _normalize_vectors(self, vectors: np.ndarray) -> np.ndarray:
 48 |         """
 49 |         对向量进行L2归一化。
 50 |         如果向量的范数为0，则返回零向量。
 51 |         """
 52 |         if vectors.ndim == 1:  # 单个向量
 53 |             norm = np.linalg.norm(vectors)
 54 |             return vectors / norm if norm > 0 else np.zeros_like(vectors)
 55 |         elif vectors.ndim == 2:  # 向量批次
 56 |             norms = np.linalg.norm(vectors, axis=1, keepdims=True)
 57 |             safe_norms = np.where(norms == 0, 1.0, norms)  # 范数为0时用1替代，避免除以0
 58 |             normalized = vectors / safe_norms
 59 |             normalized[norms.squeeze() == 0] = 0.0  # 将原始范数为0的行设为0向量
 60 |             return normalized
 61 |         else:
 62 |             raise ValueError("vectors must be 1D or 2D NumPy arrays.")
 63 | 
 64 |     def add_vectors(self, new_vectors_np: np.ndarray):
 65 |         """
 66 |         向Faiss索引中添加新的向量。
 67 | 
 68 |         Args:
 69 |             new_vectors_np (np.ndarray): n*m 的二维NumPy数组，作为新向量。
 70 |         """
 71 |         if not isinstance(new_vectors_np, np.ndarray) or new_vectors_np.ndim != 2:
 72 |             raise ValueError("new_vectors_np must be a 2D NumPy array.")
 73 |         if new_vectors_np.shape[0] == 0:
 74 |             raise ValueError("new_vectors_np cannot be empty.")
 75 | 
 76 |         # 归一化新向量
 77 |         normalized_new_vectors = self._normalize_vectors(
 78 |             new_vectors_np.astype(np.float32)
 79 |         )
 80 |         self.index.add(normalized_new_vectors)
 81 | 
 82 |     def search(
 83 |         self, query_vector_np: np.ndarray, k: int, similarity_threshold: float
 84 |     ) -> list:
 85 |         """
 86 |         使用Faiss和余弦相似度检索top K个结果 (单个查询)。
 87 | 
 88 |         Args:
 89 |             query_vector_np (np.ndarray): 1*m 的一维或二维NumPy数组，作为查询向量。
 90 |             k (int): 需要检索的top K结果数量。
 91 |             similarity_threshold (float): 余弦相似度阈值 (范围通常在 -1.0 到 1.0)。低于此阈值的结果将被去除。
 92 | 
 93 |         Returns:
 94 |             list: 一个列表，包含满足条件的检索结果。每个结果是一个字典，
 95 |                 格式为 {'index': 数据库中的原始索引, 'similarity': 余弦相似度}。
 96 |                 列表按相似度降序排列。
 97 |         """
 98 |         if not isinstance(query_vector_np, np.ndarray) or query_vector_np.ndim > 2:
 99 |             raise ValueError("query_vector_np must be a 1D or 2D NumPy array.")
100 | 
101 |         if query_vector_np.ndim == 1:
102 |             query_vector_np_batch = query_vector_np.reshape(1, -1)  # 转换为 (1, m)
103 |         else:  # 已经是二维了
104 |             query_vector_np_batch = query_vector_np
105 | 
106 |         results_batch = self.search_batch(
107 |             query_vector_np_batch, k, similarity_threshold
108 |         )
109 |         return results_batch[0]  # 因为是单个查询，所以取第一个结果列表
110 | 
111 |     def search_batch(
112 |         self, query_vectors_np: np.ndarray, k: int, similarity_threshold: float
113 |     ) -> list[list[dict]]:
114 |         """
115 |         使用Faiss和余弦相似度批量检索top K个结果。
116 | 
117 |         Args:
118 |             query_vectors_np (np.ndarray): b*m 的二维NumPy数组，b是查询数量，m是向量维度。
119 |             k (int): 每个查询需要检索的top K结果数量。
120 |             similarity_threshold (float): 余弦相似度阈值 (范围通常在 -1.0 到 1.0)。低于此阈值的结果将被去除。
121 | 
122 |         Returns:
123 |             list[list[dict]]: 一个列表的列表。外层列表对应每个查询，内层列表包含该查询满足条件的检索结果。每个结果是一个字典，
124 |                             格式为 {'index': 数据库中的原始索引, 'similarity': 余弦相似度}。内层列表按相似度降序排列。
125 |         """
126 |         if not isinstance(query_vectors_np, np.ndarray) or query_vectors_np.ndim != 2:
127 |             raise ValueError("query_vectors_np must be a 2D NumPy array (b * m).")
128 | 
129 |         num_queries = query_vectors_np.shape[0]
130 |         if num_queries == 0:
131 |             return []
132 | 
133 |         if query_vectors_np.shape[1] != self.m:
134 |             raise ValueError(
135 |                 f"Query vector dimension ({query_vectors_np.shape[1]}) does not match database vector dimension ({self.m})."
136 |             )
137 |         if k <= 0:
138 |             raise ValueError("k must be a positive integer.")
139 |         if self.index.ntotal == 0:  # 如果数据库为空
140 |             print("警告：向量数据库为空，无法执行搜索。")
141 |             return [[] for _ in range(num_queries)]
142 | 
143 |         normalized_queries = self._normalize_vectors(
144 |             query_vectors_np.astype(np.float32)
145 |         )
146 | 
147 |         is_zero_query = np.all(normalized_queries == 0, axis=1)
148 | 
149 |         clamped_threshold = max(-1.0, min(1.0, similarity_threshold))
150 |         radius_sq = 2.0 - 2.0 * clamped_threshold
151 |         if radius_sq < 0:
152 |             radius_sq = 0.0
153 | 
154 |         lims, D_sq_range, I_range = self.index.range_search(
155 |             normalized_queries, radius_sq
156 |         )
157 | 
158 |         all_results = []
159 |         for i in range(num_queries):
160 |             if is_zero_query[i]:
161 |                 print(
162 |                     f"警告：批量中的第 {i} 个查询向量归一化后为零向量，该查询返回空结果。"
163 |                 )
164 |                 all_results.append([])
165 |                 continue
166 | 
167 |             start_offset = lims[i]
168 |             end_offset = lims[i + 1]
169 | 
170 |             query_results_above_threshold = []
171 |             for j in range(start_offset, end_offset):
172 |                 db_index = I_range[j]
173 |                 dist_sq = D_sq_range[j]
174 | 
175 |                 dist_sq = max(0.0, min(4.0, dist_sq))
176 |                 cosine_sim = 1.0 - dist_sq / 2.0
177 |                 cosine_sim = max(-1.0, min(1.0, cosine_sim))
178 | 
179 |                 if cosine_sim >= clamped_threshold:
180 |                     query_results_above_threshold.append(
181 |                         {"index": db_index, "similarity": cosine_sim}
182 |                     )
183 | 
184 |             # 按相似度降序排序
185 |             query_results_above_threshold.sort(
186 |                 key=lambda x: x["similarity"], reverse=True
187 |             )
188 |             all_results.append(query_results_above_threshold[:k])  # 取top K
189 | 
190 |         return all_results
191 | 
192 | 
193 | class KnowledgeStore:
194 |     def __init__(
195 |         self,
196 |         embedding_pipeline: SiglipMultimodalEmbeddingPipeline,
197 |         knowledge_items: list[dict] = None,
198 |     ):
199 |         self.embedding_pipeline = embedding_pipeline
200 |         self.knowledge_items = knowledge_items or []
201 |         if len(self.knowledge_items) == 0:
202 |             dummy_image = Image.new("RGB", (224, 224), (255, 255, 255))
203 |             dummy_embedding = np.array(
204 |                 self.embedding_pipeline([dummy_image]), dtype=np.float32
205 |             )
206 |             self.searcher = FaissCosineSearcher(embedding_dim=dummy_embedding.shape[1])
207 |         else:
208 |             images = []
209 |             for item in self.knowledge_items:
210 |                 img = Image.fromarray(item["image"]).convert("RGB")
211 |                 images.append(img)
212 |             database_vectors = np.array(
213 |                 self.embedding_pipeline(images), dtype=np.float32
214 |             )
215 |             self.searcher = FaissCosineSearcher(
216 |                 database_vectors, embedding_dim=database_vectors.shape[1]
217 |             )
218 | 
219 |     def add_knowledge_items(self, new_knowledge_items: list[dict]):
220 |         """
221 |         向知识库中添加新的知识项。
222 | 
223 |         Args:
224 |             new_knowledge_items (list[dict]): 新的知识项列表，每个知识项是一个字典。
225 |         """
226 |         self.knowledge_items.extend(new_knowledge_items)
227 |         images = []
228 |         for item in new_knowledge_items:
229 |             img = Image.fromarray(item["image"]).convert("RGB")
230 |             images.append(img)
231 |         new_vectors = np.array(self.embedding_pipeline(images), dtype=np.float32)
232 |         self.searcher.add_vectors(new_vectors)
233 | 
234 |     def search(
235 |         self, query_image: Image.Image, k: int = 5, similarity_threshold: float = 0.5
236 |     ) -> list[dict]:
237 |         query_vector = np.array(self.embedding_pipeline(query_image), dtype=np.float32)
238 |         results = self.searcher.search(query_vector, k, similarity_threshold)
239 |         return [
240 |             {
241 |                 "knowledge": self.knowledge_items[result["index"]]["info"],
242 |                 "similarity": result["similarity"],
243 |                 "index": result["index"],
244 |             }
245 |             for result in results
246 |         ]
247 | 
248 |     def search_batch(
249 |         self,
250 |         query_images: list[Image.Image],
251 |         k: int = 5,
252 |         similarity_threshold: float = 0.5,
253 |     ) -> list[list[dict]]:
254 |         query_vectors = np.array(
255 |             self.embedding_pipeline(query_images), dtype=np.float32
256 |         )
257 |         results = self.searcher.search_batch(query_vectors, k, similarity_threshold)
258 |         return [
259 |             [
260 |                 {
261 |                     "knowledge": str(self.knowledge_items[result["index"]]["info"]),
262 |                     "similarity": float(result["similarity"]),
263 |                     "index": int(result["index"]),
264 |                 }
265 |                 for result in result_list
266 |             ]
267 |             for result_list in results
268 |         ]
269 | 
270 | 
271 | def load_memory(knowledge_items: list[dict]) -> KnowledgeStore:
272 |     device = os.getenv(
273 |         "CLIENT_EMBEDDING_DEVICE", "cuda" if torch.cuda.is_available() else "cpu"
274 |     )
275 |     embedding_pipeline = SiglipMultimodalEmbeddingPipeline(
276 |         model_id="google/siglip-so400m-patch14-384",
277 |         device=device,
278 |         server_endpoint=os.getenv("SERVER_EMBEDDING_ENDPOINT"),
279 |     )
280 |     memory = KnowledgeStore(
281 |         knowledge_items=knowledge_items,
282 |         embedding_pipeline=embedding_pipeline,
283 |     )
284 |     return memory
285 | 
286 | 
287 | def load_knowledge_raw_data(knowledge_base_root_path: str = None):
288 |     """
289 |     Load knowledge raw data from the specified root path.
290 | 
291 |     Args:
292 |         knowledge_base_root_path (str): The root path of the knowledge base.
293 | 
294 |     Returns:
295 |         Dict[str, Any]: The raw knowledge data.
296 |     """
297 |     if knowledge_base_root_path is None:
298 |         knowledge_base_root_path = os.path.abspath(
299 |             os.getenv("KNOWLEDGE_BASE_ABSOLUTE_ROOT_PATH")
300 |         )
301 |     fp = os.path.join(knowledge_base_root_path, "knowledge_data.pkl")
302 |     assert os.path.exists(fp), f"{fp} not exists."
303 |     return load_object_from_disk(fp)
304 | 
305 | 
306 | def knowledge_raw_data_to_memory(
307 |     knowledge_raw_data: Dict[str, Any],
308 | ) -> dict[str, KnowledgeStore]:
309 |     """Convert knowledge raw data to memory
310 |     Args:
311 |         knowledge_raw_data: Dict[str, Any]: knowledge raw data. Generated by function load_knowledge_raw_data.
312 |     Returns:
313 |         dict[str, KnowledgeStore]: memory representation of the knowledge raw data.
314 |     """
315 |     memories = {}
316 |     for package_name, app_knowledge in tqdm(
317 |         knowledge_raw_data.items(), desc="Converting knowledge raw data to memory"
318 |     ):
319 |         knowledge = app_knowledge["knowledge"]
320 |         gc.collect()
321 |         torch.cuda.empty_cache()
322 |         memories[package_name] = load_memory(knowledge)
323 | 
324 |     print("Fusion knowledge raw data to memory. This may take a while...")
325 |     fusion_knowledge = []
326 |     for k, v in knowledge_raw_data.items():
327 |         fusion_knowledge.extend(v["knowledge"])
328 |     gc.collect()
329 |     torch.cuda.empty_cache()
330 |     memories["fusion"] = load_memory(fusion_knowledge)
331 |     gc.collect()
332 |     torch.cuda.empty_cache()
333 |     print("Done.")
334 |     return memories
335 | 
336 | 
337 | def load_memories(
338 |     knowledge_base_root_path: str = None,
339 | ) -> dict[str, KnowledgeStore]:
340 |     """
341 |     Load memories from the specified root path.
342 | 
343 |     Args:
344 |         knowledge_base_root_path (str): The root path of the knowledge base.
345 | 
346 |     Returns:
347 |         Dict[str, KnowledgeStore]: The loaded memories.
348 |     """
349 |     knowledge_raw_data = load_knowledge_raw_data(knowledge_base_root_path)
350 |     memories = knowledge_raw_data_to_memory(knowledge_raw_data)
351 |     return memories
352 | 


--------------------------------------------------------------------------------
/exploration_and_mining.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import json
  4 | import time
  5 | import re
  6 | from utils.utils import (
  7 |     get_apk,
  8 |     APK,
  9 |     str_to_md5,
 10 |     openai_request,
 11 |     resize_pil_image,
 12 |     pil_to_webp_base64,
 13 |     load_object_from_disk,
 14 |     save_object_to_disk,
 15 | )
 16 | from utils.device import Device, UIElement
 17 | from MLLM_Agent.GUI_explorer import GUI_explorer, execute_adb_action
 18 | from PIL import Image
 19 | from utils.prompt_templates import TASK_GOAL_GENERATOR
 20 | from datetime import datetime
 21 | from glob import glob
 22 | from utils.memory import load_knowledge_raw_data, load_memory
 23 | from utils.knowledge_generation import update_trajectory_to_knowledge
 24 | from tqdm import tqdm
 25 | 
 26 | 
 27 | def parse_task(text: str) -> list[str]:
 28 |     pattern = r"\d+\.+(.*)"
 29 |     matches = re.findall(pattern, text)
 30 |     for i, match in enumerate(matches):
 31 |         matches[i] = match.strip()
 32 |     return matches
 33 | 
 34 | 
 35 | def task_goal_generator(
 36 |     screenshot: Image.Image,
 37 |     package_name: str = None,
 38 |     app_name: str = None,
 39 |     activity_list: str = None,
 40 |     usage: dict[str, int] = {"prompt_tokens": 0, "completion_tokens": 0},
 41 | ) -> list[str]:
 42 |     p = TASK_GOAL_GENERATOR.format(
 43 |         package_name=package_name if package_name else "Not Available",
 44 |         app_name=app_name if app_name else "Not Available",
 45 |         activity_list=activity_list if activity_list else "Not Available",
 46 |     )
 47 |     # print(f"Prompt: \n{p}")
 48 |     low_resolution = os.getenv("LOW_RESOLUTION", "False").lower() == "true"
 49 |     if low_resolution:
 50 |         screenshot = resize_pil_image(screenshot, 1000)
 51 |     messages = [
 52 |         {
 53 |             "role": "user",
 54 |             "content": [
 55 |                 {
 56 |                     "type": "image_url",
 57 |                     "image_url": {
 58 |                         "url": f"data:image/webp;base64,{pil_to_webp_base64(screenshot)}",
 59 |                     },
 60 |                 },
 61 |                 {"type": "text", "text": p},
 62 |             ],
 63 |         },
 64 |     ]
 65 |     rsp_txt = openai_request(
 66 |         messages=messages, timeout=300, usage=usage, max_tokens=8192
 67 |     )
 68 |     return parse_task(rsp_txt)
 69 | 
 70 | 
 71 | def is_task_explored(root_dir: str, task: str) -> bool:
 72 |     t = str_to_md5(task)[:16]
 73 |     res = glob(os.path.join(root_dir, "**", f"*{t}*"), recursive=True)
 74 |     return len(res) > 0
 75 | 
 76 | 
 77 | def explore_dfs(
 78 |     current_task: str,
 79 |     current_depth: int,
 80 |     exploration_output_root_dir: str,
 81 |     max_exploration_tasks: int,
 82 |     max_exploration_steps: int,
 83 |     apk_object: APK,
 84 |     device_controller: Device,
 85 |     agent: GUI_explorer,
 86 |     previous_actions: list,
 87 |     package_name: str,
 88 |     is_first_task: bool = False,
 89 |     max_exploration_depth: int = 3,
 90 |     usage: dict[str, int] = {"prompt_tokens": 0, "completion_tokens": 0},
 91 | ):
 92 |     if (
 93 |         not is_first_task
 94 |     ):  # 如果不是第一个任务，那么需要根据之前的动作序列来复现到父节点的结束状态
 95 |         print(f"{'*' * current_depth}Restore To Parent State")
 96 |         device_controller.stop_all_apps()
 97 |         device_controller.launch_app(package_name, front=True)
 98 |         for action in previous_actions:
 99 |             converted_action, before_ui_elements, logical_screen_size = action
100 |             time.sleep(3)
101 |             device_controller.wait_to_stabilize()
102 |             execute_adb_action(
103 |                 converted_action,
104 |                 device_controller,
105 |                 before_ui_elements,
106 |                 logical_screen_size,
107 |             )
108 |     # 运行任务
109 |     exploration_output_root_dir = os.path.abspath(exploration_output_root_dir)
110 |     if is_task_explored(exploration_output_root_dir, current_task):
111 |         print(f"{'*' * current_depth}Task {current_task} already explored. Skip.")
112 |         return
113 |     records = agent.run(
114 |         current_task,
115 |         max_rounds=max_exploration_steps,
116 |         step_data_output_dir=exploration_output_root_dir,
117 |     )
118 | 
119 |     if current_depth >= max_exploration_depth:  # 结束探索
120 |         return
121 |     # 生成用于恢复到current_task结束后的状态的动作序列
122 |     restore_actions = []
123 |     for step_data in records:
124 |         if isinstance(step_data["converted_action"], str):
125 |             continue
126 |         action = (
127 |             step_data["converted_action"],
128 |             [UIElement(**ui_element) for ui_element in step_data["ui_elements"]],
129 |             step_data["logical_screen_size"],
130 |         )
131 |         restore_actions.append(action)
132 |     restore_actions = previous_actions + restore_actions
133 |     # 生成任务列表
134 |     device_controller.wait_to_stabilize()
135 |     screenshot = device_controller.get_screenshot()
136 |     activity = [act for act in apk_object.get_activities() if "sdk" not in act.lower()]
137 |     activity_str = "\n".join(activity)
138 |     app_name = apk_object.get_app_name()
139 |     task_list = task_goal_generator(
140 |         screenshot=screenshot,
141 |         package_name=package_name,
142 |         app_name=app_name,
143 |         activity_list=activity_str,
144 |         usage=usage,
145 |     )
146 |     print(f"{'*' * current_depth}Generated {len(task_list)} tasks: {task_list}")
147 |     if len(task_list) == 0:
148 |         return
149 |     exploration_output_root_dir = os.path.join(
150 |         exploration_output_root_dir,
151 |         f"depth_{current_depth+1}",
152 |     )
153 |     for i in range(max_exploration_tasks):
154 |         if i < len(task_list):
155 |             task = task_list[i]
156 |             print(
157 |                 f"{'*' * current_depth}Exploring task {i+1}/{min(len(task_list),max_exploration_tasks)}: {task}"
158 |             )
159 |             if is_task_explored(exploration_output_root_dir, task):
160 |                 print(f"{'*' * current_depth}Task {task} already explored. Skip.")
161 |                 continue
162 |             explore_dfs(
163 |                 current_task=task,
164 |                 current_depth=current_depth + 1,
165 |                 exploration_output_root_dir=exploration_output_root_dir,
166 |                 max_exploration_tasks=max_exploration_tasks,
167 |                 max_exploration_steps=max_exploration_steps,
168 |                 agent=agent,
169 |                 apk_object=apk_object,
170 |                 device_controller=device_controller,
171 |                 previous_actions=restore_actions,
172 |                 package_name=package_name,
173 |                 is_first_task=bool(i == 0),
174 |                 max_exploration_depth=max_exploration_depth,
175 |                 usage=usage,
176 |             )
177 | 
178 | 
179 | def auto_exploration(
180 |     package_name: str,
181 |     exploration_output_root_dir: str = "./output",
182 |     device_serial: str = None,
183 |     max_exploration_tasks: int = 10,
184 |     max_exploration_steps: int = 30,
185 |     max_exploration_depth: int = 5,  # 从首页开始的任务扩展深度（最多扩展 max_exploration_depth-1 代）
186 |     usage: dict[str, int] = {"prompt_tokens": 0, "completion_tokens": 0},
187 | ):
188 |     exploration_output_root_dir = os.path.abspath(exploration_output_root_dir)
189 |     exploration_output_root_dir = os.path.join(
190 |         exploration_output_root_dir, package_name
191 |     )
192 |     os.makedirs(exploration_output_root_dir, exist_ok=True)
193 |     apk_path = os.path.join(exploration_output_root_dir, f"{package_name}.apk")
194 |     if os.path.exists(apk_path):
195 |         os.remove(apk_path)
196 |     print(f"Fetching the APK file of {package_name}.")
197 |     res = get_apk(package_name, apk_path, device_serial)
198 |     if res == "ERROR":
199 |         print(f"Failed to get the APK file of {package_name}.")
200 |         sys.exit(1)
201 |     print("Analyzing the APK file.")
202 |     apk_object = APK(apk_path)
203 |     app_info = {
204 |         "app_name": apk_object.get_app_name(),
205 |         "app_version": apk_object.get_androidversion_code(),
206 |         "app_version_name": apk_object.get_androidversion_name(),
207 |         "app_pkg": apk_object.get_package(),
208 |         "app_main_activity": apk_object.get_main_activity(),
209 |     }
210 |     with open(
211 |         os.path.join(exploration_output_root_dir, "app_info.json"),
212 |         "w",
213 |         encoding="utf-8",
214 |     ) as file:
215 |         json.dump(app_info, file, indent=2, ensure_ascii=False)
216 |     device = Device(device_serial)
217 |     agent = GUI_explorer(device_serial=device_serial)
218 |     print("Killing all apps.")
219 |     device.stop_all_apps()
220 |     # device.launch_app(package_name, front=True,activity=app_info["app_main_activity"])
221 |     device.launch_app(package_name, front=True)
222 |     time.sleep(5)  # 等待app启动完成
223 |     task_list = []
224 |     print("Generating exploration tasks.")
225 |     device.wait_to_stabilize()
226 |     screenshot = device.get_screenshot()
227 |     activity = [act for act in apk_object.get_activities() if "sdk" not in act.lower()]
228 |     activity_str = "\n".join(activity)
229 |     app_name = apk_object.get_app_name()
230 |     task_list = task_goal_generator(
231 |         screenshot=screenshot,
232 |         package_name=package_name,
233 |         app_name=app_name,
234 |         activity_list=activity_str,
235 |         usage=usage,
236 |     )
237 |     print(f"Generated {len(task_list)} tasks: {task_list}")
238 |     if max_exploration_tasks < len(task_list):
239 |         task_list = task_list[:max_exploration_tasks]
240 |     print(f"Exploring {len(task_list)} tasks: {task_list}")
241 |     for i, task in enumerate(task_list):
242 |         print(f"Exploring task {i+1}/{len(task_list)}: {task}")
243 |         explore_dfs(
244 |             current_task=task,
245 |             current_depth=1,
246 |             exploration_output_root_dir=exploration_output_root_dir,
247 |             max_exploration_tasks=max_exploration_tasks,
248 |             max_exploration_steps=max_exploration_steps,
249 |             agent=agent,
250 |             apk_object=apk_object,
251 |             device_controller=device,
252 |             previous_actions=[],
253 |             package_name=package_name,
254 |             is_first_task=bool(i == 0),
255 |             max_exploration_depth=max_exploration_depth,
256 |             usage=usage,
257 |         )
258 |     print("Auto exploration finished.")
259 | 
260 | 
261 | import argparse
262 | 
263 | if __name__ == "__main__":
264 |     parser = argparse.ArgumentParser()
265 |     parser.add_argument(
266 |         "-package_name", help="The package name of the APK, e.g. com.android.settings"
267 |     )
268 |     parser.add_argument(
269 |         "-device_serial", help="The serial number of the device, see `adb devices`"
270 |     )
271 |     parser.add_argument(
272 |         "-output_dir",
273 |         help="The directory to save the task file",
274 |         default="./exploration_output",
275 |     )
276 |     parser.add_argument(
277 |         "-max_branching_factor",
278 |         help="The max number of tasks to explore at each node",
279 |         default=3,
280 |     )
281 |     parser.add_argument(
282 |         "-max_exploration_steps",
283 |         help="The max number of steps to explore for each task",
284 |         default=30,
285 |     )
286 |     parser.add_argument(
287 |         "-max_exploration_depth",
288 |         help="The max depth of exploration",
289 |         default=5,
290 |     )
291 |     args = parser.parse_args()
292 |     
293 |     usage: dict[str, int] = {"prompt_tokens": 0, "completion_tokens": 0}
294 |     
295 |     print(args)
296 |     print("Starting auto exploration.")
297 |     auto_exploration(
298 |         package_name=args.package_name,
299 |         exploration_output_root_dir=args.output_dir,
300 |         device_serial=args.device_serial,
301 |         max_exploration_tasks=int(args.max_branching_factor),
302 |         max_exploration_steps=int(args.max_exploration_steps),
303 |         max_exploration_depth=int(args.max_exploration_depth),
304 |         usage=usage,
305 |     )
306 |     print(f"Task goal generator token usage: {usage}")
307 | 
308 |     ## 将轨迹中的知识提取出来
309 |     # 如果版本升级，是否清空知识库重新生成
310 |     EMPTY_KNOWLEDGE_BASE_WHEN_VERSION_UPGRADE = (
311 |         os.getenv("EMPTY_KNOWLEDGE_BASE_WHEN_VERSION_UPGRADE", "False").lower()
312 |         == "false"
313 |     )
314 |     # 如果版本降级，是否清空知识库重新生成
315 |     EMPTY_KNOWLEDGE_BASE_WHEN_VERSION_DOWNGRADE = (
316 |         os.getenv("EMPTY_KNOWLEDGE_BASE_WHEN_VERSION_DOWNGRADE", "True").lower()
317 |         == "true"
318 |     )
319 |     app_info_json_paths = glob(
320 |         os.path.join(args.output_dir, "**", "app_info.json"), recursive=True
321 |     )
322 |     app_infos = {}
323 |     for app_info_json_path in app_info_json_paths:
324 |         with open(app_info_json_path, "r", encoding="utf-8") as f:
325 |             app_info = json.load(f)
326 |         app_infos[app_info["app_pkg"]] = app_info
327 | 
328 |     # knowledge_raw_data = {}
329 |     print("Loading knowledge raw data.")
330 |     knowledge_raw_data = load_knowledge_raw_data()
331 |     fusion_knowledge = []
332 |     locations = []
333 |     for k, v in knowledge_raw_data.items():
334 |         if k in app_infos:
335 |             app_info = app_infos[k]
336 |             new_app_version = app_info["app_version"]
337 |             old_app_version = v["app_version"]
338 |             if new_app_version > old_app_version:
339 |                 if EMPTY_KNOWLEDGE_BASE_WHEN_VERSION_UPGRADE:
340 |                     print(
341 |                         f"App {k} version upgraded from {old_app_version} to {new_app_version}, clear knowledge base."
342 |                     )
343 |                     v["knowledge"] = []
344 |             if new_app_version < old_app_version:
345 |                 if EMPTY_KNOWLEDGE_BASE_WHEN_VERSION_DOWNGRADE:
346 |                     print(
347 |                         f"App {k} version downgraded from {old_app_version} to {new_app_version}, clear knowledge base."
348 |                     )
349 |                     v["knowledge"] = []
350 |         fusion_knowledge.extend(v["knowledge"])
351 |         locations.extend([(k, i) for i in range(len(v["knowledge"]))])
352 | 
353 |     for k, app_info in app_infos.items():
354 |         if k not in knowledge_raw_data:
355 |             print(f"App {k} not in knowledge raw data, add it.")
356 |             knowledge_raw_data[k] = app_info
357 |             knowledge_raw_data[k]["knowledge"] = []
358 | 
359 |     print("Loading fusion memory. This may take a while.")
360 |     fusion_memory = load_memory(fusion_knowledge)
361 | 
362 |     for app_info_json_path in tqdm(
363 |         app_info_json_paths, ncols=80, desc="Extracting knowledge"
364 |     ):
365 |         app_info_json_dir = os.path.dirname(app_info_json_path)
366 |         pkl_paths = glob(
367 |             os.path.join(app_info_json_dir, "**", "*.pkl.zst"), recursive=True
368 |         )
369 |         for pkl_path in tqdm(
370 |             pkl_paths,
371 |             ncols=80,
372 |             desc=f"Processing pkl files in {os.path.basename(app_info_json_dir)}",
373 |             leave=False,
374 |         ):
375 |             trajectory_data = load_object_from_disk(pkl_path)
376 |             update_trajectory_to_knowledge(
377 |                 trajectory_data=trajectory_data,
378 |                 locations=locations,
379 |                 fusion_memory=fusion_memory,
380 |                 knowledge_data=knowledge_raw_data,
381 |                 usage=usage,
382 |             )
383 |     print(f"Total token usage: {usage}")
384 |     knowledge_base_root_path = os.path.abspath(os.getenv("KNOWLEDGE_BASE_ABSOLUTE_ROOT_PATH","./knowledge_base"))
385 |     fp = os.path.join(knowledge_base_root_path, "knowledge_data.pkl")
386 |     save_object_to_disk(knowledge_raw_data, fp, compress_level=20)
387 | 


--------------------------------------------------------------------------------
/utils/embedding_pipeline.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 在项目的根目录运行 python -m utils.embedding_pipeline 即可启动服务，然后在SiglipMultimodalEmbeddingPipeline的构造函数的device填写"server"。
  3 | """
  4 | 
  5 | import os
  6 | 
  7 | os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
  8 | import torch
  9 | from PIL import Image
 10 | import gradio as gr
 11 | from transformers import AutoProcessor, AutoModel
 12 | from typing import Any, List, Union, Dict, Tuple
 13 | import numpy as np
 14 | from gradio_client import Client
 15 | import io
 16 | import base64
 17 | from utils.utils import resize_pil_image
 18 | MAX_BATCH_SIZE = 32  # 最大批处理大小，根据显存大小调整，50张最大512像素的图片大约需要2.5GB
 19 | MAX_IMAGE_SIZE = 512  # 最大图片大小，512x512，超过这个大小会自动保留比例缩放到最长边为512
 20 | 
 21 | 
 22 | class SiglipMultimodalEmbeddingPipeline:
 23 |     IMAGE_PREFIX = "<is_image>"
 24 |     SEPARATOR = "<sep>"
 25 | 
 26 |     def __init__(
 27 |         self,
 28 |         model_id: str = "google/siglip-so400m-patch14-384",
 29 |         device: str = "cuda",  # "cuda", "mps", "cpu", or "server"
 30 |         server_endpoint: str = "http://127.0.0.1:8765",
 31 |     ):
 32 |         self.requested_device_str = device # Store original request
 33 |         self.model_id = model_id
 34 |         self.client = None
 35 |         self.model = None
 36 |         self.processor = None
 37 |         self.device = None # This will be a torch.device object or "server"
 38 |         self.dtype = torch.float32 # Default dtype
 39 |         self.autocast_device_type = None # 'cuda', 'cpu', 'mps' for torch.autocast
 40 | 
 41 |         if self.requested_device_str == "server":
 42 |             self.device = "server"
 43 |             try:
 44 |                 self.client = Client(server_endpoint, verbose=False)
 45 |                 #print(f"Initialized Gradio client for server: {server_endpoint}")
 46 |             except Exception as e:
 47 |                 print(f"Failed to initialize Gradio client: {e}")
 48 |                 raise
 49 |         else:
 50 |             # Determine actual device and dtype
 51 |             resolved_device_str = self.requested_device_str
 52 |             
 53 |             if resolved_device_str == "cuda":
 54 |                 if torch.cuda.is_available():
 55 |                     self.device = torch.device("cuda")
 56 |                     self.autocast_device_type = "cuda"
 57 |                     if torch.cuda.is_bf16_supported():
 58 |                         self.dtype = torch.bfloat16
 59 |                         print(f"Using CUDA with bfloat16 for model {model_id}.")
 60 |                     else:
 61 |                         self.dtype = torch.float16 # Most CUDA GPUs support at least float16
 62 |                         print(f"Using CUDA with float16 for model {model_id} (bfloat16 not supported).")
 63 |                 else:
 64 |                     print(f"Warning: CUDA device requested for model {model_id} but not available. Falling back to CPU.")
 65 |                     resolved_device_str = "cpu"
 66 | 
 67 |             if resolved_device_str == "mps":
 68 |                 if torch.backends.mps.is_available() and torch.backends.mps.is_built():
 69 |                     self.device = torch.device("mps")
 70 |                     self.autocast_device_type = "mps"
 71 |                     self.dtype = torch.float16 # MPS typically uses float16 well
 72 |                     print(f"Using MPS with float16 for model {model_id}.")
 73 |                 else:
 74 |                     print(f"Warning: MPS device requested for model {model_id} but not available/built. Falling back to CPU.")
 75 |                     resolved_device_str = "cpu"
 76 |             
 77 |             if resolved_device_str == "cpu": # Covers explicit "cpu" or fallbacks
 78 |                 self.device = torch.device("cpu")
 79 |                 self.autocast_device_type = "cpu"
 80 |                 # For CPU, bfloat16 can offer benefits if supported, float16 is generally not preferred for computation.
 81 |                 # Check if torch.bfloat16 dtype exists (available in PyTorch 1.10+)
 82 |                 # and if CPU has specific bfloat16 acceleration (more recent PyTorch for torch.backends.cpu.is_bf16_supported())
 83 |                 if hasattr(torch, 'bfloat16') and \
 84 |                    (getattr(torch.backends.cpu, 'is_bf16_supported', lambda: False)()):
 85 |                     self.dtype = torch.bfloat16
 86 |                     print(f"Using CPU with bfloat16 for model {model_id} (native support).")
 87 |                 # elif hasattr(torch, 'bfloat16'): # If bfloat16 exists but no specific CPU hardware support, it might be emulated.
 88 |                 #     self.dtype = torch.bfloat16
 89 |                 #     print(f"Using CPU with bfloat16 for model {model_id} (may be emulated).")
 90 |                 else:
 91 |                     self.dtype = torch.float32 # Default, remains float32
 92 |                     print(f"Using CPU with float32 for model {model_id}.")
 93 | 
 94 |             try:
 95 |                 self.processor = AutoProcessor.from_pretrained(model_id)
 96 |                 # Load model and cast to the determined device and dtype
 97 |                 self.model = AutoModel.from_pretrained(model_id) \
 98 |                                      .to(device=self.device, dtype=self.dtype) \
 99 |                                      .eval()
100 |                 print(f"Model {model_id} loaded successfully on {self.device} with dtype {self.dtype}.")
101 |             except Exception as e:
102 |                 print(f"Error loading model {model_id} on {self.device} with {self.dtype}: {e}")
103 |                 # Potentially fall back further or raise
104 |                 if self.dtype != torch.float32:
105 |                     print(f"Attempting to load model {model_id} with float32 as a fallback...")
106 |                     self.dtype = torch.float32
107 |                     try:
108 |                         self.model = AutoModel.from_pretrained(model_id) \
109 |                                              .to(device=self.device, dtype=self.dtype) \
110 |                                              .eval()
111 |                         print(f"Model {model_id} loaded successfully on {self.device} with fallback dtype {self.dtype}.")
112 |                     except Exception as e2:
113 |                         print(f"Fallback loading also failed: {e2}")
114 |                         raise e2 from e
115 |                 else:
116 |                     raise
117 | 
118 |     def __read_image_to_base64(self, image_path: str) -> str:
119 |         with open(image_path, "rb") as img_file:
120 |             return base64.b64encode(img_file.read()).decode()
121 | 
122 |     def __pillow_image_to_base64(self, pillow_image: Image.Image) -> str:
123 |         img_byte_arr = io.BytesIO()
124 |         if pillow_image.size[0] > MAX_IMAGE_SIZE or pillow_image.size[1] > MAX_IMAGE_SIZE:
125 |             pillow_image = resize_pil_image(pillow_image, MAX_IMAGE_SIZE)
126 |         pillow_image.convert("RGB").save(img_byte_arr, format="WEBP", quality=95)
127 |         return base64.b64encode(img_byte_arr.getvalue()).decode()
128 | 
129 |     def embedding_images_or_texts(
130 |         self, images_or_texts_str: str
131 |     ) -> Union[List[float], List[List[float]]]:
132 |         processed_input: Union[Image.Image, List[Image.Image], str, List[str]]
133 |         is_single_input = self.SEPARATOR not in images_or_texts_str
134 | 
135 |         if images_or_texts_str.startswith(self.IMAGE_PREFIX):
136 |             content_str = images_or_texts_str.removeprefix(self.IMAGE_PREFIX)
137 |             if is_single_input:
138 |                 processed_input = Image.open(
139 |                     io.BytesIO(base64.b64decode(content_str))
140 |                 ).convert("RGB")
141 |             else:
142 |                 processed_input = [
143 |                     Image.open(io.BytesIO(base64.b64decode(img_b64))).convert("RGB")
144 |                     for img_b64 in content_str.split(self.SEPARATOR)
145 |                     if img_b64
146 |                 ]
147 |         else: # Text input
148 |             if is_single_input:
149 |                 processed_input = images_or_texts_str
150 |             else:
151 |                 processed_input = [
152 |                     text for text in images_or_texts_str.split(self.SEPARATOR)
153 |                     if text
154 |                 ]
155 |         return self.__call__(processed_input)
156 | 
157 |     def __call__(
158 |         self, images_or_texts: Union[Image.Image, List[Image.Image], str, List[str], List[np.ndarray]]
159 |     ) -> Union[List[float], List[List[float]]]:
160 |         try:
161 |             if self.device == "server":
162 |                 if self.client is None:
163 |                     raise ConnectionError("Client not initialized. Set device='server' and provide server_endpoint.")
164 |                 return self.__call__server(images_or_texts)
165 |             else:
166 |                 if self.model is None or self.processor is None:
167 |                     raise RuntimeError("Model or processor not initialized. Check __init__ errors.")
168 |                 return self.__call__local(images_or_texts)
169 |         except Exception as e:
170 |             print(f"Error during embedding: {e}")
171 |             if isinstance(images_or_texts, list) and len(images_or_texts) > 0:
172 |                 print(f"Problematic input type: {type(images_or_texts[0])}, count: {len(images_or_texts)}")
173 |             else:
174 |                 print(f"Problematic input type: {type(images_or_texts)}")
175 |             print(f"Input snippet: {str(images_or_texts)[:200]}")
176 |             raise e
177 | 
178 |     def __call__server(
179 |         self, images_or_texts: Union[Image.Image, List[Image.Image], str, List[str], List[np.ndarray]]
180 |     ) -> Union[List[float], List[List[float]]]:
181 |         serialized_input: str
182 | 
183 |         if isinstance(images_or_texts, str):
184 |             if os.path.exists(images_or_texts):
185 |                 serialized_input = self.IMAGE_PREFIX + self.__read_image_to_base64(images_or_texts)
186 |             else:
187 |                 serialized_input = images_or_texts
188 |         elif isinstance(images_or_texts, Image.Image):
189 |             serialized_input = self.IMAGE_PREFIX + self.__pillow_image_to_base64(images_or_texts)
190 |         elif isinstance(images_or_texts, list):
191 |             if not images_or_texts:
192 |                 return []
193 | 
194 |             first_item = images_or_texts[0]
195 |             if isinstance(first_item, str):
196 |                 # Assuming paths if os.path.exists on first_item, else treat all as text
197 |                 # This might be ambiguous if mixing paths and direct texts.
198 |                 # For robustness, client should ideally send pre-processed lists of one type.
199 |                 try:
200 |                     is_path = os.path.exists(first_item)
201 |                 except TypeError: # os.path.exists can fail on non-string/bytes-like objects
202 |                     is_path = False
203 | 
204 |                 if is_path: # Assume list of image paths
205 |                     b64_images = [self.__read_image_to_base64(p) for p in images_or_texts if isinstance(p, str) and os.path.exists(p)]
206 |                     serialized_input = self.IMAGE_PREFIX + self.SEPARATOR.join(b64_images) + self.SEPARATOR
207 |                 else: # Assume list of texts
208 |                     valid_texts = [t for t in images_or_texts if isinstance(t, str)]
209 |                     serialized_input = self.SEPARATOR.join(valid_texts) + self.SEPARATOR
210 |             elif isinstance(first_item, Image.Image):
211 |                 b64_images = [self.__pillow_image_to_base64(img) for img in images_or_texts if isinstance(img, Image.Image)]
212 |                 serialized_input = self.IMAGE_PREFIX + self.SEPARATOR.join(b64_images) + self.SEPARATOR
213 |             elif isinstance(first_item, np.ndarray):
214 |                 pil_images = [Image.fromarray(img_arr).convert("RGB") for img_arr in images_or_texts if isinstance(img_arr, np.ndarray)]
215 |                 b64_images = [self.__pillow_image_to_base64(img) for img in pil_images]
216 |                 serialized_input = self.IMAGE_PREFIX + self.SEPARATOR.join(b64_images) + self.SEPARATOR
217 |             else:
218 |                 raise ValueError(f"Unsupported list item type for server call: {type(first_item)}")
219 |         else:
220 |             raise ValueError(f"Unsupported input type for server call: {type(images_or_texts)}")
221 | 
222 |         rsp = self.client.predict(
223 |             serialized_input,
224 |             api_name="/embedding_images_or_texts",
225 |         )
226 |         return rsp
227 | 
228 |     @torch.no_grad()
229 |     def __call__local(
230 |         self, images_or_texts: Union[Image.Image, List[Image.Image], str, List[str], List[np.ndarray]]
231 |     ) -> Union[List[float], List[List[float]]]:
232 |         
233 |         input_was_singular = False
234 |         processed_input: Union[List[Image.Image], List[str]]
235 | 
236 |         if isinstance(images_or_texts, str):
237 |             input_was_singular = True
238 |             if os.path.exists(images_or_texts):
239 |                 img=Image.open(images_or_texts).convert("RGB")
240 |                 processed_input = [img]
241 |             else:
242 |                 processed_input = [images_or_texts]
243 |         elif isinstance(images_or_texts, Image.Image):
244 |             input_was_singular = True
245 |             processed_input = [images_or_texts.convert("RGB")]
246 |         elif isinstance(images_or_texts, list):
247 |             if not images_or_texts:
248 |                 return []
249 |             
250 |             first_item = images_or_texts[0]
251 |             if isinstance(first_item, str):
252 |                 try:
253 |                     is_path = os.path.exists(first_item)
254 |                 except TypeError:
255 |                     is_path = False
256 |                 if is_path:
257 |                     processed_input = [Image.open(p).convert("RGB") for p in images_or_texts if isinstance(p, str)]
258 |                 else:
259 |                     processed_input = [t for t in images_or_texts if isinstance(t, str)]
260 |             elif isinstance(first_item, Image.Image):
261 |                 processed_input = [img.convert("RGB") for img in images_or_texts if isinstance(img, Image.Image)]
262 |             elif isinstance(first_item, np.ndarray):
263 |                 processed_input = [Image.fromarray(img_arr).convert("RGB") for img_arr in images_or_texts if isinstance(img_arr, np.ndarray)]
264 |             else:
265 |                 raise ValueError(f"Unsupported list item type for local call: {type(first_item)}")
266 |         else:
267 |             raise ValueError(f"Unsupported input type for local call: {type(images_or_texts)}")
268 | 
269 |         if not processed_input: # If filtering removed all items
270 |             return [] if not input_was_singular else [[]] # Match output shape expectation
271 | 
272 |         is_text_input = isinstance(processed_input[0], str)
273 |         all_embeddings = []
274 |         
275 |         # Use autocast if not using float32
276 |         autocast_enabled = self.dtype != torch.float32
277 | 
278 |         for i in range(0, len(processed_input), MAX_BATCH_SIZE):
279 |             batch = processed_input[i : i + MAX_BATCH_SIZE]
280 |             features_tensor = None # Initialize
281 | 
282 |             if is_text_input:
283 |                 # Processor returns tensors on CPU by default
284 |                 inputs = self.processor(
285 |                     text=batch, padding="max_length", truncation=True, return_tensors="pt"
286 |                 )
287 |                 # Move inputs to the correct device
288 |                 inputs = {k: v.to(self.device) for k, v in inputs.items()}
289 |                 
290 |                 with torch.autocast(device_type=self.autocast_device_type, dtype=self.dtype, enabled=autocast_enabled):
291 |                     features_tensor = self.model.get_text_features(**inputs)
292 |             else: # Image input
293 |                 # Ensure all images in batch are PIL Images and resized if necessary
294 |                 pil_batch = []
295 |                 for img_data in batch:
296 |                     current_img = img_data # Already PIL Image from pre-processing
297 |                     if current_img.size[0] > MAX_IMAGE_SIZE or current_img.size[1] > MAX_IMAGE_SIZE:
298 |                        current_img = resize_pil_image(current_img, MAX_IMAGE_SIZE)
299 |                     pil_batch.append(current_img)
300 | 
301 |                 inputs = self.processor(images=pil_batch, return_tensors="pt")
302 |                 # Move inputs to the correct device
303 |                 inputs = {k: v.to(self.device) for k, v in inputs.items()}
304 | 
305 |                 with torch.autocast(device_type=self.autocast_device_type, dtype=self.dtype, enabled=autocast_enabled):
306 |                     features_tensor = self.model.get_image_features(**inputs)
307 |             
308 |             if features_tensor is not None:
309 |                 # Move features to CPU and convert to list
310 |                 all_embeddings.extend(features_tensor.cpu().float().tolist()) # .float() before tolist for consistency if original was half
311 | 
312 |         return all_embeddings[0] if input_was_singular and all_embeddings else all_embeddings
313 | 
314 | if __name__ == "__main__":
315 |     os.environ["no_proxy"] = "localhost,127.0.0.1,::1" # Removed /8 as it's usually not needed for local
316 |     print("Text and Image Embedding Service")
317 | 
318 |     # Determine device for the server hosting the model
319 |     server_device = os.getenv("SERVER_EMBEDDING_DEVICE", "cuda" if torch.cuda.is_available() else "cpu")
320 |     print(f"Using device: {server_device}")
321 | 
322 |     print("Loading Multimodal Embedding Pipeline...")
323 |     # This pipeline instance will run locally on the server
324 |     multimodal_pipeline = SiglipMultimodalEmbeddingPipeline(
325 |         model_id="google/siglip-so400m-patch14-384", # or "marco/mcdse-2b-v1"
326 |         device=server_device,
327 |     )
328 |     print("Multimodal Embedding Pipeline Loaded.")
329 | 
330 |     # --- Gradio UI Helper Functions ---
331 |     def embed_texts_gradio(text_input_str: str):
332 |         if not text_input_str.strip():
333 |             return [], {"error": "Input is empty"}
334 |         texts = [t.strip() for t in text_input_str.splitlines() if t.strip()]
335 |         if not texts:
336 |             return [], {"error": "No valid text lines found"}
337 |         
338 |         # Prepare string for embedding_images_or_texts method
339 |         # This simulates what __call__server would do if it were calling this server
340 |         # If texts is a list: "text1<sep>text2<sep>"
341 |         # If texts is single: "text1"
342 |         if len(texts) == 1:
343 |             api_input_str = texts[0]
344 |         else:
345 |             api_input_str = SiglipMultimodalEmbeddingPipeline.SEPARATOR.join(texts) + SiglipMultimodalEmbeddingPipeline.SEPARATOR
346 |         
347 |         embeddings = multimodal_pipeline.embedding_images_or_texts(api_input_str)
348 |         shape = np.array(embeddings).shape
349 |         return embeddings, {"shape": str(shape)}
350 | 
351 |     def embed_single_image_gradio(image_input: Image.Image): # Gradio gives PIL Image if type="pil"
352 |         if image_input is None:
353 |             return [], {"error": "No image uploaded"}
354 |         
355 |         # Prepare string for embedding_images_or_texts method
356 |         # This simulates what __call__server would do
357 |         b64_image = multimodal_pipeline._SiglipMultimodalEmbeddingPipeline__pillow_image_to_base64(image_input)
358 |         api_input_str = SiglipMultimodalEmbeddingPipeline.IMAGE_PREFIX + b64_image
359 |         
360 |         embeddings = multimodal_pipeline.embedding_images_or_texts(api_input_str)
361 |         shape = np.array(embeddings).shape
362 |         return embeddings, {"shape": str(shape)}
363 | 
364 |     def embed_multiple_images_gradio(image_files: List[str]): # Gradio gives list of filepaths if type="filepath"
365 |         if not image_files:
366 |             return [], {"error": "No images uploaded"}
367 |         
368 |         # Prepare string for embedding_images_or_texts method
369 |         b64_images = [multimodal_pipeline._SiglipMultimodalEmbeddingPipeline__read_image_to_base64(f.name) for f in image_files]
370 |         api_input_str = (
371 |             SiglipMultimodalEmbeddingPipeline.IMAGE_PREFIX + 
372 |             SiglipMultimodalEmbeddingPipeline.SEPARATOR.join(b64_images) + 
373 |             SiglipMultimodalEmbeddingPipeline.SEPARATOR
374 |         )
375 |         
376 |         embeddings = multimodal_pipeline.embedding_images_or_texts(api_input_str)
377 |         shape = np.array(embeddings).shape
378 |         return embeddings, {"shape": str(shape)}
379 | 
380 |     print("Launching Gradio...")
381 |     with gr.Blocks() as demo:
382 |         gr.Markdown("### Text and Image Embedding Service")
383 | 
384 |         with gr.Tab("Text Embedding"):
385 |             text_input = gr.Textbox(
386 |                 lines=7, label="Input Text(s)", info="Enter one text per line for multiple texts."
387 |             )
388 |             with gr.Row():
389 |                 text_button = gr.Button("Generate Text Embedding(s)")
390 |             text_output_json = gr.JSON(label="Embedding Vector(s)")
391 |             text_output_shape = gr.JSON(label="Output Shape")
392 |             text_button.click(
393 |                 embed_texts_gradio,
394 |                 inputs=text_input,
395 |                 outputs=[text_output_json, text_output_shape],
396 |                 api_name="embed_texts" # For client.predict
397 |             )
398 |         
399 |         with gr.Tab("Single Image Embedding"):
400 |             # For single image, type="pil" is convenient to get PIL.Image directly
401 |             single_image_input = gr.Image(type="pil", label="Upload Single Image")
402 |             with gr.Row():
403 |                 single_image_button = gr.Button("Generate Image Embedding")
404 |             single_image_output_json = gr.JSON(label="Embedding Vector")
405 |             single_image_output_shape = gr.JSON(label="Output Shape")
406 |             single_image_button.click(
407 |                 embed_single_image_gradio,
408 |                 inputs=single_image_input,
409 |                 outputs=[single_image_output_json, single_image_output_shape],
410 |                 api_name="embed_single_image"
411 |             )
412 | 
413 |         with gr.Tab("Multiple Images Embedding"):
414 |             # For multiple images, type="filepath" or type="bytes"
415 |             # Using file_count="multiple" with gr.File
416 |             multi_image_input = gr.File(
417 |                 label="Upload Multiple Images",
418 |                 file_count="multiple",
419 |                 file_types=["image"], # e.g., .png, .jpg, .jpeg, .webp
420 |                 type="filepath" # Gives list of tempfile._TemporaryFileWrapper objects, use .name for path
421 |             )
422 |             with gr.Row():
423 |                 multi_image_button = gr.Button("Generate Image Embeddings")
424 |             multi_image_output_json = gr.JSON(label="Embedding Vectors")
425 |             multi_image_output_shape = gr.JSON(label="Output Shape")
426 |             multi_image_button.click(
427 |                 embed_multiple_images_gradio,
428 |                 inputs=multi_image_input,
429 |                 outputs=[multi_image_output_json, multi_image_output_shape],
430 |                 api_name="embed_multiple_images"
431 |             )
432 |         
433 |         # This is the original endpoint that __call__server targets
434 |         # It's kept for compatibility if you have clients using it directly
435 |         # but the UI above uses more specific helper functions for clarity.
436 |         gr.Textbox(
437 |             label="Internal API input (for client calls)", 
438 |             visible=False # Hide from UI unless debugging
439 |         ).submit(
440 |             multimodal_pipeline.embedding_images_or_texts,
441 |             inputs=gr.Textbox(), # This needs to be a Textbox to accept the string
442 |             outputs=gr.JSON(), # Output should be JSON for vectors
443 |             api_name="embedding_images_or_texts" # This matches the client
444 |         )
445 | 
446 | 
447 |     demo.queue().launch(share=False, server_name="0.0.0.0", server_port=8765)


--------------------------------------------------------------------------------
/utils/device.py:
--------------------------------------------------------------------------------
  1 | import dataclasses
  2 | from typing import Any, Optional
  3 | import xml.etree.ElementTree as ET
  4 | from dataclasses import asdict
  5 | import time
  6 | from PIL import Image
  7 | import imagehash
  8 | import numpy as np
  9 | 
 10 | 
 11 | @dataclasses.dataclass
 12 | class BoundingBox:
 13 |     """Class for representing a bounding box."""
 14 | 
 15 |     x_min: float | int
 16 |     x_max: float | int
 17 |     y_min: float | int
 18 |     y_max: float | int
 19 | 
 20 |     @property
 21 |     def center(self) -> tuple[float, float]:
 22 |         """Gets center of bounding box."""
 23 |         return (self.x_min + self.x_max) / 2.0, (self.y_min + self.y_max) / 2.0
 24 | 
 25 |     @property
 26 |     def width(self) -> float | int:
 27 |         """Gets width of bounding box."""
 28 |         return self.x_max - self.x_min
 29 | 
 30 |     @property
 31 |     def height(self) -> float | int:
 32 |         """Gets height of bounding box."""
 33 |         return self.y_max - self.y_min
 34 | 
 35 |     @property
 36 |     def area(self) -> float | int:
 37 |         return self.width * self.height
 38 | 
 39 | 
 40 | @dataclasses.dataclass
 41 | class UIElement:
 42 |     """Represents a UI element."""
 43 | 
 44 |     text: Optional[str] = None  # 这个就是文本框里面的内容
 45 |     content_description: Optional[str] = None
 46 |     class_name: Optional[str] = None
 47 |     bbox: Optional[BoundingBox] = None  # 根据屏幕尺寸进行归一化的bbox
 48 |     bbox_pixels: Optional[BoundingBox] = None
 49 |     hint_text: Optional[str] = None
 50 |     is_checked: Optional[bool] = None
 51 |     is_checkable: Optional[bool] = None
 52 |     is_clickable: Optional[bool] = None
 53 |     is_editable: Optional[bool] = None
 54 |     is_enabled: Optional[bool] = None
 55 |     is_focused: Optional[bool] = None
 56 |     is_focusable: Optional[bool] = None
 57 |     is_long_clickable: Optional[bool] = None
 58 |     is_scrollable: Optional[bool] = None
 59 |     is_selected: Optional[bool] = None
 60 |     is_visible: Optional[bool] = None
 61 |     package_name: Optional[str] = None
 62 |     resource_name: Optional[str] = None
 63 |     tooltip: Optional[str] = None
 64 |     resource_id: Optional[str] = None
 65 | 
 66 |     element_id: Optional[str] = None
 67 |     # Make the actual storage field private and part of the dataclass init
 68 |     _image_hash: Optional[str] = dataclasses.field(
 69 |         default=None, repr=False
 70 |     )  # Use field to control repr if needed
 71 |     uid: Optional[str] = None
 72 | 
 73 |     def _get_element_id(self) -> str:
 74 |         """Generates a unique ID for the UI element based on its properties."""
 75 |         if self.element_id is None and self.bbox_pixels:
 76 |             elem_w, elem_h = self.bbox_pixels.width, self.bbox_pixels.height
 77 |             if self.resource_id:
 78 |                 self.element_id = self.resource_id.replace(":", ".").replace("/", "_")
 79 |             else:
 80 |                 self.element_id = f"{self.class_name}_{elem_w}_{elem_h}"
 81 |             if self.content_description and len(self.content_description) < 20:
 82 |                 content_desc = (
 83 |                     self.content_description.replace("/", "_")
 84 |                     .replace(" ", "")
 85 |                     .replace(":", "_")
 86 |                 )
 87 |                 self.element_id += f"_{content_desc}"
 88 |         return self.element_id
 89 | 
 90 |     def _maybe_is_editable(self) -> bool:
 91 |         """Checks if the UI element is editable based on its properties."""
 92 |         if self.is_editable:
 93 |             return True
 94 |         if self.is_clickable and self.class_name:
 95 |             for s in [
 96 |                 "EditText",
 97 |                 "TextView",
 98 |                 "AutoCompleteTextView",
 99 |                 "MultiAutoCompleteTextView",
100 |             ]:
101 |                 if s in self.class_name:
102 |                     return True
103 |         return False
104 | 
105 |     def _update_uid(self) -> None:  # WARNING:在更新image_hash之后记得调用这个函数
106 |         if self.element_id is not None and self._image_hash is not None:
107 |             self.uid = f"{self.element_id}_{self._image_hash }"
108 |         elif self._image_hash is not None:
109 |             self.uid = f"{self._image_hash }"
110 |         elif self.element_id is not None:
111 |             self.uid = f"{self.element_id}"
112 | 
113 |     def set_image_hash(self, image: Image.Image) -> None:
114 |         """Set the image hash for the UI element."""
115 |         self.image_hash = str(
116 |             imagehash.phash(image, hash_size=16, highfreq_factor=8)
117 |         ).upper()
118 | 
119 |     # Define the property getter and setter
120 |     @property
121 |     def image_hash(self) -> Optional[str]:
122 |         """Getter for the image hash."""
123 |         return self._image_hash
124 | 
125 |     # 用setter实现更新image_hash之后自动更新uid
126 |     @image_hash.setter
127 |     def image_hash(self, value: Optional[str]) -> None:
128 |         """Setter for the image hash that also updates the UID."""
129 |         if self._image_hash != value:
130 |             self._image_hash = value
131 |             self._update_uid()
132 | 
133 |     def __post_init__(self):
134 |         """Post-initialization to ensure element_id is set."""
135 |         if self.bbox_pixels is not None and isinstance(self.bbox_pixels, dict):
136 |             self.bbox_pixels = BoundingBox(**self.bbox_pixels)
137 |         if self.bbox is not None and isinstance(self.bbox, dict):
138 |             self.bbox = BoundingBox(**self.bbox)
139 |         self.element_id = self._get_element_id()
140 |         self.is_editable = self._maybe_is_editable()
141 |         self._update_uid()
142 | 
143 | 
144 | def _normalize_bounding_box(
145 |     node_bbox: BoundingBox,
146 |     screen_width_height_px: tuple[int, int],
147 | ) -> BoundingBox:
148 |     width, height = screen_width_height_px
149 |     return BoundingBox(
150 |         node_bbox.x_min / width,
151 |         node_bbox.x_max / width,
152 |         node_bbox.y_min / height,
153 |         node_bbox.y_max / height,
154 |     )
155 | 
156 | 
157 | def _parse_ui_hierarchy(xml_string: str) -> dict[str, Any]:
158 |     """Parses the UI hierarchy XML into a dictionary structure."""
159 |     root = ET.fromstring(xml_string)
160 | 
161 |     def parse_node(node):
162 |         result = node.attrib
163 |         result["children"] = [parse_node(child) for child in node]
164 |         return result
165 | 
166 |     return parse_node(root)
167 | 
168 | 
169 | def xml_dump_to_ui_elements(
170 |     xml_string: str,
171 |     exclude_invisible_elements: bool = False,
172 |     screen_size: Optional[tuple[int, int]] = None,
173 |     screenshot: Optional[Image.Image] = None,
174 | ) -> list[UIElement]:
175 |     """Converts a UI hierarchy XML dump from uiautomator dump to UIElements.
176 |     Args:
177 |         xml_string: The XML string containing the UI hierarchy dump.
178 |         exclude_invisible_elements: True if invisible elements should not be
179 |       returned.
180 |         screen_size: The size of the device screen in pixels (width, height).
181 | 
182 |     Returns:
183 |         The extracted UI elements.
184 |     """
185 | 
186 |     def text_or_none(text: Optional[str]) -> Optional[str]:
187 |         """Returns None if text is None or 0 length."""
188 |         return text if text else None
189 | 
190 |     parsed_hierarchy = _parse_ui_hierarchy(xml_string)
191 |     ui_elements = []
192 | 
193 |     def process_node(node, screen_size=None, is_root=False, parent_node=None):
194 |         bounds = node.get("bounds")
195 |         bbox_pixels, bbox_normalized = None, None
196 |         if bounds:
197 |             x_min, y_min, x_max, y_max = map(
198 |                 int, bounds.strip("[]").replace("][", ",").split(",")
199 |             )
200 |             bbox_pixels = BoundingBox(x_min, x_max, y_min, y_max)
201 |             if screen_size is not None:
202 |                 bbox_normalized = _normalize_bounding_box(bbox_pixels, screen_size)
203 | 
204 |         ui_element = UIElement(
205 |             text=text_or_none(node.get("text")),
206 |             content_description=text_or_none(node.get("content-desc")),
207 |             class_name=text_or_none(node.get("class")),
208 |             bbox=bbox_normalized,
209 |             bbox_pixels=bbox_pixels,
210 |             hint_text=text_or_none(node.get("hint")),
211 |             is_checked=node.get("checked") == "true",
212 |             is_checkable=node.get("checkable") == "true",
213 |             is_clickable=node.get("clickable") == "true",
214 |             is_enabled=node.get("enabled") == "true",
215 |             is_focused=node.get("focused") == "true",
216 |             is_focusable=node.get("focusable") == "true",
217 |             is_long_clickable=node.get("long-clickable") == "true",
218 |             is_scrollable=node.get("scrollable") == "true",
219 |             is_selected=node.get("selected") == "true",
220 |             package_name=text_or_none(node.get("package")),
221 |             resource_id=text_or_none(node.get("resource-id")),
222 |             is_visible=node.get("visible-to-user") == "true",
223 |         )
224 |         if parent_node and parent_node.element_id:
225 |             # ui_element.element_id = f"{parent_node.element_id}_{ui_element.element_id}"
226 |             pass
227 |         if not is_root:
228 |             if (
229 |                 not (node.get("children", None) is not None)
230 |                 or (text_or_none(node.get("content-desc")) is not None)
231 |                 or (node.get("scrollable", "false") == "true")
232 |                 or (node.get("clickable", "false") == "true")
233 |             ):
234 |                 if exclude_invisible_elements and not (
235 |                     node.get("visible-to-user", "false") == "true"
236 |                 ):
237 |                     # continue
238 |                     pass
239 |                 else:
240 |                     if screen_size is None or validate_ui_element(
241 |                         ui_element, screen_size
242 |                     ):
243 |                         if ui_element.bbox_pixels and screenshot:
244 |                             image_hash = (
245 |                                 imagehash.phash(  # NOTE:使用phash来计算图片的hash值
246 |                                     screenshot.crop(
247 |                                         (
248 |                                             ui_element.bbox_pixels.x_min,
249 |                                             ui_element.bbox_pixels.y_min,
250 |                                             ui_element.bbox_pixels.x_max,
251 |                                             ui_element.bbox_pixels.y_max,
252 |                                         )
253 |                                     ),
254 |                                     hash_size=16,
255 |                                     highfreq_factor=8,
256 |                                 )
257 |                             )
258 |                             ui_element.image_hash = str(image_hash).upper()
259 |                         ui_elements.append(ui_element)
260 | 
261 |         for child in node.get("children", []):
262 |             process_node(
263 |                 child, screen_size=screen_size, is_root=False, parent_node=ui_element
264 |             )
265 | 
266 |     process_node(parsed_hierarchy, screen_size=screen_size, is_root=True)
267 |     return ui_elements
268 | 
269 | 
270 | def _generate_ui_element_description(ui_element: UIElement, index: int) -> str:
271 |     """Generate a description for a given UI element with important information.
272 | 
273 |     Args:
274 |       ui_element: UI elements for the current screen.
275 |       index: The numeric index for the UI element.
276 | 
277 |     Returns:
278 |       The description for the UI element.
279 |     """
280 |     element_description = f'UI element {index}: {{"index": {index}, '
281 |     if ui_element.text:
282 |         element_description += f'"text": "{ui_element.text}", '
283 |     if ui_element.content_description:
284 |         element_description += (
285 |             f'"content_description": "{ui_element.content_description}", '
286 |         )
287 |     if ui_element.hint_text:
288 |         element_description += f'"hint_text": "{ui_element.hint_text}", '
289 |     if ui_element.tooltip:
290 |         element_description += f'"tooltip": "{ui_element.tooltip}", '
291 |     element_description += (
292 |         f'"is_clickable": {"True" if ui_element.is_clickable else "False"}, '
293 |     )
294 |     element_description += (
295 |         '"is_long_clickable":'
296 |         f' {"True" if ui_element.is_long_clickable else "False"}, '
297 |     )
298 |     element_description += (
299 |         f'"is_editable": {"True" if ui_element.is_editable else "False"}, '
300 |     )
301 |     if ui_element.is_scrollable:
302 |         element_description += '"is_scrollable": True, '
303 |     if ui_element.is_focusable:
304 |         element_description += '"is_focusable": True, '
305 |     element_description += (
306 |         f'"is_selected": {"True" if ui_element.is_selected else "False"}, '
307 |     )
308 |     element_description += (
309 |         f'"is_checked": {"True" if ui_element.is_checked else "False"}, '
310 |     )
311 |     return element_description[:-2] + "}"  # 这里的[:-2]是为了去掉', '
312 | 
313 | 
314 | def validate_ui_element(
315 |     ui_element: UIElement,
316 |     screen_width_height_px: tuple[int, int],
317 | ) -> bool:
318 |     """Used to filter out invalid UI element."""
319 |     screen_width, screen_height = screen_width_height_px
320 | 
321 |     # Filters out invisible element.
322 |     if not ui_element.is_visible:
323 |         return False
324 | 
325 |     # Filters out element with invalid bounding box.
326 |     if ui_element.bbox_pixels:
327 |         x_min = ui_element.bbox_pixels.x_min
328 |         x_max = ui_element.bbox_pixels.x_max
329 |         y_min = ui_element.bbox_pixels.y_min
330 |         y_max = ui_element.bbox_pixels.y_max
331 | 
332 |         if (
333 |             x_min >= x_max
334 |             or x_min >= screen_width
335 |             or x_max <= 0
336 |             or y_min >= y_max
337 |             or y_min >= screen_height
338 |             or y_max <= 0
339 |         ):
340 |             return False
341 | 
342 |     # if not is_element_available(ui_element):
343 |     #     return False
344 | 
345 |     return True
346 | 
347 | 
348 | def _generate_ui_elements_description_list(
349 |     ui_elements: list[UIElement],
350 |     screen_width_height_px: tuple[int, int],
351 | ) -> str:
352 |     """Generate concise information for a list of UIElement.
353 | 
354 |     Args:
355 |       ui_elements: UI elements for the current screen.
356 |       screen_width_height_px: The height and width of the screen in pixels.
357 | 
358 |     Returns:
359 |       Concise information for each UIElement.
360 |     """
361 |     tree_info = ""
362 |     for index, ui_element in enumerate(ui_elements):
363 |         if validate_ui_element(ui_element, screen_width_height_px):
364 |             tree_info += _generate_ui_element_description(ui_element, index) + "\n"
365 |     return tree_info
366 | 
367 | 
368 | import base64
369 | import re
370 | from typing import Any, Optional
371 | import cv2
372 | import numpy as np
373 | 
374 | 
375 | def _logical_to_physical(
376 |     logical_coordinates: tuple[int, int],
377 |     logical_screen_size: tuple[int, int],
378 |     physical_frame_boundary: tuple[int, int, int, int],
379 |     orientation: int,
380 | ) -> tuple[int, int]:
381 |     """Convert logical coordinates to physical coordinates.
382 | 
383 |     Args:
384 |       logical_coordinates: The logical coordinates for the point.
385 |       logical_screen_size: The logical screen size.
386 |       physical_frame_boundary: The physical coordinates in portrait orientation
387 |         for the upper left and lower right corner for the frame.
388 |       orientation: The current screen orientation.
389 | 
390 |     Returns:
391 |       The physical coordinate for the point in portrait orientation.
392 | 
393 |     Raises:
394 |       ValueError: If the orientation is not valid.
395 |     """
396 |     x, y = logical_coordinates
397 |     px0, py0, px1, py1 = physical_frame_boundary
398 |     px, py = px1 - px0, py1 - py0
399 |     lx, ly = logical_screen_size
400 |     if orientation == 0:
401 |         return (int(x * px / lx) + px0, int(y * py / ly) + py0)
402 |     if orientation == 1:
403 |         return (px - int(y * px / ly) + px0, int(x * py / lx) + py0)
404 |     if orientation == 2:
405 |         return (px - int(x * px / lx) + px0, py - int(y * py / ly) + py0)
406 |     if orientation == 3:
407 |         return (int(y * px / ly) + px0, py - int(x * py / lx) + py0)
408 |     print("Invalid orientation.")
409 |     raise ValueError("Unsupported orientation.")
410 | 
411 | 
412 | def _ui_element_logical_corner(
413 |     ui_element: UIElement, orientation: int
414 | ) -> list[tuple[int, int]]:
415 |     """Get logical coordinates for corners of a given UI element.
416 | 
417 |     Args:
418 |       ui_element: The corresponding UI element.
419 |       orientation: The current orientation.
420 | 
421 |     Returns:
422 |       Logical coordinates for upper left and lower right corner for the UI
423 |       element.
424 | 
425 |     Raises:
426 |       ValueError: If bounding box is missing.
427 |       ValueError: If orientation is not valid.
428 |     """
429 |     if ui_element.bbox_pixels is None:
430 |         raise ValueError("UI element does not have bounding box.")
431 |     if orientation == 0:
432 |         return [
433 |             (int(ui_element.bbox_pixels.x_min), int(ui_element.bbox_pixels.y_min)),
434 |             (int(ui_element.bbox_pixels.x_max), int(ui_element.bbox_pixels.y_max)),
435 |         ]
436 |     if orientation == 1:
437 |         return [
438 |             (int(ui_element.bbox_pixels.x_min), int(ui_element.bbox_pixels.y_max)),
439 |             (int(ui_element.bbox_pixels.x_max), int(ui_element.bbox_pixels.y_min)),
440 |         ]
441 |     if orientation == 2:
442 |         return [
443 |             (int(ui_element.bbox_pixels.x_max), int(ui_element.bbox_pixels.y_max)),
444 |             (int(ui_element.bbox_pixels.x_min), int(ui_element.bbox_pixels.y_min)),
445 |         ]
446 |     if orientation == 3:
447 |         return [
448 |             (int(ui_element.bbox_pixels.x_max), int(ui_element.bbox_pixels.y_min)),
449 |             (int(ui_element.bbox_pixels.x_min), int(ui_element.bbox_pixels.y_max)),
450 |         ]
451 |     raise ValueError("Unsupported orientation.")
452 | 
453 | 
454 | def add_ui_element_mark(
455 |     screenshot: np.ndarray,
456 |     ui_element: UIElement,
457 |     index: int | str,
458 |     logical_screen_size: tuple[int, int],
459 |     physical_frame_boundary: tuple[int, int, int, int],
460 |     orientation: int,
461 | ):
462 |     """Add mark (a bounding box plus index) for a UI element in the screenshot.
463 | 
464 |     Args:
465 |       screenshot: The screenshot as a numpy ndarray.
466 |       ui_element: The UI element to be marked.
467 |       index: The index for the UI element.
468 |       logical_screen_size: The logical screen size.
469 |       physical_frame_boundary: The physical coordinates in portrait orientation
470 |         for the upper left and lower right corner for the frame.
471 |       orientation: The current screen orientation.
472 |     """
473 |     if ui_element.bbox_pixels:
474 |         upper_left_logical, lower_right_logical = _ui_element_logical_corner(
475 |             ui_element, orientation
476 |         )
477 |         upper_left_physical = _logical_to_physical(
478 |             upper_left_logical,
479 |             logical_screen_size,
480 |             physical_frame_boundary,
481 |             orientation,
482 |         )
483 |         lower_right_physical = _logical_to_physical(
484 |             lower_right_logical,
485 |             logical_screen_size,
486 |             physical_frame_boundary,
487 |             orientation,
488 |         )
489 | 
490 |         cv2.rectangle(
491 |             screenshot,
492 |             upper_left_physical,
493 |             lower_right_physical,
494 |             color=(0, 255, 0),
495 |             # thickness=2,
496 |             thickness=3,
497 |         )
498 |         screenshot[
499 |             upper_left_physical[1] + 1 : upper_left_physical[1] + 25,
500 |             upper_left_physical[0] + 1 : upper_left_physical[0] + 35,
501 |             :,
502 |         ] = (255, 255, 255)
503 |         cv2.putText(
504 |             screenshot,
505 |             str(index),
506 |             (
507 |                 upper_left_physical[0] + 1,
508 |                 upper_left_physical[1] + 20,
509 |             ),
510 |             cv2.FONT_HERSHEY_SIMPLEX,
511 |             0.7,
512 |             (0, 0, 0),
513 |             thickness=2,
514 |             # thickness=3,#这个会导致标签的数字变成一团
515 |         )
516 | 
517 | 
518 | def add_screenshot_label(screenshot: np.ndarray, label: str):
519 |     """Add a text label to the right bottom of the screenshot.
520 | 
521 |     Args:
522 |       screenshot: The screenshot as a numpy ndarray.
523 |       label: The text label to add, just a single word.
524 |     """
525 |     if len(label) > 8:
526 |         print(f"Label {label} is too long, please use a shorter one.")
527 |     height, width, _ = screenshot.shape
528 |     screenshot[height - 30 : height, width - 150 : width, :] = (255, 255, 255)
529 |     cv2.putText(
530 |         screenshot,
531 |         label,
532 |         (width - 135, height - 5),
533 |         cv2.FONT_HERSHEY_SIMPLEX,
534 |         1,
535 |         (0, 0, 0),
536 |         # thickness=2,
537 |         thickness=3,
538 |     )
539 | 
540 | 
541 | import logging
542 | import uiautomator2 as u2
543 | from PIL import Image
544 | from typing import List, Tuple, Union
545 | import time
546 | import re
547 | 
548 | 
549 | def get_available_devices() -> list[str]:
550 |     """
551 |     Get a list of device serials connected via adb
552 |     :return: list of str, each str is a device serial number
553 |     """
554 |     import subprocess
555 | 
556 |     r = subprocess.check_output(["adb", "devices"])
557 |     if not isinstance(r, str):
558 |         r = r.decode()
559 |     devices = []
560 |     for line in r.splitlines():
561 |         segs = line.strip().split()
562 |         if len(segs) == 2 and segs[1] == "device":
563 |             devices.append(segs[0])
564 |     return devices
565 | 
566 | 
567 | class Device(object):
568 | 
569 |     def __init__(self, device_serial: str = None) -> None:
570 |         """
571 |         Initialize a device connection with the bare minimum requirements.
572 |         """
573 |         self.logger = logging.getLogger(self.__class__.__name__)
574 |         if device_serial is None:
575 |             all_devices = get_available_devices()
576 |             if len(all_devices) == 0:
577 |                 raise Exception("No device connected.")
578 |             device_serial = all_devices[0]
579 |         self.logger.info(f"Using device {device_serial}")
580 |         self.device_serial = device_serial
581 |         self.u2d = None
582 |         self.connect()
583 | 
584 |     def __del__(self) -> None:
585 |         self.disconnect()
586 | 
587 |     def connect(self) -> None:
588 |         """
589 |         Connect to the device.
590 |         """
591 |         self._prior_ui_elements_state = None
592 |         if self.u2d is None:
593 |             self.u2d = u2.connect(self.device_serial)
594 |         self.logger.info(f"Connected to device.\n{self.u2d.info}")
595 | 
596 |     def disconnect(self) -> None:
597 |         """
598 |         Disconnect from the device.
599 |         """
600 |         if self.u2d is not None:
601 |             self.u2d.stop_uiautomator()
602 |             self.u2d = None
603 |         self.logger.info("Disconnected from device.")
604 | 
605 |     def run_shell_command(
606 |         self, cmdargs: Union[str, List[str]], timeout=60
607 |     ) -> tuple[str, int]:
608 |         """
609 |         Run shell command on device
610 | 
611 |         Args:
612 |             cmdargs: str or list, example: "ls -l" or ["ls", "-l"]
613 |             timeout: seconds of command run, works on when stream is False
614 | 
615 |         Returns:
616 |             return type is `namedtuple("ShellResponse", ("output", "exit_code"))`
617 | 
618 |         Raises:
619 |             AdbShellError
620 |         """
621 |         return self.u2d.shell(cmdargs, timeout=timeout)
622 | 
623 |     def launch_app(
624 |         self,
625 |         package_name: str,
626 |         use_monkey: bool = False,
627 |         timeout: float = 20.0,
628 |         front: bool = True,
629 |         activity: str = None,
630 |     ) -> None:
631 |         """
632 |         Args:
633 |             package_name (str): package name
634 |             use_monkey (bool): use monkey command to start app when activity is not given
635 |             timeout (float): maxium wait time, 0 means no wait
636 |             front (bool): wait until app is current app
637 |         """
638 |         self.u2d.app_start(package_name, use_monkey=use_monkey, activity=activity)
639 |         if timeout > 0:
640 |             self.u2d.app_wait(package_name, front=front, timeout=timeout)
641 | 
642 |     def stop_app(self, package_name: str):
643 |         """Stop one application"""
644 |         self.u2d.app_stop(package_name)
645 | 
646 |     def stop_all_apps(self, excludes: list = []) -> List[str]:
647 |         """Stop all third party applications
648 |         Args:
649 |             excludes (list): apps that do not want to kill
650 | 
651 |         Returns:
652 |             a list of killed apps
653 |         """
654 |         return self.u2d.app_stop_all(excludes=excludes)
655 | 
656 |     def list_running_app(self) -> List[str]:
657 |         """
658 |         Returns:
659 |             list of running apps
660 |         """
661 |         return self.u2d.app_list_running()
662 | 
663 |     def list_installed_app(self, filter: str = None) -> List[str]:
664 |         """
665 |         List installed app package names
666 | 
667 |         Args:
668 |             filter: [-f] [-d] [-e] [-s] [-3] [-i] [-u] [--user USER_ID] [FILTER]
669 | 
670 |         Returns:
671 |             list of apps by filter
672 |         """
673 |         return self.u2d.app_list(filter)
674 | 
675 |     def get_viewhierachy(self) -> str:
676 |         viewhierachy = self.u2d.dump_hierarchy(
677 |             compressed=False, pretty=False, max_depth=50
678 |         )
679 |         return viewhierachy
680 | 
681 |     def get_screenshot(self) -> Image.Image:
682 |         return self.u2d.screenshot().convert("RGB")
683 | 
684 |     def get_screen_size(self) -> Tuple[int, int]:
685 |         """
686 |         Returns:
687 |             screen width and height
688 |         """
689 |         return self.u2d.window_size()
690 | 
691 |     def get_top_activity_name(self) -> str:
692 |         current = self.u2d.app_current()
693 |         return current["activity"]
694 | 
695 |     def get_top_package_name(self) -> str:
696 |         current = self.u2d.app_current()
697 |         return current["package"]
698 | 
699 |     def get_installed_apps(self) -> List[str]:
700 |         return self.u2d.app_list()
701 | 
702 |     def click(self, x: int, y: int):
703 |         self.u2d.click(x, y)
704 | 
705 |     def long_click(self, x: int, y: int, duration: float = 2.0):
706 |         self.u2d.long_click(x, y, duration)
707 | 
708 |     def double_click(self, x: int, y: int):
709 |         self.u2d.double_click(x, y)
710 | 
711 |     def drag(self, x1: int, y1: int, x2: int, y2: int, duration: float = 0.5):
712 |         self.u2d.drag(x1, y1, x2, y2, duration)
713 | 
714 |     def swipe(self, x1: int, y1: int, x2: int, y2: int, duration: float = 0.5):
715 |         self.u2d.swipe(x1, y1, x2, y2, duration)
716 | 
717 |     def swip_up(self):
718 |         self.u2d.swipe_ext("up")
719 | 
720 |     def swip_down(self):
721 |         self.u2d.swipe_ext("down")
722 | 
723 |     def swip_left(self):
724 |         self.u2d.swipe_ext("left")
725 | 
726 |     def swip_right(self):
727 |         self.u2d.swipe_ext("right")
728 | 
729 |     def is_keyboard_shown(self) -> bool:
730 |         output, exit_code = self.u2d.shell(
731 |             "dumpsys input_method | grep mInputShown", timeout=120
732 |         )
733 |         # If the output shows mInputShown=true,
734 |         # it means that the current input method is in the display state,
735 |         # which usually means that there is an input box focused.
736 |         return "mInputShown=true" in output
737 | 
738 |     def input_text(self, text: str, smart_enter=True, clear_first=True):
739 |         self.u2d.send_keys(
740 |             text, clear=clear_first
741 |         )  # adb broadcasts the input and clears the contents of the input box before typing
742 | 
743 |         if smart_enter:
744 |             self.u2d.send_action()
745 |             # Automatically execute enter,
746 |             # search and other commands according to the needs of the input box,
747 |             # Added in version 3.1
748 | 
749 |     def enter(self):
750 |         self.u2d.press("enter")
751 | 
752 |     def home(self):
753 |         self.u2d.press("home")
754 | 
755 |     def back(self):
756 |         self.u2d.press("back")
757 | 
758 |     def _get_ui_elements(
759 |         self, exclude_invisible_elements: bool = True
760 |     ) -> list[UIElement]:
761 |         """Get the current UI elements from the device.
762 | 
763 |         Args:
764 |             exclude_invisible_elements: If True, invisible elements will be excluded from the result.
765 |         Returns:
766 |             list[UIElement] - The extracted UI elements.
767 |         """
768 |         return xml_dump_to_ui_elements(
769 |             self.get_viewhierachy(),
770 |             exclude_invisible_elements=exclude_invisible_elements,
771 |             screen_size=self.get_screen_size(),
772 |             screenshot=self.get_screenshot(),
773 |         )
774 | 
775 |     def wait_to_stabilize(
776 |         self,
777 |         stability_threshold: int = 3,
778 |         sleep_duration: float = 0.5,
779 |         timeout: float = 6.0,
780 |     ) -> list[UIElement]:
781 |         """Checks if the UI elements remain stable over a number of checks and returns the state.
782 | 
783 |         Args:
784 |             stability_threshold: Number of consecutive checks where UI elements must
785 |             remain the same to consider UI stable.
786 |             sleep_duration: Minimum time in seconds between each check.
787 |             timeout: Maximum time in seconds to wait for UI to become stable before
788 |             giving up.
789 | 
790 |         Returns:
791 |             The current state of the UI if stability is achieved within the timeout.
792 |         """
793 |         if not self._prior_ui_elements_state:
794 |             self._prior_ui_elements_state = self._get_ui_elements()
795 |         if stability_threshold <= 0:
796 |             raise ValueError("Stability threshold must be a positive integer.")
797 | 
798 |         stable_checks = 1
799 |         start_time = time.time()
800 |         deadline = start_time + timeout
801 |         current_ui_elements_state = []
802 |         while stable_checks < stability_threshold and time.time() < deadline:
803 |             iteration_start_time = time.time()
804 |             current_ui_elements_state = self._get_ui_elements()
805 | 
806 |             if self._prior_ui_elements_state == current_ui_elements_state:
807 |                 stable_checks += 1
808 |                 if stable_checks == stability_threshold:
809 |                     break  # Exit early if stability is achieved.
810 |             else:
811 |                 stable_checks = 1  # Reset if any change is detected
812 |                 self._prior_ui_elements_state = current_ui_elements_state
813 | 
814 |             elapsed_time = time.time() - iteration_start_time
815 |             remaining_sleep = sleep_duration - elapsed_time
816 |             if remaining_sleep > 0:
817 |                 sleep_time = min(remaining_sleep, deadline - time.time())
818 |                 if sleep_time > 0:
819 |                     time.sleep(sleep_time)
820 |             # If remaining_sleep <= 0, proceed immediately to the next iteration
821 |         return current_ui_elements_state
822 | 
823 |     def get_orientation(self) -> int:
824 |         """Returns the current screen orientation.
825 |         0: natural, 1: left, 2: right, 3: upside down.
826 |         """
827 |         # return self.u2d.orientation #natural, left, right, upsidedown
828 |         return self.u2d.info.get("displayRotation", 0)  # 0, 1, 2, 3
829 | 
830 |     def get_physical_frame_boundary(self) -> tuple[int, int, int, int]:
831 |         """Returns the physical frame boundary.
832 | 
833 |         Returns:
834 |             First two integers are the coordinates for top left corner, last two are for
835 |             lower right corner. All coordinates are given in portrait orientation.
836 |         """
837 |         response = self.run_shell_command("dumpsys input | grep physicalFrame")
838 |         if response.exit_code == 0:
839 |             raw_output = response.output
840 |             pattern = r"physicalFrame=\[(\d+), (\d+), (\d+), (\d+)\]"
841 |             matches = re.findall(pattern, raw_output)
842 |             orientation = self.get_orientation()
843 |             for m in matches:
844 |                 if (
845 |                     int(m[0]) == 0
846 |                     and int(m[1]) == 0
847 |                     and int(m[2]) == 0
848 |                     and int(m[3]) == 0
849 |                 ):
850 |                     continue
851 |                 if orientation == 0 or orientation == 2:
852 |                     return (int(m[0]), int(m[1]), int(m[2]), int(m[3]))
853 |                 return (int(m[1]), int(m[0]), int(m[3]), int(m[2]))
854 |         raise ValueError("Failed to get physical frame boundary.")
855 | 
856 | 
857 | def is_element_available(ele: UIElement) -> bool:
858 |     """
859 |     Check if the UI element is available for interaction.
860 |     """
861 |     return (
862 |         ele.is_clickable
863 |         or ele.is_scrollable
864 |         or ele.is_long_clickable
865 |         or ele.is_editable
866 |     )
867 | 


--------------------------------------------------------------------------------