├── .gitattributes
├── assets
    └── performance_venus.png
├── requirements.txt
├── examples
    ├── trace
    │   ├── 1e5d64c1163a4fe1cee9fed0a76fdf88.png
    │   ├── 29cf1a7a36dcbc065e28006ff7a688d7.png
    │   ├── 5842fa4411caad04cbcd67e514c2b426.png
    │   ├── 9795177942c1ed0bc654517934ceeb31.png
    │   ├── a47e1b25d9939b3fc76e9be1acd7bdb6.png
    │   ├── b04967ff19d08cd4b935fdd6a230152e.png
    │   ├── b07c500b65336f687eeb3f7ee7ae3eed.png
    │   └── trace.json
    ├── grounding_result_format.json
    └── grounding_meta_format.json
├── vis_androidworld
    ├── UI-Venus-androidworld.zip
    ├── vis_androidworld_trace.py
    └── templates
    │   └── index.html
├── .gitignore
├── LEGAL.md
├── scripts
    ├── run_navi_72b.sh
    ├── run_navi_7b.sh
    ├── run_gd_72b.sh
    └── run_gd_7b.sh
├── models
    ├── navigation
    │   ├── runner.py
    │   ├── ui_venus_navi_vllm.py
    │   ├── utils.py
    │   └── ui_venus_navi_agent.py
    └── grounding
    │   ├── ui_venus_ground_7b.py
    │   ├── ui_venus_ground_72b.py
    │   └── eval_screenspot_pro.py
├── LICENSE
└── README.md


/.gitattributes:
--------------------------------------------------------------------------------
1 | vis_androidworld/UI-Venus-androidworld.zip filter=lfs diff=lfs merge=lfs -text
2 | 


--------------------------------------------------------------------------------
/assets/performance_venus.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/inclusionAI/UI-Venus/HEAD/assets/performance_venus.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers==4.49.0
2 | vllm==0.8.3
3 | qwen_agent
4 | qwen_vl_utils
5 | torch
6 | torchvision
7 | torchaudio


--------------------------------------------------------------------------------
/examples/trace/1e5d64c1163a4fe1cee9fed0a76fdf88.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/inclusionAI/UI-Venus/HEAD/examples/trace/1e5d64c1163a4fe1cee9fed0a76fdf88.png


--------------------------------------------------------------------------------
/examples/trace/29cf1a7a36dcbc065e28006ff7a688d7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/inclusionAI/UI-Venus/HEAD/examples/trace/29cf1a7a36dcbc065e28006ff7a688d7.png


--------------------------------------------------------------------------------
/examples/trace/5842fa4411caad04cbcd67e514c2b426.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/inclusionAI/UI-Venus/HEAD/examples/trace/5842fa4411caad04cbcd67e514c2b426.png


--------------------------------------------------------------------------------
/examples/trace/9795177942c1ed0bc654517934ceeb31.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/inclusionAI/UI-Venus/HEAD/examples/trace/9795177942c1ed0bc654517934ceeb31.png


--------------------------------------------------------------------------------
/examples/trace/a47e1b25d9939b3fc76e9be1acd7bdb6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/inclusionAI/UI-Venus/HEAD/examples/trace/a47e1b25d9939b3fc76e9be1acd7bdb6.png


--------------------------------------------------------------------------------
/examples/trace/b04967ff19d08cd4b935fdd6a230152e.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/inclusionAI/UI-Venus/HEAD/examples/trace/b04967ff19d08cd4b935fdd6a230152e.png


--------------------------------------------------------------------------------
/examples/trace/b07c500b65336f687eeb3f7ee7ae3eed.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/inclusionAI/UI-Venus/HEAD/examples/trace/b07c500b65336f687eeb3f7ee7ae3eed.png


--------------------------------------------------------------------------------
/vis_androidworld/UI-Venus-androidworld.zip:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:836345bda4672e80c5ee823eb4651b7da5cd1fbef6e3ff436a1f131adad6d684
3 | size 110696800
4 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Compiled python modules.
 2 | *.pyc
 3 | 
 4 | # Byte-compiled
 5 | _pycache__/
 6 | .cache/
 7 | 
 8 | # Poetry, setuptools, PyPI distribution artifacts.
 9 | /*.egg-info
10 | .eggs/
11 | build/
12 | dist/
13 | poetry.lock
14 | 
15 | # Tests
16 | .pytest_cache/
17 | 
18 | # Type checking
19 | .pytype/
20 | 
21 | # Other
22 | *.DS_Store
23 | 
24 | # PyCharm
25 | .idea


--------------------------------------------------------------------------------
/LEGAL.md:
--------------------------------------------------------------------------------
1 | Legal Disclaimer
2 | 
3 | Within this source code, the comments in Chinese shall be the original, governing version. Any comment in other languages are for reference only. In the event of any conflict between the Chinese language version comments and other language version comments, the Chinese language version shall prevail.
4 | 
5 | 法律免责声明
6 | 
7 | 关于代码注释部分，中文注释为官方版本，其它语言注释仅做参考。中文注释可能与其它语言注释存在不一致，当中文注释与其它语言注释存在不一致时，请以中文注释为准。


--------------------------------------------------------------------------------
/scripts/run_navi_72b.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -euo pipefail
 3 | 
 4 | export PYTHONPATH=.
 5 | 
 6 | model_path='inclusionAI/UI-Venus-Navi-72B'
 7 | input_file='examples/trace/trace.json'
 8 | output_file='./saved_trace.json'
 9 | 
10 | python models/navigation/runner.py \
11 |     --max_pixels=12845056 \
12 |     --min_pixels=3136 \
13 |     --model_path="${model_path}" \
14 |     --input_file="${input_file}" \
15 |     --output_file="${output_file}"
16 | 


--------------------------------------------------------------------------------
/scripts/run_navi_7b.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -euo pipefail
 3 | 
 4 | export PYTHONPATH=.
 5 | 
 6 | 
 7 | model_path='inclusionAI/UI-Venus-Navi-7B'
 8 | input_file='examples/trace/trace.json'
 9 | output_file='./saved_trace.json'
10 | 
11 | python models/navigation/runner.py \
12 |     --max_pixels=937664 \
13 |     --min_pixels=830000 \
14 |     --model_path="${model_path}" \
15 |     --input_file="${input_file}" \
16 |     --output_file="${output_file}"
17 | 


--------------------------------------------------------------------------------
/examples/grounding_result_format.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "details": [
 3 |       {
 4 |           "img_path": "osworld/0FOB4CLBT2.png",
 5 |           "group": null,
 6 |           "platform": null,
 7 |           "application": null,
 8 |           "lang": "en",
 9 |           "instruction_style": "instruction",
10 |           "prompt_to_evaluate": "Open the filter function for search settings.",
11 |           "gt_type": "positive",
12 |           "ui_type": null,
13 |           "task_filename": "osworld_g",
14 |           "pred": [
15 |               1435.5102040816325,
16 |               339.5089285714286
17 |           ],
18 |           "raw_response": [
19 |               "[1748,415,1769,430]"
20 |           ],
21 |           "bbox": [
22 |               1422,
23 |               326,
24 |               1449,
25 |               354
26 |           ],
27 |           "correctness": "correct"
28 |       }
29 |   ]
30 | }


--------------------------------------------------------------------------------
/examples/grounding_meta_format.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |       "img_filename": "pc_ede36f9b-1154-4f76-b7f8-c15d7d3f9b6e.png",
 4 |       "bbox": [
 5 |           910,
 6 |           78,
 7 |           954,
 8 |           112
 9 |       ],
10 |       "instruction": "close this window",
11 |       "application": "windows",
12 |       "id": "desktop_0",
13 |       "action": "Click the \"X\" button in the top-right corner of the window to close it.",
14 |       "decription": "A white \"X\" icon located in the top-right corner of the window.",
15 |       "negative_instruction": "save",
16 |       "negative_action": "Click on the blue button with the floppy disk icon to save the current file.",
17 |       "negative_description": "A small blue button with a floppy disk icon that is located to the left of the \"View\" tab",
18 |       "ui_type": "icon",
19 |       "platform": "desktop",
20 |       "img_size": [
21 |           960,
22 |           540
23 |       ]
24 |   }
25 | ]


--------------------------------------------------------------------------------
/examples/trace/trace.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     [
 3 |         {
 4 |             "image_path": "examples/trace/5842fa4411caad04cbcd67e514c2b426.png",
 5 |             "task": "在夸克浏览器中，查看云图片中的\"壁纸_3.jpg\"的详细信息，并记住文件名，文件大小和文件ID，并以空格分隔符分隔，输出"
 6 |         },
 7 |         {
 8 |             "image_path": "examples/trace/9795177942c1ed0bc654517934ceeb31.png",
 9 |             "task": "在夸克浏览器中，查看云图片中的\"壁纸_3.jpg\"的详细信息，并记住文件名，文件大小和文件ID，并以空格分隔符分隔，输出"
10 |         },
11 |         {
12 |             "image_path": "examples/trace/1e5d64c1163a4fe1cee9fed0a76fdf88.png",
13 |             "task": "在夸克浏览器中，查看云图片中的\"壁纸_3.jpg\"的详细信息，并记住文件名，文件大小和文件ID，并以空格分隔符分隔，输出"
14 |         },
15 |         {
16 |             "image_path": "examples/trace/b04967ff19d08cd4b935fdd6a230152e.png",
17 |             "task": "在夸克浏览器中，查看云图片中的\"壁纸_3.jpg\"的详细信息，并记住文件名，文件大小和文件ID，并以空格分隔符分隔，输出"
18 |         },
19 |         {
20 |             "image_path": "examples/trace/29cf1a7a36dcbc065e28006ff7a688d7.png",
21 |             "task": "在夸克浏览器中，查看云图片中的\"壁纸_3.jpg\"的详细信息，并记住文件名，文件大小和文件ID，并以空格分隔符分隔，输出"
22 |         },
23 |         {
24 |             "image_path": "examples/trace/b07c500b65336f687eeb3f7ee7ae3eed.png",
25 |             "task": "在夸克浏览器中，查看云图片中的\"壁纸_3.jpg\"的详细信息，并记住文件名，文件大小和文件ID，并以空格分隔符分隔，输出"
26 |         },
27 |         {
28 |             "image_path": "examples/trace/a47e1b25d9939b3fc76e9be1acd7bdb6.png",
29 |             "task": "在夸克浏览器中，查看云图片中的\"壁纸_3.jpg\"的详细信息，并记住文件名，文件大小和文件ID，并以空格分隔符分隔，输出"
30 |         }
31 |     ]
32 | ]


--------------------------------------------------------------------------------
/scripts/run_gd_72b.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | 
 5 | models=("ui_venus_ground_72b")
 6 | for model in "${models[@]}"
 7 | do
 8 |     python models/grounding/eval_screenspot_pro.py  \
 9 |         --model_type ${model}  \
10 |         --screenspot_imgs "ScreenSpot-v2-variants/screenspotv2_image"  \
11 |         --screenspot_test "ScreenSpot-v2-variants"  \
12 |         --model_name_or_path "inclusionAI/UI-Venus-Ground-72B" \
13 |         --task "all" \
14 |         --language "en" \
15 |         --gt_type "positive" \
16 |         --log_path "venus_72b_ss2.json" \
17 |         --inst_style "instruction"
18 | 
19 | done
20 | 
21 | 
22 | 
23 | models=("ui_venus_ground_72b") 
24 | for model in "${models[@]}"
25 | do
26 |     python models/grounding/eval_screenspot_pro.py  \
27 |         --model_type ${model}  \
28 |         --screenspot_imgs "Screenspot-pro/images"  \
29 |         --screenspot_test "Screenspot-pro/annotations"  \
30 |         --model_name_or_path "inclusionAI/UI-Venus-Ground-72B" \
31 |         --task "all" \
32 |         --language "en" \
33 |         --gt_type "positive" \
34 |         --log_path "venus_72b_pro.json" \
35 |         --inst_style "instruction"
36 | 
37 | done
38 | 
39 | 
40 | models=("ui_venus_ground_72b") 
41 | for model in "${models[@]}"
42 | do
43 |     python models/grounding/eval_screenspot_pro.py  \
44 |         --model_type ${model}  \
45 |         --screenspot_imgs "data/osworld"  \
46 |         --screenspot_test "data/osworld_meta"  \
47 |         --model_name_or_path "inclusionAI/UI-Venus-Ground-72B" \
48 |         --task "all" \
49 |         --language "en" \
50 |         --gt_type "positive" \
51 |         --log_path "osworld_g_72b.json" \
52 |         --inst_style "instruction"
53 | 
54 | done
55 | 
56 | models=("ui_venus_ground_72b") 
57 | for model in "${models[@]}"
58 | do
59 |     python models/grounding/eval_screenspot_pro.py  \
60 |         --model_type ${model}  \
61 |         --screenspot_imgs "data/ui_vision/ui-vision/images"  \
62 |         --screenspot_test "data/ui_vision/ui-vision/annotations/element_grounding"  \
63 |         --model_name_or_path "inclusionAI/UI-Venus-Ground-72B" \
64 |         --task "all" \
65 |         --language "en" \
66 |         --gt_type "positive" \
67 |         --log_path "vison_72b.json" \
68 |         --inst_style "instruction"
69 | 
70 | done
71 | 
72 | 
73 | models=("ui_venus_ground_72b") 
74 | for model in "${models[@]}"
75 | do
76 |     python models/grounding/eval_screenspot_pro.py  \
77 |         --model_type ${model}  \
78 |         --screenspot_imgs "CAGUI/CAGUI_grounding/images/"  \
79 |         --screenspot_test "CAGUI/CAGUI_grounding/json_files/"  \
80 |         --model_name_or_path "inclusionAI/UI-Venus-Ground-72B" \
81 |         --task "all" \
82 |         --language "en" \
83 |         --gt_type "positive" \
84 |         --log_path "cpm_72b.json" \
85 |         --inst_style "instruction"
86 | 
87 | done
88 | 


--------------------------------------------------------------------------------
/scripts/run_gd_7b.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | 
 5 | models=("ui_venus_ground_7b")
 6 | for model in "${models[@]}"
 7 | do
 8 |     python models/grounding/eval_screenspot_pro.py  \
 9 |         --model_type ${model}  \
10 |         --screenspot_imgs "ScreenSpot-v2-variants/screenspotv2_image"  \
11 |         --screenspot_test "ScreenSpot-v2-variants"  \
12 |         --model_name_or_path "inclusionAI/UI-Venus-Ground-7B" \
13 |         --task "all" \
14 |         --language "en" \
15 |         --gt_type "positive" \
16 |         --log_path "venus_7b/venus_7b_ss2.json" \
17 |         --inst_style "instruction"
18 | 
19 | done
20 | 
21 | 
22 | 
23 | models=("ui_venus_ground_7b") 
24 | for model in "${models[@]}"
25 | do
26 |     python models/grounding/eval_screenspot_pro.py  \
27 |         --model_type ${model}  \
28 |         --screenspot_imgs "Screenspot-pro/images"  \
29 |         --screenspot_test "Screenspot-pro/annotations"  \
30 |         --model_name_or_path "inclusionAI/UI-Venus-Ground-7B" \
31 |         --task "all" \
32 |         --language "en" \
33 |         --gt_type "positive" \
34 |         --log_path "venus_7b/venus_7b_pro.json" \
35 |         --inst_style "instruction"
36 | 
37 | done
38 | 
39 | 
40 | models=("ui_venus_ground_7b") 
41 | for model in "${models[@]}"
42 | do
43 |     python models/grounding/eval_screenspot_pro.py  \
44 |         --model_type ${model}  \
45 |         --screenspot_imgs "data/osworld"  \
46 |         --screenspot_test "data/osworld_meta"  \
47 |         --model_name_or_path "inclusionAI/UI-Venus-Ground-7B" \
48 |         --task "all" \
49 |         --language "en" \
50 |         --gt_type "positive" \
51 |         --log_path "venus_7b/osworld_g_7b.json" \
52 |         --inst_style "instruction"
53 | 
54 | done
55 | 
56 | models=("ui_venus_ground_7b") 
57 | for model in "${models[@]}"
58 | do
59 |     python models/grounding/eval_screenspot_pro.py  \
60 |         --model_type ${model}  \
61 |         --screenspot_imgs "data/ui_vision/ui-vision/images"  \
62 |         --screenspot_test "data/ui_vision/ui-vision/annotations/element_grounding"  \
63 |         --model_name_or_path "inclusionAI/UI-Venus-Ground-7B" \
64 |         --task "all" \
65 |         --language "en" \
66 |         --gt_type "positive" \
67 |         --log_path "venus_7b/vison_7b.json" \
68 |         --inst_style "instruction"
69 | 
70 | done
71 | 
72 | 
73 | models=("ui_venus_ground_7b") 
74 | for model in "${models[@]}"
75 | do
76 |     python models/grounding/eval_screenspot_pro.py  \
77 |         --model_type ${model}  \
78 |         --screenspot_imgs "CAGUI/CAGUI_grounding/images/"  \
79 |         --screenspot_test "CAGUI/CAGUI_grounding/json_files/"  \
80 |         --model_name_or_path "inclusionAI/UI-Venus-Ground-7B" \
81 |         --task "all" \
82 |         --language "en" \
83 |         --gt_type "positive" \
84 |         --log_path "venus_7b/cpm_7b.json" \
85 |         --inst_style "instruction"
86 | 
87 | done
88 | 


--------------------------------------------------------------------------------
/models/navigation/runner.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import argparse
  3 | import logging
  4 | from dataclasses import dataclass, asdict
  5 | 
  6 | 
  7 | def read_json(path):
  8 |     with open(path, 'r', encoding='utf-8') as f:
  9 |         data = json.load(f)
 10 |     return data
 11 | 
 12 | def save_json(file, data):
 13 |   with open(file, 'w') as f:
 14 |     json.dump(data, f, indent=4, ensure_ascii=False)
 15 | 
 16 | def get_venus_agent():
 17 |     from models.navigation.ui_venus_navi_agent import VenusNaviAgent
 18 |     return VenusNaviAgent
 19 | 
 20 | def setup_logger(name: str = __name__, level: int = logging.INFO) -> logging.Logger:
 21 |     logger = logging.getLogger(name)
 22 |     if logger.handlers:
 23 |         return logger
 24 | 
 25 |     logger.setLevel(level)
 26 |     formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
 27 | 
 28 |     handler = logging.StreamHandler()
 29 |     handler.setLevel(level)
 30 |     handler.setFormatter(formatter)
 31 | 
 32 |     logger.addHandler(handler)
 33 |     return logger
 34 | 
 35 | 
 36 | @dataclass
 37 | class ModelConfig:
 38 |     model_path: str = "Qwen/Qwen2.5-VL-72B-Instruct" 
 39 |     tensor_parallel_size: int = 4
 40 |     gpu_memory_utilization: float = 0.6
 41 |     max_tokens: int = 2048
 42 |     max_pixels: int = 12845056
 43 |     min_pixels: int = 3136
 44 |     max_model_len: int = 10000
 45 |     max_num_seqs: int = 5
 46 |     temperature: float = 0.0
 47 |     top_p: float = 1.0
 48 |     top_k: int = -1
 49 |     n: int = 1
 50 | 
 51 |     def __str__(self):
 52 |         return f"ModelConfig({', '.join(f'{k}={v}' for k, v in asdict(self).items())})"
 53 |         
 54 | 
 55 | def main():
 56 |     parser = argparse.ArgumentParser()
 57 |     parser.add_argument("--model_path", type=str, default='/root/models/uivenus-7B')
 58 |     parser.add_argument("--tensor_parallel_size", type=int, default=1)
 59 |     parser.add_argument("--batch_size", type=int,  default=1)
 60 |     parser.add_argument("--input_file", type=str, default='examples/trace/trace.json')
 61 |     parser.add_argument("--output_file", type=str, default='./saved_trace.json')
 62 |     parser.add_argument("--gpu_memory_utilization", type=float, default=0.6)
 63 |     parser.add_argument("--max_tokens", type=int, default=2048)
 64 |     parser.add_argument("--max_pixels", type=int, default=12845056)
 65 |     parser.add_argument("--min_pixels", type=int, default=3136)
 66 |     parser.add_argument("--max_model_len", type=int, default=128000)
 67 |     parser.add_argument("--max_num_seqs", type=int, default=5)
 68 |     parser.add_argument("--temperature", type=float, default=0.0)
 69 |     parser.add_argument("--n", type=int, default=1)
 70 |     parser.add_argument("--history_length", type=int, default=0)
 71 |     
 72 |     args = parser.parse_args()
 73 | 
 74 |     logger = setup_logger("UI-vernus")
 75 | 
 76 |     model_config = ModelConfig(
 77 |         model_path=args.model_path,
 78 |         tensor_parallel_size=args.tensor_parallel_size,
 79 |         gpu_memory_utilization=args.gpu_memory_utilization,
 80 |         max_tokens=args.max_tokens,
 81 |         max_pixels=args.max_pixels,
 82 |         min_pixels=args.min_pixels,
 83 |         max_model_len=args.max_model_len,
 84 |         max_num_seqs=args.max_num_seqs,
 85 |         temperature=args.temperature,
 86 |         n=args.n,
 87 |     )
 88 |     logger.info(f"{model_config}")
 89 | 
 90 |     data = read_json(args.input_file)
 91 |     
 92 |     try:
 93 |         VenusNaviAgent = get_venus_agent()
 94 |         venus_agent = VenusNaviAgent(model_config, logger, args.history_length)
 95 |         logger.info("VenusNaviAgent initialized successfully")
 96 |     except Exception as e:
 97 |         logger.error(f"VenusNaviAgent initialized failed: {e}")
 98 |         raise
 99 | 
100 |     results = []
101 |     for trace_index, trace in enumerate(data):
102 |         for item in trace:
103 |             task = item['task']
104 |             image_path = item['image_path']
105 |             action_json = venus_agent.step(task, image_path)
106 |         history_record = venus_agent.export_history()
107 |         venus_agent.reset()
108 |         results.append(history_record)
109 | 
110 |     save_json(args.output_file, results)
111 | 
112 | 
113 | if __name__ == "__main__":
114 |     main() 


--------------------------------------------------------------------------------
/models/grounding/ui_venus_ground_7b.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor,AutoTokenizer
  4 | from transformers.generation import GenerationConfig
  5 | 
  6 | 
  7 | from qwen_vl_utils import process_vision_info,smart_resize
  8 | 
  9 | 
 10 | class UI_Venus_Ground_7B():
 11 |     def load_model(self, model_name_or_path="/root/ckpt/huggingface/"):
 12 |         self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
 13 |             model_name_or_path, 
 14 |             device_map="cuda", 
 15 |             trust_remote_code=True, 
 16 |             torch_dtype=torch.bfloat16,
 17 |             attn_implementation="flash_attention_2"
 18 |         ).eval()
 19 |         self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
 20 |         self.processor = AutoProcessor.from_pretrained(model_name_or_path)
 21 | 
 22 |         # Setting default generation config
 23 |         self.generation_config = GenerationConfig.from_pretrained(model_name_or_path, trust_remote_code=True).to_dict()
 24 |         self.set_generation_config(
 25 |             max_length=2048,
 26 |             do_sample=False,
 27 |             temperature=0.0
 28 |         )
 29 | 
 30 |     def set_generation_config(self, **kwargs):
 31 |         self.generation_config.update(**kwargs)
 32 |         self.model.generation_config = GenerationConfig(**self.generation_config)
 33 | 
 34 | 
 35 |     
 36 |     def inference(self, instruction, image_path):
 37 |         assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
 38 |         
 39 |         prompt_origin = 'Outline the position corresponding to the instruction: {}. The output should be only [x1,y1,x2,y2].'
 40 |         full_prompt = prompt_origin.format(instruction)
 41 | 
 42 |         min_pixels = 2000000
 43 |         max_pixels = 4800000
 44 |         
 45 |         messages = [
 46 |             {
 47 |                 "role": "user",
 48 |                 "content": [
 49 |                     {
 50 |                         "type": "image", 
 51 |                         "image": image_path,
 52 |                         "min_pixels": min_pixels,
 53 |                         "max_pixels": max_pixels
 54 |                     },
 55 |                     {"type": "text", "text": full_prompt},
 56 |                 ],
 57 |             }
 58 |         ]
 59 | 
 60 |         # Preparation for inference
 61 |         text = self.processor.apply_chat_template(
 62 |             messages, tokenize=False, add_generation_prompt=True
 63 |         )
 64 |         image_inputs, video_inputs = process_vision_info(messages)
 65 |         inputs = self.processor(
 66 |             text=[text],
 67 |             images=image_inputs,
 68 |             videos=video_inputs,
 69 |             padding=True,
 70 |             return_tensors="pt",
 71 |         )
 72 |         inputs = inputs.to(self.model.device)
 73 | 
 74 |         # Inference: Generation of the output
 75 |         generated_ids = self.model.generate(**inputs, max_new_tokens=128)
 76 |         generated_ids_trimmed = [
 77 |             out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
 78 |         ]
 79 |         output_text = self.processor.batch_decode(
 80 |             generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
 81 |         )
 82 |                 
 83 |         print(output_text)
 84 | 
 85 |         input_height = inputs['image_grid_thw'][0][1]*14
 86 |         input_width = inputs['image_grid_thw'][0][2]*14
 87 | 
 88 |         try:
 89 |             box = eval(output_text[0])
 90 |             abs_y1 = float(box[1]/input_height)
 91 |             abs_x1 = float(box[0]/input_width)
 92 |             abs_y2 = float(box[3]/input_height)
 93 |             abs_x2 = float(box[2]/input_width)
 94 |             box = [abs_x1,abs_y1,abs_x2,abs_y2]
 95 |         except:
 96 |             box = [0,0,0,0]
 97 | 
 98 |         point = [(box[0]+box[2])/2,(box[1]+box[3])/2]
 99 |         result_dict = {
100 |             "result": "positive",
101 |             "format": "x1y1x2y2",
102 |             "raw_response": output_text,
103 |             "bbox": box,
104 |             "point": point
105 |         }
106 |         
107 |         return result_dict
108 | 
109 | 


--------------------------------------------------------------------------------
/models/grounding/ui_venus_ground_72b.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | from PIL import Image
  4 | 
  5 | from transformers.generation import GenerationConfig
  6 | from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor,AutoTokenizer
  7 | from qwen_vl_utils import process_vision_info,smart_resize
  8 | 
  9 | 
 10 | 
 11 | class  UI_Venus_Ground_72B():
 12 |     def load_model(self, model_name_or_path="/root/ckpt/huggingface/"):
 13 |         self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
 14 |             model_name_or_path, 
 15 |             device_map="auto", 
 16 |             trust_remote_code=True, 
 17 |             torch_dtype=torch.bfloat16,
 18 |             attn_implementation="flash_attention_2"
 19 |         ).eval()
 20 |         self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
 21 |         self.processor = AutoProcessor.from_pretrained(model_name_or_path)
 22 |         # Setting default generation config
 23 |         self.generation_config = GenerationConfig.from_pretrained("/heyuan79/yunfei.yf/Qwen/Qwen__Qwen2.5-VL-7B-Instruct", trust_remote_code=True).to_dict()
 24 |         self.set_generation_config(
 25 |             max_length=2048,
 26 |             do_sample=False,
 27 |             temperature=0.0
 28 |         )
 29 | 
 30 |     def set_generation_config(self, **kwargs):
 31 |         self.generation_config.update(**kwargs)
 32 |         self.model.generation_config = GenerationConfig(**self.generation_config)
 33 | 
 34 | 
 35 |     def inference(self, instruction, image_path): 
 36 | 
 37 |         assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
 38 | 
 39 |         prompt_origin = 'Output the bounding box in the image corresponding to the instruction "{}" with grounding. The output should be only [x1,y1,x2,y2].'
 40 |         full_prompt = prompt_origin.format(instruction)
 41 | 
 42 |         
 43 |         
 44 |         messages = [
 45 |             {
 46 |                 "role": "user",
 47 |                 "content": [
 48 |                     {
 49 |                         "type": "image",
 50 |                         "image": image_path,
 51 |                     },
 52 |                     {"type": "text", "text": full_prompt},
 53 |                 ],
 54 |             }
 55 |         ]
 56 |             # min_pixels=2073600, 1080p
 57 |             # max_pixels=3686400, 1440P
 58 |         min_pixels = 3110400
 59 |         max_pixels = 48000000
 60 |         text = self.processor.apply_chat_template(
 61 |             messages, tokenize=False, add_generation_prompt=True
 62 |         )
 63 |         image_inputs, video_inputs = process_vision_info(messages)
 64 |         new_image_inputs = []
 65 |         w, h = 0, 0
 66 |         for sub_img in image_inputs:
 67 |             width, height = sub_img.size
 68 |             resized_height, resized_width = smart_resize(
 69 |             height,
 70 |             width,
 71 |             factor=28,
 72 |             min_pixels=min_pixels,
 73 |             max_pixels=max_pixels,
 74 |             )
 75 |             h, w = resized_height, resized_width 
 76 |             sub_img = sub_img.resize((resized_width, resized_height))
 77 |             new_image_inputs.append(sub_img)
 78 |        
 79 |         inputs = self.processor(
 80 |             text=[text],
 81 |             images=new_image_inputs,
 82 |             videos=video_inputs,
 83 |             padding=True,
 84 |             return_tensors="pt",
 85 |         )
 86 |         inputs = inputs.to(self.model.device)
 87 | 
 88 |         # Inference: Generation of the output
 89 |         generated_ids = self.model.generate(**inputs, max_new_tokens=128)
 90 |         generated_ids_trimmed = [
 91 |             out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
 92 |         ]
 93 |         output_text = self.processor.batch_decode(
 94 |             generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
 95 |         )
 96 |                 
 97 |         try:
 98 |             box = eval(output_text[0])
 99 |         except:
100 |             box = [0,0,0,0]
101 | 
102 | 
103 |         try:
104 |             input_width, input_height = w, h
105 | 
106 |             abs_y1 = float(box[1]/input_height)
107 |             abs_x1 = float(box[0]/input_width)
108 |             abs_y2 = float(box[3]/input_height)
109 |             abs_x2 = float(box[2]/input_width)
110 |             box = [abs_x1,abs_y1,abs_x2,abs_y2]
111 |         except:
112 |             box = [0,0,0,0]
113 |         print()
114 |         point = [(box[0]+box[2])/2,(box[1]+box[3])/2]
115 |         result_dict = {
116 |             "result": "positive",
117 |             "format": "x1y1x2y2",
118 |             "raw_response": output_text,
119 |             "bbox": box,
120 |             "point": point
121 |         }
122 | 
123 |         
124 |         return result_dict
125 | 
126 | 


--------------------------------------------------------------------------------
/vis_androidworld/vis_androidworld_trace.py:
--------------------------------------------------------------------------------
  1 | from flask import Flask, render_template, send_from_directory, request
  2 | import os
  3 | import argparse
  4 | from pathlib import Path
  5 | from PIL import Image  
  6 | 
  7 | app = Flask(__name__)
  8 | 
  9 | 
 10 | parser = argparse.ArgumentParser(description='Android World visualization')
 11 | parser.add_argument('--path', type=str, default="", 
 12 |                     help='path of visualization files')
 13 | parser.add_argument('--port', type=int, default=5050, 
 14 |                     help='port')
 15 | 
 16 | args = parser.parse_args()
 17 | 
 18 | BASE_DIR = Path(args.path)
 19 | 
 20 | @app.route('/')
 21 | def index():
 22 |     tasks = []
 23 |     for entry in os.scandir(BASE_DIR):
 24 |         if entry.is_dir():
 25 |             task_dir = BASE_DIR / entry.name
 26 |             status_value, full_status = get_task_status(task_dir)
 27 | 
 28 |             emoji = "✅" if status_value == "1.0" or status_value == "1" else "❌"
 29 |             display_name = f"{entry.name} - {status_value} {emoji}"
 30 | 
 31 |             tasks.append({
 32 |                 "original_name": entry.name,
 33 |                 "display_name": display_name,
 34 |                 "status_value": status_value,
 35 |                 "full_status": full_status
 36 |             })
 37 | 
 38 |     tasks.sort(key=lambda x: x['original_name'])
 39 | 
 40 |     selected_original_name = request.args.get('task', tasks[0]['original_name'] if tasks else None)
 41 | 
 42 |     task_data = None
 43 |     if selected_original_name:
 44 |         task_data = prepare_task_data(selected_original_name)
 45 | 
 46 |     return render_template(
 47 |         'index.html',
 48 |         tasks=tasks,
 49 |         selected_original_name=selected_original_name,
 50 |         task_data=task_data
 51 |     )
 52 | 
 53 | def get_task_status(task_dir):
 54 |     """Retrieve the task's status value and return the entire content of the status file."""
 55 |     status_file = task_dir / "000000status.txt"
 56 |     full_status = "Status file not found."
 57 |     status_value = "?"
 58 | 
 59 |     try:
 60 |         with open(status_file, 'r') as f:
 61 |             full_status = f.read().strip()
 62 | 
 63 |             lines = full_status.split('\n')
 64 |             if lines:
 65 |                 status_value = lines[0].strip()
 66 | 
 67 |     except FileNotFoundError:
 68 |         pass
 69 |     except Exception as e:
 70 |         full_status = f"Failed to read status file:{str(e)}"
 71 | 
 72 |     return status_value, full_status
 73 | 
 74 | @app.route('/images/<task>/<filename>')
 75 | def serve_image(task, filename):
 76 |     """extract images"""
 77 |     task_dir = BASE_DIR / task
 78 |     return send_from_directory(task_dir, filename)
 79 | 
 80 | def prepare_task_data(task_name):
 81 |     """prepare data"""
 82 |     task_dir = BASE_DIR / task_name
 83 | 
 84 |     goal = read_file(task_dir / "000000goal.txt")
 85 | 
 86 |     status_value, full_status = get_task_status(task_dir)
 87 | 
 88 |     steps = []
 89 |     step_files = []
 90 | 
 91 |     for entry in os.scandir(task_dir):
 92 |         if entry.name.startswith("000000"):
 93 |             continue
 94 |         step_files.append(entry.name)
 95 |     step_files.sort()
 96 | 
 97 |     step_groups = {}
 98 |     for filename in step_files:
 99 |         prefix = filename.split('_')[0]
100 |         step_groups.setdefault(prefix, []).append(filename)
101 | 
102 |     for prefix, files in sorted(step_groups.items()):
103 |         step = {
104 |             'prefix': prefix,
105 |             'image': next((f for f in files if f.endswith('_raw.jpg')), None),
106 |             'thinking': next((f for f in files if f.endswith('_thinking.txt')), None),
107 |             'tool_call': next((f for f in files if f.endswith('_tool_call.txt')), None),
108 |             'conclusion': next((f for f in files if f.endswith('_conclusion.txt')), None),
109 |         }
110 | 
111 |         if step['image']:
112 |             image_path = task_dir / step['image']
113 |             try:
114 |                 with Image.open(image_path) as img:
115 |                     step['image_width'] = img.width  
116 |                     step['image_height'] = img.height  
117 |             except Exception as e:
118 |                 step['image_width'] = None
119 |                 step['image_height'] = None
120 |         else:
121 |             step['image_width'] = None
122 |             step['image_height'] = None
123 | 
124 |         for file_type in ['thinking', 'tool_call', 'conclusion']:
125 |             if step[file_type]:
126 |                 content = read_file(task_dir / step[file_type])
127 |                 step[file_type + '_content'] = content
128 |                 step['x'] = None
129 |                 step['y'] = None 
130 |             else:
131 |                 step[file_type + '_content'] = "file not found"
132 |                 step['x'] = None
133 |                 step['y'] = None 
134 | 
135 |         steps.append(step)
136 | 
137 |     return {
138 |         'name': task_name,
139 |         'goal': goal,
140 |         'status_value': status_value,
141 |         'full_status': full_status,  
142 |         'steps': steps
143 |     }
144 | 
145 | def read_file(file_path, default="file not found"):
146 |     """read file"""
147 |     try:
148 |         with open(file_path, 'r') as f:
149 |             return f.read()
150 |     except:
151 |         return default
152 | 
153 | if __name__ == '__main__':
154 |     app.run(host='0.0.0.0', port=args.port, debug=True)


--------------------------------------------------------------------------------
/models/navigation/ui_venus_navi_vllm.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from typing import Dict, List, Any, Tuple
  3 | 
  4 | from transformers import AutoProcessor
  5 | from vllm import LLM, SamplingParams
  6 | 
  7 | from qwen_vl_utils import process_vision_info
  8 | 
  9 | 
 10 | class NaviVLLM:
 11 |     def __init__(self, model_config, logger):
 12 |         """
 13 |         Initialize the NaviVLLM model.
 14 | 
 15 |         Args:
 16 |             model_config: Configuration object with model parameters.
 17 |             logger: Logger instance for logging.
 18 |         """
 19 |         self.logger = logger
 20 |         self.model_config = model_config
 21 | 
 22 |         self.model = LLM(
 23 |             model=model_config.model_path,
 24 |             max_model_len=model_config.max_model_len,
 25 |             max_num_seqs=model_config.max_num_seqs,
 26 |             tensor_parallel_size=model_config.tensor_parallel_size,
 27 |             gpu_memory_utilization=model_config.gpu_memory_utilization,
 28 |             )
 29 |         self.processor = AutoProcessor.from_pretrained(model_config.model_path)
 30 |         self.processor.image_processor.max_pixels = model_config.max_pixels
 31 |         self.processor.image_processor.min_pixels = model_config.min_pixels
 32 |         self.sampling_params = SamplingParams(
 33 |             max_tokens=model_config.max_tokens,
 34 |             temperature=model_config.temperature,
 35 |             top_p=model_config.top_p,
 36 |             top_k=model_config.top_k,
 37 |             repetition_penalty=1.05,
 38 |             n=model_config.n,
 39 |             stop_token_ids=[]
 40 |         )
 41 | 
 42 |         self.logger.info(
 43 |             f"SamplingParams: max_tokens={model_config.max_tokens}, "
 44 |             f"temperature={model_config.temperature}, top_p={model_config.top_p}, "
 45 |             f"top_k={model_config.top_k}, n={model_config.n}, "
 46 |             f"stop_token_ids={self.sampling_params.stop_token_ids}"
 47 |         )
 48 | 
 49 |     def create_message_for_image(self, image: str, problem: str) -> List[Dict[str, Any]]:
 50 |         return [
 51 |             {"role": "system", "content": "You are a helpful assistant."},
 52 |             {
 53 |                 "role": "user",
 54 |                 "content": [
 55 |                     {"type": "text", "text": problem},
 56 |                     {
 57 |                         "type": "image",
 58 |                         "image": image,
 59 |                         "min_pixels": self.model_config.min_pixels,
 60 |                         "max_pixels": self.model_config.max_pixels,
 61 |                     }
 62 |                 ],
 63 |             },
 64 |         ]
 65 | 
 66 |     def _prepare_llm_inputs(self, messages_list: List[List[Dict]]) -> List[Dict]:
 67 |         """
 68 |         Convert messages to vLLM input format with multi-modal data.
 69 | 
 70 |         Args:
 71 |             messages_list: List of message lists (one per sample).
 72 | 
 73 |         Returns:
 74 |             List of dictionaries containing 'prompt' and 'multi_modal_data'.
 75 |         """
 76 |         prompts = [
 77 |             self.processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True)
 78 |             for msg in messages_list
 79 |         ]
 80 |     
 81 |         image_inputs, _ = process_vision_info(messages_list)
 82 |         
 83 |         llm_inputs = []
 84 |         for image_input, text in zip(image_inputs, prompts):
 85 |             mm_data = {"image": image_input}
 86 |             llm_inputs.append({
 87 |                 "prompt": text,
 88 |                 "multi_modal_data": mm_data
 89 |             })
 90 |         
 91 |         return llm_inputs
 92 | 
 93 |     def _process_batch(
 94 |         self,
 95 |         batch_data: List[Tuple[bytes, str, str]], 
 96 |     ) -> Tuple[List[str], List[str]]:
 97 |         """
 98 |         Process a batch of (image_path, problem) data into vLLM inputs.
 99 | 
100 |         Args:
101 |             batch_data: List of tuples (image_path, problem).
102 | 
103 |         Returns:
104 |             List of LLM input dictionaries.
105 |         """
106 |         images, problems = zip(*batch_data)
107 |         
108 |         messages_list = [self.create_message_for_image(img, prob) for img, prob in zip(images, problems)]
109 |         
110 |         return self._prepare_llm_inputs(messages_list)
111 |         
112 |         
113 |     def __call__(self, data, print_log=False):
114 |         """
115 |         Generate responses for a list of (image_path, problem) pairs.
116 | 
117 |         Args:
118 |             data: List of tuples (image_path, problem).
119 |             print_log: Whether to log questions and answers.
120 | 
121 |         Returns:
122 |             List[List[str]]: Each inner list contains `n` generated responses.
123 |         """
124 |         llm_input_list = self._process_batch(data)
125 | 
126 |         outputs = self.model.generate(llm_input_list, sampling_params=self.sampling_params)
127 |         responses = []
128 |         for output in outputs:
129 |             generated_texts = [o.text for o in output.outputs]
130 |             responses.append(generated_texts)
131 | 
132 |         if print_log:
133 |             for (image_path, problem), response in zip(data, responses):
134 |                 self.logger.info(f"Image: {os.path.basename(image_path)}")
135 |                 self.logger.info(f"Problem: {problem}")
136 |                 self.logger.info(f"Response: {response[0]}")
137 |                 self.logger.info("-" * 50)
138 | 
139 |         return responses


--------------------------------------------------------------------------------
/models/navigation/utils.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from typing import Optional, Tuple
  3 | 
  4 | 
  5 | USER_PROMPT = """
  6 | **You are a GUI Agent.**  
  7 | Your task is to analyze a given user task, review current screenshot and previous actions, and determine the next action to complete the task.
  8 | 
  9 | ### User Task
 10 | {user_task}
 11 | 
 12 | ### Previous Actions
 13 | {previous_actions}
 14 | 
 15 | ### Available Actions
 16 | You may execute one of the following functions:
 17 | Click(box=(x1, y1))
 18 | Drag(start=(x1, y1), end=(x2, y2))
 19 | Scroll(start=(x1, y1), end=(x2, y2), direction='down/up/right/left')
 20 | Type(content='')
 21 | Launch(app='')
 22 | Wait()
 23 | Finished(content='')
 24 | CallUser(content='')
 25 | LongPress(box=(x1, y1))
 26 | PressBack()
 27 | PressHome()
 28 | PressEnter()
 29 | PressRecent()
 30 | 
 31 | ### Instruction
 32 | - Make sure you understand the task goal to avoid wrong actions.
 33 | - Make sure you carefully examine the the current screenshot. Sometimes the summarized history might not be reliable, over-claiming some effects.
 34 | - For requests that are questions (or chat messages), remember to use the `CallUser` action to reply to user explicitly before finishing! Then, after you have replied, use the Finished action if the goal is achieved.
 35 | - Consider exploring the screen by using the `scroll` action with different directions to reveal additional content.
 36 | - To copy some text: first select the exact text you want to copy, which usually also brings up the text selection bar, then click the `copy` button in bar.
 37 | - To paste text into a text box, first long press the text box, then usually the text selection bar will appear with a `paste` button in it.
 38 | - You first thinks about the reasoning process in the mind, then provide the action. The reasoning and action are enclosed in <think></think> and <action></action> tags respectively. After providing action, summarize your action in <conclusion></conclusion> tags
 39 | """
 40 | 
 41 | 
 42 | def parse_coordinates(coord_str: str) -> Optional[Tuple[float, float]]:
 43 |     if not coord_str:
 44 |         return None, None
 45 | 
 46 |     coord_str_clean = coord_str.replace(" ", "")
 47 |     match = re.match(r"\(([\d.]+),([\d.]+)\)", coord_str_clean)
 48 |     if match:
 49 |         return float(match.group(1)), float(match.group(2))
 50 |     
 51 |     match = re.match(r"\(\s*([\d.]+)\s*,\s*([\d.]+)\s*\)", coord_str)
 52 |     if match:
 53 |         return float(match.group(1)), float(match.group(2))
 54 |     
 55 |     return None, None
 56 | 
 57 | def _split_parameters(params_str: str) -> list:
 58 |     param_parts = []
 59 |     current_part = ""
 60 |     
 61 |     in_quotes = False
 62 |     quote_char = None
 63 |     bracket_level = 0
 64 |     
 65 |     for char in params_str:
 66 |         if char in ['"', "'"] and not in_quotes:
 67 |             in_quotes = True
 68 |             quote_char = char
 69 |         elif char == quote_char and in_quotes:
 70 |             in_quotes = False
 71 |             quote_char = None
 72 |         
 73 |         elif not in_quotes:
 74 |             if char == '(':
 75 |                 bracket_level += 1
 76 |             elif char == ')':
 77 |                 bracket_level -= 1
 78 |             elif char == ',' and bracket_level == 0:
 79 |                 param_parts.append(current_part.strip())
 80 |                 current_part = ""
 81 |                 continue
 82 |         
 83 |         current_part += char
 84 |     
 85 |     if current_part.strip():
 86 |         param_parts.append(current_part.strip())
 87 |     
 88 |     return param_parts
 89 | 
 90 | def parse_answer(action_str: str):
 91 |     pattern = r"^(\w+)\((.*)\)$"
 92 |     match = re.match(pattern, action_str.strip(), re.DOTALL)
 93 |     if not match:
 94 |         raise ValueError(f"Invalid action_str format: {action_str}")
 95 |     
 96 |     action_type = match.group(1)
 97 |     params_str = match.group(2).strip()
 98 |     params = {}
 99 |     
100 |     if params_str:
101 |         try:
102 |             param_pairs = _split_parameters(params_str)
103 |             
104 |             for pair in param_pairs:
105 |                 if '=' in pair:
106 |                     key, value = pair.split("=", 1)
107 |                     value = value.strip("'").strip()
108 |                     params[key.strip()] = value
109 |                 else:
110 |                     params[pair.strip()] = None
111 |         except Exception as e:
112 |             print(f"Answer parse error: {e}")
113 |     
114 |     if action_type == 'Click':
115 |         p_x, p_y = parse_coordinates(params.get("box", ""))
116 |         if p_x is not None and p_y is not None:
117 |             return 'Click', {'box': (p_x, p_y)}
118 |         else:
119 |             raise ValueError(f"action {action_type} Unknown click params: {repr(params)}")
120 |     elif action_type == 'LongPress':
121 |         p_x, p_y = parse_coordinates(params.get("box", ""))
122 |         if p_x is not None and p_y is not None:
123 |             return 'LongPress', {'box': (p_x, p_y)}
124 |         else:
125 |             raise ValueError(f"action {action_type} Unknown long press params: {repr(params)}")
126 |     elif action_type == 'Drag':
127 |         p_x, p_y = parse_coordinates(params.get("start", ""))
128 |         e_x, e_y = parse_coordinates(params.get("end", ""))
129 |         if p_x is not None and p_y is not None and e_x is not None and e_y is not None:
130 |             return 'Drag', {'start': (p_x, p_y), 'end': (e_x, e_y)}
131 |         else:
132 |             raise ValueError(f"action {action_type} Unknown drag params: {repr(params)}")
133 |     elif action_type == 'Scroll':
134 |         p_x, p_y = parse_coordinates(params.get("start", ""))
135 |         e_x, e_y = parse_coordinates(params.get("end", ""))
136 |         if p_x is not None and p_y is not None and e_x is not None and e_y is not None:
137 |             return 'Scroll', {'start': (p_x, p_y), 'end': (e_x, e_y), 'direction': ''}
138 |         elif "direction" in params:
139 |             direction = params.get("direction")
140 |             return 'Scroll', {'start': (), 'end': (), 'direction': direction}
141 |         else:
142 |             raise ValueError(f"action {action_type} Unknown scroll params: {repr(params)}")
143 |     elif action_type == 'Type':
144 |         key = 'content'
145 |         type_text = params.get(key)
146 |         if type_text is not None:
147 |             return 'Type', {'content': type_text}
148 |         else:
149 |             raise ValueError(f"action {action_type} Unknown type params: {repr(params)}")
150 |     elif action_type == 'CallUser':
151 |         key = 'content'
152 |         call_text = params.get(key)
153 |         if call_text is not None:
154 |             return 'CallUser', {'content': call_text}
155 |         else:
156 |             raise ValueError(f"action {action_type} Unknown call user params: {repr(params)}")
157 |     elif action_type == 'Launch':
158 |         app = params.get("app", "")
159 |         url = params.get("url", "")
160 |         if app is not None:
161 |             return 'Launch', {'app': app, 'url': url}
162 |         else:
163 |             raise ValueError(f"action {action_type} Unknown launch params: {repr(params)}")
164 |     elif action_type == 'Finished':
165 |         key = 'content'
166 |         finished_text = params.get(key, "")
167 |         return 'Finished', {'content': finished_text}
168 |     elif action_type in ['Wait', 'PressBack', 'PressHome', 'PressEnter', 'PressRecent']:
169 |         return action_type, {}
170 |     else:
171 |         raise ValueError(f"action {action_type} Unknown action: {repr(params)}")
172 |     


--------------------------------------------------------------------------------
/models/navigation/ui_venus_navi_agent.py:
--------------------------------------------------------------------------------
  1 | from typing import Dict, List, Optional, Tuple, Any
  2 | from dataclasses import dataclass, asdict
  3 | from PIL import Image
  4 | import logging
  5 | 
  6 | from models.navigation.ui_venus_navi_vllm import NaviVLLM
  7 | from qwen_vl_utils import smart_resize
  8 | from .utils import parse_answer, USER_PROMPT
  9 | 
 10 | 
 11 | ACTION_MAPPING = {
 12 |     'click', 'drag', 'scroll', 'type', 'launch', 'wait', 'finished',
 13 |     'calluser', 'longpress', 'pressback', 'presshome', 'pressenter',
 14 |     'pressrecent', 'answer'
 15 | }
 16 | 
 17 | 
 18 | @dataclass
 19 | class StepData:
 20 |     image_path: str
 21 |     raw_screenshot: Image.Image
 22 |     query: str
 23 |     generated_text: str
 24 |     think: str
 25 |     action: str
 26 |     _conclusion: str
 27 |     action_output_json: Optional[Dict[str, Any]] = None
 28 |     status: str = 'success'
 29 | 
 30 |     def to_dict(self, include_screenshot: bool = False) -> dict:
 31 |         """
 32 |         Convert this step to a JSON-serializable dict.
 33 | 
 34 |         Args:
 35 |             include_screenshot (bool): Whether to include base64-encoded image.
 36 | 
 37 |         Returns:
 38 |             dict: Serializable step data.
 39 |         """
 40 |         data = asdict(self)
 41 |         data['raw_screenshot'] = None
 42 | 
 43 |         if include_screenshot and self.raw_screenshot is not None:
 44 |             import base64
 45 |             from io import BytesIO
 46 |             buffer = BytesIO()
 47 |             self.raw_screenshot.save(buffer, format="PNG")
 48 |             data['raw_screenshot_base64'] = base64.b64encode(buffer.getvalue()).decode("utf-8")
 49 | 
 50 |         return data
 51 | 
 52 | 
 53 | class VenusNaviAgent:
 54 |     def __init__(self,
 55 |                  model_config,
 56 |                  logger: logging.Logger,
 57 |                  history_length: int = 0) -> None:
 58 |         self.model = NaviVLLM(model_config=model_config, logger=logger)
 59 |         self.max_pixels = model_config.max_pixels
 60 |         self.min_pixels = model_config.min_pixels
 61 |         self.logger = logger
 62 |         self.history: List[StepData] = []
 63 |         self.history_length = max(0, history_length)
 64 |     
 65 |     def reset(self):
 66 |         self.logger.info(f"Agent Reset")
 67 |         self.history = []
 68 | 
 69 |     def _build_query(self, goal: str) -> str:
 70 |         if len(self.history) == 0:
 71 |             history_str = ""
 72 |         else:
 73 |             recent_history = self.history[-self.history_length:]
 74 |             history_entries = [
 75 |                 f"Step {i}: <think>{step.think}</think><action>{step.action}</action>" for i, step in enumerate(recent_history)
 76 |             ]
 77 |             history_str = "\n".join(history_entries)
 78 |         
 79 |         return USER_PROMPT.format(user_task=goal, previous_actions=history_str)
 80 | 
 81 |     def _rescale_coordinate(self, x: float, y: float, orig_size: Tuple[int, int], resized_size: Tuple[int, int]) -> Tuple[int, int]:
 82 |         o_w, o_h = orig_size
 83 |         r_w, r_h = resized_size
 84 |         x_scaled = int(x * o_w / r_w)
 85 |         y_scaled = int(y * o_h / r_h)
 86 |         return (
 87 |             max(0, min(x_scaled, o_w)),
 88 |             max(0, min(y_scaled, o_h))
 89 |         )
 90 | 
 91 |     def _convert_coordinate(self, action_json: dict, size_params: dict):        
 92 |         orig_size = (size_params['original_width'], size_params['original_height'])
 93 |         resized_size = (size_params['resized_width'], size_params['resized_height'])
 94 |         action_type = action_json['action'].lower()
 95 |         try:
 96 |             if action_type == 'click' or action_type == 'longpress':
 97 |                 x, y = action_json['params']['box']
 98 |                 action_json['params']['box'] = self._rescale_coordinate(x, y, orig_size, resized_size)
 99 |             elif action_type == 'drag':
100 |                 x1, y1 = action_json['params']['start']
101 |                 x2, y2 = action_json['params']['end']
102 |                 action_json['params']['start'] = self._rescale_coordinate(x1, y1, orig_size, resized_size)
103 |                 action_json['params']['end'] = self._rescale_coordinate(x2, y2, orig_size, resized_size)
104 |             elif action_type == 'scroll':
105 |                 if 'start' in action_json['params'] and len(action_json['params']['start']) > 0:
106 |                     x, y = action_json['params']['start']
107 |                     action_json['params']['start'] = self._rescale_coordinate(x, y, orig_size, resized_size)
108 |                 if 'end' in action_json['params'] and len(action_json['params']['end']) > 0:
109 |                     x, y = action_json['params']['end']
110 |                     action_json['params']['end'] = self._rescale_coordinate(x, y, orig_size, resized_size)
111 |         except (KeyError, ValueError, TypeError) as e:
112 |             self.logger.warning(f"convert failed: {e}, action_json={action_json}")
113 | 
114 |         return action_json
115 |     
116 |     def step(self, goal: str, image_path: str):
117 |         self.logger.info(f"----------step {len(self.history) + 1}")
118 |         try:
119 |             raw_screenshot = Image.open(image_path).convert('RGB')
120 |         except Exception as e:
121 |             self.logger.error(f"Can't load {image_path}: {e}")
122 |             return None
123 |         
124 |         original_width, original_height  = raw_screenshot.size
125 |         resized_height, resized_width = smart_resize(
126 |             original_height, original_width,
127 |             min_pixels=self.min_pixels,
128 |             max_pixels=self.max_pixels)
129 |         
130 |         size_params = {
131 |             'original_width': original_width,
132 |             'original_height': original_height,
133 |             'resized_width': resized_width,
134 |             'resized_height': resized_height,
135 |         }
136 |         
137 |         user_query = self._build_query(goal)
138 |         generated_text = self.model([(image_path, user_query)])[0][0]
139 |         
140 |         self.logger.info(f"Goal: {goal}")
141 |         self.logger.info(f"USER Query: {repr(user_query)}")
142 |         self.logger.info(f"ACTION text: {repr(str(generated_text))}")
143 | 
144 |         think_text = generated_text.split('<think>')[1].split('</think>')[0].strip('\n')
145 |         answer_text = generated_text.split('<action>')[1].split('</action>')[0].strip('\n')
146 |         conclusion_text = generated_text.split('<conclusion>')[1].split('</conclusion>')[0].strip('\n')
147 |         
148 |         self.logger.info(f"Think: {think_text}")
149 |         self.logger.info(f"Answer: {answer_text}")
150 |         
151 |         try:
152 |             action_name, action_params = parse_answer(answer_text)
153 |             action_json = {'action': action_name, 'params': action_params}
154 |             action_json = self._convert_coordinate(action_json, size_params)
155 |         except Exception as e:
156 |             self.logger.warning(f'Failed to parse_answer: {e}')
157 |             step_data = StepData(
158 |                 image_path=image_path,
159 |                 raw_screenshot=raw_screenshot,
160 |                 query=user_query,
161 |                 generated_text=generated_text,
162 |                 think=think_text,
163 |                 action=answer_text,
164 |                 _conclusion=conclusion_text,
165 |                 status='failed'
166 |             )
167 |             self.history.append(step_data)
168 |             return None
169 | 
170 |         step_data = StepData(
171 |             image_path=image_path,
172 |             raw_screenshot=raw_screenshot,
173 |             query=user_query,
174 |             generated_text=generated_text,
175 |             think=think_text,
176 |             action=answer_text,
177 |             _conclusion=conclusion_text,
178 |             action_output_json=action_json,
179 |             status='success'
180 |         )
181 |         self.history.append(step_data)
182 | 
183 |         self.logger.info(f'Action: {repr(str(action_json))}')
184 |         return action_json
185 |     
186 |     def export_history(self, include_screenshot=False):
187 |         serialized_history = [
188 |             step.to_dict(include_screenshot=include_screenshot)
189 |             for step in self.history
190 |         ]
191 |         return serialized_history
192 |     
193 | 
194 |         
195 |         
196 |         


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 


--------------------------------------------------------------------------------
/vis_androidworld/templates/index.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html>
  3 | <head>
  4 |     <title>Android World Visualization</title>
  5 |     <meta charset="UTF-8">
  6 |     <style>
  7 |         :root {
  8 |             --primary-color: #3498db;
  9 |             --secondary-color: #2c3e50;
 10 |             --success-color: #2ecc71;
 11 |             --danger-color: #e74c3c;
 12 |             --light-bg: #f8f9fa;
 13 |             --border-color: #dee2e6;
 14 |         }
 15 | 
 16 |         body {
 17 |             font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
 18 |             line-height: 1.6;
 19 |             color: #333;
 20 |             max-width: 1200px;
 21 |             margin: 0 auto;
 22 |             padding: 20px;
 23 |             background-color: #f5f8fa;
 24 |         }
 25 | 
 26 |         header {
 27 |             background: linear-gradient(135deg, #3498db, #2c3e50);
 28 |             color: white;
 29 |             padding: 20px;
 30 |             border-radius: 8px;
 31 |             margin-bottom: 20px;
 32 |             box-shadow: 0 4px 6px rgba(0,0,0,0.1);
 33 |         }
 34 | 
 35 |         h1 {
 36 |             margin: 0;
 37 |             font-size: 2.2rem;
 38 |         }
 39 | 
 40 |         .task-selector {
 41 |             margin: 20px 0;
 42 |             display: flex;
 43 |             gap: 15px;
 44 |             align-items: center;
 45 |         }
 46 | 
 47 |         select {
 48 |             padding: 12px 15px;
 49 |             font-size: 1rem;
 50 |             border: 2px solid var(--primary-color);
 51 |             border-radius: 6px;
 52 |             background-color: white;
 53 |             flex-grow: 1;
 54 |             transition: all 0.3s;
 55 |         }
 56 | 
 57 |         select:focus {
 58 |             outline: none;
 59 |             box-shadow: 0 0 0 3px rgba(52, 152, 219, 0.3);
 60 |         }
 61 | 
 62 |         button {
 63 |             background-color: var(--primary-color);
 64 |             color: white;
 65 |             border: none;
 66 |             padding: 12px 20px;
 67 |             border-radius: 6px;
 68 |             cursor: pointer;
 69 |             font-size: 1rem;
 70 |             transition: background-color 0.3s;
 71 |         }
 72 | 
 73 |         button:hover {
 74 |             background-color: #2980b9;
 75 |         }
 76 | 
 77 |         .task-overview {
 78 |             background-color: white;
 79 |             border-radius: 8px;
 80 |             padding: 25px;
 81 |             margin-bottom: 30px;
 82 |             box-shadow: 0 2px 5px rgba(0,0,0,0.05);
 83 |             border-left: 5px solid var(--primary-color);
 84 |         }
 85 | 
 86 |         .goal {
 87 |             font-size: 1.4rem;
 88 |             font-weight: 600;
 89 |             margin-bottom: 15px;
 90 |             color: var(--secondary-color);
 91 |         }
 92 | 
 93 |         .status {
 94 |             padding: 15px;
 95 |             border-radius: 6px;
 96 |             background-color: var(--light-bg);
 97 |             font-family: monospace;
 98 |             white-space: pre-wrap;
 99 |             line-height: 1.5; 
100 |         }
101 | 
102 |         .steps-container {
103 |             background-color: white;
104 |             border-radius: 8px;
105 |             overflow: hidden;
106 |             box-shadow: 0 2px 10px rgba(0,0,0,0.05);
107 |         }
108 | 
109 |         table {
110 |             width: 100%;
111 |             border-collapse: collapse;
112 |             table-layout: fixed;
113 |         }
114 | 
115 |         thead th {
116 |             background-color: var(--secondary-color);
117 |             color: white;
118 |             padding: 15px 20px;
119 |             text-align: left;
120 |             position: sticky;
121 |             top: 0;
122 |         }
123 | 
124 |         tbody td {
125 |             padding: 20px;
126 |             border-bottom: 1px solid var(--border-color);
127 |             vertical-align: top;
128 |         }
129 | 
130 |         .step-header {
131 |             background-color: var(--light-bg);
132 |             font-weight: 600;
133 |             padding: 8px 15px;
134 |             border-radius: 4px;
135 |             margin-bottom: 10px;
136 |         }
137 | 
138 |         .image-container {
139 |             max-width: 100%;
140 |             text-align: center;
141 |         }
142 | 
143 |         .step-image {
144 |             max-width: 100%;
145 |             max-height: 400px;
146 |             border-radius: 4px;
147 |             box-shadow: 0 2px 5px rgba(0,0,0,0.1);
148 |             display: block;
149 |             margin: 0 auto;
150 |         }
151 | 
152 |         .text-content {
153 |             font-family: monospace;
154 |             white-space: pre-wrap;
155 |             max-height: 300px;
156 |             overflow-y: auto;
157 |             padding: 15px;
158 |             background-color: var(--light-bg);
159 |             border-radius: 4px;
160 |             line-height: 1.8;
161 |         }
162 | 
163 |         .no-image {
164 |             color: #999;
165 |             font-style: italic;
166 |             text-align: center;
167 |             padding: 20px;
168 |         }
169 | 
170 |         .footer {
171 |             text-align: center;
172 |             margin-top: 30px;
173 |             color: #7f8c8d;
174 |             font-size: 0.9rem;
175 |         }
176 | 
177 |         .no-data {
178 |             text-align: center;
179 |             padding: 30px;
180 |             background-color: #f8d7da;
181 |             border-radius: 8px;
182 |             color: #721c24;
183 |             margin-top: 20px;
184 |         }
185 | 
186 |         .click-dot {
187 |             position: absolute;
188 |             width: 12px;
189 |             height: 12px;
190 |             background: #ff4444;
191 |             border-radius: 50%;
192 |             box-shadow: 0 0 4px rgba(255,0,0,0.5);
193 |         }
194 |     </style>
195 | </head>
196 | <body>
197 |     <header>
198 |         <h1>Android World Visualization</h1>
199 |     </header>
200 | 
201 |     <form class="task-selector">
202 |         <select name="task" id="task-select" size="1"> 
203 |             {% for task in tasks %}
204 |                 <option value="{{ task.original_name }}"
205 |                         {% if task.original_name == selected_original_name %}selected{% endif %}>
206 |                     {{ task.display_name }}
207 |                 </option>
208 |             {% endfor %}
209 |         </select>
210 |         
211 |     </form>
212 | 
213 |     {% if task_data %}
214 |         <div class="task-overview">
215 |             <div class="goal">Goal: {{ task_data.goal }}</div>
216 |             <div class="status">{{ task_data.full_status }}</div>
217 |         </div>
218 | 
219 |         <div class="steps-container">
220 |             <table>
221 |                 <thead>
222 |                     <tr>
223 |                         <th style="width: 40%">Screenshot</th>
224 |                         <th style="width: 20%">Think</th>
225 |                         <th style="width: 20%">Action</th>
226 |                         <th style="width: 20%">Conclusion</th>
227 |                     </tr>
228 |                 </thead>
229 |                 <tbody>
230 |                     {% for step in task_data.steps %}
231 |                     <tr>
232 |                         <td>
233 |                             {% if step.image %}
234 |                                 <div class="image-container">  <!-- 移除inline的position: relative -->
235 |                                     <div class="step-header">step {{ step.prefix }}</div>
236 |                                     
237 |                                     <div class="img-wrapper">
238 |                                         <img
239 |                                             src="{{ url_for('serve_image', task=task_data.name, filename=step.image) }}"
240 |                                             alt="screenshot"
241 |                                             class="step-image"
242 |                                             data-original-width="{{ step.image_width or '' }}"
243 |                                             data-original-height="{{ step.image_height or '' }}"
244 |                                         >
245 |                                         {% if step.x and step.y %}
246 |                                         <div class="click-dot"
247 |                                             data-original-x="{{ step.x }}"
248 |                                             data-original-y="{{ step.y }}">
249 |                                         </div>
250 |                                         {% endif %}
251 |                                     </div>
252 |                                 </div>
253 |                             {% else %}
254 |                                 <div class="no-image">this step no image</div>
255 |                             {% endif %}
256 |                         </td>
257 |                         <td>
258 |                             <div class="step-header">Think</div>
259 |                             <div class="text-content">{{ step.thinking_content }}</div>
260 |                         </td>
261 |                         <td>
262 |                             <div class="step-header">Action</div>
263 |                             <div class="text-content">{{ step.tool_call_content }}</div>
264 |                         </td>
265 |                         <td>
266 |                             <div class="step-header">Conclusion</div>
267 |                             <div class="text-content">{{ step.conclusion_content }}</div>
268 |                         </td>
269 |                     </tr>
270 |                     {% endfor %}
271 |                 </tbody>
272 |             </table>
273 |         </div>
274 |     {% else %}
275 |         <div class="no-data">
276 |             <p>no data in file</p>
277 |         </div>
278 |     {% endif %}
279 | 
280 |     <div class="footer">
281 |         <p>Android World visualization | {{ selected_original_name }}</p>
282 |     </div>
283 | </body>
284 | </html>
285 | 
286 | 
287 | <script>
288 | document.addEventListener('DOMContentLoaded', () => {
289 |     document.querySelectorAll('.image-container').forEach(container => {
290 |         const wrapper = container.querySelector('.img-wrapper');
291 |         const img = wrapper.querySelector('.step-image');
292 |         const dot = wrapper.querySelector('.click-dot');
293 |         if (!wrapper || !img || !dot) return;
294 | 
295 |         const getImageSize = () => ({
296 |             originalWidth: img.naturalWidth,
297 |             originalHeight: img.naturalHeight
298 |         });
299 | 
300 |         img.onload = () => {
301 |             const { originalWidth, originalHeight } = getImageSize();
302 |             if (originalWidth === 0 || originalHeight === 0) {
303 |                 console.error('fail to get image size');
304 |                 return;
305 |             }
306 | 
307 |             const displayWidth = img.clientWidth;
308 |             const displayHeight = img.clientHeight;
309 | 
310 |             const imgOffsetLeft = img.offsetLeft;  
311 |             const imgOffsetTop = img.offsetTop;    
312 | 
313 |             const originalX = parseFloat(dot.dataset.originalX);
314 |             const originalY = parseFloat(dot.dataset.originalY);
315 | 
316 |             const widthRatio = displayWidth / originalWidth;
317 |             const heightRatio = displayHeight / originalHeight;
318 |             const scale = Math.min(widthRatio, heightRatio);
319 | 
320 |             dot.style.left = `${imgOffsetLeft + originalX * scale}px`;  
321 |             dot.style.top = `${imgOffsetTop + originalY * scale}px`;    
322 |         };
323 | 
324 |         img.onerror = () => {
325 |             console.error('fail to show image');
326 |         };
327 |     });
328 | });
329 | </script>
330 | 
331 | <style>
332 |     .img-wrapper {
333 |         position: relative;  
334 |         display: inline-block;  
335 |         margin: 0 auto;  
336 |     }
337 | </style>
338 | 
339 | 
340 | <script>
341 | document.addEventListener('DOMContentLoaded', () => {
342 | 
343 |     const taskSelect = document.getElementById('task-select');
344 |     taskSelect.addEventListener('change', function() {
345 |         this.form.submit(); 
346 |     });
347 | });
348 | </script>
349 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # 🌟 UI-Venus: Building High-Performance UI Agents with RFT
  2 | 
  3 | 
  4 | [![License](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
  5 | [![Report](https://img.shields.io/badge/Report-Technical%20Report-blueviolet?logo=notion)](http://arxiv.org/abs/2508.10833)
  6 | [![GitHub](https://img.shields.io/badge/GitHub-Repository-green?logo=github)](https://github.com/inclusionAI/UI-Venus)
  7 | [![Hugging Face](https://img.shields.io/badge/Hugging%20Face-Model-orange?logo=huggingface)](https://huggingface.co/collections/inclusionAI/ui-venus-689f2fb01a4234cbce91c56a)
  8 | 
  9 | 
 10 | <p align="center">
 11 |   <em>UI-Venus leverages Reinforcement Fine-Tuning (RFT) to achieve state-of-the-art performance in GUI understanding and action prediction across mobile, desktop, and web interfaces.</em>
 12 | </p>
 13 | 
 14 | ---
 15 | 
 16 |  <p align="center">
 17 |   📈 UI-Venus Benchmark Performance
 18 | </p>
 19 | 
 20 | <p align="center">
 21 |   <img src="assets/performance_venus.png" alt="UI-Venus Performance Across Datasets" width="1200" />
 22 |   <br>
 23 | </p>
 24 | 
 25 | > **Figure:** Performance of UI-Venus across multiple benchmark datasets. UI-Venus achieves **State-of-the-Art (SOTA)** results on key UI understanding and interaction benchmarks, including **ScreenSpot-Pro**, **ScreenSpot-v2**, **OS-World-G**, **UI-Vision**, and **Android World**. The results demonstrate its superior capability in visual grounding, UI navigation, cross-platform generalization, and complex task reasoning.
 26 | 
 27 | 
 28 | 
 29 | ---
 30 | # Overview
 31 | 
 32 | * [Key Highlights](#key-highlights)
 33 | * [Installation](#installation)
 34 | * [Quick Start](#quick-start)
 35 | * [Evaluation](#evaluation)
 36 | * [Citation](#citation)
 37 | 
 38 | 
 39 | ---
 40 | 
 41 | 
 42 | # ✨ Key Highlights
 43 | 
 44 | We introduce **UI-Venus**, a state-of-the-art UI agent that pushes the boundaries of visual grounding and interactive navigation. Below are the core contributions and innovations:
 45 | 
 46 |   ---
 47 | 
 48 |   ### 💡 Open-Source SOTA UI Agent with Full Toolchain
 49 |   We develop and open-source **UI-Venus** — a state-of-the-art UI agent trained on **350K high-quality, professionally annotated samples**.  
 50 |   In addition to model checkpoints (7B & 72B), we release:
 51 |   - 🔹 Full evaluation pipeline
 52 |   - 🔹 Inference scripts
 53 | 
 54 |   👉 [GitHub Repository](https://github.com/inclusionAI/UI-Venus) | [Hugging Face Model](https://huggingface.co/collections/inclusionAI/ui-venus-689f2fb01a4234cbce91c56a)
 55 | 
 56 |   ---
 57 | 
 58 |   ### 📏 Action-Level Reward Design for RL-Based Navigation
 59 |   Beyond grounding, we design **fine-grained, action-wise reward functions** for reinforcement learning in GUI navigation tasks.  
 60 |   Our approach achieves:
 61 |   - ✅ Competitive performance on **AndroidWorld**
 62 |   - ✅ Better credit assignment in long-horizon tasks
 63 |   - ✅ End-to-end learnable action prediction with policy gradients
 64 | 
 65 |   This marks a significant step toward **autonomous UI interaction** with precise feedback signals.
 66 | 
 67 |   ---
 68 | 
 69 |   ### 🧹 Comprehensive Study on UI Data Quality & Cleaning Strategy
 70 |   We conduct a **systematic analysis of UI interaction data quality**, identifying key challenges in real-world traces.  
 71 |   To improve training signal fidelity, we propose a **three-stage data refinement pipeline**:
 72 |   1. **Prompt Rewrite** – Clarify ambiguous user intentions
 73 |   2. **Trace Editing** – Fix incorrect or redundant actions
 74 |   3. **Trace Generation** – Synthesize missing steps via LLM-augmented modeling  
 75 | 
 76 |   📈 Result: RL & Higher-quality data → More robust and generalizable agents.
 77 | 
 78 | 
 79 | 
 80 |   ---
 81 | 
 82 | ##  Installation
 83 | 
 84 | manually install the dependencies:
 85 | 
 86 | ```bash
 87 | pip install -r requirements.txt
 88 | ```
 89 | 
 90 | ---
 91 |   
 92 | 
 93 | 
 94 | ##  Quick Start
 95 | 
 96 | Use the shell scripts to launch the evaluation. The evaluation setup follows the same protocol as **ScreenSpot**, including data format, annotation structure, and metric calculation.
 97 | 
 98 | ### Grounding
 99 | - **For 7B model:**
100 |   ```bash 
101 |   bash scripts/run_gd_7b.sh
102 |   ```
103 |   
104 | - **For 72B model:**
105 |   ```bash 
106 |   bash scripts/run_gd_72b.sh
107 |   ```
108 | 
109 | 🔧 Configuration Required.
110 | Please set the following paths in your script:
111 | 
112 | * `screenspot_imgs`: Folder containing your screenshots (e.g., Screenspot-pro/images)
113 | * `screenspot_test`: Folder containing your all json files  (e.g., Screenspot-pro/annotations)
114 | * `model_name_or_path`: Model checkpoint path (e.g., /root/ckpt/huggingface/)
115 | * `log_path`: Output folders
116 | 
117 | 
118 | ### 📄 Data Format
119 | 
120 | We provide example templates for input and output formats:
121 | 
122 | - **Input Format (Grounding Annotations):**  
123 |   `examples/grounding_meta_format.json` — Defines the standard structure of annotation files in `screenspot_test`.
124 | 
125 | - **Output Format (Model Predictions):**  
126 |   `examples/grounding_result_format.json` — Shows the recommended format for model prediction results.
127 | 
128 | Please refer to these examples when preparing your data or parsing the evaluation outputs.
129 | 
130 | ### Navigation
131 | - **For 7B model:**
132 |   ```bash 
133 |   bash scripts/run_navi_7b.sh
134 |   ```
135 | 
136 | - **For 72B model:**
137 |   ```bash 
138 |   bash scripts/run_navi_72b.sh
139 |   ```
140 | 🔧 Configuration Required.
141 | Please set the following paths in your script:
142 | * `model_path`: Path to the model checkpoint (e.g., inclusionAI/UI-Venus-Navi-7B). 
143 | * `input_file`: Input JSON file containing navigation tasks (str, default: examples/trace.json). Each entry must include:
144 |   - task: The user instruction (string)
145 |   - image_path: Screenshot filepath  
146 | * `output_file`: Path to save the agent’s execution history (JSON format)
147 | ---
148 | 
149 | ## Evaluation
150 | 
151 | ###  Results on ScreenSpot-v2
152 | 
153 | | **Model**                | **Mobile Text** | **Mobile Icon** | **Desktop Text** | **Desktop Icon** | **Web Text** | **Web Icon** | **Avg.** |
154 | |--------------------------|-----------------|-----------------|------------------|------------------|--------------|--------------|----------|
155 | | uitars-1.5               | -               | -               | -                | -                | -            | -            | 94.2     |
156 | | Seed-1.5-VL              | -               | -               | -                | -                | -            | -            | 95.2     |
157 | | GPT-4o                   | 26.6            | 24.2            | 24.2             | 19.3             | 12.8         | 11.8         | 20.1     |
158 | | Qwen2.5-VL-7B            | 97.6            | 87.2            | 90.2             | 74.2             | 93.2         | 81.3         | 88.8     |
159 | | UI-TARS-7B               | 96.9            | 89.1            | 95.4             | 85.0             | 93.6         | 85.2         | 91.6     |
160 | | UI-TARS-72B              | 94.8            | 86.3            | 91.2             | 87.9             | 91.5         | 87.7         | 90.3     |
161 | | LPO                      | 97.9            | 82.9            | 95.9             | 86.4             | 95.6         | 84.2         | 90.5     |
162 | | **UI-Venus-Ground-7B (Ours)**  | **99.0**        | **90.0**        | **97.0**         | **90.7**         | **96.2**     | **88.7**     | **94.1** |
163 | | **UI-Venus-Ground-72B (Ours)**  | **99.7**        | **93.8**        | **95.9**         | **90.0**         | **96.2**     | **92.6**     | **95.3** |
164 | 
165 | ---
166 | 
167 | 
168 | 
169 | ### Results on ScreenSpot-Pro
170 | 
171 | Performance comparison of GUI agent models across six task categories on **ScreenSpot-Pro**.  
172 | Scores are in percentage (%). `T` = Text, `I` = Icon.  
173 | `*`: reproduced; `†`: trained from UI-TARS-1.5-7B.
174 | 
175 | | Model | CAD (T/I) | Dev (T/I) | Creative (T/I) | Scientific (T/I) | Office (T/I) | OS (T/I) | Avg T | Avg I | **Overall** | Type |
176 | |-------|-----------|-----------|----------------|------------------|--------------|---------|--------|--------|------------|------|
177 | | GPT-4o | 2.0 / 0.0 | 1.3 / 0.0 | 1.0 / 0.0 | 2.1 / 0.0 | 1.1 / 0.0 | 0.0 / 0.0 | 1.3 | 0.0 | 0.8 | Closed |
178 | | Claude Computer Use | 14.5 / 3.7 | 22.0 / 3.9 | 25.9 / 3.4 | 33.9 / 15.8 | 30.1 / 16.3 | 11.0 / 4.5 | 23.4 | 7.1 | 17.1 | Closed |
179 | | UI-TARS-1.5 | – / – | – / – | – / – | – / – | – / – | – / – | – | – | **61.6** | Closed |
180 | | Seed1.5-VL | – / – | – / – | – / – | – / – | – / – | – / – | – | – | 60.9 | Closed |
181 | | Qwen2.5-VL-7B\* | 16.8 / 1.6 | 46.8 / 4.1 | 35.9 / 7.7 | 49.3 / 7.3 | 52.5 / 20.8 | 37.4 / 6.7 | 38.9 | 7.1 | 26.8 | SFT |
182 | | Qwen2.5-VL-72B* | 54.8 / 15.6 | 65.6 / 16.6 | 63.1 / 19.6 | 78.5 / 34.5 | 79.1 / 47.2 | 66.4 / 29.2 | 67.3 | 25.0 | 51.2 | SFT |
183 | | UI-TARS-7B | 20.8 / 9.4 | 58.4 / 12.4 | 50.0 / 9.1 | 63.9 / 31.8 | 63.3 / 20.8 | 30.8 / 16.9 | 47.8 | 16.2 | 35.7 | SFT |
184 | | UI-TARS-72B | 18.8 / 12.5 | 62.9 / 17.2 | 57.1 / 15.4 | 64.6 / 20.9 | 63.3 / 26.4 | 42.1 / 15.7 | 50.9 | 17.6 | 38.1 | SFT |
185 | | Phi-Ground-7B | 26.9 / 17.2 | 70.8 / 16.7 | 56.6 / 13.3 | 58.0 / 29.1 | 76.4 / 44.0 | 55.1 / 25.8 | 56.4 | 21.8 | 43.2 | RL |
186 | | UI-TARS-1.5-7B | – / – | – / – | – / – | – / – | – / – | – / – | – | – | 49.6 | RL |
187 | | GTA1-7B† | 53.3 / 17.2 | 66.9 / 20.7 | 62.6 / 18.2 | 76.4 / 31.8 | 82.5 / 50.9 | 48.6 / 25.9 | 65.5 | 25.2 | 50.1 | RL |
188 | | GTA1-72B | 56.9 / 28.1 | 79.9 / 33.1 | 73.2 / 20.3 | 81.9 / 38.2 | 85.3 / 49.1 | 73.8 / 39.1 | 74.5 | 32.5 | 58.4 | RL |
189 | | **UI-Venus-Ground-7B** | 60.4 / 21.9 | 74.7 / 24.1 | 63.1 / 14.7 | 76.4 / 31.8 | 75.7 / 41.5 | 49.5 / 22.5 | 67.1 | 24.3 | **50.8** | Ours (RL) |
190 | | **UI-Venus-Ground-72B** | 66.5 / 29.7 | 84.4 / 33.1 | 73.2 / 30.8 | 84.7 / 42.7 | 83.1 / 60.4 | 75.7 / 36.0 | 77.4 | 36.8 | **61.9** | Ours (RL) |
191 | 
192 | > 🔝 **Experimental results show that UI-Venus-Ground-72B achieves state-of-the-art performance on ScreenSpot-Pro with an average score of 61.9, while also setting new benchmarks on ScreenSpot-v2(95.3), OSWorld_G(70.4), AgentCPM(85), and UI-Vision(36.8), highlighting its effectiveness in complex visual grounding and action prediction tasks.**
193 | 
194 | ### Results on AndroidWorld
195 | This is the compressed package of validation trajectories for **AndroidWorld**, including execution logs and navigation paths.  
196 | 📥 Download: [UI-Venus-androidworld.zip](vis_androidworld/UI-Venus-androidworld.zip)
197 | 
198 | | Models | With Planner | A11y Tree | Screenshot | Success Rate (pass@1) |
199 | |--------|--------------|-----------|------------|------------------------|
200 | | **Closed-source Models** | | | | |
201 | | GPT-4o| ❌ | ✅ | ❌ | 30.6 |
202 | | ScaleTrack| ❌ | ✅ | ❌ | 44.0 |
203 | | SeedVL-1.5 | ❌ | ✅ | ✅ | 62.1 |
204 | | UI-TARS-1.5 | ❌ | ❌ | ✅ | 64.2 |
205 | | **Open-source Models** | | | | |
206 | | GUI-Critic-R1-7B | ❌ | ✅ | ✅ | 27.6 |
207 | | Qwen2.5-VL-72B* | ❌ | ❌ | ✅ | 35.0 |
208 | | UGround | ✅ | ❌ | ✅ | 44.0 |
209 | | Aria-UI | ✅ | ❌ | ✅ | 44.8 |
210 | | UI-TARS-72B | ❌ | ❌ | ✅ | 46.6 |
211 | | GLM-4.5v | ❌ | ❌ | ✅ | 57.0 |
212 | | **Ours** | | | | |
213 | | UI-Venus-Navi-7B | ❌ | ❌ | ✅ | **49.1** |
214 | | UI-Venus-Navi-72B | ❌ | ❌ | ✅ | **65.9** |
215 | 
216 | > **Table:** Performance comparison on **AndroidWorld** for end-to-end models. Our UI-Venus-Navi-72B achieves state-of-the-art performance, outperforming all baseline methods across different settings.
217 | 
218 | 
219 | ### Results on AndroidControl and GUI-Odyssey
220 | 
221 | | Models | AndroidControl-Low<br>Type Acc. | AndroidControl-Low<br>Step SR | AndroidControl-High<br>Type Acc. | AndroidControl-High<br>Step SR | GUI-Odyssey<br>Type Acc. | GUI-Odyssey<br>Step SR |
222 | |--------|-------------------------------|-----------------------------|-------------------------------|-----------------------------|------------------------|----------------------|
223 | | **Closed-source Models** | | | | | | |
224 | | GPT-4o | 74.3 | 19.4 | 66.3 | 20.8 | 34.3 | 3.3 |
225 | | **Open Source Models** | | | | | | |
226 | | Qwen2.5-VL-7B | 94.1 | 85.0 | 75.1 | 62.9 | 59.5 | 46.3 |
227 | | SeeClick | 93.0 | 75.0 | 82.9 | 59.1 | 71.0 | 53.9 |
228 | | OS-Atlas-7B | 93.6 | 85.2 | 85.2 | 71.2 | 84.5 | 62.0 |
229 | | Aguvis-7B| - | 80.5 | - | 61.5 | - | - |
230 | | Aguvis-72B| - | 84.4 | - | 66.4 | - | - |
231 | | OS-Genesis-7B | 90.7 | 74.2 | 66.2 | 44.5 | - | - |
232 | | UI-TARS-7B| 98.0 | 90.8 | 83.7 | 72.5 | 94.6 | 87.0 |
233 | | UI-TARS-72B| **98.1** | 91.3 | 85.2 | 74.7 | **95.4** | **88.6** |
234 | | GUI-R1-7B| 85.2 | 66.5 | 71.6 | 51.7 | 65.5 | 38.8 |
235 | | NaviMaster-7B | 85.6 | 69.9 | 72.9 | 54.0 | - | - |
236 | | UI-AGILE-7B | 87.7 | 77.6 | 80.1 | 60.6 | - | - |
237 | | AgentCPM-GUI | 94.4 | 90.2 | 77.7 | 69.2 | 90.0 | 75.0 |
238 | | **Ours** | | | | | | |
239 | | UI-Venus-Navi-7B | 97.1 | 92.4 | **86.5** | 76.1 | 87.3 | 71.5 |
240 | | UI-Venus-Navi-72B | 96.7 | **92.9** | 85.9 | **77.2** | 87.2 | 72.4 |
241 | 
242 | > **Table:** Performance comparison on offline UI navigation datasets including AndroidControl and GUI-Odyssey. Note that models with * are reproduced.
243 | 
244 | 
245 | # Citation
246 | Please consider citing if you find our work useful:
247 | ```plain
248 | @misc{gu2025uivenustechnicalreportbuilding,
249 |       title={UI-Venus Technical Report: Building High-performance UI Agents with RFT}, 
250 |       author={Zhangxuan Gu and Zhengwen Zeng and Zhenyu Xu and Xingran Zhou and Shuheng Shen and Yunfei Liu and Beitong Zhou and Changhua Meng and Tianyu Xia and Weizhi Chen and Yue Wen and Jingya Dou and Fei Tang and Jinzhen Lin and Yulin Liu and Zhenlin Guo and Yichen Gong and Heng Jia and Changlong Gao and Yuan Guo and Yong Deng and Zhenyu Guo and Liang Chen and Weiqiang Wang},
251 |       year={2025},
252 |       eprint={2508.10833},
253 |       archivePrefix={arXiv},
254 |       primaryClass={cs.CV},
255 |       url={https://arxiv.org/abs/2508.10833}, 
256 | }
257 | ```
258 | 


--------------------------------------------------------------------------------
/models/grounding/eval_screenspot_pro.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import itertools
  3 | 
  4 | import torch
  5 | import json
  6 | import re
  7 | import argparse
  8 | import os
  9 | from PIL import Image
 10 | import logging
 11 | from tqdm import tqdm
 12 | 
 13 | 
 14 | logging.basicConfig(level=logging.INFO)
 15 | torch.manual_seed(114514)
 16 | 
 17 | GT_TYPES = ['positive', 'negative']
 18 | INSTRUCTION_STYLES = ['instruction', 'action', 'description']
 19 | LANGUAGES = ['en', 'cn']
 20 | 
 21 | def parse_args():
 22 |     parser = argparse.ArgumentParser()
 23 |     parser.add_argument('--model_type', type=str, required=True)
 24 |     parser.add_argument('--model_name_or_path', type=str, required=False)
 25 |     parser.add_argument('--screenspot_imgs', type=str, required=True)
 26 |     parser.add_argument('--screenspot_test', type=str, required=True)
 27 |     parser.add_argument('--task', type=str, required=True)
 28 |     parser.add_argument('--inst_style', type=str, required=True, choices=INSTRUCTION_STYLES + ['all'], help="Instruction style to use.")
 29 |     parser.add_argument('--language', type=str, required=True, choices=LANGUAGES + ['all'], default='en', help="Language to use.")
 30 |     parser.add_argument('--gt_type', type=str, required=True, choices=GT_TYPES + ['all'], help="Ground truth type: 'positive' or 'negative'.")
 31 |     parser.add_argument('--log_path', type=str, required=True)
 32 | 
 33 |     args = parser.parse_args()
 34 |     return args
 35 | 
 36 | def build_model(args):
 37 |     model_type = args.model_type
 38 |     model_name_or_path = args.model_name_or_path
 39 | 
 40 | 
 41 |     if model_type == "ui_venus_ground_7b":
 42 |         from ui_venus_ground_7b import UI_Venus_Ground_7B
 43 |         model = UI_Venus_Ground_7B()
 44 |         if args.model_name_or_path:
 45 |             model.load_model(model_name_or_path=model_name_or_path)
 46 |         else:
 47 |             model.load_model()
 48 | 
 49 |     elif model_type == "ui_venus_ground_72b":
 50 |         from ui_venus_ground_72b import UI_Venus_Ground_72B
 51 |         model = UI_Venus_Ground_72B()
 52 |         if args.model_name_or_path:
 53 |             model.load_model(model_name_or_path=model_name_or_path)
 54 |         else:
 55 |             model.load_model()
 56 |     
 57 |     else:
 58 |         raise ValueError(f"Unsupported model type {model_type}.")
 59 |     model.set_generation_config(temperature=0, max_new_tokens=256)
 60 |     return model
 61 | 
 62 | def collect_results_to_eval(results, platform=None, group=None, application=None, language=None, gt_type=None, instruction_style=None, ui_type=None):
 63 |     """
 64 |     Filters the results based on provided values. None means include all (ignore filtering this attribute).
 65 | 
 66 |     Parameters:
 67 |         results (list): A list of dictionaries containing sample results.
 68 |     
 69 |     Returns:
 70 |         list: A filtered list of dictionaries based on the given criteria.
 71 |     """
 72 |     filtered_results = []
 73 | 
 74 |     for sample in results:
 75 |         # Check each filter condition; if None, consider it as passed
 76 |         if (platform is None or sample.get("platform") == platform) and \
 77 |            (group is None or sample.get("group") == group) and \
 78 |            (application is None or sample.get("application") == application) and \
 79 |            (language is None or sample.get("language") == language) and \
 80 |            (gt_type is None or sample.get("gt_type") == gt_type) and \
 81 |            (instruction_style is None or sample.get("instruction_style") == instruction_style) and \
 82 |            (ui_type is None or sample.get("ui_type") == ui_type):
 83 |             filtered_results.append(sample)
 84 | 
 85 |     return filtered_results
 86 | 
 87 | 
 88 | def make_combinations(results, platform=False, group=None, application=False, language=False, gt_type=False, instruction_style=False, ui_type=False):
 89 |     """
 90 |     Returns a list of combinations of values for attributes where the corresponding parameter is set to True.
 91 |     """
 92 |     # Initialize a dictionary to store unique values for each attribute
 93 |     unique_values = {
 94 |         "platform": set(),
 95 |         "group": set(),
 96 |         "application": set(),
 97 |         "language": set(),
 98 |         "gt_type": set(),
 99 |         "instruction_style": set(),
100 |         "ui_type": set(),
101 |     }
102 | 
103 |     # Collect unique values from the results
104 |     for sample in results:
105 |         if platform:
106 |             unique_values["platform"].add(sample.get("platform"))
107 |         if group:
108 |             unique_values["group"].add(sample.get("group"))
109 |         if application:
110 |             unique_values["application"].add(sample.get("application"))
111 |         if language:
112 |             unique_values["language"].add(sample.get("language"))
113 |         if gt_type:
114 |             unique_values["gt_type"].add(sample.get("gt_type"))
115 |         if instruction_style:
116 |             unique_values["instruction_style"].add(sample.get("instruction_style"))
117 |         if ui_type:
118 |             unique_values["ui_type"].add(sample.get("ui_type"))
119 | 
120 |     # Filter out the attributes that are set to False (no need for combinations)
121 |     filtered_values = {key: list(value) for key, value in unique_values.items() if value}
122 |     if not filtered_values:
123 |         return []
124 | 
125 |     # Generate all combinations of the selected attributes using itertools.product
126 |     attribute_combinations = list(itertools.product(*filtered_values.values()))
127 | 
128 |     # Convert combinations into dictionaries with corresponding attribute names
129 |     combinations = []
130 |     for combination in attribute_combinations:
131 |         combinations.append(dict(zip(filtered_values.keys(), combination)))
132 | 
133 |     return combinations
134 | 
135 | 
136 | def calc_metric_for_result_list(results):
137 |     """Calculates the metrics for a simple result list."""
138 |     num_total = len(results)
139 |     correct_num = sum(1 for res in results if res["correctness"] == "correct")
140 |     wrong_format_num = sum(1 for res in results if res["correctness"] == "wrong_format")
141 | 
142 |     # Calculate text and icon specific metrics using collect_results_to_eval
143 |     text_results = collect_results_to_eval(results, ui_type="text")
144 |     icon_results = collect_results_to_eval(results, ui_type="icon")
145 | 
146 |     text_correct = sum(1 for res in text_results if res["correctness"] == "correct")
147 |     text_total = len(text_results)
148 |     icon_correct = sum(1 for res in icon_results if res["correctness"] == "correct")
149 |     icon_total = len(icon_results)
150 |     metrics = {
151 |         "num_correct_action": correct_num,
152 |         "num_total": num_total,
153 |         "wrong_format_num": wrong_format_num,
154 |         "action_acc": correct_num / num_total if num_total > 0 else 0,
155 |         "text_acc": text_correct / text_total if text_total > 0 else 0,
156 |         "icon_acc": icon_correct / icon_total if icon_total > 0 else 0
157 |     }
158 |     return metrics
159 | 
160 | 
161 | def eval_sample_positive_gt(sample, response):
162 |     bbox = sample["bbox"]
163 |     bbox = [bbox[0], bbox[1], bbox[2], bbox[3]]  # x1, y1, x2, y2
164 |     # bbox = [bbox[0], bbox[1], bbox[0] + bbox[2], bbox[1] + bbox[3]]  # x1, y1, w, h
165 |     img_size = sample["img_size"]
166 |     bbox = [bbox[0] / img_size[0], bbox[1] / img_size[1], bbox[2] / img_size[0], bbox[3] / img_size[1]]
167 |     
168 |     click_point = response["point"]  # may be none
169 |     print(click_point)
170 |     if click_point is None:
171 |         return "wrong_format"
172 |     # Check if the predicted point falls in the ground truth box
173 |     if (bbox[0] <= click_point[0] <= bbox[2]) and (bbox[1] <= click_point[1] <= bbox[3]):
174 |         return "correct"
175 |     else:
176 |         return "wrong"
177 |     
178 | def eval_sample_negative_gt(sample, response):
179 |     if response["result"] == "negative":
180 |         return "correct"
181 |     elif response["result"] == "positive":
182 |         return "wrong"
183 |     else: ## response["result"] == wrong_format
184 |         return "wrong_format"
185 | 
186 | def evaluate_fine_grained(results):
187 |     # Generate all combinations of platform, instruction_style, and gt_type
188 |     combinations = make_combinations(
189 |         results, 
190 |         platform=True, 
191 |         application=True,
192 |         instruction_style=True, 
193 |         gt_type=True
194 |     )
195 | 
196 |     evaluation_result = {}
197 | 
198 |     # Iterate through each combination
199 |     for combo in combinations:
200 |         platform = combo.get("platform")
201 |         application = combo.get("application")
202 |         inst_style = combo.get("instruction_style")
203 |         gt_type = combo.get("gt_type")
204 |         
205 |         # Filter results for the current combination
206 |         filtered_results = collect_results_to_eval(
207 |             results=results,
208 |             platform=platform,
209 |             application=application,
210 |             instruction_style=inst_style,
211 |             gt_type=gt_type
212 |         )
213 |         
214 |         # Calculate metrics using the calc_metric_for_result_list function
215 |         metrics = calc_metric_for_result_list(filtered_results)
216 |         if metrics['num_total'] == 0:
217 |             continue
218 |         
219 |         # Construct a unique key based on the combination
220 |         key = f"plat:{platform} app:{application} inst_style:{inst_style} gt_type:{gt_type}"
221 |         evaluation_result[key] = metrics
222 | 
223 |     return evaluation_result
224 | 
225 | def evaluate_seeclick_paper_style(results):
226 |     # Generate all combinations of platform, instruction_style, and gt_type
227 |     combinations = make_combinations(
228 |         results, 
229 |         platform=True, 
230 |         instruction_style=True, 
231 |         gt_type=True
232 |     )
233 | 
234 |     evaluation_result = {}
235 | 
236 |     # Iterate through each combination
237 |     for combo in combinations:
238 |         platform = combo.get("platform")
239 |         inst_style = combo.get("instruction_style")
240 |         gt_type = combo.get("gt_type")
241 |         
242 |         # Filter results for the current combination
243 |         filtered_results = collect_results_to_eval(
244 |             results=results,
245 |             platform=platform,
246 |             instruction_style=inst_style,
247 |             gt_type=gt_type
248 |         )
249 |         
250 |         # Calculate metrics using the calc_metric_for_result_list function
251 |         metrics = calc_metric_for_result_list(filtered_results)
252 |         if metrics['num_total'] == 0:
253 |             continue
254 |         
255 |         # Construct a unique key based on the combination
256 |         key = f"plat:{platform} inst_style:{inst_style} gt_type:{gt_type}"
257 |         evaluation_result[key] = metrics
258 | 
259 |     return evaluation_result
260 | 
261 | def evaluate_leaderboard_detailed_style(results):
262 |     # Generate all combinations of platform, instruction_style, and gt_type
263 |     combinations = make_combinations(
264 |         results, 
265 |         application=True,
266 |     )
267 | 
268 |     evaluation_result = {}
269 | 
270 |     # Iterate through each combination
271 |     for combo in combinations:
272 |         application = combo.get("application")
273 |         
274 |         # Filter results for the current combination
275 |         filtered_results = collect_results_to_eval(
276 |             results=results,
277 |             application=application,
278 |         )
279 |         
280 |         # Calculate metrics using the calc_metric_for_result_list function
281 |         metrics = calc_metric_for_result_list(filtered_results)
282 |         if metrics['num_total'] == 0:
283 |             continue
284 |         
285 |         # Construct a unique key based on the combination
286 |         key = f"app:{application}"
287 |         evaluation_result[key] = metrics
288 | 
289 |     return evaluation_result
290 | 
291 | def evaluate_leaderboard_simple_style(results):
292 |     # Generate all combinations of platform, instruction_style, and gt_type
293 |     combinations = make_combinations(
294 |         results, 
295 |         group=True,
296 |     )
297 | 
298 |     evaluation_result = {}
299 | 
300 |     # Iterate through each combination
301 |     for combo in combinations:
302 |         group = combo.get("group")
303 |         
304 |         # Filter results for the current combination
305 |         filtered_results = collect_results_to_eval(
306 |             results=results,
307 |             group=group,
308 |         )
309 |         
310 |         # Calculate metrics using the calc_metric_for_result_list function
311 |         metrics = calc_metric_for_result_list(filtered_results)
312 |         if metrics['num_total'] == 0:
313 |             continue
314 |         
315 |         # Construct a unique key based on the combination
316 |         key = f"group:{group}"
317 |         evaluation_result[key] = metrics
318 | 
319 |     return evaluation_result
320 | 
321 | def evaluate_overall(results):
322 |     """
323 |     Evaluates the overall metrics for all results without any filtering.
324 |     
325 |     Parameters:
326 |         results (list): A list of dictionaries containing sample results.
327 |         
328 |     Returns:
329 |         dict: A dictionary containing the overall metrics.
330 |     """
331 |     # Calculate metrics for the entire result set
332 |     metrics = calc_metric_for_result_list(results)
333 |     
334 |     return metrics
335 | 
336 | 
337 | def evaluate(results):
338 |     """Collect results and calculate metrics. You can comment out function calls or add new ones based on your need.
339 |     """
340 |     result_report = {
341 |         "details": [],  # Store detailed information for each sample
342 |         "metrics": {}
343 |     }
344 | 
345 |     # TODO: comment out function calls based on your need
346 |     result_report["metrics"]["fine_grained"] = evaluate_fine_grained(results)
347 |     result_report["metrics"]["seeclick_style"] = evaluate_seeclick_paper_style(results)
348 |     result_report["metrics"]["leaderboard_simple_style"] = evaluate_leaderboard_simple_style(results)
349 |     result_report["metrics"]["leaderboard_detailed_style"] = evaluate_leaderboard_detailed_style(results)
350 |     result_report["metrics"]["overall"] = evaluate_overall(results)
351 | 
352 |     # Save detailed results
353 |     result_report["details"] = results
354 | 
355 |     return result_report
356 | 
357 | def main(args):
358 |     model = build_model(args)
359 |     print("Load model success")
360 | 
361 |     if args.task == "all":
362 |         task_filenames = [
363 |             os.path.splitext(f)[0]
364 |             for f in os.listdir(args.screenspot_test)
365 |             if f.endswith(".json")
366 |         ]
367 |     else:
368 |         task_filenames = args.task.split(",")
369 | 
370 |     if args.inst_style == "all":
371 |         inst_styles = INSTRUCTION_STYLES
372 |     else:
373 |         inst_styles = args.inst_style.split(",")
374 | 
375 |     if args.language == "all":
376 |         languages = LANGUAGES
377 |     else:
378 |         languages = args.language.split(",")
379 | 
380 |     if args.gt_type == "all":
381 |         gt_types = GT_TYPES
382 |     else:
383 |         gt_types = args.gt_type.split(",")
384 | 
385 |     tasks_to_run = []
386 |     for task_filename in task_filenames:
387 |         dataset = task_filename + ".json"
388 |         with open(os.path.join(args.screenspot_test, dataset), 'r') as f:
389 |             task_data = json.load(f)
390 | 
391 |         # Create the list of tasks to run, one item as an instance. Tasks may be reused.
392 |         for inst_style in inst_styles:  # Expand tasks based on user configurations
393 |             for gt_type in gt_types:
394 |                 for lang in languages:
395 |                     for task_instance in task_data:
396 |                         task_instance = copy.deepcopy(task_instance)
397 |                         task_instance["task_filename"] = task_filename
398 |                         task_instance["gt_type"] = gt_type
399 |                         task_instance["instruction_style"] = inst_style
400 |                         task_instance["language"] = lang
401 |                         if lang == "cn":
402 |                             if inst_style!= 'instruction' or gt_type != 'positive':
403 |                                 # TODO: Translate the data
404 |                                 raise AttributeError("Only positive samples and 'instruction' style are supported for Chinese instructions.")
405 |                             task_instance["prompt_to_evaluate"] = task_instance["instruction_cn"]
406 |                         elif lang == "en":
407 |                             task_instance["prompt_to_evaluate"] = task_instance["instruction"]
408 | 
409 |                         tasks_to_run.append(task_instance)
410 |         print(f"Num of sample in {task_filename}: {len(task_data)} * {len(inst_styles)} * {len(gt_types)} * {len(languages)} = {len(task_data) * len(inst_styles) * len(gt_types) * len(languages)}")
411 |     print(f"Total tasks: {len(tasks_to_run)}")
412 | 
413 |     results = []
414 |     for sample in tqdm(tasks_to_run):
415 |         filename = sample["img_filename"]
416 |         img_path = os.path.join(args.screenspot_imgs, filename)
417 | 
418 | 
419 |         response = model.inference(instruction=sample["prompt_to_evaluate"], image_path=img_path)
420 | 
421 |         point = response["point"]
422 |         tmp_img = Image.open(img_path)
423 |         img_size = tmp_img.size
424 |         sample["img_size"] = img_size
425 |         point_in_pixel = [point[0] * img_size[0], point[1] * img_size[1]] if point else None
426 |         
427 |         sample_result = {
428 |             "img_path": img_path, 
429 |             "group": sample["group"] if "group" in sample else None,
430 |             "platform": sample["platform"] if "platform" in sample else None,
431 |             "application": sample["application"] if "application" in sample else None,
432 |             "lang": sample["language"] if "language" in sample else None,
433 |             "instruction_style": sample["instruction_style"] if "instruction_style" in sample else None,
434 |             "prompt_to_evaluate": sample["prompt_to_evaluate"] if "prompt_to_evaluate" in sample else None,
435 |             "gt_type": sample["gt_type"] if "gt_type" in sample else 'positive',
436 |             "ui_type": sample["ui_type"] if "ui_type" in sample else None,
437 |             "task_filename": sample["task_filename"], 
438 |             "pred": point_in_pixel, 
439 |             "raw_response": response["raw_response"]
440 |         }
441 |         
442 |         if sample["gt_type"] == "positive":
443 |             correctness = eval_sample_positive_gt(sample, response)
444 |             sample_result.update({
445 |                 "bbox": sample["bbox"], 
446 |             })
447 |         elif sample["gt_type"] == "negative":
448 |             correctness = eval_sample_negative_gt(sample, response)
449 |         else:
450 |             raise ValueError("Wrong instruction type")
451 | 
452 |         print(correctness, point, sample['bbox'])
453 |         sample_result.update({
454 |             "correctness": correctness,
455 |         })
456 |         results.append(sample_result)
457 |         
458 |     result_report = evaluate(results)
459 |     os.makedirs(os.path.dirname(args.log_path), exist_ok=True)
460 |     with open(args.log_path, 'w') as f:
461 |         json.dump(result_report, f, indent=4)
462 |     logging.info("Evaluation of ScreenSpot finished.")
463 | 
464 | 
465 | if __name__ == "__main__":
466 |     main(parse_args())
467 | 


--------------------------------------------------------------------------------