├── .gitattributes ├── assets └── performance_venus.png ├── requirements.txt ├── examples ├── trace │ ├── 1e5d64c1163a4fe1cee9fed0a76fdf88.png │ ├── 29cf1a7a36dcbc065e28006ff7a688d7.png │ ├── 5842fa4411caad04cbcd67e514c2b426.png │ ├── 9795177942c1ed0bc654517934ceeb31.png │ ├── a47e1b25d9939b3fc76e9be1acd7bdb6.png │ ├── b04967ff19d08cd4b935fdd6a230152e.png │ ├── b07c500b65336f687eeb3f7ee7ae3eed.png │ └── trace.json ├── grounding_result_format.json └── grounding_meta_format.json ├── vis_androidworld ├── UI-Venus-androidworld.zip ├── vis_androidworld_trace.py └── templates │ └── index.html ├── .gitignore ├── LEGAL.md ├── scripts ├── run_navi_72b.sh ├── run_navi_7b.sh ├── run_gd_72b.sh └── run_gd_7b.sh ├── models ├── navigation │ ├── runner.py │ ├── ui_venus_navi_vllm.py │ ├── utils.py │ └── ui_venus_navi_agent.py └── grounding │ ├── ui_venus_ground_7b.py │ ├── ui_venus_ground_72b.py │ └── eval_screenspot_pro.py ├── LICENSE └── README.md /.gitattributes: -------------------------------------------------------------------------------- 1 | vis_androidworld/UI-Venus-androidworld.zip filter=lfs diff=lfs merge=lfs -text 2 | -------------------------------------------------------------------------------- /assets/performance_venus.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/inclusionAI/UI-Venus/HEAD/assets/performance_venus.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | transformers==4.49.0 2 | vllm==0.8.3 3 | qwen_agent 4 | qwen_vl_utils 5 | torch 6 | torchvision 7 | torchaudio -------------------------------------------------------------------------------- /examples/trace/1e5d64c1163a4fe1cee9fed0a76fdf88.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/inclusionAI/UI-Venus/HEAD/examples/trace/1e5d64c1163a4fe1cee9fed0a76fdf88.png -------------------------------------------------------------------------------- /examples/trace/29cf1a7a36dcbc065e28006ff7a688d7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/inclusionAI/UI-Venus/HEAD/examples/trace/29cf1a7a36dcbc065e28006ff7a688d7.png -------------------------------------------------------------------------------- /examples/trace/5842fa4411caad04cbcd67e514c2b426.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/inclusionAI/UI-Venus/HEAD/examples/trace/5842fa4411caad04cbcd67e514c2b426.png -------------------------------------------------------------------------------- /examples/trace/9795177942c1ed0bc654517934ceeb31.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/inclusionAI/UI-Venus/HEAD/examples/trace/9795177942c1ed0bc654517934ceeb31.png -------------------------------------------------------------------------------- /examples/trace/a47e1b25d9939b3fc76e9be1acd7bdb6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/inclusionAI/UI-Venus/HEAD/examples/trace/a47e1b25d9939b3fc76e9be1acd7bdb6.png -------------------------------------------------------------------------------- /examples/trace/b04967ff19d08cd4b935fdd6a230152e.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/inclusionAI/UI-Venus/HEAD/examples/trace/b04967ff19d08cd4b935fdd6a230152e.png -------------------------------------------------------------------------------- /examples/trace/b07c500b65336f687eeb3f7ee7ae3eed.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/inclusionAI/UI-Venus/HEAD/examples/trace/b07c500b65336f687eeb3f7ee7ae3eed.png -------------------------------------------------------------------------------- /vis_androidworld/UI-Venus-androidworld.zip: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:836345bda4672e80c5ee823eb4651b7da5cd1fbef6e3ff436a1f131adad6d684 3 | size 110696800 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled python modules. 2 | *.pyc 3 | 4 | # Byte-compiled 5 | _pycache__/ 6 | .cache/ 7 | 8 | # Poetry, setuptools, PyPI distribution artifacts. 9 | /*.egg-info 10 | .eggs/ 11 | build/ 12 | dist/ 13 | poetry.lock 14 | 15 | # Tests 16 | .pytest_cache/ 17 | 18 | # Type checking 19 | .pytype/ 20 | 21 | # Other 22 | *.DS_Store 23 | 24 | # PyCharm 25 | .idea -------------------------------------------------------------------------------- /LEGAL.md: -------------------------------------------------------------------------------- 1 | Legal Disclaimer 2 | 3 | Within this source code, the comments in Chinese shall be the original, governing version. Any comment in other languages are for reference only. In the event of any conflict between the Chinese language version comments and other language version comments, the Chinese language version shall prevail. 4 | 5 | 法律免责声明 6 | 7 | 关于代码注释部分,中文注释为官方版本,其它语言注释仅做参考。中文注释可能与其它语言注释存在不一致,当中文注释与其它语言注释存在不一致时,请以中文注释为准。 -------------------------------------------------------------------------------- /scripts/run_navi_72b.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | 4 | export PYTHONPATH=. 5 | 6 | model_path='inclusionAI/UI-Venus-Navi-72B' 7 | input_file='examples/trace/trace.json' 8 | output_file='./saved_trace.json' 9 | 10 | python models/navigation/runner.py \ 11 | --max_pixels=12845056 \ 12 | --min_pixels=3136 \ 13 | --model_path="${model_path}" \ 14 | --input_file="${input_file}" \ 15 | --output_file="${output_file}" 16 | -------------------------------------------------------------------------------- /scripts/run_navi_7b.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | 4 | export PYTHONPATH=. 5 | 6 | 7 | model_path='inclusionAI/UI-Venus-Navi-7B' 8 | input_file='examples/trace/trace.json' 9 | output_file='./saved_trace.json' 10 | 11 | python models/navigation/runner.py \ 12 | --max_pixels=937664 \ 13 | --min_pixels=830000 \ 14 | --model_path="${model_path}" \ 15 | --input_file="${input_file}" \ 16 | --output_file="${output_file}" 17 | -------------------------------------------------------------------------------- /examples/grounding_result_format.json: -------------------------------------------------------------------------------- 1 | { 2 | "details": [ 3 | { 4 | "img_path": "osworld/0FOB4CLBT2.png", 5 | "group": null, 6 | "platform": null, 7 | "application": null, 8 | "lang": "en", 9 | "instruction_style": "instruction", 10 | "prompt_to_evaluate": "Open the filter function for search settings.", 11 | "gt_type": "positive", 12 | "ui_type": null, 13 | "task_filename": "osworld_g", 14 | "pred": [ 15 | 1435.5102040816325, 16 | 339.5089285714286 17 | ], 18 | "raw_response": [ 19 | "[1748,415,1769,430]" 20 | ], 21 | "bbox": [ 22 | 1422, 23 | 326, 24 | 1449, 25 | 354 26 | ], 27 | "correctness": "correct" 28 | } 29 | ] 30 | } -------------------------------------------------------------------------------- /examples/grounding_meta_format.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "img_filename": "pc_ede36f9b-1154-4f76-b7f8-c15d7d3f9b6e.png", 4 | "bbox": [ 5 | 910, 6 | 78, 7 | 954, 8 | 112 9 | ], 10 | "instruction": "close this window", 11 | "application": "windows", 12 | "id": "desktop_0", 13 | "action": "Click the \"X\" button in the top-right corner of the window to close it.", 14 | "decription": "A white \"X\" icon located in the top-right corner of the window.", 15 | "negative_instruction": "save", 16 | "negative_action": "Click on the blue button with the floppy disk icon to save the current file.", 17 | "negative_description": "A small blue button with a floppy disk icon that is located to the left of the \"View\" tab", 18 | "ui_type": "icon", 19 | "platform": "desktop", 20 | "img_size": [ 21 | 960, 22 | 540 23 | ] 24 | } 25 | ] -------------------------------------------------------------------------------- /examples/trace/trace.json: -------------------------------------------------------------------------------- 1 | [ 2 | [ 3 | { 4 | "image_path": "examples/trace/5842fa4411caad04cbcd67e514c2b426.png", 5 | "task": "在夸克浏览器中,查看云图片中的\"壁纸_3.jpg\"的详细信息,并记住文件名,文件大小和文件ID,并以空格分隔符分隔,输出" 6 | }, 7 | { 8 | "image_path": "examples/trace/9795177942c1ed0bc654517934ceeb31.png", 9 | "task": "在夸克浏览器中,查看云图片中的\"壁纸_3.jpg\"的详细信息,并记住文件名,文件大小和文件ID,并以空格分隔符分隔,输出" 10 | }, 11 | { 12 | "image_path": "examples/trace/1e5d64c1163a4fe1cee9fed0a76fdf88.png", 13 | "task": "在夸克浏览器中,查看云图片中的\"壁纸_3.jpg\"的详细信息,并记住文件名,文件大小和文件ID,并以空格分隔符分隔,输出" 14 | }, 15 | { 16 | "image_path": "examples/trace/b04967ff19d08cd4b935fdd6a230152e.png", 17 | "task": "在夸克浏览器中,查看云图片中的\"壁纸_3.jpg\"的详细信息,并记住文件名,文件大小和文件ID,并以空格分隔符分隔,输出" 18 | }, 19 | { 20 | "image_path": "examples/trace/29cf1a7a36dcbc065e28006ff7a688d7.png", 21 | "task": "在夸克浏览器中,查看云图片中的\"壁纸_3.jpg\"的详细信息,并记住文件名,文件大小和文件ID,并以空格分隔符分隔,输出" 22 | }, 23 | { 24 | "image_path": "examples/trace/b07c500b65336f687eeb3f7ee7ae3eed.png", 25 | "task": "在夸克浏览器中,查看云图片中的\"壁纸_3.jpg\"的详细信息,并记住文件名,文件大小和文件ID,并以空格分隔符分隔,输出" 26 | }, 27 | { 28 | "image_path": "examples/trace/a47e1b25d9939b3fc76e9be1acd7bdb6.png", 29 | "task": "在夸克浏览器中,查看云图片中的\"壁纸_3.jpg\"的详细信息,并记住文件名,文件大小和文件ID,并以空格分隔符分隔,输出" 30 | } 31 | ] 32 | ] -------------------------------------------------------------------------------- /scripts/run_gd_72b.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | 5 | models=("ui_venus_ground_72b") 6 | for model in "${models[@]}" 7 | do 8 | python models/grounding/eval_screenspot_pro.py \ 9 | --model_type ${model} \ 10 | --screenspot_imgs "ScreenSpot-v2-variants/screenspotv2_image" \ 11 | --screenspot_test "ScreenSpot-v2-variants" \ 12 | --model_name_or_path "inclusionAI/UI-Venus-Ground-72B" \ 13 | --task "all" \ 14 | --language "en" \ 15 | --gt_type "positive" \ 16 | --log_path "venus_72b_ss2.json" \ 17 | --inst_style "instruction" 18 | 19 | done 20 | 21 | 22 | 23 | models=("ui_venus_ground_72b") 24 | for model in "${models[@]}" 25 | do 26 | python models/grounding/eval_screenspot_pro.py \ 27 | --model_type ${model} \ 28 | --screenspot_imgs "Screenspot-pro/images" \ 29 | --screenspot_test "Screenspot-pro/annotations" \ 30 | --model_name_or_path "inclusionAI/UI-Venus-Ground-72B" \ 31 | --task "all" \ 32 | --language "en" \ 33 | --gt_type "positive" \ 34 | --log_path "venus_72b_pro.json" \ 35 | --inst_style "instruction" 36 | 37 | done 38 | 39 | 40 | models=("ui_venus_ground_72b") 41 | for model in "${models[@]}" 42 | do 43 | python models/grounding/eval_screenspot_pro.py \ 44 | --model_type ${model} \ 45 | --screenspot_imgs "data/osworld" \ 46 | --screenspot_test "data/osworld_meta" \ 47 | --model_name_or_path "inclusionAI/UI-Venus-Ground-72B" \ 48 | --task "all" \ 49 | --language "en" \ 50 | --gt_type "positive" \ 51 | --log_path "osworld_g_72b.json" \ 52 | --inst_style "instruction" 53 | 54 | done 55 | 56 | models=("ui_venus_ground_72b") 57 | for model in "${models[@]}" 58 | do 59 | python models/grounding/eval_screenspot_pro.py \ 60 | --model_type ${model} \ 61 | --screenspot_imgs "data/ui_vision/ui-vision/images" \ 62 | --screenspot_test "data/ui_vision/ui-vision/annotations/element_grounding" \ 63 | --model_name_or_path "inclusionAI/UI-Venus-Ground-72B" \ 64 | --task "all" \ 65 | --language "en" \ 66 | --gt_type "positive" \ 67 | --log_path "vison_72b.json" \ 68 | --inst_style "instruction" 69 | 70 | done 71 | 72 | 73 | models=("ui_venus_ground_72b") 74 | for model in "${models[@]}" 75 | do 76 | python models/grounding/eval_screenspot_pro.py \ 77 | --model_type ${model} \ 78 | --screenspot_imgs "CAGUI/CAGUI_grounding/images/" \ 79 | --screenspot_test "CAGUI/CAGUI_grounding/json_files/" \ 80 | --model_name_or_path "inclusionAI/UI-Venus-Ground-72B" \ 81 | --task "all" \ 82 | --language "en" \ 83 | --gt_type "positive" \ 84 | --log_path "cpm_72b.json" \ 85 | --inst_style "instruction" 86 | 87 | done 88 | -------------------------------------------------------------------------------- /scripts/run_gd_7b.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | 5 | models=("ui_venus_ground_7b") 6 | for model in "${models[@]}" 7 | do 8 | python models/grounding/eval_screenspot_pro.py \ 9 | --model_type ${model} \ 10 | --screenspot_imgs "ScreenSpot-v2-variants/screenspotv2_image" \ 11 | --screenspot_test "ScreenSpot-v2-variants" \ 12 | --model_name_or_path "inclusionAI/UI-Venus-Ground-7B" \ 13 | --task "all" \ 14 | --language "en" \ 15 | --gt_type "positive" \ 16 | --log_path "venus_7b/venus_7b_ss2.json" \ 17 | --inst_style "instruction" 18 | 19 | done 20 | 21 | 22 | 23 | models=("ui_venus_ground_7b") 24 | for model in "${models[@]}" 25 | do 26 | python models/grounding/eval_screenspot_pro.py \ 27 | --model_type ${model} \ 28 | --screenspot_imgs "Screenspot-pro/images" \ 29 | --screenspot_test "Screenspot-pro/annotations" \ 30 | --model_name_or_path "inclusionAI/UI-Venus-Ground-7B" \ 31 | --task "all" \ 32 | --language "en" \ 33 | --gt_type "positive" \ 34 | --log_path "venus_7b/venus_7b_pro.json" \ 35 | --inst_style "instruction" 36 | 37 | done 38 | 39 | 40 | models=("ui_venus_ground_7b") 41 | for model in "${models[@]}" 42 | do 43 | python models/grounding/eval_screenspot_pro.py \ 44 | --model_type ${model} \ 45 | --screenspot_imgs "data/osworld" \ 46 | --screenspot_test "data/osworld_meta" \ 47 | --model_name_or_path "inclusionAI/UI-Venus-Ground-7B" \ 48 | --task "all" \ 49 | --language "en" \ 50 | --gt_type "positive" \ 51 | --log_path "venus_7b/osworld_g_7b.json" \ 52 | --inst_style "instruction" 53 | 54 | done 55 | 56 | models=("ui_venus_ground_7b") 57 | for model in "${models[@]}" 58 | do 59 | python models/grounding/eval_screenspot_pro.py \ 60 | --model_type ${model} \ 61 | --screenspot_imgs "data/ui_vision/ui-vision/images" \ 62 | --screenspot_test "data/ui_vision/ui-vision/annotations/element_grounding" \ 63 | --model_name_or_path "inclusionAI/UI-Venus-Ground-7B" \ 64 | --task "all" \ 65 | --language "en" \ 66 | --gt_type "positive" \ 67 | --log_path "venus_7b/vison_7b.json" \ 68 | --inst_style "instruction" 69 | 70 | done 71 | 72 | 73 | models=("ui_venus_ground_7b") 74 | for model in "${models[@]}" 75 | do 76 | python models/grounding/eval_screenspot_pro.py \ 77 | --model_type ${model} \ 78 | --screenspot_imgs "CAGUI/CAGUI_grounding/images/" \ 79 | --screenspot_test "CAGUI/CAGUI_grounding/json_files/" \ 80 | --model_name_or_path "inclusionAI/UI-Venus-Ground-7B" \ 81 | --task "all" \ 82 | --language "en" \ 83 | --gt_type "positive" \ 84 | --log_path "venus_7b/cpm_7b.json" \ 85 | --inst_style "instruction" 86 | 87 | done 88 | -------------------------------------------------------------------------------- /models/navigation/runner.py: -------------------------------------------------------------------------------- 1 | import json 2 | import argparse 3 | import logging 4 | from dataclasses import dataclass, asdict 5 | 6 | 7 | def read_json(path): 8 | with open(path, 'r', encoding='utf-8') as f: 9 | data = json.load(f) 10 | return data 11 | 12 | def save_json(file, data): 13 | with open(file, 'w') as f: 14 | json.dump(data, f, indent=4, ensure_ascii=False) 15 | 16 | def get_venus_agent(): 17 | from models.navigation.ui_venus_navi_agent import VenusNaviAgent 18 | return VenusNaviAgent 19 | 20 | def setup_logger(name: str = __name__, level: int = logging.INFO) -> logging.Logger: 21 | logger = logging.getLogger(name) 22 | if logger.handlers: 23 | return logger 24 | 25 | logger.setLevel(level) 26 | formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") 27 | 28 | handler = logging.StreamHandler() 29 | handler.setLevel(level) 30 | handler.setFormatter(formatter) 31 | 32 | logger.addHandler(handler) 33 | return logger 34 | 35 | 36 | @dataclass 37 | class ModelConfig: 38 | model_path: str = "Qwen/Qwen2.5-VL-72B-Instruct" 39 | tensor_parallel_size: int = 4 40 | gpu_memory_utilization: float = 0.6 41 | max_tokens: int = 2048 42 | max_pixels: int = 12845056 43 | min_pixels: int = 3136 44 | max_model_len: int = 10000 45 | max_num_seqs: int = 5 46 | temperature: float = 0.0 47 | top_p: float = 1.0 48 | top_k: int = -1 49 | n: int = 1 50 | 51 | def __str__(self): 52 | return f"ModelConfig({', '.join(f'{k}={v}' for k, v in asdict(self).items())})" 53 | 54 | 55 | def main(): 56 | parser = argparse.ArgumentParser() 57 | parser.add_argument("--model_path", type=str, default='/root/models/uivenus-7B') 58 | parser.add_argument("--tensor_parallel_size", type=int, default=1) 59 | parser.add_argument("--batch_size", type=int, default=1) 60 | parser.add_argument("--input_file", type=str, default='examples/trace/trace.json') 61 | parser.add_argument("--output_file", type=str, default='./saved_trace.json') 62 | parser.add_argument("--gpu_memory_utilization", type=float, default=0.6) 63 | parser.add_argument("--max_tokens", type=int, default=2048) 64 | parser.add_argument("--max_pixels", type=int, default=12845056) 65 | parser.add_argument("--min_pixels", type=int, default=3136) 66 | parser.add_argument("--max_model_len", type=int, default=128000) 67 | parser.add_argument("--max_num_seqs", type=int, default=5) 68 | parser.add_argument("--temperature", type=float, default=0.0) 69 | parser.add_argument("--n", type=int, default=1) 70 | parser.add_argument("--history_length", type=int, default=0) 71 | 72 | args = parser.parse_args() 73 | 74 | logger = setup_logger("UI-vernus") 75 | 76 | model_config = ModelConfig( 77 | model_path=args.model_path, 78 | tensor_parallel_size=args.tensor_parallel_size, 79 | gpu_memory_utilization=args.gpu_memory_utilization, 80 | max_tokens=args.max_tokens, 81 | max_pixels=args.max_pixels, 82 | min_pixels=args.min_pixels, 83 | max_model_len=args.max_model_len, 84 | max_num_seqs=args.max_num_seqs, 85 | temperature=args.temperature, 86 | n=args.n, 87 | ) 88 | logger.info(f"{model_config}") 89 | 90 | data = read_json(args.input_file) 91 | 92 | try: 93 | VenusNaviAgent = get_venus_agent() 94 | venus_agent = VenusNaviAgent(model_config, logger, args.history_length) 95 | logger.info("VenusNaviAgent initialized successfully") 96 | except Exception as e: 97 | logger.error(f"VenusNaviAgent initialized failed: {e}") 98 | raise 99 | 100 | results = [] 101 | for trace_index, trace in enumerate(data): 102 | for item in trace: 103 | task = item['task'] 104 | image_path = item['image_path'] 105 | action_json = venus_agent.step(task, image_path) 106 | history_record = venus_agent.export_history() 107 | venus_agent.reset() 108 | results.append(history_record) 109 | 110 | save_json(args.output_file, results) 111 | 112 | 113 | if __name__ == "__main__": 114 | main() -------------------------------------------------------------------------------- /models/grounding/ui_venus_ground_7b.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor,AutoTokenizer 4 | from transformers.generation import GenerationConfig 5 | 6 | 7 | from qwen_vl_utils import process_vision_info,smart_resize 8 | 9 | 10 | class UI_Venus_Ground_7B(): 11 | def load_model(self, model_name_or_path="/root/ckpt/huggingface/"): 12 | self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained( 13 | model_name_or_path, 14 | device_map="cuda", 15 | trust_remote_code=True, 16 | torch_dtype=torch.bfloat16, 17 | attn_implementation="flash_attention_2" 18 | ).eval() 19 | self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True) 20 | self.processor = AutoProcessor.from_pretrained(model_name_or_path) 21 | 22 | # Setting default generation config 23 | self.generation_config = GenerationConfig.from_pretrained(model_name_or_path, trust_remote_code=True).to_dict() 24 | self.set_generation_config( 25 | max_length=2048, 26 | do_sample=False, 27 | temperature=0.0 28 | ) 29 | 30 | def set_generation_config(self, **kwargs): 31 | self.generation_config.update(**kwargs) 32 | self.model.generation_config = GenerationConfig(**self.generation_config) 33 | 34 | 35 | 36 | def inference(self, instruction, image_path): 37 | assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path." 38 | 39 | prompt_origin = 'Outline the position corresponding to the instruction: {}. The output should be only [x1,y1,x2,y2].' 40 | full_prompt = prompt_origin.format(instruction) 41 | 42 | min_pixels = 2000000 43 | max_pixels = 4800000 44 | 45 | messages = [ 46 | { 47 | "role": "user", 48 | "content": [ 49 | { 50 | "type": "image", 51 | "image": image_path, 52 | "min_pixels": min_pixels, 53 | "max_pixels": max_pixels 54 | }, 55 | {"type": "text", "text": full_prompt}, 56 | ], 57 | } 58 | ] 59 | 60 | # Preparation for inference 61 | text = self.processor.apply_chat_template( 62 | messages, tokenize=False, add_generation_prompt=True 63 | ) 64 | image_inputs, video_inputs = process_vision_info(messages) 65 | inputs = self.processor( 66 | text=[text], 67 | images=image_inputs, 68 | videos=video_inputs, 69 | padding=True, 70 | return_tensors="pt", 71 | ) 72 | inputs = inputs.to(self.model.device) 73 | 74 | # Inference: Generation of the output 75 | generated_ids = self.model.generate(**inputs, max_new_tokens=128) 76 | generated_ids_trimmed = [ 77 | out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) 78 | ] 79 | output_text = self.processor.batch_decode( 80 | generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False 81 | ) 82 | 83 | print(output_text) 84 | 85 | input_height = inputs['image_grid_thw'][0][1]*14 86 | input_width = inputs['image_grid_thw'][0][2]*14 87 | 88 | try: 89 | box = eval(output_text[0]) 90 | abs_y1 = float(box[1]/input_height) 91 | abs_x1 = float(box[0]/input_width) 92 | abs_y2 = float(box[3]/input_height) 93 | abs_x2 = float(box[2]/input_width) 94 | box = [abs_x1,abs_y1,abs_x2,abs_y2] 95 | except: 96 | box = [0,0,0,0] 97 | 98 | point = [(box[0]+box[2])/2,(box[1]+box[3])/2] 99 | result_dict = { 100 | "result": "positive", 101 | "format": "x1y1x2y2", 102 | "raw_response": output_text, 103 | "bbox": box, 104 | "point": point 105 | } 106 | 107 | return result_dict 108 | 109 | -------------------------------------------------------------------------------- /models/grounding/ui_venus_ground_72b.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | from PIL import Image 4 | 5 | from transformers.generation import GenerationConfig 6 | from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor,AutoTokenizer 7 | from qwen_vl_utils import process_vision_info,smart_resize 8 | 9 | 10 | 11 | class UI_Venus_Ground_72B(): 12 | def load_model(self, model_name_or_path="/root/ckpt/huggingface/"): 13 | self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained( 14 | model_name_or_path, 15 | device_map="auto", 16 | trust_remote_code=True, 17 | torch_dtype=torch.bfloat16, 18 | attn_implementation="flash_attention_2" 19 | ).eval() 20 | self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True) 21 | self.processor = AutoProcessor.from_pretrained(model_name_or_path) 22 | # Setting default generation config 23 | self.generation_config = GenerationConfig.from_pretrained("/heyuan79/yunfei.yf/Qwen/Qwen__Qwen2.5-VL-7B-Instruct", trust_remote_code=True).to_dict() 24 | self.set_generation_config( 25 | max_length=2048, 26 | do_sample=False, 27 | temperature=0.0 28 | ) 29 | 30 | def set_generation_config(self, **kwargs): 31 | self.generation_config.update(**kwargs) 32 | self.model.generation_config = GenerationConfig(**self.generation_config) 33 | 34 | 35 | def inference(self, instruction, image_path): 36 | 37 | assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path." 38 | 39 | prompt_origin = 'Output the bounding box in the image corresponding to the instruction "{}" with grounding. The output should be only [x1,y1,x2,y2].' 40 | full_prompt = prompt_origin.format(instruction) 41 | 42 | 43 | 44 | messages = [ 45 | { 46 | "role": "user", 47 | "content": [ 48 | { 49 | "type": "image", 50 | "image": image_path, 51 | }, 52 | {"type": "text", "text": full_prompt}, 53 | ], 54 | } 55 | ] 56 | # min_pixels=2073600, 1080p 57 | # max_pixels=3686400, 1440P 58 | min_pixels = 3110400 59 | max_pixels = 48000000 60 | text = self.processor.apply_chat_template( 61 | messages, tokenize=False, add_generation_prompt=True 62 | ) 63 | image_inputs, video_inputs = process_vision_info(messages) 64 | new_image_inputs = [] 65 | w, h = 0, 0 66 | for sub_img in image_inputs: 67 | width, height = sub_img.size 68 | resized_height, resized_width = smart_resize( 69 | height, 70 | width, 71 | factor=28, 72 | min_pixels=min_pixels, 73 | max_pixels=max_pixels, 74 | ) 75 | h, w = resized_height, resized_width 76 | sub_img = sub_img.resize((resized_width, resized_height)) 77 | new_image_inputs.append(sub_img) 78 | 79 | inputs = self.processor( 80 | text=[text], 81 | images=new_image_inputs, 82 | videos=video_inputs, 83 | padding=True, 84 | return_tensors="pt", 85 | ) 86 | inputs = inputs.to(self.model.device) 87 | 88 | # Inference: Generation of the output 89 | generated_ids = self.model.generate(**inputs, max_new_tokens=128) 90 | generated_ids_trimmed = [ 91 | out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) 92 | ] 93 | output_text = self.processor.batch_decode( 94 | generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False 95 | ) 96 | 97 | try: 98 | box = eval(output_text[0]) 99 | except: 100 | box = [0,0,0,0] 101 | 102 | 103 | try: 104 | input_width, input_height = w, h 105 | 106 | abs_y1 = float(box[1]/input_height) 107 | abs_x1 = float(box[0]/input_width) 108 | abs_y2 = float(box[3]/input_height) 109 | abs_x2 = float(box[2]/input_width) 110 | box = [abs_x1,abs_y1,abs_x2,abs_y2] 111 | except: 112 | box = [0,0,0,0] 113 | print() 114 | point = [(box[0]+box[2])/2,(box[1]+box[3])/2] 115 | result_dict = { 116 | "result": "positive", 117 | "format": "x1y1x2y2", 118 | "raw_response": output_text, 119 | "bbox": box, 120 | "point": point 121 | } 122 | 123 | 124 | return result_dict 125 | 126 | -------------------------------------------------------------------------------- /vis_androidworld/vis_androidworld_trace.py: -------------------------------------------------------------------------------- 1 | from flask import Flask, render_template, send_from_directory, request 2 | import os 3 | import argparse 4 | from pathlib import Path 5 | from PIL import Image 6 | 7 | app = Flask(__name__) 8 | 9 | 10 | parser = argparse.ArgumentParser(description='Android World visualization') 11 | parser.add_argument('--path', type=str, default="", 12 | help='path of visualization files') 13 | parser.add_argument('--port', type=int, default=5050, 14 | help='port') 15 | 16 | args = parser.parse_args() 17 | 18 | BASE_DIR = Path(args.path) 19 | 20 | @app.route('/') 21 | def index(): 22 | tasks = [] 23 | for entry in os.scandir(BASE_DIR): 24 | if entry.is_dir(): 25 | task_dir = BASE_DIR / entry.name 26 | status_value, full_status = get_task_status(task_dir) 27 | 28 | emoji = "✅" if status_value == "1.0" or status_value == "1" else "❌" 29 | display_name = f"{entry.name} - {status_value} {emoji}" 30 | 31 | tasks.append({ 32 | "original_name": entry.name, 33 | "display_name": display_name, 34 | "status_value": status_value, 35 | "full_status": full_status 36 | }) 37 | 38 | tasks.sort(key=lambda x: x['original_name']) 39 | 40 | selected_original_name = request.args.get('task', tasks[0]['original_name'] if tasks else None) 41 | 42 | task_data = None 43 | if selected_original_name: 44 | task_data = prepare_task_data(selected_original_name) 45 | 46 | return render_template( 47 | 'index.html', 48 | tasks=tasks, 49 | selected_original_name=selected_original_name, 50 | task_data=task_data 51 | ) 52 | 53 | def get_task_status(task_dir): 54 | """Retrieve the task's status value and return the entire content of the status file.""" 55 | status_file = task_dir / "000000status.txt" 56 | full_status = "Status file not found." 57 | status_value = "?" 58 | 59 | try: 60 | with open(status_file, 'r') as f: 61 | full_status = f.read().strip() 62 | 63 | lines = full_status.split('\n') 64 | if lines: 65 | status_value = lines[0].strip() 66 | 67 | except FileNotFoundError: 68 | pass 69 | except Exception as e: 70 | full_status = f"Failed to read status file:{str(e)}" 71 | 72 | return status_value, full_status 73 | 74 | @app.route('/images//') 75 | def serve_image(task, filename): 76 | """extract images""" 77 | task_dir = BASE_DIR / task 78 | return send_from_directory(task_dir, filename) 79 | 80 | def prepare_task_data(task_name): 81 | """prepare data""" 82 | task_dir = BASE_DIR / task_name 83 | 84 | goal = read_file(task_dir / "000000goal.txt") 85 | 86 | status_value, full_status = get_task_status(task_dir) 87 | 88 | steps = [] 89 | step_files = [] 90 | 91 | for entry in os.scandir(task_dir): 92 | if entry.name.startswith("000000"): 93 | continue 94 | step_files.append(entry.name) 95 | step_files.sort() 96 | 97 | step_groups = {} 98 | for filename in step_files: 99 | prefix = filename.split('_')[0] 100 | step_groups.setdefault(prefix, []).append(filename) 101 | 102 | for prefix, files in sorted(step_groups.items()): 103 | step = { 104 | 'prefix': prefix, 105 | 'image': next((f for f in files if f.endswith('_raw.jpg')), None), 106 | 'thinking': next((f for f in files if f.endswith('_thinking.txt')), None), 107 | 'tool_call': next((f for f in files if f.endswith('_tool_call.txt')), None), 108 | 'conclusion': next((f for f in files if f.endswith('_conclusion.txt')), None), 109 | } 110 | 111 | if step['image']: 112 | image_path = task_dir / step['image'] 113 | try: 114 | with Image.open(image_path) as img: 115 | step['image_width'] = img.width 116 | step['image_height'] = img.height 117 | except Exception as e: 118 | step['image_width'] = None 119 | step['image_height'] = None 120 | else: 121 | step['image_width'] = None 122 | step['image_height'] = None 123 | 124 | for file_type in ['thinking', 'tool_call', 'conclusion']: 125 | if step[file_type]: 126 | content = read_file(task_dir / step[file_type]) 127 | step[file_type + '_content'] = content 128 | step['x'] = None 129 | step['y'] = None 130 | else: 131 | step[file_type + '_content'] = "file not found" 132 | step['x'] = None 133 | step['y'] = None 134 | 135 | steps.append(step) 136 | 137 | return { 138 | 'name': task_name, 139 | 'goal': goal, 140 | 'status_value': status_value, 141 | 'full_status': full_status, 142 | 'steps': steps 143 | } 144 | 145 | def read_file(file_path, default="file not found"): 146 | """read file""" 147 | try: 148 | with open(file_path, 'r') as f: 149 | return f.read() 150 | except: 151 | return default 152 | 153 | if __name__ == '__main__': 154 | app.run(host='0.0.0.0', port=args.port, debug=True) -------------------------------------------------------------------------------- /models/navigation/ui_venus_navi_vllm.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Dict, List, Any, Tuple 3 | 4 | from transformers import AutoProcessor 5 | from vllm import LLM, SamplingParams 6 | 7 | from qwen_vl_utils import process_vision_info 8 | 9 | 10 | class NaviVLLM: 11 | def __init__(self, model_config, logger): 12 | """ 13 | Initialize the NaviVLLM model. 14 | 15 | Args: 16 | model_config: Configuration object with model parameters. 17 | logger: Logger instance for logging. 18 | """ 19 | self.logger = logger 20 | self.model_config = model_config 21 | 22 | self.model = LLM( 23 | model=model_config.model_path, 24 | max_model_len=model_config.max_model_len, 25 | max_num_seqs=model_config.max_num_seqs, 26 | tensor_parallel_size=model_config.tensor_parallel_size, 27 | gpu_memory_utilization=model_config.gpu_memory_utilization, 28 | ) 29 | self.processor = AutoProcessor.from_pretrained(model_config.model_path) 30 | self.processor.image_processor.max_pixels = model_config.max_pixels 31 | self.processor.image_processor.min_pixels = model_config.min_pixels 32 | self.sampling_params = SamplingParams( 33 | max_tokens=model_config.max_tokens, 34 | temperature=model_config.temperature, 35 | top_p=model_config.top_p, 36 | top_k=model_config.top_k, 37 | repetition_penalty=1.05, 38 | n=model_config.n, 39 | stop_token_ids=[] 40 | ) 41 | 42 | self.logger.info( 43 | f"SamplingParams: max_tokens={model_config.max_tokens}, " 44 | f"temperature={model_config.temperature}, top_p={model_config.top_p}, " 45 | f"top_k={model_config.top_k}, n={model_config.n}, " 46 | f"stop_token_ids={self.sampling_params.stop_token_ids}" 47 | ) 48 | 49 | def create_message_for_image(self, image: str, problem: str) -> List[Dict[str, Any]]: 50 | return [ 51 | {"role": "system", "content": "You are a helpful assistant."}, 52 | { 53 | "role": "user", 54 | "content": [ 55 | {"type": "text", "text": problem}, 56 | { 57 | "type": "image", 58 | "image": image, 59 | "min_pixels": self.model_config.min_pixels, 60 | "max_pixels": self.model_config.max_pixels, 61 | } 62 | ], 63 | }, 64 | ] 65 | 66 | def _prepare_llm_inputs(self, messages_list: List[List[Dict]]) -> List[Dict]: 67 | """ 68 | Convert messages to vLLM input format with multi-modal data. 69 | 70 | Args: 71 | messages_list: List of message lists (one per sample). 72 | 73 | Returns: 74 | List of dictionaries containing 'prompt' and 'multi_modal_data'. 75 | """ 76 | prompts = [ 77 | self.processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True) 78 | for msg in messages_list 79 | ] 80 | 81 | image_inputs, _ = process_vision_info(messages_list) 82 | 83 | llm_inputs = [] 84 | for image_input, text in zip(image_inputs, prompts): 85 | mm_data = {"image": image_input} 86 | llm_inputs.append({ 87 | "prompt": text, 88 | "multi_modal_data": mm_data 89 | }) 90 | 91 | return llm_inputs 92 | 93 | def _process_batch( 94 | self, 95 | batch_data: List[Tuple[bytes, str, str]], 96 | ) -> Tuple[List[str], List[str]]: 97 | """ 98 | Process a batch of (image_path, problem) data into vLLM inputs. 99 | 100 | Args: 101 | batch_data: List of tuples (image_path, problem). 102 | 103 | Returns: 104 | List of LLM input dictionaries. 105 | """ 106 | images, problems = zip(*batch_data) 107 | 108 | messages_list = [self.create_message_for_image(img, prob) for img, prob in zip(images, problems)] 109 | 110 | return self._prepare_llm_inputs(messages_list) 111 | 112 | 113 | def __call__(self, data, print_log=False): 114 | """ 115 | Generate responses for a list of (image_path, problem) pairs. 116 | 117 | Args: 118 | data: List of tuples (image_path, problem). 119 | print_log: Whether to log questions and answers. 120 | 121 | Returns: 122 | List[List[str]]: Each inner list contains `n` generated responses. 123 | """ 124 | llm_input_list = self._process_batch(data) 125 | 126 | outputs = self.model.generate(llm_input_list, sampling_params=self.sampling_params) 127 | responses = [] 128 | for output in outputs: 129 | generated_texts = [o.text for o in output.outputs] 130 | responses.append(generated_texts) 131 | 132 | if print_log: 133 | for (image_path, problem), response in zip(data, responses): 134 | self.logger.info(f"Image: {os.path.basename(image_path)}") 135 | self.logger.info(f"Problem: {problem}") 136 | self.logger.info(f"Response: {response[0]}") 137 | self.logger.info("-" * 50) 138 | 139 | return responses -------------------------------------------------------------------------------- /models/navigation/utils.py: -------------------------------------------------------------------------------- 1 | import re 2 | from typing import Optional, Tuple 3 | 4 | 5 | USER_PROMPT = """ 6 | **You are a GUI Agent.** 7 | Your task is to analyze a given user task, review current screenshot and previous actions, and determine the next action to complete the task. 8 | 9 | ### User Task 10 | {user_task} 11 | 12 | ### Previous Actions 13 | {previous_actions} 14 | 15 | ### Available Actions 16 | You may execute one of the following functions: 17 | Click(box=(x1, y1)) 18 | Drag(start=(x1, y1), end=(x2, y2)) 19 | Scroll(start=(x1, y1), end=(x2, y2), direction='down/up/right/left') 20 | Type(content='') 21 | Launch(app='') 22 | Wait() 23 | Finished(content='') 24 | CallUser(content='') 25 | LongPress(box=(x1, y1)) 26 | PressBack() 27 | PressHome() 28 | PressEnter() 29 | PressRecent() 30 | 31 | ### Instruction 32 | - Make sure you understand the task goal to avoid wrong actions. 33 | - Make sure you carefully examine the the current screenshot. Sometimes the summarized history might not be reliable, over-claiming some effects. 34 | - For requests that are questions (or chat messages), remember to use the `CallUser` action to reply to user explicitly before finishing! Then, after you have replied, use the Finished action if the goal is achieved. 35 | - Consider exploring the screen by using the `scroll` action with different directions to reveal additional content. 36 | - To copy some text: first select the exact text you want to copy, which usually also brings up the text selection bar, then click the `copy` button in bar. 37 | - To paste text into a text box, first long press the text box, then usually the text selection bar will appear with a `paste` button in it. 38 | - You first thinks about the reasoning process in the mind, then provide the action. The reasoning and action are enclosed in and tags respectively. After providing action, summarize your action in tags 39 | """ 40 | 41 | 42 | def parse_coordinates(coord_str: str) -> Optional[Tuple[float, float]]: 43 | if not coord_str: 44 | return None, None 45 | 46 | coord_str_clean = coord_str.replace(" ", "") 47 | match = re.match(r"\(([\d.]+),([\d.]+)\)", coord_str_clean) 48 | if match: 49 | return float(match.group(1)), float(match.group(2)) 50 | 51 | match = re.match(r"\(\s*([\d.]+)\s*,\s*([\d.]+)\s*\)", coord_str) 52 | if match: 53 | return float(match.group(1)), float(match.group(2)) 54 | 55 | return None, None 56 | 57 | def _split_parameters(params_str: str) -> list: 58 | param_parts = [] 59 | current_part = "" 60 | 61 | in_quotes = False 62 | quote_char = None 63 | bracket_level = 0 64 | 65 | for char in params_str: 66 | if char in ['"', "'"] and not in_quotes: 67 | in_quotes = True 68 | quote_char = char 69 | elif char == quote_char and in_quotes: 70 | in_quotes = False 71 | quote_char = None 72 | 73 | elif not in_quotes: 74 | if char == '(': 75 | bracket_level += 1 76 | elif char == ')': 77 | bracket_level -= 1 78 | elif char == ',' and bracket_level == 0: 79 | param_parts.append(current_part.strip()) 80 | current_part = "" 81 | continue 82 | 83 | current_part += char 84 | 85 | if current_part.strip(): 86 | param_parts.append(current_part.strip()) 87 | 88 | return param_parts 89 | 90 | def parse_answer(action_str: str): 91 | pattern = r"^(\w+)\((.*)\)$" 92 | match = re.match(pattern, action_str.strip(), re.DOTALL) 93 | if not match: 94 | raise ValueError(f"Invalid action_str format: {action_str}") 95 | 96 | action_type = match.group(1) 97 | params_str = match.group(2).strip() 98 | params = {} 99 | 100 | if params_str: 101 | try: 102 | param_pairs = _split_parameters(params_str) 103 | 104 | for pair in param_pairs: 105 | if '=' in pair: 106 | key, value = pair.split("=", 1) 107 | value = value.strip("'").strip() 108 | params[key.strip()] = value 109 | else: 110 | params[pair.strip()] = None 111 | except Exception as e: 112 | print(f"Answer parse error: {e}") 113 | 114 | if action_type == 'Click': 115 | p_x, p_y = parse_coordinates(params.get("box", "")) 116 | if p_x is not None and p_y is not None: 117 | return 'Click', {'box': (p_x, p_y)} 118 | else: 119 | raise ValueError(f"action {action_type} Unknown click params: {repr(params)}") 120 | elif action_type == 'LongPress': 121 | p_x, p_y = parse_coordinates(params.get("box", "")) 122 | if p_x is not None and p_y is not None: 123 | return 'LongPress', {'box': (p_x, p_y)} 124 | else: 125 | raise ValueError(f"action {action_type} Unknown long press params: {repr(params)}") 126 | elif action_type == 'Drag': 127 | p_x, p_y = parse_coordinates(params.get("start", "")) 128 | e_x, e_y = parse_coordinates(params.get("end", "")) 129 | if p_x is not None and p_y is not None and e_x is not None and e_y is not None: 130 | return 'Drag', {'start': (p_x, p_y), 'end': (e_x, e_y)} 131 | else: 132 | raise ValueError(f"action {action_type} Unknown drag params: {repr(params)}") 133 | elif action_type == 'Scroll': 134 | p_x, p_y = parse_coordinates(params.get("start", "")) 135 | e_x, e_y = parse_coordinates(params.get("end", "")) 136 | if p_x is not None and p_y is not None and e_x is not None and e_y is not None: 137 | return 'Scroll', {'start': (p_x, p_y), 'end': (e_x, e_y), 'direction': ''} 138 | elif "direction" in params: 139 | direction = params.get("direction") 140 | return 'Scroll', {'start': (), 'end': (), 'direction': direction} 141 | else: 142 | raise ValueError(f"action {action_type} Unknown scroll params: {repr(params)}") 143 | elif action_type == 'Type': 144 | key = 'content' 145 | type_text = params.get(key) 146 | if type_text is not None: 147 | return 'Type', {'content': type_text} 148 | else: 149 | raise ValueError(f"action {action_type} Unknown type params: {repr(params)}") 150 | elif action_type == 'CallUser': 151 | key = 'content' 152 | call_text = params.get(key) 153 | if call_text is not None: 154 | return 'CallUser', {'content': call_text} 155 | else: 156 | raise ValueError(f"action {action_type} Unknown call user params: {repr(params)}") 157 | elif action_type == 'Launch': 158 | app = params.get("app", "") 159 | url = params.get("url", "") 160 | if app is not None: 161 | return 'Launch', {'app': app, 'url': url} 162 | else: 163 | raise ValueError(f"action {action_type} Unknown launch params: {repr(params)}") 164 | elif action_type == 'Finished': 165 | key = 'content' 166 | finished_text = params.get(key, "") 167 | return 'Finished', {'content': finished_text} 168 | elif action_type in ['Wait', 'PressBack', 'PressHome', 'PressEnter', 'PressRecent']: 169 | return action_type, {} 170 | else: 171 | raise ValueError(f"action {action_type} Unknown action: {repr(params)}") 172 | -------------------------------------------------------------------------------- /models/navigation/ui_venus_navi_agent.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List, Optional, Tuple, Any 2 | from dataclasses import dataclass, asdict 3 | from PIL import Image 4 | import logging 5 | 6 | from models.navigation.ui_venus_navi_vllm import NaviVLLM 7 | from qwen_vl_utils import smart_resize 8 | from .utils import parse_answer, USER_PROMPT 9 | 10 | 11 | ACTION_MAPPING = { 12 | 'click', 'drag', 'scroll', 'type', 'launch', 'wait', 'finished', 13 | 'calluser', 'longpress', 'pressback', 'presshome', 'pressenter', 14 | 'pressrecent', 'answer' 15 | } 16 | 17 | 18 | @dataclass 19 | class StepData: 20 | image_path: str 21 | raw_screenshot: Image.Image 22 | query: str 23 | generated_text: str 24 | think: str 25 | action: str 26 | _conclusion: str 27 | action_output_json: Optional[Dict[str, Any]] = None 28 | status: str = 'success' 29 | 30 | def to_dict(self, include_screenshot: bool = False) -> dict: 31 | """ 32 | Convert this step to a JSON-serializable dict. 33 | 34 | Args: 35 | include_screenshot (bool): Whether to include base64-encoded image. 36 | 37 | Returns: 38 | dict: Serializable step data. 39 | """ 40 | data = asdict(self) 41 | data['raw_screenshot'] = None 42 | 43 | if include_screenshot and self.raw_screenshot is not None: 44 | import base64 45 | from io import BytesIO 46 | buffer = BytesIO() 47 | self.raw_screenshot.save(buffer, format="PNG") 48 | data['raw_screenshot_base64'] = base64.b64encode(buffer.getvalue()).decode("utf-8") 49 | 50 | return data 51 | 52 | 53 | class VenusNaviAgent: 54 | def __init__(self, 55 | model_config, 56 | logger: logging.Logger, 57 | history_length: int = 0) -> None: 58 | self.model = NaviVLLM(model_config=model_config, logger=logger) 59 | self.max_pixels = model_config.max_pixels 60 | self.min_pixels = model_config.min_pixels 61 | self.logger = logger 62 | self.history: List[StepData] = [] 63 | self.history_length = max(0, history_length) 64 | 65 | def reset(self): 66 | self.logger.info(f"Agent Reset") 67 | self.history = [] 68 | 69 | def _build_query(self, goal: str) -> str: 70 | if len(self.history) == 0: 71 | history_str = "" 72 | else: 73 | recent_history = self.history[-self.history_length:] 74 | history_entries = [ 75 | f"Step {i}: {step.think}{step.action}" for i, step in enumerate(recent_history) 76 | ] 77 | history_str = "\n".join(history_entries) 78 | 79 | return USER_PROMPT.format(user_task=goal, previous_actions=history_str) 80 | 81 | def _rescale_coordinate(self, x: float, y: float, orig_size: Tuple[int, int], resized_size: Tuple[int, int]) -> Tuple[int, int]: 82 | o_w, o_h = orig_size 83 | r_w, r_h = resized_size 84 | x_scaled = int(x * o_w / r_w) 85 | y_scaled = int(y * o_h / r_h) 86 | return ( 87 | max(0, min(x_scaled, o_w)), 88 | max(0, min(y_scaled, o_h)) 89 | ) 90 | 91 | def _convert_coordinate(self, action_json: dict, size_params: dict): 92 | orig_size = (size_params['original_width'], size_params['original_height']) 93 | resized_size = (size_params['resized_width'], size_params['resized_height']) 94 | action_type = action_json['action'].lower() 95 | try: 96 | if action_type == 'click' or action_type == 'longpress': 97 | x, y = action_json['params']['box'] 98 | action_json['params']['box'] = self._rescale_coordinate(x, y, orig_size, resized_size) 99 | elif action_type == 'drag': 100 | x1, y1 = action_json['params']['start'] 101 | x2, y2 = action_json['params']['end'] 102 | action_json['params']['start'] = self._rescale_coordinate(x1, y1, orig_size, resized_size) 103 | action_json['params']['end'] = self._rescale_coordinate(x2, y2, orig_size, resized_size) 104 | elif action_type == 'scroll': 105 | if 'start' in action_json['params'] and len(action_json['params']['start']) > 0: 106 | x, y = action_json['params']['start'] 107 | action_json['params']['start'] = self._rescale_coordinate(x, y, orig_size, resized_size) 108 | if 'end' in action_json['params'] and len(action_json['params']['end']) > 0: 109 | x, y = action_json['params']['end'] 110 | action_json['params']['end'] = self._rescale_coordinate(x, y, orig_size, resized_size) 111 | except (KeyError, ValueError, TypeError) as e: 112 | self.logger.warning(f"convert failed: {e}, action_json={action_json}") 113 | 114 | return action_json 115 | 116 | def step(self, goal: str, image_path: str): 117 | self.logger.info(f"----------step {len(self.history) + 1}") 118 | try: 119 | raw_screenshot = Image.open(image_path).convert('RGB') 120 | except Exception as e: 121 | self.logger.error(f"Can't load {image_path}: {e}") 122 | return None 123 | 124 | original_width, original_height = raw_screenshot.size 125 | resized_height, resized_width = smart_resize( 126 | original_height, original_width, 127 | min_pixels=self.min_pixels, 128 | max_pixels=self.max_pixels) 129 | 130 | size_params = { 131 | 'original_width': original_width, 132 | 'original_height': original_height, 133 | 'resized_width': resized_width, 134 | 'resized_height': resized_height, 135 | } 136 | 137 | user_query = self._build_query(goal) 138 | generated_text = self.model([(image_path, user_query)])[0][0] 139 | 140 | self.logger.info(f"Goal: {goal}") 141 | self.logger.info(f"USER Query: {repr(user_query)}") 142 | self.logger.info(f"ACTION text: {repr(str(generated_text))}") 143 | 144 | think_text = generated_text.split('')[1].split('')[0].strip('\n') 145 | answer_text = generated_text.split('')[1].split('')[0].strip('\n') 146 | conclusion_text = generated_text.split('')[1].split('')[0].strip('\n') 147 | 148 | self.logger.info(f"Think: {think_text}") 149 | self.logger.info(f"Answer: {answer_text}") 150 | 151 | try: 152 | action_name, action_params = parse_answer(answer_text) 153 | action_json = {'action': action_name, 'params': action_params} 154 | action_json = self._convert_coordinate(action_json, size_params) 155 | except Exception as e: 156 | self.logger.warning(f'Failed to parse_answer: {e}') 157 | step_data = StepData( 158 | image_path=image_path, 159 | raw_screenshot=raw_screenshot, 160 | query=user_query, 161 | generated_text=generated_text, 162 | think=think_text, 163 | action=answer_text, 164 | _conclusion=conclusion_text, 165 | status='failed' 166 | ) 167 | self.history.append(step_data) 168 | return None 169 | 170 | step_data = StepData( 171 | image_path=image_path, 172 | raw_screenshot=raw_screenshot, 173 | query=user_query, 174 | generated_text=generated_text, 175 | think=think_text, 176 | action=answer_text, 177 | _conclusion=conclusion_text, 178 | action_output_json=action_json, 179 | status='success' 180 | ) 181 | self.history.append(step_data) 182 | 183 | self.logger.info(f'Action: {repr(str(action_json))}') 184 | return action_json 185 | 186 | def export_history(self, include_screenshot=False): 187 | serialized_history = [ 188 | step.to_dict(include_screenshot=include_screenshot) 189 | for step in self.history 190 | ] 191 | return serialized_history 192 | 193 | 194 | 195 | 196 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | -------------------------------------------------------------------------------- /vis_androidworld/templates/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Android World Visualization 5 | 6 | 195 | 196 | 197 |
198 |

Android World Visualization

199 |
200 | 201 |
202 | 210 | 211 |
212 | 213 | {% if task_data %} 214 |
215 |
Goal: {{ task_data.goal }}
216 |
{{ task_data.full_status }}
217 |
218 | 219 |
220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | {% for step in task_data.steps %} 231 | 232 | 257 | 261 | 265 | 269 | 270 | {% endfor %} 271 | 272 |
ScreenshotThinkActionConclusion
233 | {% if step.image %} 234 |
235 |
step {{ step.prefix }}
236 | 237 |
238 | screenshot 245 | {% if step.x and step.y %} 246 |
249 |
250 | {% endif %} 251 |
252 |
253 | {% else %} 254 |
this step no image
255 | {% endif %} 256 |
258 |
Think
259 |
{{ step.thinking_content }}
260 |
262 |
Action
263 |
{{ step.tool_call_content }}
264 |
266 |
Conclusion
267 |
{{ step.conclusion_content }}
268 |
273 |
274 | {% else %} 275 |
276 |

no data in file

277 |
278 | {% endif %} 279 | 280 | 283 | 284 | 285 | 286 | 287 | 330 | 331 | 338 | 339 | 340 | 349 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 🌟 UI-Venus: Building High-Performance UI Agents with RFT 2 | 3 | 4 | [![License](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) 5 | [![Report](https://img.shields.io/badge/Report-Technical%20Report-blueviolet?logo=notion)](http://arxiv.org/abs/2508.10833) 6 | [![GitHub](https://img.shields.io/badge/GitHub-Repository-green?logo=github)](https://github.com/inclusionAI/UI-Venus) 7 | [![Hugging Face](https://img.shields.io/badge/Hugging%20Face-Model-orange?logo=huggingface)](https://huggingface.co/collections/inclusionAI/ui-venus-689f2fb01a4234cbce91c56a) 8 | 9 | 10 |

11 | UI-Venus leverages Reinforcement Fine-Tuning (RFT) to achieve state-of-the-art performance in GUI understanding and action prediction across mobile, desktop, and web interfaces. 12 |

13 | 14 | --- 15 | 16 |

17 | 📈 UI-Venus Benchmark Performance 18 |

19 | 20 |

21 | UI-Venus Performance Across Datasets 22 |
23 |

24 | 25 | > **Figure:** Performance of UI-Venus across multiple benchmark datasets. UI-Venus achieves **State-of-the-Art (SOTA)** results on key UI understanding and interaction benchmarks, including **ScreenSpot-Pro**, **ScreenSpot-v2**, **OS-World-G**, **UI-Vision**, and **Android World**. The results demonstrate its superior capability in visual grounding, UI navigation, cross-platform generalization, and complex task reasoning. 26 | 27 | 28 | 29 | --- 30 | # Overview 31 | 32 | * [Key Highlights](#key-highlights) 33 | * [Installation](#installation) 34 | * [Quick Start](#quick-start) 35 | * [Evaluation](#evaluation) 36 | * [Citation](#citation) 37 | 38 | 39 | --- 40 | 41 | 42 | # ✨ Key Highlights 43 | 44 | We introduce **UI-Venus**, a state-of-the-art UI agent that pushes the boundaries of visual grounding and interactive navigation. Below are the core contributions and innovations: 45 | 46 | --- 47 | 48 | ### 💡 Open-Source SOTA UI Agent with Full Toolchain 49 | We develop and open-source **UI-Venus** — a state-of-the-art UI agent trained on **350K high-quality, professionally annotated samples**. 50 | In addition to model checkpoints (7B & 72B), we release: 51 | - 🔹 Full evaluation pipeline 52 | - 🔹 Inference scripts 53 | 54 | 👉 [GitHub Repository](https://github.com/inclusionAI/UI-Venus) | [Hugging Face Model](https://huggingface.co/collections/inclusionAI/ui-venus-689f2fb01a4234cbce91c56a) 55 | 56 | --- 57 | 58 | ### 📏 Action-Level Reward Design for RL-Based Navigation 59 | Beyond grounding, we design **fine-grained, action-wise reward functions** for reinforcement learning in GUI navigation tasks. 60 | Our approach achieves: 61 | - ✅ Competitive performance on **AndroidWorld** 62 | - ✅ Better credit assignment in long-horizon tasks 63 | - ✅ End-to-end learnable action prediction with policy gradients 64 | 65 | This marks a significant step toward **autonomous UI interaction** with precise feedback signals. 66 | 67 | --- 68 | 69 | ### 🧹 Comprehensive Study on UI Data Quality & Cleaning Strategy 70 | We conduct a **systematic analysis of UI interaction data quality**, identifying key challenges in real-world traces. 71 | To improve training signal fidelity, we propose a **three-stage data refinement pipeline**: 72 | 1. **Prompt Rewrite** – Clarify ambiguous user intentions 73 | 2. **Trace Editing** – Fix incorrect or redundant actions 74 | 3. **Trace Generation** – Synthesize missing steps via LLM-augmented modeling 75 | 76 | 📈 Result: RL & Higher-quality data → More robust and generalizable agents. 77 | 78 | 79 | 80 | --- 81 | 82 | ## Installation 83 | 84 | manually install the dependencies: 85 | 86 | ```bash 87 | pip install -r requirements.txt 88 | ``` 89 | 90 | --- 91 | 92 | 93 | 94 | ## Quick Start 95 | 96 | Use the shell scripts to launch the evaluation. The evaluation setup follows the same protocol as **ScreenSpot**, including data format, annotation structure, and metric calculation. 97 | 98 | ### Grounding 99 | - **For 7B model:** 100 | ```bash 101 | bash scripts/run_gd_7b.sh 102 | ``` 103 | 104 | - **For 72B model:** 105 | ```bash 106 | bash scripts/run_gd_72b.sh 107 | ``` 108 | 109 | 🔧 Configuration Required. 110 | Please set the following paths in your script: 111 | 112 | * `screenspot_imgs`: Folder containing your screenshots (e.g., Screenspot-pro/images) 113 | * `screenspot_test`: Folder containing your all json files (e.g., Screenspot-pro/annotations) 114 | * `model_name_or_path`: Model checkpoint path (e.g., /root/ckpt/huggingface/) 115 | * `log_path`: Output folders 116 | 117 | 118 | ### 📄 Data Format 119 | 120 | We provide example templates for input and output formats: 121 | 122 | - **Input Format (Grounding Annotations):** 123 | `examples/grounding_meta_format.json` — Defines the standard structure of annotation files in `screenspot_test`. 124 | 125 | - **Output Format (Model Predictions):** 126 | `examples/grounding_result_format.json` — Shows the recommended format for model prediction results. 127 | 128 | Please refer to these examples when preparing your data or parsing the evaluation outputs. 129 | 130 | ### Navigation 131 | - **For 7B model:** 132 | ```bash 133 | bash scripts/run_navi_7b.sh 134 | ``` 135 | 136 | - **For 72B model:** 137 | ```bash 138 | bash scripts/run_navi_72b.sh 139 | ``` 140 | 🔧 Configuration Required. 141 | Please set the following paths in your script: 142 | * `model_path`: Path to the model checkpoint (e.g., inclusionAI/UI-Venus-Navi-7B). 143 | * `input_file`: Input JSON file containing navigation tasks (str, default: examples/trace.json). Each entry must include: 144 | - task: The user instruction (string) 145 | - image_path: Screenshot filepath 146 | * `output_file`: Path to save the agent’s execution history (JSON format) 147 | --- 148 | 149 | ## Evaluation 150 | 151 | ### Results on ScreenSpot-v2 152 | 153 | | **Model** | **Mobile Text** | **Mobile Icon** | **Desktop Text** | **Desktop Icon** | **Web Text** | **Web Icon** | **Avg.** | 154 | |--------------------------|-----------------|-----------------|------------------|------------------|--------------|--------------|----------| 155 | | uitars-1.5 | - | - | - | - | - | - | 94.2 | 156 | | Seed-1.5-VL | - | - | - | - | - | - | 95.2 | 157 | | GPT-4o | 26.6 | 24.2 | 24.2 | 19.3 | 12.8 | 11.8 | 20.1 | 158 | | Qwen2.5-VL-7B | 97.6 | 87.2 | 90.2 | 74.2 | 93.2 | 81.3 | 88.8 | 159 | | UI-TARS-7B | 96.9 | 89.1 | 95.4 | 85.0 | 93.6 | 85.2 | 91.6 | 160 | | UI-TARS-72B | 94.8 | 86.3 | 91.2 | 87.9 | 91.5 | 87.7 | 90.3 | 161 | | LPO | 97.9 | 82.9 | 95.9 | 86.4 | 95.6 | 84.2 | 90.5 | 162 | | **UI-Venus-Ground-7B (Ours)** | **99.0** | **90.0** | **97.0** | **90.7** | **96.2** | **88.7** | **94.1** | 163 | | **UI-Venus-Ground-72B (Ours)** | **99.7** | **93.8** | **95.9** | **90.0** | **96.2** | **92.6** | **95.3** | 164 | 165 | --- 166 | 167 | 168 | 169 | ### Results on ScreenSpot-Pro 170 | 171 | Performance comparison of GUI agent models across six task categories on **ScreenSpot-Pro**. 172 | Scores are in percentage (%). `T` = Text, `I` = Icon. 173 | `*`: reproduced; `†`: trained from UI-TARS-1.5-7B. 174 | 175 | | Model | CAD (T/I) | Dev (T/I) | Creative (T/I) | Scientific (T/I) | Office (T/I) | OS (T/I) | Avg T | Avg I | **Overall** | Type | 176 | |-------|-----------|-----------|----------------|------------------|--------------|---------|--------|--------|------------|------| 177 | | GPT-4o | 2.0 / 0.0 | 1.3 / 0.0 | 1.0 / 0.0 | 2.1 / 0.0 | 1.1 / 0.0 | 0.0 / 0.0 | 1.3 | 0.0 | 0.8 | Closed | 178 | | Claude Computer Use | 14.5 / 3.7 | 22.0 / 3.9 | 25.9 / 3.4 | 33.9 / 15.8 | 30.1 / 16.3 | 11.0 / 4.5 | 23.4 | 7.1 | 17.1 | Closed | 179 | | UI-TARS-1.5 | – / – | – / – | – / – | – / – | – / – | – / – | – | – | **61.6** | Closed | 180 | | Seed1.5-VL | – / – | – / – | – / – | – / – | – / – | – / – | – | – | 60.9 | Closed | 181 | | Qwen2.5-VL-7B\* | 16.8 / 1.6 | 46.8 / 4.1 | 35.9 / 7.7 | 49.3 / 7.3 | 52.5 / 20.8 | 37.4 / 6.7 | 38.9 | 7.1 | 26.8 | SFT | 182 | | Qwen2.5-VL-72B* | 54.8 / 15.6 | 65.6 / 16.6 | 63.1 / 19.6 | 78.5 / 34.5 | 79.1 / 47.2 | 66.4 / 29.2 | 67.3 | 25.0 | 51.2 | SFT | 183 | | UI-TARS-7B | 20.8 / 9.4 | 58.4 / 12.4 | 50.0 / 9.1 | 63.9 / 31.8 | 63.3 / 20.8 | 30.8 / 16.9 | 47.8 | 16.2 | 35.7 | SFT | 184 | | UI-TARS-72B | 18.8 / 12.5 | 62.9 / 17.2 | 57.1 / 15.4 | 64.6 / 20.9 | 63.3 / 26.4 | 42.1 / 15.7 | 50.9 | 17.6 | 38.1 | SFT | 185 | | Phi-Ground-7B | 26.9 / 17.2 | 70.8 / 16.7 | 56.6 / 13.3 | 58.0 / 29.1 | 76.4 / 44.0 | 55.1 / 25.8 | 56.4 | 21.8 | 43.2 | RL | 186 | | UI-TARS-1.5-7B | – / – | – / – | – / – | – / – | – / – | – / – | – | – | 49.6 | RL | 187 | | GTA1-7B† | 53.3 / 17.2 | 66.9 / 20.7 | 62.6 / 18.2 | 76.4 / 31.8 | 82.5 / 50.9 | 48.6 / 25.9 | 65.5 | 25.2 | 50.1 | RL | 188 | | GTA1-72B | 56.9 / 28.1 | 79.9 / 33.1 | 73.2 / 20.3 | 81.9 / 38.2 | 85.3 / 49.1 | 73.8 / 39.1 | 74.5 | 32.5 | 58.4 | RL | 189 | | **UI-Venus-Ground-7B** | 60.4 / 21.9 | 74.7 / 24.1 | 63.1 / 14.7 | 76.4 / 31.8 | 75.7 / 41.5 | 49.5 / 22.5 | 67.1 | 24.3 | **50.8** | Ours (RL) | 190 | | **UI-Venus-Ground-72B** | 66.5 / 29.7 | 84.4 / 33.1 | 73.2 / 30.8 | 84.7 / 42.7 | 83.1 / 60.4 | 75.7 / 36.0 | 77.4 | 36.8 | **61.9** | Ours (RL) | 191 | 192 | > 🔝 **Experimental results show that UI-Venus-Ground-72B achieves state-of-the-art performance on ScreenSpot-Pro with an average score of 61.9, while also setting new benchmarks on ScreenSpot-v2(95.3), OSWorld_G(70.4), AgentCPM(85), and UI-Vision(36.8), highlighting its effectiveness in complex visual grounding and action prediction tasks.** 193 | 194 | ### Results on AndroidWorld 195 | This is the compressed package of validation trajectories for **AndroidWorld**, including execution logs and navigation paths. 196 | 📥 Download: [UI-Venus-androidworld.zip](vis_androidworld/UI-Venus-androidworld.zip) 197 | 198 | | Models | With Planner | A11y Tree | Screenshot | Success Rate (pass@1) | 199 | |--------|--------------|-----------|------------|------------------------| 200 | | **Closed-source Models** | | | | | 201 | | GPT-4o| ❌ | ✅ | ❌ | 30.6 | 202 | | ScaleTrack| ❌ | ✅ | ❌ | 44.0 | 203 | | SeedVL-1.5 | ❌ | ✅ | ✅ | 62.1 | 204 | | UI-TARS-1.5 | ❌ | ❌ | ✅ | 64.2 | 205 | | **Open-source Models** | | | | | 206 | | GUI-Critic-R1-7B | ❌ | ✅ | ✅ | 27.6 | 207 | | Qwen2.5-VL-72B* | ❌ | ❌ | ✅ | 35.0 | 208 | | UGround | ✅ | ❌ | ✅ | 44.0 | 209 | | Aria-UI | ✅ | ❌ | ✅ | 44.8 | 210 | | UI-TARS-72B | ❌ | ❌ | ✅ | 46.6 | 211 | | GLM-4.5v | ❌ | ❌ | ✅ | 57.0 | 212 | | **Ours** | | | | | 213 | | UI-Venus-Navi-7B | ❌ | ❌ | ✅ | **49.1** | 214 | | UI-Venus-Navi-72B | ❌ | ❌ | ✅ | **65.9** | 215 | 216 | > **Table:** Performance comparison on **AndroidWorld** for end-to-end models. Our UI-Venus-Navi-72B achieves state-of-the-art performance, outperforming all baseline methods across different settings. 217 | 218 | 219 | ### Results on AndroidControl and GUI-Odyssey 220 | 221 | | Models | AndroidControl-Low
Type Acc. | AndroidControl-Low
Step SR | AndroidControl-High
Type Acc. | AndroidControl-High
Step SR | GUI-Odyssey
Type Acc. | GUI-Odyssey
Step SR | 222 | |--------|-------------------------------|-----------------------------|-------------------------------|-----------------------------|------------------------|----------------------| 223 | | **Closed-source Models** | | | | | | | 224 | | GPT-4o | 74.3 | 19.4 | 66.3 | 20.8 | 34.3 | 3.3 | 225 | | **Open Source Models** | | | | | | | 226 | | Qwen2.5-VL-7B | 94.1 | 85.0 | 75.1 | 62.9 | 59.5 | 46.3 | 227 | | SeeClick | 93.0 | 75.0 | 82.9 | 59.1 | 71.0 | 53.9 | 228 | | OS-Atlas-7B | 93.6 | 85.2 | 85.2 | 71.2 | 84.5 | 62.0 | 229 | | Aguvis-7B| - | 80.5 | - | 61.5 | - | - | 230 | | Aguvis-72B| - | 84.4 | - | 66.4 | - | - | 231 | | OS-Genesis-7B | 90.7 | 74.2 | 66.2 | 44.5 | - | - | 232 | | UI-TARS-7B| 98.0 | 90.8 | 83.7 | 72.5 | 94.6 | 87.0 | 233 | | UI-TARS-72B| **98.1** | 91.3 | 85.2 | 74.7 | **95.4** | **88.6** | 234 | | GUI-R1-7B| 85.2 | 66.5 | 71.6 | 51.7 | 65.5 | 38.8 | 235 | | NaviMaster-7B | 85.6 | 69.9 | 72.9 | 54.0 | - | - | 236 | | UI-AGILE-7B | 87.7 | 77.6 | 80.1 | 60.6 | - | - | 237 | | AgentCPM-GUI | 94.4 | 90.2 | 77.7 | 69.2 | 90.0 | 75.0 | 238 | | **Ours** | | | | | | | 239 | | UI-Venus-Navi-7B | 97.1 | 92.4 | **86.5** | 76.1 | 87.3 | 71.5 | 240 | | UI-Venus-Navi-72B | 96.7 | **92.9** | 85.9 | **77.2** | 87.2 | 72.4 | 241 | 242 | > **Table:** Performance comparison on offline UI navigation datasets including AndroidControl and GUI-Odyssey. Note that models with * are reproduced. 243 | 244 | 245 | # Citation 246 | Please consider citing if you find our work useful: 247 | ```plain 248 | @misc{gu2025uivenustechnicalreportbuilding, 249 | title={UI-Venus Technical Report: Building High-performance UI Agents with RFT}, 250 | author={Zhangxuan Gu and Zhengwen Zeng and Zhenyu Xu and Xingran Zhou and Shuheng Shen and Yunfei Liu and Beitong Zhou and Changhua Meng and Tianyu Xia and Weizhi Chen and Yue Wen and Jingya Dou and Fei Tang and Jinzhen Lin and Yulin Liu and Zhenlin Guo and Yichen Gong and Heng Jia and Changlong Gao and Yuan Guo and Yong Deng and Zhenyu Guo and Liang Chen and Weiqiang Wang}, 251 | year={2025}, 252 | eprint={2508.10833}, 253 | archivePrefix={arXiv}, 254 | primaryClass={cs.CV}, 255 | url={https://arxiv.org/abs/2508.10833}, 256 | } 257 | ``` 258 | -------------------------------------------------------------------------------- /models/grounding/eval_screenspot_pro.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import itertools 3 | 4 | import torch 5 | import json 6 | import re 7 | import argparse 8 | import os 9 | from PIL import Image 10 | import logging 11 | from tqdm import tqdm 12 | 13 | 14 | logging.basicConfig(level=logging.INFO) 15 | torch.manual_seed(114514) 16 | 17 | GT_TYPES = ['positive', 'negative'] 18 | INSTRUCTION_STYLES = ['instruction', 'action', 'description'] 19 | LANGUAGES = ['en', 'cn'] 20 | 21 | def parse_args(): 22 | parser = argparse.ArgumentParser() 23 | parser.add_argument('--model_type', type=str, required=True) 24 | parser.add_argument('--model_name_or_path', type=str, required=False) 25 | parser.add_argument('--screenspot_imgs', type=str, required=True) 26 | parser.add_argument('--screenspot_test', type=str, required=True) 27 | parser.add_argument('--task', type=str, required=True) 28 | parser.add_argument('--inst_style', type=str, required=True, choices=INSTRUCTION_STYLES + ['all'], help="Instruction style to use.") 29 | parser.add_argument('--language', type=str, required=True, choices=LANGUAGES + ['all'], default='en', help="Language to use.") 30 | parser.add_argument('--gt_type', type=str, required=True, choices=GT_TYPES + ['all'], help="Ground truth type: 'positive' or 'negative'.") 31 | parser.add_argument('--log_path', type=str, required=True) 32 | 33 | args = parser.parse_args() 34 | return args 35 | 36 | def build_model(args): 37 | model_type = args.model_type 38 | model_name_or_path = args.model_name_or_path 39 | 40 | 41 | if model_type == "ui_venus_ground_7b": 42 | from ui_venus_ground_7b import UI_Venus_Ground_7B 43 | model = UI_Venus_Ground_7B() 44 | if args.model_name_or_path: 45 | model.load_model(model_name_or_path=model_name_or_path) 46 | else: 47 | model.load_model() 48 | 49 | elif model_type == "ui_venus_ground_72b": 50 | from ui_venus_ground_72b import UI_Venus_Ground_72B 51 | model = UI_Venus_Ground_72B() 52 | if args.model_name_or_path: 53 | model.load_model(model_name_or_path=model_name_or_path) 54 | else: 55 | model.load_model() 56 | 57 | else: 58 | raise ValueError(f"Unsupported model type {model_type}.") 59 | model.set_generation_config(temperature=0, max_new_tokens=256) 60 | return model 61 | 62 | def collect_results_to_eval(results, platform=None, group=None, application=None, language=None, gt_type=None, instruction_style=None, ui_type=None): 63 | """ 64 | Filters the results based on provided values. None means include all (ignore filtering this attribute). 65 | 66 | Parameters: 67 | results (list): A list of dictionaries containing sample results. 68 | 69 | Returns: 70 | list: A filtered list of dictionaries based on the given criteria. 71 | """ 72 | filtered_results = [] 73 | 74 | for sample in results: 75 | # Check each filter condition; if None, consider it as passed 76 | if (platform is None or sample.get("platform") == platform) and \ 77 | (group is None or sample.get("group") == group) and \ 78 | (application is None or sample.get("application") == application) and \ 79 | (language is None or sample.get("language") == language) and \ 80 | (gt_type is None or sample.get("gt_type") == gt_type) and \ 81 | (instruction_style is None or sample.get("instruction_style") == instruction_style) and \ 82 | (ui_type is None or sample.get("ui_type") == ui_type): 83 | filtered_results.append(sample) 84 | 85 | return filtered_results 86 | 87 | 88 | def make_combinations(results, platform=False, group=None, application=False, language=False, gt_type=False, instruction_style=False, ui_type=False): 89 | """ 90 | Returns a list of combinations of values for attributes where the corresponding parameter is set to True. 91 | """ 92 | # Initialize a dictionary to store unique values for each attribute 93 | unique_values = { 94 | "platform": set(), 95 | "group": set(), 96 | "application": set(), 97 | "language": set(), 98 | "gt_type": set(), 99 | "instruction_style": set(), 100 | "ui_type": set(), 101 | } 102 | 103 | # Collect unique values from the results 104 | for sample in results: 105 | if platform: 106 | unique_values["platform"].add(sample.get("platform")) 107 | if group: 108 | unique_values["group"].add(sample.get("group")) 109 | if application: 110 | unique_values["application"].add(sample.get("application")) 111 | if language: 112 | unique_values["language"].add(sample.get("language")) 113 | if gt_type: 114 | unique_values["gt_type"].add(sample.get("gt_type")) 115 | if instruction_style: 116 | unique_values["instruction_style"].add(sample.get("instruction_style")) 117 | if ui_type: 118 | unique_values["ui_type"].add(sample.get("ui_type")) 119 | 120 | # Filter out the attributes that are set to False (no need for combinations) 121 | filtered_values = {key: list(value) for key, value in unique_values.items() if value} 122 | if not filtered_values: 123 | return [] 124 | 125 | # Generate all combinations of the selected attributes using itertools.product 126 | attribute_combinations = list(itertools.product(*filtered_values.values())) 127 | 128 | # Convert combinations into dictionaries with corresponding attribute names 129 | combinations = [] 130 | for combination in attribute_combinations: 131 | combinations.append(dict(zip(filtered_values.keys(), combination))) 132 | 133 | return combinations 134 | 135 | 136 | def calc_metric_for_result_list(results): 137 | """Calculates the metrics for a simple result list.""" 138 | num_total = len(results) 139 | correct_num = sum(1 for res in results if res["correctness"] == "correct") 140 | wrong_format_num = sum(1 for res in results if res["correctness"] == "wrong_format") 141 | 142 | # Calculate text and icon specific metrics using collect_results_to_eval 143 | text_results = collect_results_to_eval(results, ui_type="text") 144 | icon_results = collect_results_to_eval(results, ui_type="icon") 145 | 146 | text_correct = sum(1 for res in text_results if res["correctness"] == "correct") 147 | text_total = len(text_results) 148 | icon_correct = sum(1 for res in icon_results if res["correctness"] == "correct") 149 | icon_total = len(icon_results) 150 | metrics = { 151 | "num_correct_action": correct_num, 152 | "num_total": num_total, 153 | "wrong_format_num": wrong_format_num, 154 | "action_acc": correct_num / num_total if num_total > 0 else 0, 155 | "text_acc": text_correct / text_total if text_total > 0 else 0, 156 | "icon_acc": icon_correct / icon_total if icon_total > 0 else 0 157 | } 158 | return metrics 159 | 160 | 161 | def eval_sample_positive_gt(sample, response): 162 | bbox = sample["bbox"] 163 | bbox = [bbox[0], bbox[1], bbox[2], bbox[3]] # x1, y1, x2, y2 164 | # bbox = [bbox[0], bbox[1], bbox[0] + bbox[2], bbox[1] + bbox[3]] # x1, y1, w, h 165 | img_size = sample["img_size"] 166 | bbox = [bbox[0] / img_size[0], bbox[1] / img_size[1], bbox[2] / img_size[0], bbox[3] / img_size[1]] 167 | 168 | click_point = response["point"] # may be none 169 | print(click_point) 170 | if click_point is None: 171 | return "wrong_format" 172 | # Check if the predicted point falls in the ground truth box 173 | if (bbox[0] <= click_point[0] <= bbox[2]) and (bbox[1] <= click_point[1] <= bbox[3]): 174 | return "correct" 175 | else: 176 | return "wrong" 177 | 178 | def eval_sample_negative_gt(sample, response): 179 | if response["result"] == "negative": 180 | return "correct" 181 | elif response["result"] == "positive": 182 | return "wrong" 183 | else: ## response["result"] == wrong_format 184 | return "wrong_format" 185 | 186 | def evaluate_fine_grained(results): 187 | # Generate all combinations of platform, instruction_style, and gt_type 188 | combinations = make_combinations( 189 | results, 190 | platform=True, 191 | application=True, 192 | instruction_style=True, 193 | gt_type=True 194 | ) 195 | 196 | evaluation_result = {} 197 | 198 | # Iterate through each combination 199 | for combo in combinations: 200 | platform = combo.get("platform") 201 | application = combo.get("application") 202 | inst_style = combo.get("instruction_style") 203 | gt_type = combo.get("gt_type") 204 | 205 | # Filter results for the current combination 206 | filtered_results = collect_results_to_eval( 207 | results=results, 208 | platform=platform, 209 | application=application, 210 | instruction_style=inst_style, 211 | gt_type=gt_type 212 | ) 213 | 214 | # Calculate metrics using the calc_metric_for_result_list function 215 | metrics = calc_metric_for_result_list(filtered_results) 216 | if metrics['num_total'] == 0: 217 | continue 218 | 219 | # Construct a unique key based on the combination 220 | key = f"plat:{platform} app:{application} inst_style:{inst_style} gt_type:{gt_type}" 221 | evaluation_result[key] = metrics 222 | 223 | return evaluation_result 224 | 225 | def evaluate_seeclick_paper_style(results): 226 | # Generate all combinations of platform, instruction_style, and gt_type 227 | combinations = make_combinations( 228 | results, 229 | platform=True, 230 | instruction_style=True, 231 | gt_type=True 232 | ) 233 | 234 | evaluation_result = {} 235 | 236 | # Iterate through each combination 237 | for combo in combinations: 238 | platform = combo.get("platform") 239 | inst_style = combo.get("instruction_style") 240 | gt_type = combo.get("gt_type") 241 | 242 | # Filter results for the current combination 243 | filtered_results = collect_results_to_eval( 244 | results=results, 245 | platform=platform, 246 | instruction_style=inst_style, 247 | gt_type=gt_type 248 | ) 249 | 250 | # Calculate metrics using the calc_metric_for_result_list function 251 | metrics = calc_metric_for_result_list(filtered_results) 252 | if metrics['num_total'] == 0: 253 | continue 254 | 255 | # Construct a unique key based on the combination 256 | key = f"plat:{platform} inst_style:{inst_style} gt_type:{gt_type}" 257 | evaluation_result[key] = metrics 258 | 259 | return evaluation_result 260 | 261 | def evaluate_leaderboard_detailed_style(results): 262 | # Generate all combinations of platform, instruction_style, and gt_type 263 | combinations = make_combinations( 264 | results, 265 | application=True, 266 | ) 267 | 268 | evaluation_result = {} 269 | 270 | # Iterate through each combination 271 | for combo in combinations: 272 | application = combo.get("application") 273 | 274 | # Filter results for the current combination 275 | filtered_results = collect_results_to_eval( 276 | results=results, 277 | application=application, 278 | ) 279 | 280 | # Calculate metrics using the calc_metric_for_result_list function 281 | metrics = calc_metric_for_result_list(filtered_results) 282 | if metrics['num_total'] == 0: 283 | continue 284 | 285 | # Construct a unique key based on the combination 286 | key = f"app:{application}" 287 | evaluation_result[key] = metrics 288 | 289 | return evaluation_result 290 | 291 | def evaluate_leaderboard_simple_style(results): 292 | # Generate all combinations of platform, instruction_style, and gt_type 293 | combinations = make_combinations( 294 | results, 295 | group=True, 296 | ) 297 | 298 | evaluation_result = {} 299 | 300 | # Iterate through each combination 301 | for combo in combinations: 302 | group = combo.get("group") 303 | 304 | # Filter results for the current combination 305 | filtered_results = collect_results_to_eval( 306 | results=results, 307 | group=group, 308 | ) 309 | 310 | # Calculate metrics using the calc_metric_for_result_list function 311 | metrics = calc_metric_for_result_list(filtered_results) 312 | if metrics['num_total'] == 0: 313 | continue 314 | 315 | # Construct a unique key based on the combination 316 | key = f"group:{group}" 317 | evaluation_result[key] = metrics 318 | 319 | return evaluation_result 320 | 321 | def evaluate_overall(results): 322 | """ 323 | Evaluates the overall metrics for all results without any filtering. 324 | 325 | Parameters: 326 | results (list): A list of dictionaries containing sample results. 327 | 328 | Returns: 329 | dict: A dictionary containing the overall metrics. 330 | """ 331 | # Calculate metrics for the entire result set 332 | metrics = calc_metric_for_result_list(results) 333 | 334 | return metrics 335 | 336 | 337 | def evaluate(results): 338 | """Collect results and calculate metrics. You can comment out function calls or add new ones based on your need. 339 | """ 340 | result_report = { 341 | "details": [], # Store detailed information for each sample 342 | "metrics": {} 343 | } 344 | 345 | # TODO: comment out function calls based on your need 346 | result_report["metrics"]["fine_grained"] = evaluate_fine_grained(results) 347 | result_report["metrics"]["seeclick_style"] = evaluate_seeclick_paper_style(results) 348 | result_report["metrics"]["leaderboard_simple_style"] = evaluate_leaderboard_simple_style(results) 349 | result_report["metrics"]["leaderboard_detailed_style"] = evaluate_leaderboard_detailed_style(results) 350 | result_report["metrics"]["overall"] = evaluate_overall(results) 351 | 352 | # Save detailed results 353 | result_report["details"] = results 354 | 355 | return result_report 356 | 357 | def main(args): 358 | model = build_model(args) 359 | print("Load model success") 360 | 361 | if args.task == "all": 362 | task_filenames = [ 363 | os.path.splitext(f)[0] 364 | for f in os.listdir(args.screenspot_test) 365 | if f.endswith(".json") 366 | ] 367 | else: 368 | task_filenames = args.task.split(",") 369 | 370 | if args.inst_style == "all": 371 | inst_styles = INSTRUCTION_STYLES 372 | else: 373 | inst_styles = args.inst_style.split(",") 374 | 375 | if args.language == "all": 376 | languages = LANGUAGES 377 | else: 378 | languages = args.language.split(",") 379 | 380 | if args.gt_type == "all": 381 | gt_types = GT_TYPES 382 | else: 383 | gt_types = args.gt_type.split(",") 384 | 385 | tasks_to_run = [] 386 | for task_filename in task_filenames: 387 | dataset = task_filename + ".json" 388 | with open(os.path.join(args.screenspot_test, dataset), 'r') as f: 389 | task_data = json.load(f) 390 | 391 | # Create the list of tasks to run, one item as an instance. Tasks may be reused. 392 | for inst_style in inst_styles: # Expand tasks based on user configurations 393 | for gt_type in gt_types: 394 | for lang in languages: 395 | for task_instance in task_data: 396 | task_instance = copy.deepcopy(task_instance) 397 | task_instance["task_filename"] = task_filename 398 | task_instance["gt_type"] = gt_type 399 | task_instance["instruction_style"] = inst_style 400 | task_instance["language"] = lang 401 | if lang == "cn": 402 | if inst_style!= 'instruction' or gt_type != 'positive': 403 | # TODO: Translate the data 404 | raise AttributeError("Only positive samples and 'instruction' style are supported for Chinese instructions.") 405 | task_instance["prompt_to_evaluate"] = task_instance["instruction_cn"] 406 | elif lang == "en": 407 | task_instance["prompt_to_evaluate"] = task_instance["instruction"] 408 | 409 | tasks_to_run.append(task_instance) 410 | print(f"Num of sample in {task_filename}: {len(task_data)} * {len(inst_styles)} * {len(gt_types)} * {len(languages)} = {len(task_data) * len(inst_styles) * len(gt_types) * len(languages)}") 411 | print(f"Total tasks: {len(tasks_to_run)}") 412 | 413 | results = [] 414 | for sample in tqdm(tasks_to_run): 415 | filename = sample["img_filename"] 416 | img_path = os.path.join(args.screenspot_imgs, filename) 417 | 418 | 419 | response = model.inference(instruction=sample["prompt_to_evaluate"], image_path=img_path) 420 | 421 | point = response["point"] 422 | tmp_img = Image.open(img_path) 423 | img_size = tmp_img.size 424 | sample["img_size"] = img_size 425 | point_in_pixel = [point[0] * img_size[0], point[1] * img_size[1]] if point else None 426 | 427 | sample_result = { 428 | "img_path": img_path, 429 | "group": sample["group"] if "group" in sample else None, 430 | "platform": sample["platform"] if "platform" in sample else None, 431 | "application": sample["application"] if "application" in sample else None, 432 | "lang": sample["language"] if "language" in sample else None, 433 | "instruction_style": sample["instruction_style"] if "instruction_style" in sample else None, 434 | "prompt_to_evaluate": sample["prompt_to_evaluate"] if "prompt_to_evaluate" in sample else None, 435 | "gt_type": sample["gt_type"] if "gt_type" in sample else 'positive', 436 | "ui_type": sample["ui_type"] if "ui_type" in sample else None, 437 | "task_filename": sample["task_filename"], 438 | "pred": point_in_pixel, 439 | "raw_response": response["raw_response"] 440 | } 441 | 442 | if sample["gt_type"] == "positive": 443 | correctness = eval_sample_positive_gt(sample, response) 444 | sample_result.update({ 445 | "bbox": sample["bbox"], 446 | }) 447 | elif sample["gt_type"] == "negative": 448 | correctness = eval_sample_negative_gt(sample, response) 449 | else: 450 | raise ValueError("Wrong instruction type") 451 | 452 | print(correctness, point, sample['bbox']) 453 | sample_result.update({ 454 | "correctness": correctness, 455 | }) 456 | results.append(sample_result) 457 | 458 | result_report = evaluate(results) 459 | os.makedirs(os.path.dirname(args.log_path), exist_ok=True) 460 | with open(args.log_path, 'w') as f: 461 | json.dump(result_report, f, indent=4) 462 | logging.info("Evaluation of ScreenSpot finished.") 463 | 464 | 465 | if __name__ == "__main__": 466 | main(parse_args()) 467 | --------------------------------------------------------------------------------