├── clicks ├── .gitignore ├── src │ └── clicks │ │ ├── third_party │ │ ├── ui_tars │ │ │ ├── __init__.py │ │ │ ├── integration.py │ │ │ └── client.py │ │ ├── molmo │ │ │ ├── __init__.py │ │ │ ├── client.py │ │ │ └── integration.py │ │ ├── qwen │ │ │ ├── __init__.py │ │ │ ├── integration.py │ │ │ └── client.py │ │ ├── gemini │ │ │ ├── __init__.py │ │ │ ├── integration.py │ │ │ └── client.py │ │ ├── openai │ │ │ ├── __init__.py │ │ │ ├── integration.py │ │ │ └── client.py │ │ ├── openai_cua │ │ │ ├── __init__.py │ │ │ ├── integration.py │ │ │ └── client.py │ │ ├── claude │ │ │ ├── __init__.py │ │ │ ├── integration.py │ │ │ └── client.py │ │ ├── omniparser │ │ │ ├── __init__.py │ │ │ ├── client.py │ │ │ └── integration.py │ │ ├── common.py │ │ └── __init__.py │ │ ├── evaluate │ │ ├── __init__.py │ │ ├── models.py │ │ ├── ace.py │ │ └── utils.py │ │ └── api_client_base.py ├── pyproject.toml ├── LICENSE ├── scripts │ ├── collect_runs.py │ └── calculate_latency.py ├── README.md └── eval.py └── README.md /clicks/.gitignore: -------------------------------------------------------------------------------- 1 | *.tar 2 | *results* 3 | *.csv 4 | *.json 5 | *.jsonl 6 | *.txt 7 | __pycache__ 8 | data/* -------------------------------------------------------------------------------- /clicks/src/clicks/third_party/ui_tars/__init__.py: -------------------------------------------------------------------------------- 1 | from .integration import ( 2 | UITarsAPIClient, 3 | get_ui_tars_api_client, 4 | ) 5 | 6 | __all__ = ['UITarsAPIClient', 'get_ui_tars_api_client'] 7 | -------------------------------------------------------------------------------- /clicks/src/clicks/third_party/molmo/__init__.py: -------------------------------------------------------------------------------- 1 | from .integration import ( 2 | MolmoAPIClient, 3 | get_molmo_api_client, 4 | ) 5 | 6 | __all__ = [ 7 | 'MolmoAPIClient', 8 | 'get_molmo_api_client', 9 | ] 10 | -------------------------------------------------------------------------------- /clicks/src/clicks/third_party/qwen/__init__.py: -------------------------------------------------------------------------------- 1 | from .integration import ( 2 | QwenVLAPIClient, 3 | get_qwen_api_client, 4 | ) 5 | 6 | __all__ = [ 7 | 'QwenVLAPIClient', 8 | 'get_qwen_api_client', 9 | ] 10 | -------------------------------------------------------------------------------- /clicks/src/clicks/third_party/gemini/__init__.py: -------------------------------------------------------------------------------- 1 | from .integration import ( 2 | GeminiAPIClient, 3 | get_gemini_api_client, 4 | ) 5 | 6 | __all__ = [ 7 | 'GeminiAPIClient', 8 | 'get_gemini_api_client', 9 | ] 10 | -------------------------------------------------------------------------------- /clicks/src/clicks/third_party/openai/__init__.py: -------------------------------------------------------------------------------- 1 | from .integration import ( 2 | OpenAIAPIClient, 3 | get_openai_api_client, 4 | ) 5 | 6 | __all__ = [ 7 | 'OpenAIAPIClient', 8 | 'get_openai_api_client', 9 | ] 10 | -------------------------------------------------------------------------------- /clicks/src/clicks/third_party/openai_cua/__init__.py: -------------------------------------------------------------------------------- 1 | from .integration import ( 2 | OpenAICUAAPIClient, 3 | get_openai_cua_api_client, 4 | ) 5 | 6 | __all__ = [ 7 | 'OpenAICUAAPIClient', 8 | 'get_openai_cua_api_client', 9 | ] 10 | -------------------------------------------------------------------------------- /clicks/src/clicks/third_party/claude/__init__.py: -------------------------------------------------------------------------------- 1 | from .integration import ( 2 | ClaudeComputerUseAPIClient, 3 | get_claude_api_client, 4 | ) 5 | 6 | __all__ = [ 7 | 'ClaudeComputerUseAPIClient', 8 | 'get_claude_api_client', 9 | ] 10 | -------------------------------------------------------------------------------- /clicks/src/clicks/third_party/omniparser/__init__.py: -------------------------------------------------------------------------------- 1 | from .integration import ( 2 | OmniParserAPIClient, 3 | get_omniparser_api_client, 4 | ) 5 | 6 | __all__ = [ 7 | 'OmniParserAPIClient', 8 | 'get_omniparser_api_client', 9 | ] 10 | -------------------------------------------------------------------------------- /clicks/src/clicks/evaluate/__init__.py: -------------------------------------------------------------------------------- 1 | from .ace import AceAPIClient, get_api_client 2 | from .models import ( 3 | AcePrediction, 4 | Action, 5 | ClickAction, 6 | Coordinate, 7 | GroundTruth, 8 | ) 9 | 10 | __all__ = [ 11 | 'Action', 12 | 'ClickAction', 13 | 'AceAPIClient', 14 | 'AcePrediction', 15 | 'Coordinate', 16 | 'GroundTruth', 17 | 'get_api_client', 18 | ] 19 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # showdown 2 | 3 | General Agents 4 | 5 | `showdown` is a suite of offline and online benchmarks for computer-use agents. 6 | 7 | If you use `showdown` in your research, please cite it as follows: 8 | 9 | ```bibtex 10 | @misc{showdown2025, 11 | title={The Showdown Computer Control Evaluation Suite}, 12 | author={General Agents Team}, 13 | year={2025}, 14 | url={https://github.com/generalagents/showdown}, 15 | } 16 | ``` -------------------------------------------------------------------------------- /clicks/src/clicks/third_party/common.py: -------------------------------------------------------------------------------- 1 | import base64 2 | from io import BytesIO 3 | 4 | from PIL import Image 5 | 6 | 7 | def encode_image_to_base64_uri(image_path: str) -> str: 8 | try: 9 | with Image.open(image_path) as img: 10 | if img.mode != 'RGB': 11 | img = img.convert('RGB') 12 | 13 | buffer = BytesIO() 14 | img.save(buffer, format='JPEG', quality=95) 15 | buffer.seek(0) 16 | 17 | img_base64 = base64.b64encode(buffer.getvalue()).decode('utf-8') 18 | 19 | return f'data:image/jpeg;base64,{img_base64}' 20 | except Exception as e: 21 | raise ValueError(f'Error encoding image to base64: {e}') 22 | -------------------------------------------------------------------------------- /clicks/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "clicks" 3 | version = "0.1.0" 4 | description = "clicks evals" 5 | requires-python = "==3.12.*" 6 | dependencies = [ 7 | "requests>=2.31.0", 8 | "pillow>=10.0.0", 9 | "tenacity>=8.2.3", 10 | "colorama>=0.4.6", 11 | "tqdm>=4.66.1", 12 | "pandas>=2.0.0", 13 | "scipy>=1.11.1", 14 | "anthropic>=0.18.1", 15 | "openai>=1.66.0", 16 | "Pillow>=10.0.0", 17 | "matplotlib>=3.7.0", 18 | "google-genai>=0.6.0", 19 | "generalagents>=0.1.0", 20 | ] 21 | 22 | [tool.hatch.build.targets.wheel] 23 | packages = ["src/clicks"] 24 | 25 | [build-system] 26 | requires = ["hatchling"] 27 | build-backend = "hatchling.build" 28 | -------------------------------------------------------------------------------- /clicks/src/clicks/api_client_base.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import Any, Dict 3 | 4 | from clicks.evaluate.models import EvaluationResult, ParsedPrediction 5 | 6 | 7 | class AbstractAPIClient(ABC): 8 | client_type: str 9 | model_name: str = '' 10 | 11 | @abstractmethod 12 | def predict(self, image_data_uri: str, prompt: str) -> Dict[str, Any] | None: 13 | pass 14 | 15 | @abstractmethod 16 | def parse_prediction(self, prediction: Dict[str, Any]) -> ParsedPrediction: 17 | pass 18 | 19 | @abstractmethod 20 | def process_single_item( 21 | self, item: Dict[str, Any], frames_dir: str, run_id: str 22 | ) -> EvaluationResult: 23 | pass 24 | -------------------------------------------------------------------------------- /clicks/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 General Agents 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /clicks/src/clicks/third_party/__init__.py: -------------------------------------------------------------------------------- 1 | from .claude import ( 2 | ClaudeComputerUseAPIClient, 3 | get_claude_api_client, 4 | ) 5 | from .gemini import ( 6 | GeminiAPIClient, 7 | get_gemini_api_client, 8 | ) 9 | from .molmo import ( 10 | MolmoAPIClient, 11 | get_molmo_api_client, 12 | ) 13 | from .omniparser import ( 14 | OmniParserAPIClient, 15 | get_omniparser_api_client, 16 | ) 17 | from .openai import ( 18 | OpenAIAPIClient, 19 | get_openai_api_client, 20 | ) 21 | from .openai_cua import ( 22 | OpenAICUAAPIClient, 23 | get_openai_cua_api_client, 24 | ) 25 | from .qwen import ( 26 | QwenVLAPIClient, 27 | get_qwen_api_client, 28 | ) 29 | from .ui_tars import ( 30 | UITarsAPIClient, 31 | get_ui_tars_api_client, 32 | ) 33 | 34 | __all__ = [ 35 | # Claude 36 | 'ClaudeComputerUseAPIClient', 37 | 'get_claude_api_client', 38 | # Qwen 39 | 'QwenVLAPIClient', 40 | 'get_qwen_api_client', 41 | # OpenAI 42 | 'OpenAIAPIClient', 43 | 'get_openai_api_client', 44 | # OpenAI CUA 45 | 'OpenAICUAAPIClient', 46 | 'get_openai_cua_api_client', 47 | # Gemini 48 | 'GeminiAPIClient', 49 | 'get_gemini_api_client', 50 | # Molmo 51 | 'MolmoAPIClient', 52 | 'get_molmo_api_client', 53 | # UI-TARS 54 | 'UITarsAPIClient', 55 | 'get_ui_tars_api_client', 56 | # OmniParser 57 | 'OmniParserAPIClient', 58 | 'get_omniparser_api_client', 59 | ] 60 | -------------------------------------------------------------------------------- /clicks/scripts/collect_runs.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | 4 | import pandas as pd 5 | from clicks.evaluate.utils import analyze_results 6 | 7 | 8 | def summarize_all_results(): 9 | base_dir = os.path.dirname(os.path.abspath(__file__)) 10 | results_dir = os.path.join(base_dir, '..', 'results', 'showdown-clicks-dev') 11 | 12 | output_file = os.path.join(results_dir, '..', 'report', 'metrics.csv') 13 | 14 | run_dirs = [d for d in os.listdir(results_dir) if os.path.isdir(os.path.join(results_dir, d))] 15 | 16 | all_metrics = [] 17 | 18 | print(f'Found {len(run_dirs)} result directories to process.') 19 | 20 | result_files = glob.glob(os.path.join(results_dir, '*.csv')) 21 | 22 | for result_file in result_files: 23 | try: 24 | print(f'Processing {result_file}...') 25 | 26 | # Extract model name from filename 27 | filename = os.path.basename(result_file) 28 | 29 | # Skip if this is a summary file 30 | if filename.startswith('summary_') or 'all_metrics' in filename: 31 | continue 32 | 33 | # Parse model name from filename 34 | name = filename.split('.csv')[0] 35 | 36 | # Read the results 37 | results_df = pd.read_csv(result_file) 38 | 39 | # Convert DataFrame to list of dictionaries for analysis 40 | # Ensure all keys are strings to match analyze_results parameter type 41 | results = [{str(k): v for k, v in item.items()} for item in results_df.to_dict('records')] 42 | 43 | if not results: 44 | print(f'No results found in {result_file}, skipping.') 45 | continue 46 | 47 | # Apply the same analysis logic 48 | results_analysis = analyze_results(results) 49 | 50 | # Create metrics dictionary 51 | metrics_dict = { 52 | 'model': name, 53 | 'ci': results_analysis.ci, 54 | 'accuracy': results_analysis.accuracy, 55 | 'total_correct': results_analysis.total_correct, 56 | 'total_processed': results_analysis.total_processed, 57 | 'accuracy_ci_low': results_analysis.accuracy_ci_low, 58 | 'accuracy_ci_high': results_analysis.accuracy_ci_high, 59 | 'result_file': result_file, 60 | } 61 | 62 | all_metrics.append(metrics_dict) 63 | print(f'Added metrics for {result_file}') 64 | 65 | except Exception as e: 66 | print(f'Error processing {result_file}: {e}') 67 | 68 | if all_metrics: 69 | # Create DataFrame and save to CSV 70 | all_metrics_df = pd.DataFrame(all_metrics) 71 | all_metrics_df.to_csv(output_file, index=False) 72 | print(f'All metrics saved to: {output_file}') 73 | else: 74 | print('No metrics were generated.') 75 | 76 | 77 | if __name__ == '__main__': 78 | summarize_all_results() 79 | -------------------------------------------------------------------------------- /clicks/src/clicks/evaluate/models.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from pydantic import BaseModel 4 | 5 | 6 | class Coordinate(BaseModel): 7 | x: Optional[int] = None 8 | y: Optional[int] = None 9 | 10 | 11 | class ClickAction(BaseModel): 12 | kind: str = 'left_click' 13 | coordinate: Coordinate 14 | 15 | 16 | class Action(BaseModel): 17 | kind: str 18 | coordinate: Optional[Coordinate] = None 19 | text: Optional[str] = None 20 | 21 | 22 | class AcePrediction(BaseModel): 23 | action: Action 24 | raw_response: Optional[str] = None 25 | 26 | 27 | class ParsedPrediction(BaseModel): 28 | pred_x: Optional[int] = None 29 | pred_y: Optional[int] = None 30 | raw_response: Optional[str] = None 31 | 32 | 33 | class GroundTruth(BaseModel): 34 | gt_x1: Optional[int] = None 35 | gt_y1: Optional[int] = None 36 | gt_x2: Optional[int] = None 37 | gt_y2: Optional[int] = None 38 | 39 | 40 | class EvaluationMetrics(BaseModel): 41 | total_processed: int 42 | total_correct: int 43 | accuracy: float 44 | ci: float 45 | accuracy_ci_low: Optional[float] = None 46 | accuracy_ci_high: Optional[float] = None 47 | 48 | 49 | class EvaluationItem(BaseModel): 50 | id: str 51 | recording_id: str 52 | instruction: str 53 | image: str 54 | x1: Optional[int] = None 55 | y1: Optional[int] = None 56 | x2: Optional[int] = None 57 | y2: Optional[int] = None 58 | width: Optional[int] = None 59 | height: Optional[int] = None 60 | 61 | 62 | class EvaluationResult(BaseModel): 63 | id: str 64 | recording_id: str 65 | instruction: str 66 | image_path: str 67 | gt_x1: Optional[int] = None 68 | gt_y1: Optional[int] = None 69 | gt_x2: Optional[int] = None 70 | gt_y2: Optional[int] = None 71 | pred_x: Optional[int] = None 72 | pred_y: Optional[int] = None 73 | is_in_bbox: Optional[bool] = None 74 | latency_seconds: float 75 | visualization_path: Optional[str] = None 76 | raw_response: Optional[str] = None 77 | 78 | 79 | class ModelConfig(BaseModel): 80 | api_endpoint: str = '' 81 | 82 | 83 | class AceModelConfig(ModelConfig): 84 | model: str = 'ace-control-medium' 85 | api_key: Optional[str] = None 86 | 87 | 88 | class ClaudeModelConfig(ModelConfig): 89 | model: str = 'claude-3-7-sonnet-20250219' 90 | api_key: Optional[str] = None 91 | thinking_budget: Optional[int] = 1024 92 | tool_version: str = '20250124' 93 | 94 | 95 | class QwenModelConfig(ModelConfig): 96 | model: str = 'qwen2.5-vl-72b-instruct' 97 | api_key: Optional[str] = None 98 | max_tokens: int = 4096 99 | use_smart_resize: bool = True 100 | resize_factor: int = 28 101 | min_pixels: int = 3136 102 | max_pixels: int = 12845056 103 | 104 | 105 | class OpenAIModelConfig(ModelConfig): 106 | model: str = 'o1' 107 | api_key: Optional[str] = None 108 | max_tokens: int = 4096 109 | reasoning_effort: str = 'medium' 110 | environment: str = 'mac' 111 | 112 | 113 | class GeminiModelConfig(ModelConfig): 114 | model: str = 'gemini-2.0-flash' 115 | api_key: Optional[str] = None 116 | max_tokens: int = 4096 117 | temperature: float = 0.0 118 | 119 | 120 | class OmniParserModelConfig(ModelConfig): 121 | model: str = 'gpt-4o-2024-05-13' 122 | temperature: float = 0.7 123 | -------------------------------------------------------------------------------- /clicks/src/clicks/third_party/omniparser/client.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import io 3 | import json 4 | from typing import Any, Dict 5 | 6 | import requests 7 | from PIL import Image 8 | from tenacity import retry, stop_after_attempt, wait_exponential 9 | 10 | OMNIPARSER_API_ENDPOINT = 'https://omniparser-api-omniparser-api.modal.run' 11 | DEFAULT_MODEL = 'gpt-4o-2024-05-13' 12 | 13 | 14 | class OmniParserClient: 15 | def __init__( 16 | self, 17 | api_endpoint: str = OMNIPARSER_API_ENDPOINT, 18 | model: str = DEFAULT_MODEL, 19 | temperature: float = 0.7, 20 | ): 21 | self.api_endpoint = api_endpoint 22 | self.model = model 23 | self.temperature = temperature 24 | 25 | def _convert_to_base64(self, image_path: str) -> str: 26 | """Convert image file to base64 string.""" 27 | try: 28 | with open(image_path, 'rb') as image_file: 29 | return base64.b64encode(image_file.read()).decode('utf-8') 30 | except Exception as e: 31 | print(f'Error converting image to base64: {e}') 32 | raise 33 | 34 | def _extract_image_dimensions(self, base64_data: str) -> tuple[int, int]: 35 | try: 36 | image_data = base64.b64decode(base64_data) 37 | image = Image.open(io.BytesIO(image_data)) 38 | width, height = image.size 39 | return width, height 40 | except Exception as e: 41 | print(f'Error extracting image dimensions: {e}') 42 | return 1024, 768 43 | 44 | @retry(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, min=2, max=15)) 45 | def predict(self, image_data_uri: str, prompt: str) -> Dict[str, Any]: 46 | try: 47 | if ',' in image_data_uri: 48 | base64_data = image_data_uri.split(',')[1] 49 | else: 50 | base64_data = image_data_uri 51 | 52 | width, height = self._extract_image_dimensions(base64_data) 53 | 54 | payload = { 55 | 'image': base64_data, 56 | 'instruction': prompt, 57 | 'model_name': self.model, 58 | 'temperature': self.temperature, 59 | 'width': width, 60 | 'height': height, 61 | } 62 | 63 | response = requests.post( 64 | f'{self.api_endpoint}/computer_use', 65 | json=payload, 66 | headers={'Content-Type': 'application/json'}, 67 | ) 68 | 69 | response.raise_for_status() 70 | result = response.json() 71 | result['model'] = self.model 72 | result['width'] = width 73 | result['height'] = height 74 | 75 | return result 76 | 77 | except Exception as e: 78 | print(f'Error making prediction: {e}') 79 | return {'error': str(e)} 80 | 81 | def parse_prediction(self, prediction: Dict[str, Any]) -> Dict[str, Any]: 82 | if not prediction or 'error' in prediction: 83 | return { 84 | 'pred_x': None, 85 | 'pred_y': None, 86 | 'raw_response': json.dumps(prediction) if prediction else None, 87 | } 88 | 89 | if 'point' in prediction and prediction['point'] and len(prediction['point']) == 2: 90 | pred_x, pred_y = prediction['point'] 91 | pred_x = int(pred_x * prediction['width']) 92 | pred_y = int(pred_y * prediction['height']) 93 | else: 94 | pred_x, pred_y = None, None 95 | 96 | return { 97 | 'pred_x': pred_x, 98 | 'pred_y': pred_y, 99 | 'raw_response': json.dumps(prediction, default=str), 100 | } 101 | -------------------------------------------------------------------------------- /clicks/scripts/calculate_latency.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | from dataclasses import dataclass 4 | from typing import Dict, List, Optional 5 | 6 | import numpy as np 7 | import pandas as pd 8 | from scipy import stats 9 | 10 | 11 | @dataclass 12 | class LatencyMetrics: 13 | mean_latency: float 14 | ci: float 15 | latency_ci_low: Optional[float] 16 | latency_ci_high: Optional[float] 17 | 18 | 19 | def calculate_latency_metrics(latencies: List[float], ci: float = 0.95) -> LatencyMetrics: 20 | if not latencies: 21 | return LatencyMetrics( 22 | mean_latency=0, 23 | ci=ci, 24 | latency_ci_low=None, 25 | latency_ci_high=None, 26 | ) 27 | 28 | latencies = sorted(latencies) 29 | # Drop the last ten latency values to remove potential outliers or anomalies 30 | latencies = latencies[:-10] 31 | 32 | mean_latency = np.mean(latencies) 33 | 34 | def calculate_mean(data): 35 | return np.mean(data) 36 | 37 | latency_ci = None 38 | try: 39 | latencies_array = np.array(latencies) 40 | latency_bootstrap = stats.bootstrap( 41 | (latencies_array,), 42 | calculate_mean, 43 | confidence_level=ci, 44 | method='percentile', 45 | ) 46 | latency_ci = latency_bootstrap.confidence_interval 47 | except Exception as e: 48 | print(f'Error calculating latency confidence interval: {e}') 49 | 50 | return LatencyMetrics( 51 | mean_latency=float(mean_latency), 52 | ci=float(ci), 53 | latency_ci_low=float(latency_ci.low) if latency_ci else None, 54 | latency_ci_high=float(latency_ci.high) if latency_ci else None, 55 | ) 56 | 57 | 58 | def read_and_analyze_results(directory: str = '.') -> Dict[str, LatencyMetrics]: 59 | csv_files = glob.glob(os.path.join(directory, '*.csv')) 60 | 61 | if not csv_files: 62 | print(f'No CSV files found in {directory}') 63 | return {} 64 | 65 | results = {} 66 | 67 | for csv_file in csv_files: 68 | model_name = os.path.basename(csv_file).replace('.csv', '') 69 | try: 70 | df = pd.read_csv(csv_file) 71 | 72 | if 'latency_seconds' in df.columns: 73 | latencies = df['latency_seconds'].dropna().tolist() 74 | metrics = calculate_latency_metrics(latencies) 75 | results[model_name] = metrics 76 | else: 77 | print(f'No latency column found in {csv_file}') 78 | except Exception as e: 79 | print(f'Error processing {csv_file}: {e}') 80 | 81 | return results 82 | 83 | 84 | def print_latency_summary(results: Dict[str, LatencyMetrics]) -> None: 85 | if not results: 86 | print('No results to display') 87 | return 88 | 89 | data = [] 90 | for model_name, metrics in results.items(): 91 | ci_str = ( 92 | f'[{metrics.latency_ci_low:.2f}, {metrics.latency_ci_high:.2f}]' 93 | if metrics.latency_ci_low is not None 94 | else 'N/A' 95 | ) 96 | data.append( 97 | { 98 | 'Model': model_name, 99 | 'Mean (s)': f'{metrics.mean_latency:.2f}', 100 | '95% CI': ci_str, 101 | } 102 | ) 103 | 104 | df = pd.DataFrame(data) 105 | print('\nLatency Summary:') 106 | print('-' * 80) 107 | print(df.to_string(index=False)) 108 | # calculated from a separate script 109 | print('operator | 3.88178 | [3.77679, 3.98342]') 110 | 111 | 112 | current_dir = os.path.join( 113 | os.path.dirname(os.path.abspath(__file__)), '..', 'results', 'showdown-clicks-dev' 114 | ) 115 | 116 | results = read_and_analyze_results(current_dir) 117 | 118 | print_latency_summary(results) 119 | 120 | # Save results to CSV 121 | output_data = [] 122 | for model_name, metrics in results.items(): 123 | output_data.append( 124 | { 125 | 'model': model_name, 126 | 'mean_latency': metrics.mean_latency, 127 | 'ci_low': metrics.latency_ci_low, 128 | 'ci_high': metrics.latency_ci_high, 129 | } 130 | ) 131 | 132 | output_dir = os.path.join(current_dir, '..', 'report') 133 | os.makedirs(output_dir, exist_ok=True) 134 | output_df = pd.DataFrame(output_data) 135 | output_path = os.path.join(output_dir, 'latency_results.csv') 136 | output_df.to_csv(output_path, index=False) 137 | print(f'\nResults saved to: {output_path}') 138 | -------------------------------------------------------------------------------- /clicks/src/clicks/third_party/molmo/client.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import io 3 | import json 4 | import re 5 | import time 6 | from typing import Any, Dict, Optional 7 | 8 | import requests 9 | from PIL import Image 10 | from tenacity import retry, stop_after_attempt, wait_exponential 11 | 12 | 13 | class MolmoClient: 14 | def __init__( 15 | self, 16 | api_url: str, 17 | api_key: Optional[str] = None, 18 | max_tokens: int = 4096, 19 | temperature: float = 0.0, 20 | top_p: float = 0.9, 21 | top_k: int = 50, 22 | ): 23 | self.api_url = api_url.rstrip('/') 24 | self.api_key = api_key 25 | self.max_tokens = max_tokens 26 | self.temperature = temperature 27 | self.top_p = top_p 28 | self.top_k = top_k 29 | 30 | def _encode_image(self, image_data_uri: str) -> str: 31 | if ',' in image_data_uri: 32 | base64_data = image_data_uri.split(',')[1] 33 | else: 34 | base64_data = image_data_uri 35 | return base64_data 36 | 37 | def _extract_image_dimensions(self, base64_data: str) -> tuple[int, int]: 38 | try: 39 | image_data = base64.b64decode(base64_data) 40 | image = Image.open(io.BytesIO(image_data)) 41 | width, height = image.size 42 | return width, height 43 | except Exception as e: 44 | print(f'Error extracting image dimensions: {e}') 45 | return 1024, 768 46 | 47 | @retry(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, min=2, max=15)) 48 | def predict(self, image_data_uri: str, prompt: str) -> Dict[str, Any]: 49 | base64_image = self._encode_image(image_data_uri) 50 | 51 | prompt_template = ( 52 | f"""Point at the UI element to click to achieve the following action: {prompt}""" 53 | ) 54 | 55 | width, height = self._extract_image_dimensions(base64_image) 56 | request_data = { 57 | 'images': [base64_image], 58 | 'text': prompt_template, 59 | 'max_new_tokens': self.max_tokens, 60 | 'temperature': self.temperature, 61 | 'top_p': self.top_p, 62 | 'top_k': self.top_k, 63 | } 64 | 65 | headers = {'Content-Type': 'application/json'} 66 | if self.api_key: 67 | headers['Authorization'] = f'Bearer {self.api_key}' 68 | 69 | try: 70 | start_time = time.time() 71 | 72 | response = requests.post( 73 | f'{self.api_url}/generate', 74 | json=request_data, 75 | headers=headers, 76 | timeout=3600, 77 | ) 78 | 79 | end_time = time.time() 80 | latency = end_time - start_time 81 | 82 | if response.status_code == 200: 83 | result = response.json() 84 | return { 85 | 'raw_response': json.dumps(result), 86 | 'content': result.get('generated_text', ''), 87 | 'latency_seconds': latency, 88 | 'width': width, 89 | 'height': height, 90 | } 91 | else: 92 | error_text = response.text 93 | try: 94 | error_json = response.json() 95 | error_text = json.dumps(error_json) 96 | except: 97 | pass 98 | 99 | return { 100 | 'error': f'HTTP Error {response.status_code}', 101 | 'error_details': error_text, 102 | 'latency_seconds': latency, 103 | } 104 | 105 | except Exception as e: 106 | return { 107 | 'error': f'API Error: {str(e)}', 108 | 'latency_seconds': 0, 109 | } 110 | 111 | def parse_prediction(self, prediction: Dict[str, Any]) -> Dict[str, Any]: 112 | if 'error' in prediction: 113 | return { 114 | 'pred_x': None, 115 | 'pred_y': None, 116 | 'error': prediction.get('error'), 117 | 'error_details': prediction.get('error_details', ''), 118 | 'raw_responses': prediction.get('raw_response', '{}'), 119 | } 120 | 121 | content = prediction.get('content', '') 122 | point_match = re.search(r' 0 and height > 0: 135 | pred_x = int(rel_x * width / 100) 136 | pred_y = int(rel_y * height / 100) 137 | 138 | return { 139 | 'pred_x': pred_x, 140 | 'pred_y': pred_y, 141 | 'content': content, 142 | 'raw_responses': prediction.get('raw_response', '{}'), 143 | } 144 | -------------------------------------------------------------------------------- /clicks/src/clicks/evaluate/ace.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | from typing import Any, Dict, Optional 4 | 5 | from clicks.api_client_base import AbstractAPIClient 6 | from clicks.evaluate.models import ( 7 | AceModelConfig, 8 | AcePrediction, 9 | EvaluationItem, 10 | EvaluationResult, 11 | ParsedPrediction, 12 | ) 13 | from clicks.evaluate.utils import ( 14 | check_prediction_in_bbox, 15 | print_colored_result, 16 | visualize_prediction, 17 | ) 18 | from generalagents import Agent 19 | from PIL import Image 20 | 21 | 22 | class AceAPIClient(AbstractAPIClient): 23 | def __init__(self, config: AceModelConfig): 24 | self.config = config 25 | self.client_type = 'ace' 26 | self.agent = Agent(model=self.config.model, api_key=self.config.api_key or '') 27 | 28 | def predict( 29 | self, 30 | image_data_uri: str, 31 | prompt: str, 32 | model: Optional[str] = None, 33 | ) -> Dict[str, Any]: 34 | raise NotImplementedError('Ace API client does not support predict method') 35 | 36 | def predict_ace( 37 | self, 38 | image: Image.Image, 39 | prompt: str, 40 | ) -> ParsedPrediction: 41 | session = self.agent.start(prompt) 42 | action = session.plan(image) 43 | 44 | if action.kind == 'left_click': 45 | pred_x = action.coordinate.x 46 | pred_y = action.coordinate.y 47 | else: 48 | pred_x, pred_y = None, None 49 | 50 | return ParsedPrediction( 51 | pred_x=pred_x, 52 | pred_y=pred_y, 53 | raw_response=str(action), 54 | ) 55 | 56 | def parse_prediction(self, prediction: Dict[str, Any]) -> ParsedPrediction: 57 | ace_prediction = AcePrediction.model_validate(prediction) 58 | if ace_prediction.action.kind == 'left_click': 59 | if ace_prediction.action.coordinate: 60 | pred_x = ace_prediction.action.coordinate.x 61 | pred_y = ace_prediction.action.coordinate.y 62 | else: 63 | pred_x, pred_y = None, None 64 | else: 65 | pred_x, pred_y = None, None 66 | 67 | return ParsedPrediction( 68 | pred_x=pred_x, 69 | pred_y=pred_y, 70 | raw_response=ace_prediction.raw_response, 71 | ) 72 | 73 | def process_single_item( 74 | self, 75 | item: Dict[str, Any], 76 | frames_dir: str, 77 | run_id: str, 78 | ) -> EvaluationResult: 79 | eval_item = item if isinstance(item, EvaluationItem) else EvaluationItem.model_validate(item) 80 | 81 | image_path = eval_item.image 82 | local_path = os.path.join(frames_dir, image_path) 83 | if not os.path.exists(local_path): 84 | raise FileNotFoundError(f'Local image not found at {local_path}') 85 | image = Image.open(local_path) 86 | 87 | try: 88 | start_time = time.time() 89 | prediction = self.predict_ace(image, eval_item.instruction) 90 | latency = time.time() - start_time 91 | 92 | pred_x = prediction.pred_x 93 | pred_y = prediction.pred_y 94 | 95 | gt_x1 = eval_item.x1 96 | gt_y1 = eval_item.y1 97 | gt_x2 = eval_item.x2 98 | gt_y2 = eval_item.y2 99 | 100 | is_in_bbox = check_prediction_in_bbox(pred_x, pred_y, gt_x1, gt_y1, gt_x2, gt_y2) 101 | 102 | visualization_path = visualize_prediction( 103 | local_path, 104 | pred_x, 105 | pred_y, 106 | eval_item.id, 107 | eval_item.recording_id, 108 | eval_item.instruction, 109 | self.config.model, 110 | run_id, 111 | gt_x1, 112 | gt_y1, 113 | gt_x2, 114 | gt_y2, 115 | is_in_bbox, 116 | ) 117 | 118 | print_colored_result( 119 | eval_item.id, 120 | eval_item.instruction, 121 | pred_x, 122 | pred_y, 123 | latency, 124 | is_in_bbox, 125 | ) 126 | 127 | result = EvaluationResult( 128 | id=eval_item.id, 129 | recording_id=eval_item.recording_id, 130 | instruction=eval_item.instruction, 131 | image_path=local_path, 132 | gt_x1=gt_x1, 133 | gt_y1=gt_y1, 134 | gt_x2=gt_x2, 135 | gt_y2=gt_y2, 136 | pred_x=pred_x, 137 | pred_y=pred_y, 138 | is_in_bbox=is_in_bbox, 139 | latency_seconds=latency, 140 | raw_response=prediction.raw_response, 141 | visualization_path=visualization_path, 142 | ) 143 | 144 | return result 145 | 146 | except Exception as e: 147 | print(f'API request failed for {eval_item.id}: {str(e)}') 148 | raise e 149 | 150 | 151 | def get_api_client(api_key: Optional[str] = None, model: Optional[str] = None) -> AceAPIClient: 152 | api_key = api_key or os.environ.get('GENERALAGENTS_API_KEY', '') 153 | config = AceModelConfig(api_key=api_key, model=model or 'ace-control-small') 154 | return AceAPIClient(config=config) 155 | -------------------------------------------------------------------------------- /clicks/src/clicks/third_party/molmo/integration.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | from typing import Any, Dict, Optional 4 | 5 | from clicks.api_client_base import AbstractAPIClient 6 | from clicks.evaluate.models import ( 7 | EvaluationItem, 8 | EvaluationResult, 9 | ParsedPrediction, 10 | ) 11 | from clicks.evaluate.utils import ( 12 | check_prediction_in_bbox, 13 | print_colored_result, 14 | visualize_prediction, 15 | ) 16 | 17 | from ..common import encode_image_to_base64_uri 18 | from .client import MolmoClient 19 | 20 | 21 | class MolmoAPIClient(AbstractAPIClient): 22 | def __init__( 23 | self, 24 | api_url: str, 25 | api_key: Optional[str] = None, 26 | max_tokens: int = 4096, 27 | temperature: float = 0.0, 28 | top_p: float = 0.9, 29 | top_k: int = 50, 30 | ): 31 | self.client = MolmoClient( 32 | api_url=api_url, 33 | api_key=api_key, 34 | max_tokens=max_tokens, 35 | temperature=temperature, 36 | top_p=top_p, 37 | top_k=top_k, 38 | ) 39 | self.client_type = 'molmo' 40 | 41 | def predict(self, image_data_uri: str, prompt: str) -> Dict[str, Any]: 42 | return self.client.predict(image_data_uri=image_data_uri, prompt=prompt) 43 | 44 | def parse_prediction(self, prediction: Dict[str, Any]) -> ParsedPrediction: 45 | parsed = self.client.parse_prediction(prediction) 46 | 47 | return ParsedPrediction( 48 | pred_x=parsed.get('pred_x'), 49 | pred_y=parsed.get('pred_y'), 50 | raw_response=parsed.get('raw_responses', '{}'), 51 | ) 52 | 53 | def process_single_item( 54 | self, 55 | item: Dict[str, Any], 56 | frames_dir: str, 57 | run_id: str, 58 | ) -> EvaluationResult: 59 | eval_item = EvaluationItem( 60 | id=item['id'], 61 | recording_id=item['recording_id'], 62 | instruction=item['instruction'], 63 | image=item['image'], 64 | x1=item['x1'], 65 | y1=item['y1'], 66 | x2=item['x2'], 67 | y2=item['y2'], 68 | ) 69 | 70 | image_path = eval_item.image 71 | 72 | local_path = os.path.join(frames_dir, image_path) 73 | if not os.path.exists(local_path): 74 | raise FileNotFoundError(f'Local image not found at {local_path}') 75 | 76 | image_data_uri = encode_image_to_base64_uri(local_path) 77 | if image_data_uri is None: 78 | raise ValueError(f'Failed to encode image at {local_path}') 79 | 80 | try: 81 | start_time = time.time() 82 | prediction = self.predict(image_data_uri, eval_item.instruction) 83 | latency = time.time() - start_time 84 | 85 | pred_result = self.parse_prediction(prediction) 86 | pred_x = pred_result.pred_x 87 | pred_y = pred_result.pred_y 88 | 89 | is_in_bbox = check_prediction_in_bbox( 90 | pred_x, pred_y, eval_item.x1, eval_item.y1, eval_item.x2, eval_item.y2 91 | ) 92 | 93 | print_colored_result( 94 | item_id=eval_item.id, 95 | instruction=eval_item.instruction, 96 | pred_x=pred_x, 97 | pred_y=pred_y, 98 | latency=latency, 99 | is_in_bbox=is_in_bbox, 100 | ) 101 | 102 | visualization_path = visualize_prediction( 103 | image_path=local_path, 104 | pred_x=pred_x, 105 | pred_y=pred_y, 106 | item_id=eval_item.id, 107 | recording_id=eval_item.recording_id, 108 | instruction=eval_item.instruction, 109 | model_name='molmo', 110 | run_id=run_id, 111 | gt_x1=eval_item.x1, 112 | gt_y1=eval_item.y1, 113 | gt_x2=eval_item.x2, 114 | gt_y2=eval_item.y2, 115 | is_in_bbox=is_in_bbox, 116 | ) 117 | 118 | result = EvaluationResult( 119 | id=eval_item.id, 120 | recording_id=eval_item.recording_id, 121 | instruction=eval_item.instruction, 122 | image_path=local_path, 123 | gt_x1=eval_item.x1, 124 | gt_y1=eval_item.y1, 125 | gt_x2=eval_item.x2, 126 | gt_y2=eval_item.y2, 127 | pred_x=pred_x, 128 | pred_y=pred_y, 129 | is_in_bbox=is_in_bbox, 130 | latency_seconds=latency, 131 | raw_response=prediction.get('raw_response', '{}'), 132 | visualization_path=visualization_path, 133 | ) 134 | 135 | return result 136 | 137 | except Exception as e: 138 | print(f'Error processing item {eval_item.id}: {str(e)}') 139 | raise e 140 | 141 | 142 | def get_molmo_api_client( 143 | api_url: str, 144 | api_key: Optional[str] = None, 145 | max_tokens: int = 4096, 146 | temperature: float = 0.0, 147 | top_p: float = 0.9, 148 | top_k: int = 50, 149 | ) -> MolmoAPIClient: 150 | return MolmoAPIClient( 151 | api_url=api_url, 152 | api_key=api_key, 153 | max_tokens=max_tokens, 154 | temperature=temperature, 155 | top_p=top_p, 156 | top_k=top_k, 157 | ) 158 | -------------------------------------------------------------------------------- /clicks/src/clicks/third_party/ui_tars/integration.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | from typing import Any, Dict 4 | 5 | from clicks.api_client_base import AbstractAPIClient 6 | from clicks.evaluate.models import ( 7 | EvaluationItem, 8 | EvaluationResult, 9 | ParsedPrediction, 10 | ) 11 | from clicks.evaluate.utils import ( 12 | check_prediction_in_bbox, 13 | print_colored_result, 14 | visualize_prediction, 15 | ) 16 | 17 | from ..common import encode_image_to_base64_uri 18 | from .client import UITarsClient 19 | 20 | 21 | class UITarsAPIClient(AbstractAPIClient): 22 | def __init__( 23 | self, 24 | api_url: str, 25 | api_key: str = 'super-secret-key', 26 | max_tokens: int = 128, 27 | temperature: float = 0.0, 28 | frequency_penalty: float = 1.0, 29 | model_name: str = 'bytedance-research/UI-TARS-72B-SFT', 30 | ): 31 | self.client = UITarsClient( 32 | api_url=api_url, 33 | api_key=api_key, 34 | max_tokens=max_tokens, 35 | temperature=temperature, 36 | frequency_penalty=frequency_penalty, 37 | model_name=model_name, 38 | ) 39 | self.model_name = model_name 40 | self.client_type = 'ui_tars' 41 | 42 | def predict(self, image_data_uri: str, prompt: str) -> Dict[str, Any]: 43 | return self.client.predict(image_data_uri=image_data_uri, prompt=prompt) 44 | 45 | def parse_prediction(self, prediction: Dict[str, Any]) -> ParsedPrediction: 46 | parsed_data = self.client.parse_prediction(prediction) 47 | 48 | return ParsedPrediction( 49 | pred_x=parsed_data.get('pred_x'), 50 | pred_y=parsed_data.get('pred_y'), 51 | raw_response=parsed_data.get('raw_responses', '{}'), 52 | ) 53 | 54 | def process_single_item( 55 | self, 56 | item: Dict[str, Any], 57 | frames_dir: str, 58 | run_id: str, 59 | ) -> EvaluationResult: 60 | eval_item = EvaluationItem( 61 | id=item['id'], 62 | recording_id=item['recording_id'], 63 | instruction=item['instruction'], 64 | image=item['image'], 65 | x1=item['x1'], 66 | y1=item['y1'], 67 | x2=item['x2'], 68 | y2=item['y2'], 69 | ) 70 | 71 | image_path = os.path.join(frames_dir, eval_item.image) 72 | 73 | image_data_uri = encode_image_to_base64_uri(image_path) 74 | if not image_data_uri: 75 | raise ValueError(f'Failed to encode image: {image_path}') 76 | 77 | print(f'Processing item {eval_item.id}: {eval_item.instruction}') 78 | start_time = time.time() 79 | prediction = self.predict(image_data_uri=image_data_uri, prompt=eval_item.instruction) 80 | end_time = time.time() 81 | latency = end_time - start_time 82 | 83 | parsed_prediction = self.parse_prediction(prediction) 84 | 85 | error = prediction.get('error') 86 | if error: 87 | print(f'Error: {error}') 88 | raise ValueError(error) 89 | 90 | is_in_bbox = check_prediction_in_bbox( 91 | pred_x=parsed_prediction.pred_x, 92 | pred_y=parsed_prediction.pred_y, 93 | gt_x1=eval_item.x1, 94 | gt_y1=eval_item.y1, 95 | gt_x2=eval_item.x2, 96 | gt_y2=eval_item.y2, 97 | ) 98 | 99 | print_colored_result( 100 | item_id=eval_item.id, 101 | instruction=eval_item.instruction, 102 | pred_x=parsed_prediction.pred_x, 103 | pred_y=parsed_prediction.pred_y, 104 | latency=latency, 105 | is_in_bbox=is_in_bbox, 106 | ) 107 | 108 | visualization_path = visualize_prediction( 109 | image_path=image_path, 110 | pred_x=parsed_prediction.pred_x, 111 | pred_y=parsed_prediction.pred_y, 112 | item_id=eval_item.id, 113 | recording_id=eval_item.recording_id, 114 | instruction=eval_item.instruction, 115 | model_name=self.model_name, 116 | run_id=run_id, 117 | gt_x1=eval_item.x1, 118 | gt_y1=eval_item.y1, 119 | gt_x2=eval_item.x2, 120 | gt_y2=eval_item.y2, 121 | is_in_bbox=is_in_bbox, 122 | ) 123 | 124 | return EvaluationResult( 125 | id=eval_item.id, 126 | recording_id=eval_item.recording_id, 127 | instruction=eval_item.instruction, 128 | image_path=image_path, 129 | gt_x1=eval_item.x1, 130 | gt_y1=eval_item.y1, 131 | gt_x2=eval_item.x2, 132 | gt_y2=eval_item.y2, 133 | pred_x=parsed_prediction.pred_x, 134 | pred_y=parsed_prediction.pred_y, 135 | is_in_bbox=is_in_bbox, 136 | latency_seconds=latency, 137 | raw_response=parsed_prediction.raw_response, 138 | visualization_path=visualization_path, 139 | ) 140 | 141 | 142 | def get_ui_tars_api_client( 143 | api_url: str, 144 | api_key: str = 'super-secret-key', 145 | max_tokens: int = 128, 146 | temperature: float = 0.0, 147 | frequency_penalty: float = 1.0, 148 | model_name: str = 'bytedance-research/UI-TARS-72B-SFT', 149 | **kwargs, 150 | ) -> UITarsAPIClient: 151 | client = UITarsAPIClient( 152 | api_url=api_url, 153 | api_key=api_key, 154 | max_tokens=max_tokens, 155 | temperature=temperature, 156 | frequency_penalty=frequency_penalty, 157 | model_name=model_name, 158 | ) 159 | return client 160 | -------------------------------------------------------------------------------- /clicks/src/clicks/third_party/gemini/integration.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | from typing import Any, Dict, Optional 4 | 5 | from clicks.api_client_base import AbstractAPIClient 6 | from clicks.evaluate.models import ( 7 | EvaluationItem, 8 | EvaluationResult, 9 | GeminiModelConfig, 10 | ParsedPrediction, 11 | ) 12 | from clicks.evaluate.utils import ( 13 | check_prediction_in_bbox, 14 | print_colored_result, 15 | visualize_prediction, 16 | ) 17 | 18 | from ..common import encode_image_to_base64_uri 19 | from .client import DEFAULT_MODEL 20 | from .client import GeminiClient as GeminiBaseClient 21 | 22 | 23 | class GeminiAPIClient(AbstractAPIClient): 24 | def __init__( 25 | self, 26 | config: Optional[GeminiModelConfig] = None, 27 | ): 28 | self.config = config or GeminiModelConfig() 29 | 30 | self.api_key = self.config.api_key 31 | self.model = self.config.model 32 | self.max_tokens = self.config.max_tokens 33 | self.temperature = self.config.temperature 34 | self.client_type = 'gemini' 35 | 36 | def predict( 37 | self, image_data_uri: str, prompt: str, model: Optional[str] = None 38 | ) -> Dict[str, Any] | None: 39 | client = GeminiBaseClient( 40 | api_key=self.api_key, 41 | model=model or self.model, 42 | max_tokens=self.max_tokens, 43 | temperature=self.temperature, 44 | ) 45 | 46 | return client.predict(image_data_uri, prompt) 47 | 48 | def parse_prediction(self, prediction: Dict[str, Any]) -> ParsedPrediction: 49 | client = GeminiBaseClient( 50 | api_key=self.api_key, 51 | model=self.model, 52 | max_tokens=self.max_tokens, 53 | temperature=self.temperature, 54 | ) 55 | 56 | parsed = client.parse_prediction(prediction) 57 | 58 | return ParsedPrediction( 59 | pred_x=parsed['pred_x'], 60 | pred_y=parsed['pred_y'], 61 | raw_response=parsed['raw_response'], 62 | ) 63 | 64 | def process_single_item( 65 | self, 66 | item: Dict[str, Any], 67 | frames_dir: str, 68 | run_id: str, 69 | ) -> EvaluationResult: 70 | eval_item = EvaluationItem( 71 | id=item['id'], 72 | recording_id=item['recording_id'], 73 | instruction=item['instruction'], 74 | image=item['image'], 75 | x1=item['x1'], 76 | y1=item['y1'], 77 | x2=item['x2'], 78 | y2=item['y2'], 79 | ) 80 | 81 | image_path = os.path.join(frames_dir, eval_item.image) 82 | 83 | if not os.path.exists(image_path): 84 | raise FileNotFoundError(f'Image file not found: {image_path}') 85 | 86 | image_data_uri = encode_image_to_base64_uri(image_path) 87 | start_time = time.time() 88 | prediction = self.predict(image_data_uri, eval_item.instruction) 89 | end_time = time.time() 90 | latency = end_time - start_time 91 | 92 | if prediction: 93 | parsed_prediction = self.parse_prediction(prediction) 94 | 95 | is_in_bbox = check_prediction_in_bbox( 96 | pred_x=parsed_prediction.pred_x, 97 | pred_y=parsed_prediction.pred_y, 98 | gt_x1=eval_item.x1, 99 | gt_y1=eval_item.y1, 100 | gt_x2=eval_item.x2, 101 | gt_y2=eval_item.y2, 102 | ) 103 | 104 | print_colored_result( 105 | item_id=eval_item.id, 106 | instruction=eval_item.instruction, 107 | pred_x=parsed_prediction.pred_x, 108 | pred_y=parsed_prediction.pred_y, 109 | latency=latency, 110 | is_in_bbox=is_in_bbox, 111 | ) 112 | 113 | visualization_path = visualize_prediction( 114 | image_path=image_path, 115 | pred_x=parsed_prediction.pred_x, 116 | pred_y=parsed_prediction.pred_y, 117 | item_id=eval_item.id, 118 | recording_id=eval_item.recording_id, 119 | instruction=eval_item.instruction, 120 | model_name='gemini', 121 | run_id=run_id, 122 | gt_x1=eval_item.x1, 123 | gt_y1=eval_item.y1, 124 | gt_x2=eval_item.x2, 125 | gt_y2=eval_item.y2, 126 | is_in_bbox=is_in_bbox, 127 | ) 128 | 129 | result = EvaluationResult( 130 | id=eval_item.id, 131 | recording_id=eval_item.recording_id, 132 | instruction=eval_item.instruction, 133 | image_path=image_path, 134 | gt_x1=eval_item.x1, 135 | gt_y1=eval_item.y1, 136 | gt_x2=eval_item.x2, 137 | gt_y2=eval_item.y2, 138 | pred_x=parsed_prediction.pred_x, 139 | pred_y=parsed_prediction.pred_y, 140 | is_in_bbox=is_in_bbox, 141 | latency_seconds=latency, 142 | raw_response=parsed_prediction.raw_response, 143 | visualization_path=visualization_path, 144 | ) 145 | 146 | return result 147 | else: 148 | raise ValueError('Prediction is None') 149 | 150 | 151 | def get_gemini_api_client( 152 | api_key: Optional[str] = None, 153 | model: str = DEFAULT_MODEL, 154 | max_tokens: int = 4096, 155 | temperature: float = 0.0, 156 | ) -> GeminiAPIClient: 157 | config = GeminiModelConfig( 158 | model=model, 159 | api_key=api_key or os.environ.get('GEMINI_API_KEY'), 160 | max_tokens=max_tokens, 161 | temperature=temperature, 162 | ) 163 | 164 | return GeminiAPIClient(config=config) 165 | -------------------------------------------------------------------------------- /clicks/src/clicks/third_party/claude/integration.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | from typing import Any, Dict, Optional 4 | 5 | from clicks.api_client_base import AbstractAPIClient 6 | from clicks.evaluate.models import ( 7 | ClaudeModelConfig, 8 | EvaluationItem, 9 | EvaluationResult, 10 | ParsedPrediction, 11 | ) 12 | from clicks.evaluate.utils import ( 13 | check_prediction_in_bbox, 14 | print_colored_result, 15 | visualize_prediction, 16 | ) 17 | 18 | from ..common import encode_image_to_base64_uri 19 | from .client import DEFAULT_MODEL, ClaudeComputerUseClient 20 | 21 | 22 | class ClaudeComputerUseAPIClient(AbstractAPIClient): 23 | def __init__( 24 | self, 25 | config: Optional[ClaudeModelConfig] = None, 26 | ): 27 | if config is None: 28 | config = ClaudeModelConfig( 29 | api_endpoint='https://api.anthropic.com/v1/messages', 30 | model=DEFAULT_MODEL, 31 | api_key=os.environ.get('ANTHROPIC_API_KEY'), 32 | thinking_budget=1024, 33 | tool_version='20250124', 34 | ) 35 | 36 | self.claude_client = ClaudeComputerUseClient( 37 | api_key=config.api_key, 38 | api_endpoint=config.api_endpoint, 39 | model=config.model, 40 | thinking_budget=config.thinking_budget, 41 | tool_version=config.tool_version, 42 | ) 43 | self.config = config 44 | self.client_type = 'claude' 45 | 46 | def predict( 47 | self, image_data_uri: str, prompt: str, model: Optional[str] = None 48 | ) -> Dict[str, Any] | None: 49 | return self.claude_client.predict(image_data_uri, prompt) 50 | 51 | def parse_prediction(self, prediction: Dict[str, Any]) -> ParsedPrediction: 52 | result_dict = self.claude_client.parse_prediction(prediction) 53 | 54 | return ParsedPrediction( 55 | pred_x=result_dict.get('pred_x'), 56 | pred_y=result_dict.get('pred_y'), 57 | raw_response=result_dict.get('raw_response'), 58 | ) 59 | 60 | def process_single_item( 61 | self, 62 | item: Dict[str, Any], 63 | frames_dir: str, 64 | run_id: str, 65 | ) -> EvaluationResult: 66 | try: 67 | eval_item = item if isinstance(item, EvaluationItem) else EvaluationItem.model_validate(item) 68 | 69 | item_id = eval_item.id 70 | recording_id = eval_item.recording_id 71 | instruction = eval_item.instruction 72 | image_path = eval_item.image 73 | 74 | gt_x1 = eval_item.x1 75 | gt_y1 = eval_item.y1 76 | gt_x2 = eval_item.x2 77 | gt_y2 = eval_item.y2 78 | 79 | local_path = os.path.join(frames_dir, image_path) 80 | if not os.path.exists(local_path): 81 | raise FileNotFoundError(f'Image file not found: {local_path}') 82 | 83 | image_data_uri = encode_image_to_base64_uri(local_path) 84 | print(f'Processing item {item_id} with instruction: {instruction}') 85 | 86 | start_time = time.time() 87 | prediction = self.predict(image_data_uri, instruction) 88 | end_time = time.time() 89 | latency = end_time - start_time 90 | 91 | if prediction is None: 92 | print(f'Claude returned None for item {item_id}') 93 | pred_result = ParsedPrediction() 94 | pred_x = None 95 | pred_y = None 96 | else: 97 | pred_result = self.parse_prediction(prediction) 98 | pred_x = pred_result.pred_x 99 | pred_y = pred_result.pred_y 100 | 101 | is_in_bbox = check_prediction_in_bbox(pred_x, pred_y, gt_x1, gt_y1, gt_x2, gt_y2) 102 | 103 | print_colored_result( 104 | item_id, 105 | instruction, 106 | pred_x, 107 | pred_y, 108 | latency, 109 | is_in_bbox, 110 | ) 111 | 112 | visualization_path = visualize_prediction( 113 | local_path, 114 | pred_x, 115 | pred_y, 116 | item_id, 117 | recording_id, 118 | instruction, 119 | self.config.model, 120 | run_id, 121 | gt_x1, 122 | gt_y1, 123 | gt_x2, 124 | gt_y2, 125 | is_in_bbox, 126 | ) 127 | 128 | result = EvaluationResult( 129 | id=item_id, 130 | recording_id=recording_id, 131 | instruction=instruction, 132 | image_path=local_path, 133 | gt_x1=gt_x1, 134 | gt_y1=gt_y1, 135 | gt_x2=gt_x2, 136 | gt_y2=gt_y2, 137 | pred_x=pred_x, 138 | pred_y=pred_y, 139 | is_in_bbox=is_in_bbox, 140 | latency_seconds=latency, 141 | raw_response=pred_result.raw_response if pred_result else None, 142 | visualization_path=visualization_path, 143 | ) 144 | 145 | return result 146 | 147 | except Exception as e: 148 | print(f'API request failed: {str(e)}') 149 | raise e 150 | 151 | 152 | def get_claude_api_client( 153 | api_key: Optional[str] = None, 154 | api_endpoint: Optional[str] = None, 155 | model: str = DEFAULT_MODEL, 156 | thinking_budget: Optional[int] = 1024, 157 | tool_version: str = '20250124', 158 | ) -> ClaudeComputerUseAPIClient: 159 | config = ClaudeModelConfig( 160 | api_endpoint=api_endpoint or 'https://api.anthropic.com/v1/messages', 161 | model=model, 162 | api_key=api_key or os.environ.get('ANTHROPIC_API_KEY'), 163 | thinking_budget=thinking_budget, 164 | tool_version=tool_version, 165 | ) 166 | return ClaudeComputerUseAPIClient(config) 167 | -------------------------------------------------------------------------------- /clicks/src/clicks/third_party/omniparser/integration.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | from typing import Any, Dict, Optional 4 | 5 | from clicks.api_client_base import AbstractAPIClient 6 | from clicks.evaluate.models import ( 7 | EvaluationItem, 8 | EvaluationResult, 9 | OmniParserModelConfig, 10 | ParsedPrediction, 11 | ) 12 | from clicks.evaluate.utils import ( 13 | check_prediction_in_bbox, 14 | print_colored_result, 15 | visualize_prediction, 16 | ) 17 | 18 | from ..common import encode_image_to_base64_uri 19 | from .client import DEFAULT_MODEL 20 | from .client import OmniParserClient as OmniParserBaseClient 21 | 22 | 23 | class OmniParserAPIClient(AbstractAPIClient): 24 | def __init__( 25 | self, 26 | config: Optional[OmniParserModelConfig] = None, 27 | ): 28 | self.config = config or OmniParserModelConfig( 29 | api_endpoint=os.environ.get( 30 | 'OMNIPARSER_API_ENDPOINT', 'https://omniparser-api-omniparser-api.modal.run' 31 | ) 32 | ) 33 | 34 | self.api_endpoint = self.config.api_endpoint 35 | self.model = self.config.model 36 | self.temperature = self.config.temperature 37 | self.client_type = 'omniparser' 38 | 39 | def predict( 40 | self, image_data_uri: str, prompt: str, model: Optional[str] = None 41 | ) -> Dict[str, Any] | None: 42 | client = OmniParserBaseClient( 43 | api_endpoint=self.api_endpoint, 44 | model=model or self.model, 45 | temperature=self.temperature, 46 | ) 47 | 48 | return client.predict(image_data_uri, prompt) 49 | 50 | def parse_prediction(self, prediction: Dict[str, Any]) -> ParsedPrediction: 51 | client = OmniParserBaseClient( 52 | api_endpoint=self.api_endpoint, 53 | model=self.model, 54 | temperature=self.temperature, 55 | ) 56 | 57 | parsed = client.parse_prediction(prediction) 58 | 59 | return ParsedPrediction( 60 | pred_x=parsed['pred_x'], 61 | pred_y=parsed['pred_y'], 62 | raw_response=parsed['raw_response'], 63 | ) 64 | 65 | def process_single_item( 66 | self, 67 | item: Dict[str, Any], 68 | frames_dir: str, 69 | run_id: str, 70 | ) -> EvaluationResult: 71 | eval_item = EvaluationItem( 72 | id=item['id'], 73 | recording_id=item['recording_id'], 74 | instruction=item['instruction'], 75 | image=item['image'], 76 | x1=item['x1'], 77 | y1=item['y1'], 78 | x2=item['x2'], 79 | y2=item['y2'], 80 | ) 81 | 82 | image_path = os.path.join(frames_dir, eval_item.image) 83 | 84 | if not os.path.exists(image_path): 85 | raise FileNotFoundError(f'Image file not found at {image_path}') 86 | 87 | image_data_uri = encode_image_to_base64_uri(image_path) 88 | if image_data_uri is None: 89 | raise ValueError(f'Failed to encode image at {image_path}') 90 | 91 | start_time = time.time() 92 | prediction = self.predict(image_data_uri, eval_item.instruction) 93 | end_time = time.time() 94 | latency = end_time - start_time 95 | 96 | if prediction: 97 | parsed_prediction = self.parse_prediction(prediction) 98 | 99 | is_in_bbox = check_prediction_in_bbox( 100 | pred_x=parsed_prediction.pred_x, 101 | pred_y=parsed_prediction.pred_y, 102 | gt_x1=eval_item.x1, 103 | gt_y1=eval_item.y1, 104 | gt_x2=eval_item.x2, 105 | gt_y2=eval_item.y2, 106 | ) 107 | 108 | print_colored_result( 109 | item_id=eval_item.id, 110 | instruction=eval_item.instruction, 111 | pred_x=parsed_prediction.pred_x, 112 | pred_y=parsed_prediction.pred_y, 113 | latency=latency, 114 | is_in_bbox=is_in_bbox, 115 | ) 116 | 117 | visualization_path = visualize_prediction( 118 | image_path=image_path, 119 | pred_x=parsed_prediction.pred_x, 120 | pred_y=parsed_prediction.pred_y, 121 | item_id=eval_item.id, 122 | recording_id=eval_item.recording_id, 123 | instruction=eval_item.instruction, 124 | model_name='omniparser', 125 | run_id=run_id, 126 | gt_x1=eval_item.x1, 127 | gt_y1=eval_item.y1, 128 | gt_x2=eval_item.x2, 129 | gt_y2=eval_item.y2, 130 | is_in_bbox=is_in_bbox, 131 | ) 132 | 133 | result = EvaluationResult( 134 | id=eval_item.id, 135 | recording_id=eval_item.recording_id, 136 | instruction=eval_item.instruction, 137 | image_path=image_path, 138 | gt_x1=eval_item.x1, 139 | gt_y1=eval_item.y1, 140 | gt_x2=eval_item.x2, 141 | gt_y2=eval_item.y2, 142 | pred_x=parsed_prediction.pred_x, 143 | pred_y=parsed_prediction.pred_y, 144 | is_in_bbox=is_in_bbox, 145 | latency_seconds=latency, 146 | raw_response=parsed_prediction.raw_response, 147 | visualization_path=visualization_path, 148 | ) 149 | 150 | return result 151 | else: 152 | raise ValueError('Prediction is None') 153 | 154 | 155 | def get_omniparser_api_client( 156 | api_endpoint: Optional[str] = None, 157 | model: str = DEFAULT_MODEL, 158 | temperature: float = 0.7, 159 | ) -> OmniParserAPIClient: 160 | config = OmniParserModelConfig( 161 | api_endpoint=api_endpoint 162 | or os.environ.get('OMNIPARSER_API_ENDPOINT', 'https://omniparser-api-omniparser-api.modal.run'), 163 | model=model, 164 | temperature=temperature, 165 | ) 166 | 167 | return OmniParserAPIClient(config=config) 168 | -------------------------------------------------------------------------------- /clicks/src/clicks/third_party/openai_cua/integration.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | from typing import Any, Dict, Optional 4 | 5 | from clicks.api_client_base import AbstractAPIClient 6 | from clicks.evaluate.models import ( 7 | EvaluationItem, 8 | EvaluationResult, 9 | OpenAIModelConfig, 10 | ParsedPrediction, 11 | ) 12 | from clicks.evaluate.utils import ( 13 | check_prediction_in_bbox, 14 | print_colored_result, 15 | visualize_prediction, 16 | ) 17 | 18 | from ..common import encode_image_to_base64_uri 19 | from .client import DEFAULT_ENVIRONMENT, DEFAULT_MODEL, OpenAICUAClient 20 | 21 | 22 | class OpenAICUAAPIClient(AbstractAPIClient): 23 | def __init__( 24 | self, 25 | config: Optional[OpenAIModelConfig] = None, 26 | ): 27 | self.config = config or OpenAIModelConfig( 28 | api_endpoint=os.environ.get('OPENAI_API_ENDPOINT', 'https://api.openai.com/v1') 29 | ) 30 | 31 | self.api_key = self.config.api_key 32 | self.api_endpoint = self.config.api_endpoint 33 | self.model = self.config.model 34 | self.max_tokens = self.config.max_tokens 35 | self.environment = getattr(self.config, 'environment', DEFAULT_ENVIRONMENT) 36 | self.client_type = 'openai_cua' 37 | 38 | def predict( 39 | self, image_data_uri: str, prompt: str, model: Optional[str] = None 40 | ) -> Dict[str, Any] | None: 41 | client = OpenAICUAClient( 42 | api_key=self.api_key, 43 | api_endpoint=self.api_endpoint, 44 | model=model or self.model, 45 | max_tokens=self.max_tokens, 46 | environment=self.environment, 47 | ) 48 | 49 | return client.predict(image_data_uri, prompt) 50 | 51 | def parse_prediction(self, prediction: Dict[str, Any]) -> ParsedPrediction: 52 | client = OpenAICUAClient( 53 | api_key=self.api_key, 54 | api_endpoint=self.api_endpoint, 55 | model=self.model, 56 | max_tokens=self.max_tokens, 57 | environment=self.environment, 58 | ) 59 | 60 | parsed = client.parse_prediction(prediction) 61 | 62 | return ParsedPrediction( 63 | pred_x=parsed['pred_x'], 64 | pred_y=parsed['pred_y'], 65 | raw_response=parsed['raw_response'], 66 | ) 67 | 68 | def process_single_item( 69 | self, 70 | item: Dict[str, Any], 71 | frames_dir: str, 72 | run_id: str, 73 | ) -> EvaluationResult: 74 | eval_item = EvaluationItem( 75 | id=item['id'], 76 | recording_id=item['recording_id'], 77 | instruction=item['instruction'], 78 | image=item['image'], 79 | x1=item['x1'], 80 | y1=item['y1'], 81 | x2=item['x2'], 82 | y2=item['y2'], 83 | ) 84 | 85 | image_path = os.path.join(frames_dir, eval_item.image) 86 | 87 | if not os.path.exists(image_path): 88 | raise FileNotFoundError(f'Image file not found at {image_path}') 89 | 90 | image_data_uri = encode_image_to_base64_uri(image_path) 91 | if image_data_uri is None: 92 | raise ValueError(f'Failed to encode image at {image_path}') 93 | 94 | start_time = time.time() 95 | prediction = self.predict(image_data_uri, eval_item.instruction) 96 | end_time = time.time() 97 | latency = end_time - start_time 98 | 99 | if prediction: 100 | parsed_prediction = self.parse_prediction(prediction) 101 | 102 | is_in_bbox = check_prediction_in_bbox( 103 | pred_x=parsed_prediction.pred_x, 104 | pred_y=parsed_prediction.pred_y, 105 | gt_x1=eval_item.x1, 106 | gt_y1=eval_item.y1, 107 | gt_x2=eval_item.x2, 108 | gt_y2=eval_item.y2, 109 | ) 110 | 111 | print_colored_result( 112 | item_id=eval_item.id, 113 | instruction=eval_item.instruction, 114 | pred_x=parsed_prediction.pred_x, 115 | pred_y=parsed_prediction.pred_y, 116 | latency=latency, 117 | is_in_bbox=is_in_bbox, 118 | ) 119 | 120 | visualization_path = visualize_prediction( 121 | image_path=image_path, 122 | pred_x=parsed_prediction.pred_x, 123 | pred_y=parsed_prediction.pred_y, 124 | item_id=eval_item.id, 125 | recording_id=eval_item.recording_id, 126 | instruction=eval_item.instruction, 127 | model_name='openai_cua', 128 | run_id=run_id, 129 | gt_x1=eval_item.x1, 130 | gt_y1=eval_item.y1, 131 | gt_x2=eval_item.x2, 132 | gt_y2=eval_item.y2, 133 | is_in_bbox=is_in_bbox, 134 | ) 135 | 136 | result = EvaluationResult( 137 | id=eval_item.id, 138 | recording_id=eval_item.recording_id, 139 | instruction=eval_item.instruction, 140 | image_path=image_path, 141 | gt_x1=eval_item.x1, 142 | gt_y1=eval_item.y1, 143 | gt_x2=eval_item.x2, 144 | gt_y2=eval_item.y2, 145 | pred_x=parsed_prediction.pred_x, 146 | pred_y=parsed_prediction.pred_y, 147 | is_in_bbox=is_in_bbox, 148 | latency_seconds=latency, 149 | raw_response=parsed_prediction.raw_response, 150 | visualization_path=visualization_path, 151 | ) 152 | 153 | return result 154 | else: 155 | raise ValueError('Prediction is None') 156 | 157 | 158 | def get_openai_cua_api_client( 159 | api_key: Optional[str] = None, 160 | api_endpoint: Optional[str] = None, 161 | model: str = DEFAULT_MODEL, 162 | max_tokens: int = 4096, 163 | environment: str = DEFAULT_ENVIRONMENT, 164 | ) -> OpenAICUAAPIClient: 165 | # Create a custom config that includes CUA-specific parameters 166 | config = OpenAIModelConfig( 167 | api_endpoint=api_endpoint or os.environ.get('OPENAI_API_ENDPOINT', 'https://api.openai.com/v1'), 168 | model=model, 169 | api_key=api_key, 170 | max_tokens=max_tokens, 171 | ) 172 | 173 | # Add CUA-specific attributes 174 | setattr(config, 'environment', environment) 175 | 176 | return OpenAICUAAPIClient(config=config) 177 | -------------------------------------------------------------------------------- /clicks/src/clicks/third_party/qwen/integration.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | from typing import Any, Dict, Optional 4 | 5 | from clicks.api_client_base import AbstractAPIClient 6 | from clicks.evaluate.models import ( 7 | EvaluationItem, 8 | EvaluationResult, 9 | ParsedPrediction, 10 | QwenModelConfig, 11 | ) 12 | from clicks.evaluate.utils import ( 13 | check_prediction_in_bbox, 14 | print_colored_result, 15 | visualize_prediction, 16 | ) 17 | 18 | from ..common import encode_image_to_base64_uri 19 | from .client import DEFAULT_MODEL, QwenVLClient 20 | 21 | 22 | class QwenVLAPIClient(AbstractAPIClient): 23 | def __init__( 24 | self, 25 | config: Optional[QwenModelConfig] = None, 26 | ): 27 | if config is None: 28 | config = QwenModelConfig( 29 | api_endpoint='https://dashscope.aliyuncs.com/compatible-mode/v1', 30 | model=DEFAULT_MODEL, 31 | api_key=os.environ.get('DASHSCOPE_API_KEY'), 32 | max_tokens=4096, 33 | use_smart_resize=True, 34 | resize_factor=28, 35 | min_pixels=3136, 36 | max_pixels=12845056, 37 | ) 38 | 39 | self.qwen_client = QwenVLClient( 40 | api_key=config.api_key, 41 | api_endpoint=config.api_endpoint, 42 | model=config.model, 43 | max_tokens=config.max_tokens, 44 | use_smart_resize=config.use_smart_resize, 45 | resize_factor=config.resize_factor, 46 | min_pixels=config.min_pixels, 47 | max_pixels=config.max_pixels, 48 | ) 49 | self.config = config 50 | self.client_type = 'qwen' 51 | 52 | def predict( 53 | self, image_data_uri: str, prompt: str, model: Optional[str] = None 54 | ) -> Dict[str, Any] | None: 55 | return self.qwen_client.predict(image_data_uri, prompt) 56 | 57 | def parse_prediction(self, prediction: Dict[str, Any]) -> ParsedPrediction: 58 | result_dict = self.qwen_client.parse_prediction(prediction) 59 | 60 | return ParsedPrediction( 61 | pred_x=result_dict.get('pred_x'), 62 | pred_y=result_dict.get('pred_y'), 63 | raw_response=result_dict.get('raw_response'), 64 | ) 65 | 66 | def process_single_item( 67 | self, 68 | item: Dict[str, Any], 69 | frames_dir: str, 70 | run_id: str, 71 | ) -> EvaluationResult: 72 | try: 73 | eval_item = EvaluationItem( 74 | id=item['id'], 75 | recording_id=item['recording_id'], 76 | instruction=item['instruction'], 77 | image=item['image'], 78 | x1=item['x1'], 79 | y1=item['y1'], 80 | x2=item['x2'], 81 | y2=item['y2'], 82 | ) 83 | 84 | item_id = eval_item.id 85 | recording_id = eval_item.recording_id 86 | instruction = eval_item.instruction 87 | image_path = eval_item.image 88 | 89 | local_path = os.path.join(frames_dir, image_path) 90 | if not os.path.exists(local_path): 91 | raise FileNotFoundError(f'Image file not found: {local_path}') 92 | 93 | image_data_uri = encode_image_to_base64_uri(local_path) 94 | if image_data_uri is None: 95 | raise ValueError(f'Failed to encode image: {local_path}') 96 | 97 | print(f'Processing item {item_id} with instruction: {instruction}') 98 | start_time = time.time() 99 | prediction = self.predict(image_data_uri, instruction) 100 | latency = time.time() - start_time 101 | 102 | if prediction is None: 103 | print(f'Qwen returned None for item {item_id}') 104 | pred_result = ParsedPrediction() 105 | pred_x = None 106 | pred_y = None 107 | else: 108 | pred_result = self.parse_prediction(prediction) 109 | pred_x = pred_result.pred_x 110 | pred_y = pred_result.pred_y 111 | 112 | is_in_bbox = check_prediction_in_bbox( 113 | pred_x, pred_y, eval_item.x1, eval_item.y1, eval_item.x2, eval_item.y2 114 | ) 115 | 116 | print_colored_result( 117 | item_id, 118 | instruction, 119 | pred_x, 120 | pred_y, 121 | latency, 122 | is_in_bbox, 123 | ) 124 | 125 | visualization_path = visualize_prediction( 126 | local_path, 127 | pred_x, 128 | pred_y, 129 | item_id, 130 | recording_id, 131 | instruction, 132 | self.config.model, 133 | run_id, 134 | eval_item.x1, 135 | eval_item.y1, 136 | eval_item.x2, 137 | eval_item.y2, 138 | is_in_bbox, 139 | ) 140 | 141 | result = EvaluationResult( 142 | id=item_id, 143 | recording_id=recording_id, 144 | instruction=instruction, 145 | image_path=local_path, 146 | gt_x1=eval_item.x1, 147 | gt_y1=eval_item.y1, 148 | gt_x2=eval_item.x2, 149 | gt_y2=eval_item.y2, 150 | pred_x=pred_x, 151 | pred_y=pred_y, 152 | is_in_bbox=is_in_bbox, 153 | latency_seconds=latency, 154 | raw_response=pred_result.raw_response, 155 | visualization_path=visualization_path, 156 | ) 157 | 158 | return result 159 | 160 | except Exception as e: 161 | print(f'Error processing item: {e}') 162 | raise e 163 | 164 | 165 | def get_qwen_api_client( 166 | api_key: Optional[str] = None, 167 | api_endpoint: Optional[str] = None, 168 | model: str = DEFAULT_MODEL, 169 | max_tokens: int = 4096, 170 | use_smart_resize: bool = True, 171 | resize_factor: int = 28, 172 | min_pixels: int = 3136, 173 | max_pixels: int = 12845056, 174 | ) -> QwenVLAPIClient: 175 | config = QwenModelConfig( 176 | api_endpoint=api_endpoint or 'https://dashscope.aliyuncs.com/compatible-mode/v1', 177 | model=model, 178 | api_key=api_key or os.environ.get('DASHSCOPE_API_KEY'), 179 | max_tokens=max_tokens, 180 | use_smart_resize=use_smart_resize, 181 | resize_factor=resize_factor, 182 | min_pixels=min_pixels, 183 | max_pixels=max_pixels, 184 | ) 185 | return QwenVLAPIClient(config) 186 | -------------------------------------------------------------------------------- /clicks/src/clicks/third_party/openai/integration.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | from typing import Any, Dict, Optional 4 | 5 | from clicks.api_client_base import AbstractAPIClient 6 | from clicks.evaluate.models import ( 7 | EvaluationItem, 8 | EvaluationResult, 9 | OpenAIModelConfig, 10 | ParsedPrediction, 11 | ) 12 | from clicks.evaluate.utils import ( 13 | check_prediction_in_bbox, 14 | print_colored_result, 15 | visualize_prediction, 16 | ) 17 | 18 | from ..common import encode_image_to_base64_uri 19 | from .client import DEFAULT_MODEL 20 | from .client import OpenAIClient as OpenAIBaseClient 21 | 22 | 23 | class OpenAIAPIClient(AbstractAPIClient): 24 | def __init__( 25 | self, 26 | config: Optional[OpenAIModelConfig] = None, 27 | ): 28 | self.config = config or OpenAIModelConfig( 29 | api_endpoint=os.environ.get('OPENAI_API_ENDPOINT', 'https://api.openai.com/v1') 30 | ) 31 | 32 | self.api_key = self.config.api_key 33 | self.api_endpoint = self.config.api_endpoint 34 | self.model = self.config.model 35 | self.max_tokens = self.config.max_tokens 36 | self.reasoning_effort = self.config.reasoning_effort 37 | self.client_type = 'openai' 38 | 39 | def predict( 40 | self, image_data_uri: str, prompt: str, model: Optional[str] = None 41 | ) -> Dict[str, Any] | None: 42 | client = OpenAIBaseClient( 43 | api_key=self.api_key, 44 | api_endpoint=self.api_endpoint, 45 | model=model or self.model, 46 | max_tokens=self.max_tokens, 47 | reasoning_effort=self.reasoning_effort, 48 | ) 49 | 50 | return client.predict(image_data_uri, prompt) 51 | 52 | def parse_prediction(self, prediction: Dict[str, Any]) -> ParsedPrediction: 53 | client = OpenAIBaseClient( 54 | api_key=self.api_key, 55 | api_endpoint=self.api_endpoint, 56 | model=self.model, 57 | max_tokens=self.max_tokens, 58 | reasoning_effort=self.reasoning_effort, 59 | ) 60 | 61 | parsed = client.parse_prediction(prediction) 62 | 63 | return ParsedPrediction( 64 | pred_x=parsed['pred_x'], 65 | pred_y=parsed['pred_y'], 66 | raw_response=parsed['raw_response'], 67 | ) 68 | 69 | def process_single_item( 70 | self, 71 | item: Dict[str, Any], 72 | frames_dir: str, 73 | run_id: str, 74 | ) -> EvaluationResult: 75 | try: 76 | eval_item = EvaluationItem( 77 | id=item['id'], 78 | recording_id=item['recording_id'], 79 | instruction=item['instruction'], 80 | image=item['image'], 81 | x1=item['x1'], 82 | y1=item['y1'], 83 | x2=item['x2'], 84 | y2=item['y2'], 85 | ) 86 | 87 | item_id = eval_item.id 88 | recording_id = eval_item.recording_id 89 | instruction = eval_item.instruction 90 | image_path = eval_item.image 91 | 92 | local_path = os.path.join(frames_dir, image_path) 93 | if not os.path.exists(local_path): 94 | raise FileNotFoundError(f'Image file not found at {local_path}') 95 | 96 | image_data_uri = encode_image_to_base64_uri(local_path) 97 | if image_data_uri is None: 98 | raise ValueError(f'Failed to encode image at {local_path}') 99 | 100 | start_time = time.time() 101 | prediction = self.predict(image_data_uri, instruction) 102 | end_time = time.time() 103 | latency = end_time - start_time 104 | 105 | if not prediction: 106 | raise ValueError('Prediction is None') 107 | 108 | parsed_prediction = self.parse_prediction(prediction) 109 | 110 | is_in_bbox = check_prediction_in_bbox( 111 | pred_x=parsed_prediction.pred_x, 112 | pred_y=parsed_prediction.pred_y, 113 | gt_x1=eval_item.x1, 114 | gt_y1=eval_item.y1, 115 | gt_x2=eval_item.x2, 116 | gt_y2=eval_item.y2, 117 | ) 118 | 119 | visualization_path = visualize_prediction( 120 | image_path=local_path, 121 | pred_x=parsed_prediction.pred_x, 122 | pred_y=parsed_prediction.pred_y, 123 | item_id=item_id, 124 | recording_id=recording_id, 125 | instruction=instruction, 126 | model_name=self.config.model, 127 | run_id=run_id, 128 | gt_x1=eval_item.x1, 129 | gt_y1=eval_item.y1, 130 | gt_x2=eval_item.x2, 131 | gt_y2=eval_item.y2, 132 | is_in_bbox=is_in_bbox, 133 | ) 134 | 135 | print_colored_result( 136 | item_id=item_id, 137 | instruction=instruction, 138 | pred_x=parsed_prediction.pred_x, 139 | pred_y=parsed_prediction.pred_y, 140 | latency=latency, 141 | is_in_bbox=is_in_bbox, 142 | ) 143 | 144 | return EvaluationResult( 145 | id=item_id, 146 | recording_id=recording_id, 147 | instruction=instruction, 148 | image_path=local_path, 149 | gt_x1=eval_item.x1, 150 | gt_y1=eval_item.y1, 151 | gt_x2=eval_item.x2, 152 | gt_y2=eval_item.y2, 153 | pred_x=parsed_prediction.pred_x, 154 | pred_y=parsed_prediction.pred_y, 155 | is_in_bbox=is_in_bbox, 156 | latency_seconds=latency, 157 | raw_response=parsed_prediction.raw_response, 158 | visualization_path=visualization_path, 159 | ) 160 | except Exception as e: 161 | print(f'API request failed: {str(e)}') 162 | raise e 163 | 164 | 165 | def get_openai_api_client( 166 | api_key: Optional[str] = None, 167 | api_endpoint: Optional[str] = None, 168 | model: str = DEFAULT_MODEL, 169 | max_tokens: int = 4096, 170 | reasoning_effort: str = 'medium', 171 | ) -> OpenAIAPIClient: 172 | config = OpenAIModelConfig( 173 | api_endpoint=api_endpoint or os.environ.get('OPENAI_API_ENDPOINT', 'https://api.openai.com/v1'), 174 | model=model, 175 | api_key=api_key, 176 | max_tokens=max_tokens, 177 | reasoning_effort=reasoning_effort, 178 | ) 179 | 180 | return OpenAIAPIClient(config=config) 181 | -------------------------------------------------------------------------------- /clicks/src/clicks/third_party/gemini/client.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import io 3 | import json 4 | import os 5 | import time 6 | from typing import Any, Dict, Optional 7 | 8 | from google import genai 9 | from google.genai.types import GenerateContentConfig, SafetySetting 10 | from PIL import Image 11 | from pydantic import BaseModel 12 | from tenacity import retry, stop_after_attempt, wait_exponential 13 | 14 | DEFAULT_MODEL = 'gemini-2.0-flash' 15 | 16 | 17 | class Point(BaseModel): 18 | point: list[int] 19 | label: str 20 | 21 | 22 | class GeminiClient: 23 | def __init__( 24 | self, 25 | api_key: Optional[str] = None, 26 | model: str = DEFAULT_MODEL, 27 | max_tokens: int = 4096, 28 | temperature: float = 0.0, 29 | ): 30 | self.api_key = api_key or os.environ.get('GEMINI_API_KEY') 31 | if not self.api_key: 32 | raise ValueError( 33 | 'API key must be provided either as an argument or through the GEMINI_API_KEY environment variable' 34 | ) 35 | 36 | self.model = model 37 | self.max_tokens = max_tokens 38 | self.temperature = temperature 39 | 40 | def _extract_image_dimensions(self, base64_data: str) -> tuple[int, int]: 41 | try: 42 | image_data = base64.b64decode(base64_data) 43 | image = Image.open(io.BytesIO(image_data)) 44 | width, height = image.size 45 | return width, height 46 | except Exception as e: 47 | print(f'Error extracting image dimensions: {e}') 48 | return 1024, 768 49 | 50 | def _encode_image(self, image_path: str) -> str: 51 | try: 52 | with open(image_path, 'rb') as image_file: 53 | return base64.b64encode(image_file.read()).decode('utf-8') 54 | except Exception as e: 55 | print(f'Error encoding image: {e}') 56 | return '' 57 | 58 | @retry(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, min=2, max=15)) 59 | def predict(self, image_data_uri: str, prompt: str, **kwargs) -> Dict[str, Any]: 60 | try: 61 | if ',' in image_data_uri: 62 | base64_data = image_data_uri.split(',')[1] 63 | mime_type = image_data_uri.split(',')[0] 64 | else: 65 | base64_data = image_data_uri 66 | mime_type = 'data:image/jpeg;base64' 67 | 68 | image_data = base64.b64decode(base64_data) 69 | image_pil = Image.open(io.BytesIO(image_data)) 70 | width, height = image_pil.size 71 | 72 | start_time = time.time() 73 | 74 | client = genai.Client(api_key=self.api_key) 75 | 76 | config_localize = GenerateContentConfig( 77 | temperature=0.0, 78 | ) 79 | 80 | prompt_localize = f"""You are an AI assistant that helps users with their tasks. 81 | You are given an image and a task. Your job is to return a description of the UI element that should be clicked on to advance or complete the task. 82 | There will always be an UI element that can be clicked to advance or complete the task. Do not question this. 83 | The description should not be a question, or an action. For example, "the "-" button in the Target Membership section." is good, but "click the "-" button in the Target Membership section." is bad. 84 | You MUST remember that you are not describing the actions you will take, but the UI element that should be clicked. You should just describe the UI element. 85 | The task is: `{prompt}`. 86 | Return nothing else but the singular description.""" 87 | 88 | result_localize = client.models.generate_content( 89 | model=self.model, 90 | contents=[ 91 | image_pil, 92 | prompt_localize, 93 | ], 94 | config=config_localize, 95 | ) 96 | 97 | print(f'Localize: {result_localize.text}') 98 | 99 | prompt = ( 100 | """Point to the UI element matching the description: `""" 101 | + (result_localize.text or prompt) 102 | + """`, with no more than 1 item. The answer should follow the json format: [{'point': , "label": }, ...]. The points are in [y, x] format normalized to 0-1000.""" 103 | ) 104 | 105 | config = GenerateContentConfig( 106 | temperature=0.5, 107 | safety_settings=[ 108 | SafetySetting( 109 | category='HARM_CATEGORY_DANGEROUS_CONTENT', # type: ignore 110 | threshold='BLOCK_ONLY_HIGH', # type: ignore 111 | ), 112 | ], 113 | response_mime_type='application/json', 114 | response_schema=list[Point], 115 | ) 116 | 117 | response = client.models.generate_content( 118 | model=self.model, 119 | contents=[ 120 | image_pil, 121 | prompt, 122 | ], 123 | config=config, 124 | ) 125 | 126 | latency = time.time() - start_time 127 | 128 | return { 129 | 'response': response, 130 | 'latency': latency, 131 | 'width': width, 132 | 'height': height, 133 | } 134 | 135 | except Exception as e: 136 | print(f'API Error: {str(e)}') 137 | return {'response': None, 'latency': 0, 'width': 0, 'height': 0, 'error': str(e)} 138 | 139 | def parse_prediction(self, prediction: Dict[str, Any]) -> Dict[str, Any]: 140 | if not prediction or 'response' not in prediction: 141 | return { 142 | 'pred_x': None, 143 | 'pred_y': None, 144 | 'raw_response': json.dumps(prediction) if prediction else None, 145 | } 146 | 147 | response = prediction['response'] 148 | 149 | pred_x = None 150 | pred_y = None 151 | 152 | width = prediction['width'] 153 | height = prediction['height'] 154 | 155 | try: 156 | content = response.parsed 157 | 158 | point = content[0] 159 | 160 | y, x = point.point 161 | y = int(y / 1000 * height) 162 | x = int(x / 1000 * width) 163 | 164 | pred_x = x 165 | pred_y = y 166 | 167 | pred_x = int(pred_x) 168 | pred_y = int(pred_y) 169 | except Exception as e: 170 | print(f'Error parsing prediction: {e}') 171 | 172 | return { 173 | 'pred_x': pred_x, 174 | 'pred_y': pred_y, 175 | 'raw_response': json.dumps(prediction, default=str), 176 | } 177 | -------------------------------------------------------------------------------- /clicks/src/clicks/third_party/openai/client.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import io 3 | import json 4 | import os 5 | import time 6 | from typing import Any, Dict, Optional 7 | 8 | from openai import OpenAI 9 | from PIL import Image 10 | from tenacity import retry, stop_after_attempt, wait_exponential 11 | 12 | OPENAI_API_ENDPOINT = 'https://api.openai.com/v1' 13 | DEFAULT_MODEL = 'gpt-4o' 14 | 15 | 16 | class OpenAIClient: 17 | def __init__( 18 | self, 19 | api_key: Optional[str] = None, 20 | api_endpoint: str = OPENAI_API_ENDPOINT, 21 | model: str = DEFAULT_MODEL, 22 | max_tokens: int = 4096, 23 | reasoning_effort: str = 'medium', 24 | ): 25 | self.api_key = api_key or os.environ.get('OPENAI_API_KEY') 26 | if not self.api_key: 27 | raise ValueError( 28 | 'API key must be provided either as an argument or through the OPENAI_API_KEY environment variable' 29 | ) 30 | 31 | self.api_endpoint = api_endpoint 32 | self.model = model 33 | self.max_tokens = max_tokens 34 | self.reasoning_effort = reasoning_effort 35 | self.client = OpenAI(api_key=self.api_key) 36 | 37 | def _extract_image_dimensions(self, image_data_uri: str) -> tuple[int, int]: 38 | try: 39 | image_data = base64.b64decode(image_data_uri) 40 | image_pil = Image.open(io.BytesIO(image_data)) 41 | width, height = image_pil.size 42 | return width, height 43 | except Exception as e: 44 | print(f'Error extracting image dimensions: {e}') 45 | return 1024, 768 46 | 47 | @retry(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, min=2, max=15)) 48 | def predict(self, image_data_uri: str, prompt: str) -> Dict[str, Any]: 49 | try: 50 | if ',' in image_data_uri: 51 | base64_data = image_data_uri.split(',')[1] 52 | else: 53 | base64_data = image_data_uri 54 | 55 | width, height = self._extract_image_dimensions(base64_data) 56 | 57 | messages = [ 58 | { 59 | 'role': 'system', 60 | 'content': ( 61 | 'You are an AI assistant that can see and interact with a computer screen. ' 62 | 'You will be shown a screenshot and given an instruction. ' 63 | 'The instruction always requires you to interact with a UI element on the screen. You can assume that the UI element is always visible on the screen.' 64 | 'Return the position of the UI element that can be interacted with to advance the action specified in the instruction. ' 65 | f'The screen resolution is {width}x{height}. ' 66 | 'Your response must be in JSON format.' 67 | ), 68 | }, 69 | { 70 | 'role': 'user', 71 | 'content': [ 72 | { 73 | 'type': 'image_url', 74 | 'image_url': {'url': f'data:image/jpeg;base64,{base64_data}', 'detail': 'high'}, 75 | }, 76 | {'type': 'text', 'text': prompt}, 77 | ], 78 | }, 79 | ] 80 | 81 | start_time = time.time() 82 | 83 | params = { 84 | 'model': self.model, 85 | 'messages': messages, 86 | 'max_completion_tokens': self.max_tokens, 87 | 'response_format': {'type': 'json_object'}, 88 | 'seed': 42, 89 | 'tools': [ 90 | { 91 | 'type': 'function', 92 | 'function': { 93 | 'name': 'click_on_element', 94 | 'description': 'Click on the specified UI element', 95 | 'parameters': { 96 | 'type': 'object', 97 | 'properties': { 98 | 'x': {'type': 'integer', 'description': 'The x-coordinate for the click action'}, 99 | 'y': {'type': 'integer', 'description': 'The y-coordinate for the click action'}, 100 | }, 101 | 'required': ['x', 'y'], 102 | }, 103 | }, 104 | } 105 | ], 106 | 'tool_choice': 'auto', 107 | } 108 | 109 | if self.model == 'o1': 110 | params['reasoning_effort'] = self.reasoning_effort 111 | 112 | response = self.client.chat.completions.create(**params) 113 | 114 | print(response) 115 | 116 | end_time = time.time() 117 | latency = end_time - start_time 118 | 119 | result = { 120 | 'model': self.model, 121 | 'latency': latency, 122 | 'response': response, 123 | } 124 | 125 | return result 126 | 127 | except Exception as e: 128 | print(f'Error making prediction: {e}') 129 | return {'error': str(e)} 130 | 131 | def parse_prediction(self, prediction: Dict[str, Any]) -> Dict[str, Any]: 132 | if not prediction or 'response' not in prediction: 133 | return { 134 | 'pred_x': None, 135 | 'pred_y': None, 136 | 'raw_response': json.dumps(prediction) if prediction else None, 137 | } 138 | 139 | response = prediction['response'] 140 | 141 | pred_x = None 142 | pred_y = None 143 | 144 | if hasattr(response, 'choices') and len(response.choices) > 0: 145 | choice = response.choices[0] 146 | 147 | if ( 148 | hasattr(choice, 'message') 149 | and hasattr(choice.message, 'tool_calls') 150 | and choice.message.tool_calls 151 | ): 152 | tool_call = choice.message.tool_calls[0] 153 | if hasattr(tool_call, 'function') and hasattr(tool_call.function, 'arguments'): 154 | try: 155 | args = json.loads(tool_call.function.arguments) 156 | pred_x = args.get('x') 157 | pred_y = args.get('y') 158 | except Exception as e: 159 | print(f'Error parsing tool call arguments: {e}') 160 | 161 | elif ( 162 | hasattr(choice, 'message') and hasattr(choice.message, 'content') and choice.message.content 163 | ): 164 | try: 165 | content = json.loads(choice.message.content) 166 | pred_x = content.get('x') 167 | pred_y = content.get('y') 168 | except Exception as e: 169 | print(f'Error parsing message content: {e}') 170 | 171 | return { 172 | 'pred_x': pred_x, 173 | 'pred_y': pred_y, 174 | 'raw_response': json.dumps(prediction, default=str), 175 | } 176 | -------------------------------------------------------------------------------- /clicks/src/clicks/evaluate/utils.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import os 3 | import urllib.parse 4 | from typing import Any, Dict, List, Optional 5 | 6 | import numpy as np 7 | from clicks.evaluate.models import EvaluationMetrics 8 | from colorama import Fore, Style, init 9 | from PIL import Image, ImageDraw, ImageFont 10 | from scipy import stats 11 | 12 | init(autoreset=True) 13 | 14 | 15 | def is_point_in_bbox( 16 | x: Optional[int], 17 | y: Optional[int], 18 | x1: Optional[int], 19 | y1: Optional[int], 20 | x2: Optional[int], 21 | y2: Optional[int], 22 | ) -> bool: 23 | """Check if a point is within a bounding box.""" 24 | if x is None or y is None or x1 is None or y1 is None or x2 is None or y2 is None: 25 | return False 26 | return x1 <= x <= x2 and y1 <= y <= y2 27 | 28 | 29 | def visualize_prediction( 30 | image_path: str, 31 | pred_x: Optional[int], 32 | pred_y: Optional[int], 33 | item_id: str, 34 | recording_id: str, 35 | instruction: str, 36 | model_name: str, 37 | run_id: Optional[str], 38 | gt_x1: Optional[int], 39 | gt_y1: Optional[int], 40 | gt_x2: Optional[int], 41 | gt_y2: Optional[int], 42 | is_in_bbox: Optional[bool] = None, 43 | ) -> Optional[str]: 44 | try: 45 | base_dir = os.path.dirname( 46 | os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 47 | ) 48 | if run_id: 49 | vis_dir = os.path.join(base_dir, 'results', run_id, model_name, 'visualizations') 50 | else: 51 | vis_dir = os.path.join(base_dir, 'visualizations') 52 | os.makedirs(vis_dir, exist_ok=True) 53 | print(f'Visualization directory: {vis_dir}') 54 | 55 | img = Image.open(image_path) 56 | draw = ImageDraw.Draw(img) 57 | 58 | try: 59 | font = ImageFont.truetype('Arial.ttf', 16) 60 | except IOError: 61 | font = ImageFont.load_default() 62 | 63 | image_filename = os.path.basename(image_path) 64 | 65 | # Draw the bounding box if available 66 | if all(v is not None for v in [gt_x1, gt_y1, gt_x2, gt_y2]): 67 | draw.rectangle([(gt_x1, gt_y1), (gt_x2, gt_y2)], outline='blue', width=2) # type: ignore 68 | if gt_y1 is not None: 69 | draw.text((gt_x1, gt_y1 - 20), 'Bounding Box', fill='blue', font=font) # type: ignore 70 | else: 71 | draw.text((gt_x1, 0), 'Bounding Box', fill='blue', font=font) # type: ignore 72 | 73 | if pred_x is not None and pred_y is not None: 74 | # Different color based on whether it's in bbox or not 75 | outline_color = 'orange' if is_in_bbox else 'red' 76 | draw.ellipse( 77 | [(pred_x - 15, pred_y - 15), (pred_x + 15, pred_y + 15)], outline=outline_color, width=3 78 | ) 79 | draw.text((pred_x + 20, pred_y + 30), 'Prediction', fill=outline_color, font=font) 80 | 81 | draw.text( 82 | (10, img.height - 30), 83 | f'Item ID: {item_id} | Recording ID: {recording_id} | Instruction: {instruction}', 84 | fill='white', 85 | font=font, 86 | ) 87 | 88 | output_filename = ( 89 | f'rec{recording_id}_item{item_id}_{image_filename}_{urllib.parse.quote_plus(instruction)}.png' 90 | ) 91 | if is_in_bbox: 92 | output_dir = os.path.join(vis_dir, 'correct') 93 | else: 94 | output_dir = os.path.join(vis_dir, 'incorrect') 95 | os.makedirs(output_dir, exist_ok=True) 96 | output_path = os.path.join(output_dir, output_filename) 97 | img.save(output_path) 98 | 99 | return output_path 100 | except Exception as e: 101 | print(f'Error creating visualization: {e}') 102 | return None 103 | 104 | 105 | def encode_image_to_base64(image_path: str) -> Optional[str]: 106 | try: 107 | with open(image_path, 'rb') as f: 108 | image_data = f.read() 109 | encoded_data = base64.b64encode(image_data).decode('utf-8') 110 | mime_type = 'image/png' if image_path.lower().endswith('.png') else 'image/jpeg' 111 | return f'data:{mime_type};base64,{encoded_data}' 112 | except Exception as e: 113 | print(f'Error encoding image: {e}') 114 | return None 115 | 116 | 117 | def check_prediction_in_bbox( 118 | pred_x: Optional[int], 119 | pred_y: Optional[int], 120 | gt_x1: Optional[int], 121 | gt_y1: Optional[int], 122 | gt_x2: Optional[int], 123 | gt_y2: Optional[int], 124 | ) -> bool: 125 | return is_point_in_bbox(pred_x, pred_y, gt_x1, gt_y1, gt_x2, gt_y2) 126 | 127 | 128 | def print_colored_result( 129 | item_id: str, 130 | instruction: str, 131 | pred_x: Optional[int], 132 | pred_y: Optional[int], 133 | latency: float, 134 | is_in_bbox: Optional[bool] = None, 135 | ) -> None: 136 | color = Fore.GREEN if is_in_bbox else Fore.RED 137 | print( 138 | f'{color}ID: {item_id} | Instruction: {instruction} | ' 139 | f'Prediction: {pred_x} {pred_y} | ' 140 | f'Correct: {is_in_bbox} | Time: {latency:.2f}s{Style.RESET_ALL}' 141 | ) 142 | 143 | 144 | def analyze_results( 145 | results: List[Dict[str, Any]], run_id: Optional[str] = None 146 | ) -> EvaluationMetrics: 147 | if not results: 148 | raise ValueError('No results to summarize') 149 | 150 | total_processed = len(results) 151 | total_in_bbox = sum(1 for result in results if result.get('is_in_bbox', False)) 152 | 153 | bbox_results = np.array( 154 | [1 if result.get('is_in_bbox', False) else 0 for result in results if 'is_in_bbox' in result] 155 | ) 156 | 157 | accuracy = (total_in_bbox / total_processed) * 100 if total_processed > 0 else None 158 | 159 | accuracy_ci = None 160 | ci = 0.95 161 | 162 | def calculate_accuracy(data): 163 | return np.mean(data) * 100 164 | 165 | if len(bbox_results) > 0: 166 | try: 167 | accuracy_bootstrap = stats.bootstrap( 168 | (bbox_results,), calculate_accuracy, confidence_level=ci, method='percentile' 169 | ) 170 | accuracy_ci = accuracy_bootstrap.confidence_interval 171 | except Exception as e: 172 | print(f'Error calculating bounding box accuracy confidence interval: {e}') 173 | 174 | print('\nResults Summary:') 175 | print(f'Total Processed: {total_processed}') 176 | print(f'Total Correct: {total_in_bbox}') 177 | print(f'Accuracy: {accuracy:.2f}%') 178 | if accuracy_ci: 179 | print(f'95% CI: [{accuracy_ci.low:.2f}%, {accuracy_ci.high:.2f}%]') 180 | 181 | metrics = EvaluationMetrics( 182 | total_processed=total_processed, 183 | total_correct=total_in_bbox, 184 | accuracy=accuracy if accuracy is not None else 0, 185 | ci=ci, 186 | accuracy_ci_low=accuracy_ci.low if accuracy_ci else None, 187 | accuracy_ci_high=accuracy_ci.high if accuracy_ci else None, 188 | ) 189 | 190 | return metrics 191 | -------------------------------------------------------------------------------- /clicks/src/clicks/third_party/openai_cua/client.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import io 3 | import json 4 | import os 5 | import time 6 | import warnings 7 | from typing import Any, Dict, Optional, Tuple 8 | 9 | from openai import OpenAI 10 | from PIL import Image 11 | from tenacity import retry, stop_after_attempt, wait_exponential 12 | 13 | warnings.filterwarnings('ignore') 14 | 15 | OPENAI_API_ENDPOINT = 'https://api.openai.com/v1' 16 | DEFAULT_MODEL = 'computer-use-preview' 17 | DEFAULT_DISPLAY_WIDTH = 1024 18 | DEFAULT_DISPLAY_HEIGHT = 768 19 | DEFAULT_ENVIRONMENT = 'mac' 20 | 21 | 22 | class OpenAICUAClient: 23 | def __init__( 24 | self, 25 | api_key: Optional[str] = None, 26 | api_endpoint: str = OPENAI_API_ENDPOINT, 27 | model: str = DEFAULT_MODEL, 28 | max_tokens: int = 4096, 29 | environment: str = DEFAULT_ENVIRONMENT, 30 | ): 31 | self.api_key = api_key or os.environ.get('OPENAI_API_KEY') 32 | if not self.api_key: 33 | raise ValueError( 34 | 'API key must be provided either as an argument or through the OPENAI_API_KEY environment variable' 35 | ) 36 | 37 | self.api_endpoint = api_endpoint 38 | self.model = model 39 | self.max_tokens = max_tokens 40 | self.environment = environment 41 | self.client = OpenAI(api_key=self.api_key) 42 | 43 | def _extract_image_dimensions(self, image_data_uri: str) -> Tuple[int, int]: 44 | try: 45 | if ',' in image_data_uri: 46 | base64_data = image_data_uri.split(',')[1] 47 | else: 48 | base64_data = image_data_uri 49 | 50 | image_data = base64.b64decode(base64_data) 51 | image_pil = Image.open(io.BytesIO(image_data)) 52 | width, height = image_pil.size 53 | return width, height 54 | except Exception as e: 55 | print(f'Error extracting image dimensions: {e}') 56 | return DEFAULT_DISPLAY_WIDTH, DEFAULT_DISPLAY_HEIGHT 57 | 58 | @retry(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, min=2, max=15)) 59 | def predict(self, image_data_uri: str, prompt: str) -> Dict[str, Any]: 60 | try: 61 | base64_data = image_data_uri.split(',')[1] if ',' in image_data_uri else image_data_uri 62 | width, height = self._extract_image_dimensions(base64_data) 63 | 64 | start_time = time.time() 65 | 66 | input_data = [ 67 | { 68 | 'role': 'user', 69 | 'content': [ 70 | { 71 | 'type': 'input_text', 72 | 'text': f'Perform the following task on the screen: {prompt} by clicking on a UI element. Do not ask for confirmation or clarifications. You have all the information you need to complete the task.', 73 | }, 74 | { 75 | 'type': 'input_image', 76 | 'image_url': f'data:image/png;base64,{base64_data}', 77 | }, 78 | ], 79 | } 80 | ] 81 | 82 | response = self.client.responses.create( 83 | model=self.model, 84 | tools=[ 85 | { 86 | 'type': 'computer_use_preview', # type: ignore 87 | 'display_width': width, 88 | 'display_height': height, 89 | 'environment': self.environment, 90 | } 91 | ], 92 | input=input_data, # type: ignore 93 | truncation='auto', 94 | ) 95 | 96 | print(response) 97 | print(f'Response: {response.output}') 98 | 99 | iteration = 0 100 | previous_response_id = response.id 101 | 102 | computer_call = None 103 | 104 | while iteration < 10: 105 | iteration += 1 106 | 107 | computer_calls = [item for item in response.output if item.type == 'computer_call'] 108 | print(f'Computer calls: {computer_calls}') 109 | if len(computer_calls) > 0: 110 | computer_call = computer_calls[0] 111 | if computer_call.action.type == 'click': 112 | print(f'Click action found: {computer_call.action}') 113 | break 114 | else: 115 | print(f'Non-click action found: {computer_call.action}. Continuing...') 116 | 117 | if len(computer_calls) > 0: 118 | computer_call = computer_calls[0] 119 | call_id = computer_call.call_id 120 | 121 | pending_safety_checks = getattr(computer_call, 'pending_safety_checks', []) 122 | acknowledged_safety_checks = [] 123 | 124 | if pending_safety_checks: 125 | print(f'Safety checks detected: {pending_safety_checks}') 126 | acknowledged_safety_checks = pending_safety_checks 127 | else: 128 | elapsed_time = time.time() - start_time 129 | return { 130 | 'elapsed_time': elapsed_time, 131 | 'response': response.model_dump(), 132 | 'computer_call': None, 133 | } 134 | 135 | input_data = [ 136 | { 137 | 'call_id': call_id, 138 | 'type': 'computer_call_output', 139 | 'output': { 140 | 'type': 'input_image', 141 | 'image_url': f'data:image/png;base64,{base64_data}', 142 | }, 143 | } 144 | ] 145 | 146 | if acknowledged_safety_checks: 147 | input_data[0]['acknowledged_safety_checks'] = acknowledged_safety_checks 148 | 149 | response = self.client.responses.create( 150 | model=self.model, 151 | previous_response_id=previous_response_id, 152 | tools=[ 153 | { 154 | 'type': 'computer_use_preview', # type: ignore 155 | 'display_width': width, 156 | 'display_height': height, 157 | 'environment': self.environment, 158 | } 159 | ], 160 | input=input_data, # type: ignore 161 | truncation='auto', 162 | ) 163 | 164 | print(f'[CUA Loop Iteration {iteration}] Response: {response.output}') 165 | 166 | previous_response_id = response.id 167 | 168 | elapsed_time = time.time() - start_time 169 | print(f'API call completed in {elapsed_time:.2f} seconds') 170 | 171 | return { 172 | 'elapsed_time': elapsed_time, 173 | 'response': response.model_dump(), 174 | 'computer_call': computer_call.model_dump() if computer_call else None, 175 | } 176 | except Exception as e: 177 | print(f'Error in CUA prediction: {e}') 178 | raise 179 | 180 | def parse_prediction(self, prediction: Dict[str, Any]) -> Dict[str, Any]: 181 | if not prediction or 'computer_call' not in prediction or not prediction['computer_call']: 182 | return { 183 | 'pred_x': None, 184 | 'pred_y': None, 185 | 'raw_response': json.dumps(prediction, default=str) if prediction else None, 186 | } 187 | 188 | computer_call = prediction['computer_call'] 189 | 190 | pred_x = None 191 | pred_y = None 192 | 193 | if computer_call and 'action' in computer_call and computer_call['action']['type'] == 'click': 194 | pred_x = computer_call['action']['x'] 195 | pred_y = computer_call['action']['y'] 196 | 197 | return { 198 | 'pred_x': pred_x, 199 | 'pred_y': pred_y, 200 | 'raw_response': json.dumps(prediction, default=str), 201 | } 202 | -------------------------------------------------------------------------------- /clicks/src/clicks/third_party/claude/client.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import io 3 | import json 4 | import os 5 | import time 6 | from typing import Any, Dict, List, Optional 7 | 8 | import requests 9 | from PIL import Image 10 | from tenacity import retry, stop_after_attempt, wait_exponential 11 | 12 | CLAUDE_API_ENDPOINT = 'https://api.anthropic.com/v1/messages' 13 | DEFAULT_MODEL = 'claude-3-7-sonnet-20250219' 14 | 15 | 16 | class ClaudeComputerUseClient: 17 | def __init__( 18 | self, 19 | api_key: Optional[str] = None, 20 | api_endpoint: str = CLAUDE_API_ENDPOINT, 21 | model: str = DEFAULT_MODEL, 22 | max_tokens: int = 4096, 23 | thinking_budget: Optional[int] = 1024, 24 | tool_version: str = '20250124', 25 | ): 26 | self.api_key = api_key or os.environ.get('ANTHROPIC_API_KEY') 27 | if not self.api_key: 28 | raise ValueError( 29 | 'API key must be provided either as an argument or through the ANTHROPIC_API_KEY environment variable' 30 | ) 31 | 32 | self.api_endpoint = api_endpoint 33 | self.model = model 34 | self.max_tokens = max_tokens 35 | self.thinking_budget = thinking_budget 36 | self.tool_version = tool_version 37 | self.beta_flag = ( 38 | 'computer-use-2025-01-24' if '20250124' in tool_version else 'computer-use-2024-10-22' 39 | ) 40 | self.display_width = None 41 | self.display_height = None 42 | 43 | def _extract_image_dimensions(self, base64_data: str) -> tuple[int, int]: 44 | try: 45 | image_data = base64.b64decode(base64_data) 46 | image = Image.open(io.BytesIO(image_data)) 47 | width, height = image.size 48 | return width, height 49 | except Exception as e: 50 | print(f'Error extracting image dimensions: {e}') 51 | return 1024, 768 52 | 53 | def _create_tools(self) -> List[Dict[str, Any]]: 54 | width = self.display_width or 1024 55 | height = self.display_height or 768 56 | 57 | return [ 58 | { 59 | 'type': f'computer_{self.tool_version}', 60 | 'name': 'computer', 61 | 'display_width_px': width, 62 | 'display_height_px': height, 63 | 'display_number': 1, 64 | }, 65 | ] 66 | 67 | def _create_thinking_config(self) -> Optional[Dict[str, Any]]: 68 | if self.thinking_budget is None: 69 | return None 70 | 71 | return {'type': 'enabled', 'budget_tokens': self.thinking_budget} 72 | 73 | @retry(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, min=2, max=15)) 74 | def predict(self, image_data_uri: str, prompt: str) -> Dict[str, Any] | None: 75 | headers = { 76 | 'Content-Type': 'application/json', 77 | 'x-api-key': self.api_key, 78 | 'anthropic-version': '2023-06-01', 79 | 'anthropic-beta': self.beta_flag, 80 | } 81 | 82 | if ',' in image_data_uri: 83 | base64_data = image_data_uri.split(',')[1] 84 | else: 85 | base64_data = image_data_uri 86 | 87 | self.display_width, self.display_height = self._extract_image_dimensions(base64_data) 88 | 89 | # Note: it is unclear if the Claude computer use agent expects the screenshot to be in the very first user message, 90 | # or is it elicited by the screenshot request tool call. During testing, the behavior was inconsistent. 91 | # The default behavior here is to not include the screenshot in the first user message, and then the same image 92 | # is sent in the tool result. You may wonder why we don't include the screenshot in the first user message. 93 | # During testing on the dev set, the evals are almost the same whether we include the screenshot in the first 94 | # user message or not. 95 | payload = { 96 | 'model': self.model, 97 | 'max_tokens': self.max_tokens, 98 | 'messages': [ 99 | { 100 | 'role': 'user', 101 | 'content': [ 102 | { 103 | 'type': 'text', 104 | 'text': prompt, 105 | }, 106 | ], 107 | } 108 | ], 109 | 'tools': self._create_tools(), 110 | } 111 | 112 | thinking = self._create_thinking_config() 113 | if thinking: 114 | payload['thinking'] = thinking 115 | 116 | start_time = time.time() 117 | response = requests.post( 118 | self.api_endpoint, 119 | headers=headers, 120 | json=payload, 121 | ) 122 | 123 | if response.status_code != 200: 124 | print(f'API Error: {response.status_code} - {response.text}') 125 | response.raise_for_status() 126 | 127 | result = response.json() 128 | 129 | raw_response = json.dumps(result) 130 | 131 | print(result) 132 | 133 | tool_use = None 134 | tool_use_id = None 135 | for content_item in result.get('content', []): 136 | if content_item.get('type') == 'tool_use' and content_item.get('name') == 'computer': 137 | tool_use = content_item.get('input', {}) 138 | tool_use_id = content_item.get('id') 139 | break 140 | 141 | if not tool_use: 142 | print('No computer tool call found in the response') 143 | return None 144 | if tool_use.get('action') == 'screenshot': 145 | print('Claude requested a screenshot. Sending the same image again...') 146 | 147 | payload = { 148 | 'model': self.model, 149 | 'max_tokens': self.max_tokens, 150 | 'messages': [ 151 | { 152 | 'role': 'user', 153 | 'content': [ 154 | { 155 | 'type': 'text', 156 | 'text': prompt, 157 | }, 158 | ], 159 | }, 160 | { 161 | 'role': 'assistant', 162 | 'content': result.get('content', []), 163 | }, 164 | { 165 | 'role': 'user', 166 | 'content': [ 167 | { 168 | 'type': 'tool_result', 169 | 'tool_use_id': tool_use_id, 170 | 'content': [ 171 | { 172 | 'type': 'image', 173 | 'source': { 174 | 'type': 'base64', 175 | 'media_type': 'image/jpeg', 176 | 'data': base64_data, 177 | }, 178 | } 179 | ], 180 | } 181 | ], 182 | }, 183 | ], 184 | 'tools': self._create_tools(), 185 | } 186 | 187 | if thinking: 188 | payload['thinking'] = thinking 189 | 190 | response = requests.post( 191 | self.api_endpoint, 192 | headers=headers, 193 | json=payload, 194 | ) 195 | 196 | if response.status_code != 200: 197 | print(f'API Error: {response.status_code} - {response.text}') 198 | response.raise_for_status() 199 | 200 | result = response.json() 201 | 202 | raw_response_second = json.dumps(result) 203 | 204 | print('Second response after screenshot:') 205 | print(result) 206 | 207 | tool_use = None 208 | for content_item in result.get('content', []): 209 | if content_item.get('type') == 'tool_use' and content_item.get('name') == 'computer': 210 | tool_use = content_item.get('input', {}) 211 | break 212 | 213 | if not tool_use: 214 | print('No computer tool call found in the second response') 215 | return None 216 | 217 | tool_use['raw_responses'] = [raw_response, raw_response_second] 218 | else: 219 | tool_use['raw_responses'] = [raw_response] 220 | 221 | tool_use['latency'] = time.time() - start_time 222 | tool_use['model'] = self.model 223 | 224 | if 'thinking' in result: 225 | tool_use['thinking'] = result['thinking'] 226 | 227 | return tool_use 228 | 229 | def parse_prediction(self, prediction: Dict[str, Any]) -> Dict[str, Any]: 230 | try: 231 | assert isinstance(prediction, dict) 232 | 233 | action_kind = prediction.get('action', {}) 234 | 235 | if action_kind == 'left_click': 236 | coordinate = prediction.get('coordinate', {}) 237 | pred_x, pred_y = coordinate 238 | pred_type = 'left_click' 239 | pred_text = None 240 | elif action_kind == 'type': 241 | pred_x, pred_y = None, None 242 | pred_type = 'type' 243 | pred_text = prediction.get('text') 244 | elif action_kind == 'screenshot': 245 | pred_x, pred_y = None, None 246 | pred_type = 'screenshot' 247 | pred_text = None 248 | else: 249 | pred_x, pred_y = None, None 250 | pred_type = action_kind 251 | pred_text = None 252 | 253 | result = { 254 | 'pred_type': pred_type, 255 | 'pred_x': pred_x, 256 | 'pred_y': pred_y, 257 | 'pred_text': pred_text, 258 | } 259 | 260 | if 'raw_responses' in prediction: 261 | result['raw_responses'] = prediction['raw_responses'] 262 | 263 | if 'thinking' in prediction: 264 | result['thinking'] = prediction['thinking'] 265 | 266 | return result 267 | 268 | except Exception as e: 269 | print(f'Error parsing prediction: {e}') 270 | return { 271 | 'pred_type': None, 272 | 'pred_x': None, 273 | 'pred_y': None, 274 | 'pred_text': None, 275 | } 276 | -------------------------------------------------------------------------------- /clicks/src/clicks/third_party/ui_tars/client.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import io 3 | import json 4 | import re 5 | import time 6 | from typing import Any, Dict 7 | 8 | import requests 9 | from PIL import Image 10 | from tenacity import retry, stop_after_attempt, wait_exponential 11 | 12 | 13 | class UITarsClient: 14 | def __init__( 15 | self, 16 | api_url: str, 17 | api_key: str = 'super-secret-key', 18 | max_tokens: int = 128, 19 | temperature: float = 0.0, 20 | frequency_penalty: float = 1.0, 21 | model_name: str = 'bytedance-research/UI-TARS-72B-SFT', 22 | ): 23 | self.api_url = api_url.rstrip('/') 24 | self.api_key = api_key 25 | self.max_tokens = max_tokens 26 | self.temperature = temperature 27 | self.frequency_penalty = frequency_penalty 28 | self.model_name = model_name 29 | 30 | def _encode_image(self, image_data_uri: str) -> str: 31 | if ',' in image_data_uri: 32 | base64_data = image_data_uri.split(',')[1] 33 | else: 34 | base64_data = image_data_uri 35 | 36 | return base64_data 37 | 38 | def _extract_image_dimensions(self, base64_data: str) -> tuple[int, int]: 39 | try: 40 | image_data = base64.b64decode(base64_data) 41 | image = Image.open(io.BytesIO(image_data)) 42 | width, height = image.size 43 | return width, height 44 | except Exception as e: 45 | print(f'Error extracting image dimensions: {e}') 46 | return 1024, 768 47 | 48 | @retry(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, min=2, max=15)) 49 | def predict(self, image_data_uri: str, prompt: str) -> Dict[str, Any]: 50 | base64_image = self._encode_image(image_data_uri) 51 | width, height = self._extract_image_dimensions(base64_image) 52 | 53 | # Note: UI-TARS is not a generalist VLM: prompting it with plain English will cause the model to severely collapse. 54 | # Hence, it is unclear how to change the given computer use prompt, so we just use the default one provided in the UI-TARS repo. 55 | # drag, right_single, hotkey, type, scroll, wait, finished, call_user are here but we won't use them and will treat it as a failure. 56 | prompt_template = f"""You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task. 57 | 58 | ## Output Format 59 | ```\nThought: ... 60 | Action: ...\n``` 61 | 62 | ## Action Space 63 | 64 | click(start_box='<|box_start|>(x1,y1)<|box_end|>') 65 | left_double(start_box='<|box_start|>(x1,y1)<|box_end|>') 66 | right_single(start_box='<|box_start|>(x1,y1)<|box_end|>') 67 | drag(start_box='<|box_start|>(x1,y1)<|box_end|>', end_box='<|box_start|>(x3,y3)<|box_end|>') 68 | hotkey(key='') 69 | type(content='') #If you want to submit your input, use \"\ 70 | \" at the end of `content`. 71 | scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left') 72 | wait() #Sleep for 5s and take a screenshot to check for any changes. 73 | finished() 74 | call_user() # Submit the task and call the user when the task is unsolvable, or when you need the user's help. 75 | 76 | 77 | ## Note 78 | - Use Chinese in `Thought` part. 79 | - Summarize your next action (with its target element) in one sentence in `Thought` part. 80 | 81 | ## User Instruction 82 | {prompt}""" 83 | 84 | # Prepare the multimodal message 85 | multimodal_message = { 86 | 'role': 'user', 87 | 'content': [ 88 | {'type': 'image_url', 'image_url': {'url': f'data:image/png;base64,{base64_image}'}}, 89 | {'type': 'text', 'text': prompt_template}, 90 | ], 91 | } 92 | 93 | request_data = { 94 | 'messages': [multimodal_message], 95 | 'model': self.model_name, 96 | 'max_tokens': self.max_tokens, 97 | 'temperature': self.temperature, 98 | 'frequency_penalty': self.frequency_penalty, 99 | } 100 | 101 | headers = { 102 | 'Content-Type': 'application/json', 103 | 'Authorization': f'Bearer {self.api_key}', 104 | } 105 | 106 | try: 107 | start_time = time.time() 108 | 109 | response = requests.post( 110 | f'{self.api_url}/v1/chat/completions', 111 | json=request_data, 112 | headers=headers, 113 | timeout=7200, 114 | ) 115 | 116 | end_time = time.time() 117 | latency = end_time - start_time 118 | 119 | if response.status_code == 200: 120 | result = response.json() 121 | 122 | content = result.get('choices', [{}])[0].get('message', {}).get('content', '') 123 | 124 | print(content) 125 | return { 126 | 'raw_response': json.dumps(result), 127 | 'content': content, 128 | 'latency_seconds': latency, 129 | 'width': width, 130 | 'height': height, 131 | } 132 | else: 133 | print(response.text) 134 | error_text = response.text 135 | try: 136 | error_json = response.json() 137 | error_text = json.dumps(error_json) 138 | except: 139 | pass 140 | 141 | return { 142 | 'error': f'HTTP Error {response.status_code}', 143 | 'error_details': error_text, 144 | 'latency_seconds': latency, 145 | } 146 | 147 | except Exception as e: 148 | print(f'API Error: {str(e)}') 149 | return { 150 | 'error': f'API Error: {str(e)}', 151 | 'latency_seconds': 0, 152 | } 153 | 154 | def parse_prediction(self, prediction: Dict[str, Any]) -> Dict[str, Any]: 155 | if 'error' in prediction: 156 | return { 157 | 'pred_x': None, 158 | 'pred_y': None, 159 | 'error': prediction.get('error'), 160 | 'error_details': prediction.get('error_details', ''), 161 | 'raw_responses': prediction.get('raw_response', '{}'), 162 | } 163 | 164 | content = prediction.get('content', '') 165 | width = prediction.get('width', 1920) # Default to 1920 if width not provided 166 | height = prediction.get('height', 1080) # Default to 1080 if height not provided 167 | 168 | action_match = re.search(r'Action:\s*(.*?)(?:\n|$)', content, re.DOTALL) 169 | action_text = action_match.group(1).strip() if action_match else content 170 | 171 | click_match = re.search( 172 | r"click\(start_box='<\|box_start\|>\((\d+),\s*(\d+)\)<\|box_end\|>'\)", action_text 173 | ) 174 | if click_match: 175 | rel_x = int(click_match.group(1)) 176 | rel_y = int(click_match.group(2)) 177 | pred_x = round(width * rel_x / 1000) 178 | pred_y = round(height * rel_y / 1000) 179 | return { 180 | 'pred_x': pred_x, 181 | 'pred_y': pred_y, 182 | 'content': content, 183 | 'raw_responses': prediction.get('raw_response', '{}'), 184 | } 185 | 186 | # Process double click action 187 | double_click_match = re.search( 188 | r"left_double\(start_box='<\|box_start\|>\((\d+),\s*(\d+)\)<\|box_end\|>'\)", action_text 189 | ) 190 | if double_click_match: 191 | rel_x = int(double_click_match.group(1)) 192 | rel_y = int(double_click_match.group(2)) 193 | pred_x = round(width * rel_x / 1000) 194 | pred_y = round(height * rel_y / 1000) 195 | return { 196 | 'pred_x': pred_x, 197 | 'pred_y': pred_y, 198 | 'content': content, 199 | 'raw_responses': prediction.get('raw_response', '{}'), 200 | } 201 | 202 | # Process generic coordinate pattern 203 | coord_match = re.search(r'\((\d+),\s*(\d+)\)', content) 204 | if coord_match: 205 | rel_x = int(coord_match.group(1)) 206 | rel_y = int(coord_match.group(2)) 207 | pred_x = round(width * rel_x / 1000) 208 | pred_y = round(height * rel_y / 1000) 209 | return { 210 | 'pred_x': pred_x, 211 | 'pred_y': pred_y, 212 | 'content': content, 213 | 'raw_responses': prediction.get('raw_response', '{}'), 214 | } 215 | 216 | # Process x=X, y=Y format 217 | x_match = re.search(r'x\s*=\s*(\d+)', content, re.IGNORECASE) 218 | y_match = re.search(r'y\s*=\s*(\d+)', content, re.IGNORECASE) 219 | if x_match and y_match: 220 | rel_x = int(x_match.group(1)) 221 | rel_y = int(y_match.group(1)) 222 | pred_x = round(width * rel_x / 1000) 223 | pred_y = round(height * rel_y / 1000) 224 | return { 225 | 'pred_x': pred_x, 226 | 'pred_y': pred_y, 227 | 'content': content, 228 | 'raw_responses': prediction.get('raw_response', '{}'), 229 | } 230 | 231 | # Process box format 232 | box_match = re.search(r'<\|box_start\|>\((\d+),\s*(\d+)\)<\|box_end\|>', content) 233 | if box_match: 234 | rel_x = int(box_match.group(1)) 235 | rel_y = int(box_match.group(2)) 236 | pred_x = round(width * rel_x / 1000) 237 | pred_y = round(height * rel_y / 1000) 238 | return { 239 | 'pred_x': pred_x, 240 | 'pred_y': pred_y, 241 | 'content': content, 242 | 'raw_responses': prediction.get('raw_response', '{}'), 243 | } 244 | 245 | return { 246 | 'pred_x': None, 247 | 'pred_y': None, 248 | 'content': content, 249 | 'raw_responses': prediction.get('raw_response', '{}'), 250 | 'error': 'No coordinates found in response', 251 | } 252 | -------------------------------------------------------------------------------- /clicks/README.md: -------------------------------------------------------------------------------- 1 | # showdown-clicks 2 | 3 | General Agents 4 | 5 | [🤗 Dataset](https://huggingface.co/datasets/generalagents/showdown-clicks) | [GitHub](https://github.com/generalagents/showdown) 6 | 7 | `showdown` is a suite of offline and online benchmarks for computer-use agents. 8 | 9 | `showdown-clicks` is a collection of 5,679 left clicks of humans performing various tasks in a macOS desktop environment. It is intended to evaluate instruction-following and low-level control capabilities of computer-use agents. 10 | 11 | As of March 2025, we are releasing a subset of the full set, `showdown-clicks-dev`, containing 557 clicks. All examples are annotated with the bounding box of viable click locations for the UI element. 12 | 13 | The episodes range from tens of seconds to minutes, and screenshots are between WXGA (1280×800) and WSXGA+ (1680×1050). The recordings contain no PII and were collected in late 2024. 14 | 15 | | Column | Description | 16 | |--------|-------------| 17 | | id | Unique identifier for each data entry (alphanumeric string) | 18 | | image | Path to the screenshot image file showing the UI state | 19 | | instruction | Natural language instruction describing the task to be performed | 20 | | x1 | Top-left x-coordinate of the bounding box | 21 | | y1 | Top-left y-coordinate of the bounding box | 22 | | x2 | Bottom-right x-coordinate of the bounding box | 23 | | y2 | Bottom-right y-coordinate of the bounding box | 24 | | width | Width of the image | 25 | | height | Height of the image | 26 | 27 | ## `showdown-clicks-dev` Results 28 | 29 | | Model | Accuracy | 95% CI | Latency [^1] | 95% CI | 30 | |------------------------------------------------------|---------------|---------------------|--------------|-------------------------| 31 | | `ace-control-medium` | **77.56%** | +3.41%/-3.59% | 533ms | +8ms/-7ms | 32 | | `ace-control-small` | 72.89% | +3.59%/-3.77% | **324ms** | +7ms/-7ms | 33 | | Operator (OpenAI CUA, macOS) | 64.27% | +3.95%/-3.95% | 6385ms | +182ms/-177ms | 34 | | Molmo-72B-0924 | 54.76% | +4.13%/-4.13% | 6599ms | +113ms/-114ms | 35 | | Claude 3.7 Sonnet (Thinking, Computer Use) | 53.68% | +4.13%/-4.13% | 9656ms | +95ms/-97ms | 36 | | UI-TARS-72B-SFT | 54.4% | +4.13%/-4.13% | 1977ms | +15ms/-16ms | 37 | | OmniParser V2 + GPT-4o | 51.71% | +4.12%/-4.13% | 12642ms | +361ms/-349ms | 38 | | Gemini 2.0 Flash | 33.39% | +3.95%/-3.95% | 3069ms | +16ms/-16ms | 39 | | Qwen2.5-VL-72B-Instruct | 24.78% | +3.59%/-3.60% | 3790ms | +57ms/-55ms | 40 | | GPT-4o | 5.21% | +1.97%/-1.80% | 2500ms | +49ms/-48ms | 41 | 42 | ### Run evals 43 | ```bash 44 | uv run eval.py --model ace --dataset dev --num-workers 1 --run-id showdown-clicks-dev 45 | uv run eval.py --model claude --dataset dev --num-workers 16 --run-id showdown-clicks-dev 46 | uv run eval.py --model qwen --dataset dev --num-workers 3 --run-id showdown-clicks-dev 47 | uv run eval.py --model gemini --dataset dev --num-workers 16 --run-id showdown-clicks-dev 48 | uv run eval.py --model openai --dataset dev --num-workers 16 --run-id showdown-clicks-dev 49 | uv run eval.py --model openai-cua --dataset dev --num-workers 16 --run-id showdown-clicks-dev 50 | uv run eval.py --model molmo --dataset dev --num-workers 2 --run-id showdown-clicks-dev --api-url $YOUR_MOLMO_MODAL_API 51 | uv run eval.py --model ui-tars --dataset dev --run-id showdown-clicks-dev --api-url $YOUR_UITARS_MODAL_API --api-key $YOUR_UITARS_API_KEY --num-workers 1 --ui-tars-model bytedance-research/UI-TARS-72B-SFT 52 | uv run eval.py --model omniparser --dataset dev --run-id showdown-clicks-dev --omniparser-model gpt-4o-2024-05-13 --api-url $YOUR_OMNIPARSER_MODAL_API --num-workers 4 53 | ``` 54 | 55 | When you are done with the evals, go to Modal's UI and terminate the individual apps. 56 | 57 | ## Directory Structure 58 | 59 | The project is organized as follows: 60 | 61 | - `data/`: Input data 62 | - `showdown-clicks-dev/data.csv`: Records 63 | - `showdown-clicks-dev/frames`: Image frames 64 | 65 | - `results/`: Output data 66 | - CSV result files from evaluations 67 | - `showdown-clicks-dev/{$MODEL}/visualizations/`: Visualizations of model predictions 68 | - `report/`: Analysis reports and summary metrics 69 | 70 | - `scripts/`: Utility scripts 71 | - `calculate_latency.py`: Script to calculate latency metrics 72 | - `collect_runs.py`: Script to collect results from multiple runs 73 | 74 | - `src/clicks/`: Main source code 75 | - `api_client_base.py`: Base API client classes 76 | - `evaluate/`: Evaluation code 77 | - `ace.py`: Ace model implementation 78 | - `models.py`: Data models for evaluation 79 | - `utils.py`: Utilities for visualization and evaluation 80 | - `third_party/`: Third-party model integrations 81 | - `claude/`: Claude model integration 82 | - `gemini/`: Gemini model integration 83 | - `molmo/`: Molmo model integration 84 | - `omniparser/`: OmniParser model integration 85 | - `openai/`: OpenAI model integration 86 | - `openai_cua/`: OpenAI Computer Use Agent integration 87 | - `qwen/`: Qwen model integration 88 | - `ui_tars/`: UI-TARS model integration 89 | 90 | ## Usage 91 | 92 | To run the evaluation, use the `eval.py` script: 93 | 94 | ```bash 95 | # Run on the dev dataset (default) 96 | uv run eval.py 97 | 98 | # Run with a specific model (ace, claude, qwen, etc.) 99 | uv run eval.py --model claude --api-key YOUR_API_KEY 100 | 101 | # Run with a limited sample size (for testing) 102 | uv run eval.py --sample-size 10 103 | 104 | # Run with multiple workers for parallel processing 105 | uv run eval.py --num-workers 4 106 | 107 | # Run with a custom output file 108 | uv run eval.py --output-file results/custom_results.csv 109 | ``` 110 | 111 | ### Model-specific options 112 | 113 | #### Computer-use agents 114 | 115 | Claude: 116 | ```bash 117 | uv run eval.py --model claude --api-key YOUR_ANTHROPIC_API_KEY --claude-model claude-3-7-sonnet-20250219 --thinking-budget 1024 118 | ``` 119 | 120 | Qwen: 121 | ```bash 122 | uv run eval.py --model qwen --api-key YOUR_DASHSCOPE_API_KEY --qwen-model qwen2.5-vl-72b-instruct --max-tokens 4096 123 | ``` 124 | 125 | UI-TARS: 126 | ```bash 127 | uv run eval.py --model ui-tars --api-url YOUR_UITARS_API_URL --api-key YOUR_API_KEY --ui-tars-model bytedance-research/UI-TARS-72B-SFT --max-tokens 128 --temperature 0.0 --frequency-penalty 1.0 128 | ``` 129 | 130 | OmniParser: 131 | ```bash 132 | uv run eval.py --model omniparser --dataset dev --run-id showdown-clicks-dev --omniparser-model gpt-4o-2024-05-13 --api-url YOUR_OMNIPARSER_API_URL --omniparser-temperature 0.7 133 | ``` 134 | 135 | Operator: 136 | ```bash 137 | uv run eval.py --model openai-cua --dataset dev --run-id showdown-clicks-dev --environment mac 138 | ``` 139 | 140 | Ace (default): 141 | ```bash 142 | uv run eval.py --model ace 143 | ``` 144 | 145 | #### VLMs 146 | 147 | OpenAI: 148 | ```bash 149 | uv run eval.py --model openai --api-key YOUR_OPENAI_API_KEY --openai-model gpt-4o --dataset dev 150 | ``` 151 | 152 | Gemini: 153 | ```bash 154 | uv run eval.py --model gemini --api-key YOUR_GEMINI_API_KEY --gemini-model gemini-1.5-pro-latest --dataset dev 155 | ``` 156 | 157 | Molmo: 158 | ```bash 159 | uv run eval.py --model molmo --api-url YOUR_MOLMO_API_URL --dataset dev 160 | ``` 161 | 162 | ## Environment Variables 163 | 164 | Alternative to passing API keys as command-line arguments: 165 | 166 | - `ANTHROPIC_API_KEY`: API key for Claude 167 | - `DASHSCOPE_API_KEY`: API key for Qwen 168 | - `OPENAI_API_KEY`: API key for OpenAI and OpenAI CUA 169 | - `GEMINI_API_KEY`: API key for Gemini 170 | - `GENERALAGENTS_API_KEY`: API key for General Agents (Ace) 171 | 172 | ## Visualization 173 | 174 | The evaluation script generates visualizations of model predictions, showing both the ground truth click position, bounding box, and the predicted click position. These visualizations are saved in the `results/[run-id]/[model]/visualizations/` directory, organized by model and correctness. 175 | 176 | ## Results Format 177 | 178 | The evaluation results are saved as CSV files in the `results/[run-id]/` directory. Each row in the CSV file contains: 179 | 180 | | Column | Description | 181 | |--------|-------------| 182 | | id | Unique identifier for the evaluation item | 183 | | recording_id | Identifier for the recording session | 184 | | instruction | The instruction given to the model | 185 | | image_path | Path to the image file | 186 | | gt_x1 | Ground truth bounding box left X-coordinate | 187 | | gt_y1 | Ground truth bounding box top Y-coordinate | 188 | | gt_x2 | Ground truth bounding box right X-coordinate | 189 | | gt_y2 | Ground truth bounding box bottom Y-coordinate | 190 | | pred_x | Predicted X-coordinate | 191 | | pred_y | Predicted Y-coordinate | 192 | | is_in_bbox | Whether the prediction is within the ground truth bounding box | 193 | | latency_seconds | Time taken for the model to make the prediction | 194 | | visualization_path | Path to the visualization image | 195 | | raw_response | Raw response from the model | 196 | 197 | ## Metrics 198 | 199 | The evaluation script calculates the percentage of correct predictions (within the bounding box), with 95% confidence intervals created from bootstrapping. 200 | 201 | ## License 202 | 203 | This project is licensed under the MIT License - see the LICENSE file for details. 204 | 205 | ## Disclaimer 206 | 207 | The images used in this evaluation dataset may contain content that some users might find offensive, inappropriate, or objectionable. These images are included solely for the purpose of evaluating model performance on realistic computer use scenarios. 208 | 209 | We do not endorse, approve of, or claim responsibility for any content displayed in these images. The inclusion of any image in this dataset does not represent our views or opinions, and is not intended to promote any particular content, website, or viewpoint. 210 | 211 | Researchers and users of this evaluation framework should be aware of this possibility when reviewing results and visualizations. 212 | 213 | ## Citation 214 | 215 | If you use `showdown-clicks` in your research, please cite it as follows: 216 | 217 | ```bibtex 218 | @misc{showdown2025, 219 | title={The Showdown Computer Control Evaluation Suite}, 220 | author={General Agents Team}, 221 | year={2025}, 222 | url={https://github.com/generalagents/showdown}, 223 | } 224 | ``` 225 | 226 | [^1]: Latency values vary significantly by provider, demand, computational resources, geographical location, and other factors - most of which are opaque to us for models we don't have direct access to. Ace models are served via General Agent's API; Qwen, Claude, Gemini, and OpenAI models utilize their respective first-party APIs; while Molmo, UI-TARS, and OmniParser models are served through Modal. -------------------------------------------------------------------------------- /clicks/src/clicks/third_party/qwen/client.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import io 3 | import json 4 | import math 5 | import os 6 | import re 7 | import time 8 | from typing import Any, Dict, Optional, Tuple 9 | 10 | from openai import OpenAI 11 | from PIL import Image 12 | from tenacity import retry, stop_after_attempt, wait_exponential 13 | 14 | DEFAULT_MODEL = 'qwen2.5-vl-72b-instruct' 15 | DASHSCOPE_API_ENDPOINT = 'https://dashscope.aliyuncs.com/compatible-mode/v1' 16 | 17 | # Qwen performs better with this slightly modified prompt adapted from the official cookbook 18 | QWEN_ACTION_SPACE = { 19 | 'type': 'function', 20 | 'function': { 21 | 'name_for_human': 'computer_use', 22 | 'name': 'computer_use', 23 | 'description': "Use a mouse and keyboard to interact with a computer, and take screenshots.\\n* This is an interface to a desktop GUI. You do not have access to a terminal or applications menu. You must click on desktop icons to start applications.\\n* Some applications may take time to start or process actions, so you may need to wait and take successive screenshots to see the results of your actions. E.g. if you click on Firefox and a window doesn't open, try wait and taking another screenshot.\\n* The screen's resolution is {RES_WIDTH}x{RES_HEIGHT}.\\n* Whenever you intend to move the cursor to click on an element like an icon, you should consult a screenshot to determine the coordinates of the element before moving the cursor.\\n* If you tried clicking on a program or link but it failed to load, even after waiting, try adjusting your cursor position so that the tip of the cursor visually falls on the element that you want to click.\\n* Make sure to click any buttons, links, icons, etc with the cursor tip in the center of the element. Don't click boxes on their edges unless asked.", 24 | 'parameters': { 25 | 'properties': { 26 | 'action': { 27 | 'description': 'The action to perform. The available actions are:\\n* `key`: Performs key down presses on the arguments passed in order, then performs key releases in reverse order.\\n* `type`: Type a string of text on the keyboard.\\n* `mouse_move`: Move the cursor to a specified (x, y) pixel coordinate on the screen.\\n* `left_click`: Click the left mouse button.\\n* `left_click_drag`: Click and drag the cursor from a start coordinate to an end coordinate on the screen.\\n* `right_click`: Click the right mouse button.\\n* `double_click`: Double-click the left mouse button.\\n* `scroll`: Performs a scroll of the mouse scroll wheel.\\n* `wait`: Wait for the change to happen.\\n* `terminate`: Terminate the current task when it is completed.', 28 | 'enum': [ 29 | 'key', 30 | 'type', 31 | 'mouse_move', 32 | 'left_click', 33 | 'left_click_drag', 34 | 'right_click', 35 | 'double_click', 36 | 'scroll', 37 | 'wait', 38 | 'terminate', 39 | ], 40 | 'type': 'string', 41 | }, 42 | 'keys': {'description': 'Required only by `action=key`.', 'type': 'array'}, 43 | 'text': {'description': 'Required only by `action=type`.', 'type': 'string'}, 44 | 'start_coordinate': { 45 | 'description': '(x, y): The starting x (pixels from the left edge) and y (pixels from the top edge) coordinates. Required only by `action=left_click_drag`.', 46 | 'type': 'array', 47 | }, 48 | 'end_coordinate': { 49 | 'description': '(x, y): The ending x (pixels from the left edge) and y (pixels from the top edge) coordinates. Required only by `action=left_click_drag`.', 50 | 'type': 'array', 51 | }, 52 | 'coordinate': { 53 | 'description': '(x, y): The x (pixels from the left edge) and y (pixels from the top edge) coordinates to move the mouse to. Required by `action=mouse_move, action=left_click, action=right_click, action=double_click`.', 54 | 'type': 'array', 55 | }, 56 | 'pixels': { 57 | 'description': 'The amount of scrolling to perform. Positive values scroll up, negative values scroll down. Required only by `action=scroll`.', 58 | 'type': 'number', 59 | }, 60 | }, 61 | 'required': ['action'], 62 | 'type': 'object', 63 | }, 64 | 'args_format': 'Format the arguments as a JSON object.', 65 | }, 66 | } 67 | 68 | BASE_PROMPT_TEMPLATE = ( 69 | """# Tools 70 | 71 | You MUST call a single function to assist with the user query. Do not call multiple functions, and do not answer the user's query without calling a function. 72 | 73 | You are provided with function signatures within XML tags: 74 | """ 75 | + json.dumps(QWEN_ACTION_SPACE) 76 | + """ 77 | 78 | For each function call, return a json object with function name and arguments within XML tags: 79 | 80 | {"name": , "arguments": } 81 | """ 82 | ) 83 | 84 | 85 | def smart_resize( 86 | height: int, width: int, factor: int = 28, min_pixels: int = 3136, max_pixels: int = 12845056 87 | ): 88 | if height < factor or width < factor: 89 | raise ValueError(f'height:{height} or width:{width} must be larger than factor:{factor}') 90 | elif max(height, width) / min(height, width) > 200: 91 | raise ValueError( 92 | f'absolute aspect ratio must be smaller than 200, got {max(height, width) / min(height, width)}' 93 | ) 94 | h_bar = round(height / factor) * factor 95 | w_bar = round(width / factor) * factor 96 | if h_bar * w_bar > max_pixels: 97 | beta = math.sqrt((height * width) / max_pixels) 98 | h_bar = math.floor(height / beta / factor) * factor 99 | w_bar = math.floor(width / beta / factor) * factor 100 | elif h_bar * w_bar < min_pixels: 101 | beta = math.sqrt(min_pixels / (height * width)) 102 | h_bar = math.ceil(height * beta / factor) * factor 103 | w_bar = math.ceil(width * beta / factor) * factor 104 | return h_bar, w_bar 105 | 106 | 107 | class QwenVLClient: 108 | def __init__( 109 | self, 110 | api_key: Optional[str] = None, 111 | api_endpoint: str = DASHSCOPE_API_ENDPOINT, 112 | model: str = DEFAULT_MODEL, 113 | max_tokens: int = 4096, 114 | use_smart_resize: bool = True, 115 | resize_factor: int = 28, 116 | min_pixels: int = 3136, 117 | max_pixels: int = 12845056, 118 | ): 119 | self.api_key = api_key or os.environ.get('DASHSCOPE_API_KEY') 120 | if not self.api_key: 121 | raise ValueError( 122 | 'API key must be provided either as an argument or through the DASHSCOPE_API_KEY environment variable' 123 | ) 124 | 125 | self.api_endpoint = api_endpoint 126 | self.model = model 127 | self.max_tokens = max_tokens 128 | self.display_width = None 129 | self.display_height = None 130 | self.original_width = None 131 | self.original_height = None 132 | self.use_smart_resize = use_smart_resize 133 | self.resize_factor = resize_factor 134 | self.min_pixels = min_pixels 135 | self.max_pixels = max_pixels 136 | 137 | def _create_client(self) -> OpenAI: 138 | return OpenAI( 139 | api_key=self.api_key, 140 | base_url=self.api_endpoint, 141 | ) 142 | 143 | def _extract_image_dimensions(self, base64_data: str) -> tuple[int, int]: 144 | try: 145 | image_data = base64.b64decode(base64_data) 146 | image = Image.open(io.BytesIO(image_data)) 147 | width, height = image.size 148 | return width, height 149 | except Exception as e: 150 | print(f'Error extracting image dimensions: {e}') 151 | return 1024, 768 152 | 153 | def _resize_image(self, base64_data: str) -> Tuple[str, int, int]: 154 | try: 155 | image_data = base64.b64decode(base64_data) 156 | image = Image.open(io.BytesIO(image_data)) 157 | self.original_width, self.original_height = image.size 158 | 159 | new_height, new_width = smart_resize( 160 | self.original_height, 161 | self.original_width, 162 | factor=self.resize_factor, 163 | min_pixels=self.min_pixels, 164 | max_pixels=self.max_pixels, 165 | ) 166 | 167 | resized_image = image.resize((new_width, new_height), resample=2) 168 | 169 | buffer = io.BytesIO() 170 | resized_image.save(buffer, format=image.format or 'JPEG') 171 | buffer.seek(0) 172 | new_base64_data = base64.b64encode(buffer.getvalue()).decode('utf-8') 173 | 174 | return new_base64_data, new_width, new_height 175 | except Exception as e: 176 | print(f'Error resizing image: {e}') 177 | width = self.original_width or 1024 178 | height = self.original_height or 768 179 | return base64_data, width, height 180 | 181 | def _translate_coordinates( 182 | self, x: Optional[int], y: Optional[int] 183 | ) -> Tuple[Optional[int], Optional[int]]: 184 | if x is None or y is None: 185 | return x, y 186 | 187 | orig_width = self.original_width or 1024 188 | orig_height = self.original_height or 768 189 | disp_width = self.display_width or 1024 190 | disp_height = self.display_height or 768 191 | 192 | x_scale = orig_width / disp_width 193 | y_scale = orig_height / disp_height 194 | 195 | original_x = round(x * x_scale) 196 | original_y = round(y * y_scale) 197 | 198 | return original_x, original_y 199 | 200 | def _create_system_prompt(self) -> str: 201 | width = self.display_width or 1024 202 | height = self.display_height or 768 203 | 204 | prompt = BASE_PROMPT_TEMPLATE.replace('{RES_WIDTH}', str(width)).replace( 205 | '{RES_HEIGHT}', str(height) 206 | ) 207 | 208 | return prompt 209 | 210 | @retry(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, min=2, max=15)) 211 | def predict(self, image_data_uri: str, prompt: str) -> Dict[str, Any] | None: 212 | if ',' in image_data_uri: 213 | base64_data = image_data_uri.split(',')[1] 214 | mime_type = image_data_uri.split(',')[0] 215 | else: 216 | base64_data = image_data_uri 217 | mime_type = 'data:image/jpeg;base64' 218 | 219 | self.original_width, self.original_height = self._extract_image_dimensions(base64_data) 220 | 221 | if self.use_smart_resize: 222 | base64_data, new_width, new_height = self._resize_image(base64_data) 223 | self.display_width, self.display_height = new_width, new_height 224 | else: 225 | self.display_width, self.display_height = self.original_width, self.original_height 226 | 227 | image_data_uri = f'{mime_type},{base64_data}' 228 | 229 | system_prompt = self._create_system_prompt() 230 | 231 | try: 232 | client = self._create_client() 233 | 234 | start_time = time.time() 235 | 236 | response = client.chat.completions.create( 237 | model=self.model, 238 | temperature=0.0, 239 | max_tokens=self.max_tokens, 240 | messages=[ 241 | { 242 | 'role': 'system', 243 | 'content': [ 244 | { 245 | 'type': 'text', 246 | 'text': 'You are a helpful assistant.', 247 | }, 248 | {'type': 'text', 'text': system_prompt}, 249 | ], 250 | }, 251 | { 252 | 'role': 'user', 253 | 'content': [ 254 | {'type': 'image_url', 'image_url': {'url': image_data_uri}}, 255 | {'type': 'text', 'text': prompt}, 256 | ], 257 | }, 258 | ], 259 | ) 260 | 261 | result = response.model_dump() 262 | 263 | raw_response = json.dumps(result) 264 | 265 | assistant_message = result.get('choices', [{}])[0].get('message', {}) 266 | content = assistant_message.get('content', '') 267 | 268 | return { 269 | 'raw_response': raw_response, 270 | 'content': content, 271 | 'latency': time.time() - start_time, 272 | 'original_width': self.original_width, 273 | 'original_height': self.original_height, 274 | 'display_width': self.display_width, 275 | 'display_height': self.display_height, 276 | } 277 | 278 | except Exception as e: 279 | print(f'API Error: {str(e)}') 280 | return None 281 | 282 | def parse_prediction(self, prediction: Dict[str, Any]) -> Dict[str, Any]: 283 | if not prediction or 'content' not in prediction: 284 | return { 285 | 'pred_x': None, 286 | 'pred_y': None, 287 | 'raw_responses': prediction.get('raw_response', '{}'), 288 | } 289 | 290 | content = prediction['content'] 291 | 292 | print(f'Content: {content}') 293 | 294 | tool_call_match = re.search(r'\s*(\{.*?\})\s*', content, flags=re.DOTALL) 295 | if not tool_call_match: 296 | return { 297 | 'pred_x': None, 298 | 'pred_y': None, 299 | 'raw_responses': prediction.get('raw_response', '{}'), 300 | } 301 | 302 | try: 303 | json_text = tool_call_match.group(1) 304 | data = json.loads(json_text) 305 | 306 | if 'arguments' not in data: 307 | return { 308 | 'pred_x': None, 309 | 'pred_y': None, 310 | 'raw_responses': prediction.get('raw_response', '{}'), 311 | } 312 | 313 | args = data['arguments'] 314 | action_str = args.get('action') 315 | 316 | if not action_str: 317 | return { 318 | 'pred_x': None, 319 | 'pred_y': None, 320 | 'raw_responses': prediction.get('raw_response', '{}'), 321 | } 322 | 323 | pred_x = None 324 | pred_y = None 325 | if ( 326 | 'coordinate' in args 327 | and isinstance(args['coordinate'], list) 328 | and len(args['coordinate']) == 2 329 | ): 330 | pred_x = int(args['coordinate'][0]) 331 | pred_y = int(args['coordinate'][1]) 332 | 333 | if self.use_smart_resize and pred_x is not None and pred_y is not None: 334 | pred_x, pred_y = self._translate_coordinates(pred_x, pred_y) 335 | 336 | return { 337 | 'pred_x': pred_x, 338 | 'pred_y': pred_y, 339 | 'raw_responses': prediction.get('raw_response', '{}'), 340 | 'original_width': self.original_width, 341 | 'original_height': self.original_height, 342 | 'display_width': self.display_width, 343 | 'display_height': self.display_height, 344 | } 345 | 346 | except Exception as e: 347 | print(f'Error parsing prediction: {e}') 348 | return { 349 | 'pred_x': None, 350 | 'pred_y': None, 351 | 'raw_responses': prediction.get('raw_response', '{}'), 352 | } 353 | -------------------------------------------------------------------------------- /clicks/eval.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import multiprocessing 3 | import os 4 | from datetime import datetime 5 | from functools import partial 6 | from typing import Any, Dict, List, Optional 7 | 8 | import clicks.evaluate.ace as ace 9 | import pandas as pd 10 | from clicks.api_client_base import AbstractAPIClient 11 | from clicks.evaluate.models import EvaluationResult 12 | from clicks.evaluate.utils import analyze_results 13 | from clicks.third_party import ( 14 | get_claude_api_client, 15 | get_gemini_api_client, 16 | get_molmo_api_client, 17 | get_omniparser_api_client, 18 | get_openai_api_client, 19 | get_openai_cua_api_client, 20 | get_qwen_api_client, 21 | get_ui_tars_api_client, 22 | ) 23 | from colorama import Fore, Style, init 24 | from tqdm import tqdm 25 | 26 | init(autoreset=True) 27 | 28 | 29 | def process_item( 30 | item: Dict[str, Any], 31 | frames_dir: str, 32 | api_client: AbstractAPIClient, 33 | run_id: str, 34 | ) -> EvaluationResult: 35 | try: 36 | image_path = item.get('image', '') 37 | 38 | if not image_path or image_path.startswith('data:'): 39 | print(f'Skipping item {item.get("id", "unknown")}: Invalid image path: {image_path}') 40 | raise ValueError(f'Invalid image path: {image_path}') 41 | 42 | if hasattr(image_path, 'item'): 43 | image_path = image_path.item() 44 | 45 | image_path = str(image_path) 46 | result = api_client.process_single_item(item, frames_dir, run_id) 47 | 48 | return result 49 | except Exception as e: 50 | print(f'Error processing item {item.get("id", "unknown")}: {str(e)}') 51 | raise e 52 | 53 | 54 | def evaluate_csv( 55 | csv_file: str, 56 | frames_dir: str, 57 | api_client: Any, 58 | output_file: Optional[str] = None, 59 | sample_size: Optional[int] = None, 60 | num_workers: int = 1, 61 | run_id: str = datetime.now().strftime('%Y-%m-%d-%H-%M'), 62 | ) -> List[Dict[str, Any]]: 63 | results: List[Dict[str, Any]] = [] 64 | 65 | df = pd.read_csv(csv_file) 66 | print(f'Loaded {len(df)} items from {csv_file}') 67 | 68 | if sample_size is not None and sample_size < len(df): 69 | df = df.sample(sample_size, random_state=42) 70 | print(f'Sampled {len(df)} items for evaluation') 71 | 72 | items: List[Dict[str, Any]] = [ 73 | {str(k): v for k, v in item.items()} for item in df.to_dict('records') 74 | ] 75 | 76 | print(f'Using {num_workers} concurrent workers for processing') 77 | 78 | process_func = partial( 79 | process_item, 80 | frames_dir=frames_dir, 81 | api_client=api_client, 82 | run_id=run_id, 83 | ) 84 | 85 | try: 86 | multiprocessing.set_start_method('spawn', force=True) 87 | except RuntimeError: 88 | pass 89 | 90 | results = [] 91 | total_processed = 0 92 | total_in_bbox = 0 93 | 94 | try: 95 | with multiprocessing.Pool(processes=num_workers, maxtasksperchild=1) as pool: 96 | with tqdm(total=len(items), desc='Evaluating', unit='item') as pbar: 97 | for result in pool.imap_unordered(process_func, items): 98 | if result is not None: 99 | results.append(result.model_dump()) 100 | total_processed += 1 101 | if result.is_in_bbox: 102 | total_in_bbox += 1 103 | running_accuracy = (total_in_bbox / total_processed) * 100 if total_processed > 0 else 0 104 | pbar.set_postfix({'accuracy': f'{running_accuracy:.2f}%'}) 105 | 106 | pbar.update(1) 107 | except Exception as e: 108 | print(f'{Fore.RED}Error in multiprocessing: {e}{Style.RESET_ALL}') 109 | raise e 110 | 111 | if output_file and results: 112 | results_df = pd.DataFrame(results) 113 | results_df.to_csv(output_file, index=False) 114 | print(f'Results written to {output_file}') 115 | 116 | return results 117 | 118 | 119 | def main(): 120 | parser = argparse.ArgumentParser( 121 | description='Evaluate models on the clicks dataset with bounding box and pixel distance evaluation' 122 | ) 123 | parser.add_argument( 124 | '--dataset', 125 | type=str, 126 | choices=['dev', 'full'], 127 | default='dev', 128 | help='Dataset to evaluate on (dev or full)', 129 | ) 130 | parser.add_argument( 131 | '--model', 132 | type=str, 133 | choices=[ 134 | 'ace', 135 | 'claude', 136 | 'qwen', 137 | 'openai', 138 | 'openai-cua', 139 | 'gemini', 140 | 'molmo', 141 | 'ui-tars', 142 | 'omniparser', 143 | ], 144 | default='ace', 145 | help='Model to use for evaluation (ace, claude, qwen, openai, openai-cua, gemini, molmo, ui-tars, or omniparser)', 146 | ) 147 | parser.add_argument( 148 | '--api-url', 149 | type=str, 150 | default='', 151 | help='API endpoint for the model', 152 | ) 153 | parser.add_argument( 154 | '--api-key', 155 | type=str, 156 | default=None, 157 | help='API key for the model (if required)', 158 | ) 159 | parser.add_argument( 160 | '--claude-model', 161 | type=str, 162 | default='claude-3-7-sonnet-20250219', 163 | help='Claude model to use (default: claude-3-7-sonnet-20250219)', 164 | ) 165 | parser.add_argument( 166 | '--thinking-budget', 167 | type=int, 168 | default=1024, 169 | help='Budget for Claude thinking tokens (default: 1024, 0 to disable)', 170 | ) 171 | parser.add_argument( 172 | '--tool-version', 173 | type=str, 174 | default='20250124', 175 | help='Version of Claude computer use tools (default: 20250124)', 176 | ) 177 | parser.add_argument( 178 | '--qwen-model', 179 | type=str, 180 | default='qwen2.5-vl-72b-instruct', 181 | help='Qwen model to use (default: qwen2.5-vl-72b-instruct)', 182 | ) 183 | parser.add_argument( 184 | '--openai-model', 185 | type=str, 186 | default='gpt-4o', 187 | help='OpenAI model to use (default: gpt-4o)', 188 | ) 189 | parser.add_argument( 190 | '--openai-cua-model', 191 | type=str, 192 | default='computer-use-preview', 193 | help='OpenAI CUA model to use (default: computer-use-preview)', 194 | ) 195 | parser.add_argument( 196 | '--environment', 197 | type=str, 198 | default='mac', 199 | choices=['browser', 'mac', 'windows', 'ubuntu'], 200 | help='Environment for OpenAI CUA (default: browser)', 201 | ) 202 | parser.add_argument( 203 | '--reasoning-effort', 204 | type=str, 205 | default='medium', 206 | choices=['low', 'medium', 'high'], 207 | help='Reasoning effort for OpenAI (default: medium)', 208 | ) 209 | parser.add_argument( 210 | '--max-tokens', 211 | type=int, 212 | default=4096, 213 | help='Maximum tokens for model response (default: 4096)', 214 | ) 215 | parser.add_argument( 216 | '--sample-size', 217 | type=int, 218 | default=None, 219 | help='Number of samples to evaluate (optional, for testing)', 220 | ) 221 | parser.add_argument( 222 | '--output-file', type=str, default=None, help='Path to output CSV file (optional)' 223 | ) 224 | parser.add_argument( 225 | '--num-workers', 226 | type=int, 227 | default=1, 228 | help='Number of concurrent workers for processing (default: 1)', 229 | ) 230 | parser.add_argument( 231 | '--run-id', 232 | type=str, 233 | default=None, 234 | help='Custom run ID (optional, defaults to current timestamp)', 235 | ) 236 | parser.add_argument( 237 | '--gemini-model', 238 | type=str, 239 | default='gemini-1.5-pro-latest', 240 | help='Gemini model to use (default: gemini-1.5-pro-latest)', 241 | ) 242 | parser.add_argument( 243 | '--top-p', 244 | type=float, 245 | default=0.9, 246 | help='Top-p sampling parameter (default: 0.9)', 247 | ) 248 | parser.add_argument( 249 | '--top-k', 250 | type=int, 251 | default=50, 252 | help='Top-k sampling parameter (default: 50)', 253 | ) 254 | parser.add_argument( 255 | '--temperature', 256 | type=float, 257 | default=0.0, 258 | help='Temperature for sampling (default: 0.0)', 259 | ) 260 | parser.add_argument( 261 | '--frequency-penalty', 262 | type=float, 263 | default=1.0, 264 | help='Frequency penalty parameter for UI-TARS (default: 1.0)', 265 | ) 266 | parser.add_argument( 267 | '--ui-tars-model', 268 | type=str, 269 | default='bytedance-research/UI-TARS-72B-SFT', 270 | help='UI-TARS model to use (default: bytedance-research/UI-TARS-72B-SFT)', 271 | ) 272 | parser.add_argument( 273 | '--omniparser-model', 274 | type=str, 275 | default='gpt-4o-2024-05-13', 276 | help='OmniParser model to use (default: gpt-4o-2024-05-13)', 277 | ) 278 | parser.add_argument( 279 | '--omniparser-temperature', 280 | type=float, 281 | default=0.7, 282 | help='Temperature for OmniParser generation (default: 0.7)', 283 | ) 284 | parser.add_argument( 285 | '--ace-model', 286 | type=str, 287 | default='ace-control-medium', 288 | help='Ace model to use (default: ace-control-medium)', 289 | ) 290 | args = parser.parse_args() 291 | 292 | base_dir = os.path.dirname(os.path.abspath(__file__)) 293 | data_dir = os.path.join(base_dir, 'data') 294 | results_dir = os.path.join(base_dir, 'results') 295 | 296 | os.makedirs(results_dir, exist_ok=True) 297 | 298 | run_id = args.run_id or datetime.now().strftime('%Y-%m-%d-%H-%M') 299 | 300 | run_results_dir = os.path.join(results_dir, run_id) 301 | os.makedirs(run_results_dir, exist_ok=True) 302 | 303 | if args.dataset == 'dev': 304 | csv_file = os.path.join(data_dir, 'showdown-clicks-dev/data.csv') 305 | else: 306 | raise ValueError('Full dataset not currently supported') 307 | 308 | frames_dir = os.path.join(data_dir, 'showdown-clicks-dev') 309 | 310 | if args.output_file is None: 311 | if args.model == 'claude': 312 | model_name = args.claude_model.replace('-', '_') 313 | args.output_file = os.path.join( 314 | run_results_dir, f'claude_results_{model_name}_{args.dataset}.csv' 315 | ) 316 | elif args.model == 'qwen': 317 | model_name = args.qwen_model.replace('-', '_').replace('.', '_') 318 | args.output_file = os.path.join( 319 | run_results_dir, f'qwen_results_{model_name}_{args.dataset}.csv' 320 | ) 321 | elif args.model == 'openai': 322 | model_name = args.openai_model.replace('-', '_') 323 | args.output_file = os.path.join( 324 | run_results_dir, f'openai_results_{model_name}_{args.dataset}.csv' 325 | ) 326 | elif args.model == 'openai-cua': 327 | model_name = args.openai_cua_model.replace('-', '_') 328 | args.output_file = os.path.join( 329 | run_results_dir, f'openai_cua_results_{model_name}_{args.dataset}.csv' 330 | ) 331 | elif args.model == 'gemini': 332 | model_name = args.gemini_model.replace('-', '_').replace('.', '_') 333 | args.output_file = os.path.join( 334 | run_results_dir, f'gemini_results_{model_name}_{args.dataset}.csv' 335 | ) 336 | elif args.model == 'molmo': 337 | args.output_file = os.path.join(run_results_dir, f'molmo_results_{args.dataset}.csv') 338 | elif args.model == 'ui-tars': 339 | model_name = args.ui_tars_model.replace('/', '_').replace('-', '_') 340 | args.output_file = os.path.join( 341 | run_results_dir, f'ui_tars_results_{model_name}_{args.dataset}.csv' 342 | ) 343 | elif args.model == 'omniparser': 344 | model_name = args.omniparser_model.replace('-', '_') 345 | args.output_file = os.path.join( 346 | run_results_dir, f'omniparser_results_{model_name}_{args.dataset}.csv' 347 | ) 348 | else: 349 | model_name = args.ace_model.replace('-', '_') 350 | args.output_file = os.path.join( 351 | run_results_dir, f'ace_results_{model_name}_{args.dataset}.csv' 352 | ) 353 | 354 | print(f'{Fore.CYAN}Running evaluation with the following configuration:{Style.RESET_ALL}') 355 | print(f'{Fore.CYAN} Dataset: {args.dataset}{Style.RESET_ALL}') 356 | print(f'{Fore.CYAN} Model: {args.model}{Style.RESET_ALL}') 357 | print(f'{Fore.CYAN} CSV file: {csv_file}{Style.RESET_ALL}') 358 | print(f'{Fore.CYAN} Frames directory: {frames_dir}{Style.RESET_ALL}') 359 | print(f'{Fore.CYAN} Run ID: {run_id}{Style.RESET_ALL}') 360 | print(f'{Fore.CYAN} Results directory: {run_results_dir}{Style.RESET_ALL}') 361 | print(f'{Fore.CYAN} Output file: {args.output_file}{Style.RESET_ALL}') 362 | print(f'{Fore.CYAN} Concurrent workers: {args.num_workers}{Style.RESET_ALL}') 363 | 364 | if args.model == 'ace': 365 | print(f' API URL: {args.api_url}') 366 | print(f' Ace model: {args.ace_model}') 367 | elif args.model == 'claude': 368 | print(f' Claude model: {args.claude_model}') 369 | print(f' Thinking budget: {args.thinking_budget}') 370 | print(f' Tool version: {args.tool_version}') 371 | elif args.model == 'qwen': 372 | print(f' Qwen model: {args.qwen_model}') 373 | print(f' Max tokens: {args.max_tokens}') 374 | elif args.model == 'openai': 375 | print(f' OpenAI model: {args.openai_model}') 376 | print(f' Max tokens: {args.max_tokens}') 377 | print(f' Reasoning effort: {args.reasoning_effort}') 378 | elif args.model == 'openai-cua': 379 | print(f' OpenAI CUA model: {args.openai_cua_model}') 380 | print(f' Max tokens: {args.max_tokens}') 381 | print(f' Environment: {args.environment}') 382 | elif args.model == 'gemini': 383 | print(f' Gemini model: {args.gemini_model}') 384 | print(f' Max tokens: {args.max_tokens}') 385 | elif args.model == 'molmo': 386 | print(f' API URL: {args.api_url}') 387 | print(f' Max tokens: {args.max_tokens}') 388 | print(f' Temperature: {args.temperature}') 389 | print(f' Top-p: {args.top_p}') 390 | print(f' Top-k: {args.top_k}') 391 | elif args.model == 'ui-tars': 392 | print(f' API URL: {args.api_url}') 393 | print(f' UI-TARS model: {args.ui_tars_model}') 394 | print(f' Max tokens: {args.max_tokens}') 395 | print(f' Temperature: {args.temperature}') 396 | print(f' Frequency penalty: {args.frequency_penalty}') 397 | elif args.model == 'omniparser': 398 | print(f' API URL: {args.api_url or "https://omniparser-api-omniparser-api.modal.run"}') 399 | print(f' OmniParser model: {args.omniparser_model}') 400 | print(f' Temperature: {args.omniparser_temperature}') 401 | 402 | if args.sample_size: 403 | print(f' Sample size: {args.sample_size}') 404 | 405 | if not os.path.exists(csv_file): 406 | print(f'{Fore.RED}ERROR: CSV file not found: {csv_file}{Style.RESET_ALL}') 407 | return 408 | 409 | if not os.path.exists(frames_dir): 410 | print(f'{Fore.RED}ERROR: Frames directory not found: {frames_dir}{Style.RESET_ALL}') 411 | return 412 | 413 | try: 414 | df = pd.read_csv(csv_file) 415 | print('\nFirst few rows of the CSV file:') 416 | print(df.head(2)) 417 | print(f'\nTotal rows in CSV: {len(df)}') 418 | 419 | if not df.empty: 420 | sample_image_series = df.iloc[0]['image'] 421 | sample_image = ( 422 | sample_image_series.item() 423 | if hasattr(sample_image_series, 'item') 424 | else str(sample_image_series) 425 | ) 426 | print(f'Sample image path: {sample_image}') 427 | 428 | full_path = os.path.join(frames_dir, sample_image) 429 | print(f'Full sample image path: {full_path}') 430 | print(f'Image exists: {os.path.exists(full_path)}') 431 | 432 | if not os.path.exists(full_path): 433 | print(f'{Fore.YELLOW}WARNING: Sample image does not exist at {full_path}{Style.RESET_ALL}') 434 | print( 435 | f'{Fore.YELLOW}Please ensure all images are available in the frames directory:{Style.RESET_ALL}' 436 | ) 437 | print(f' - Extract frames: tar -xf {os.path.join(data_dir, "frames.tar")} -C {data_dir}/') 438 | except Exception as e: 439 | print(f'{Fore.RED}Error reading CSV file: {e}{Style.RESET_ALL}') 440 | 441 | if args.model == 'claude': 442 | api_key = args.api_key or os.environ.get('ANTHROPIC_API_KEY') 443 | if not api_key: 444 | print( 445 | f'{Fore.RED}ERROR: Anthropic API key not provided. Please set the ANTHROPIC_API_KEY environment variable or use --api-key.{Style.RESET_ALL}' 446 | ) 447 | return 448 | thinking_budget = args.thinking_budget if args.thinking_budget > 0 else None 449 | api_client = get_claude_api_client( 450 | api_key=api_key, 451 | model=args.claude_model, 452 | thinking_budget=thinking_budget, 453 | tool_version=args.tool_version, 454 | ) 455 | elif args.model == 'qwen': 456 | api_key = args.api_key or os.environ.get('DASHSCOPE_API_KEY') 457 | if not api_key: 458 | print( 459 | f'{Fore.RED}ERROR: DashScope API key not provided. Please set the DASHSCOPE_API_KEY environment variable or use --api-key.{Style.RESET_ALL}' 460 | ) 461 | return 462 | 463 | api_client = get_qwen_api_client( 464 | api_key=api_key, 465 | model=args.qwen_model, 466 | max_tokens=args.max_tokens, 467 | ) 468 | 469 | if args.num_workers > 1: 470 | print( 471 | f'{Fore.YELLOW}Warning: Using multiple workers ({args.num_workers}) with Qwen. If you encounter errors, try reducing the number of workers.{Style.RESET_ALL}' 472 | ) 473 | elif args.model == 'openai': 474 | api_key = args.api_key or os.environ.get('OPENAI_API_KEY') 475 | if not api_key: 476 | print( 477 | f'{Fore.RED}ERROR: OpenAI API key not provided. Please set the OPENAI_API_KEY environment variable or use --api-key.{Style.RESET_ALL}' 478 | ) 479 | return 480 | 481 | api_client = get_openai_api_client( 482 | api_key=api_key, 483 | model=args.openai_model, 484 | max_tokens=args.max_tokens, 485 | reasoning_effort=args.reasoning_effort, 486 | ) 487 | 488 | if args.num_workers > 1: 489 | print( 490 | f'{Fore.YELLOW}Warning: Using multiple workers ({args.num_workers}) with OpenAI. If you encounter errors, try reducing the number of workers.{Style.RESET_ALL}' 491 | ) 492 | elif args.model == 'openai-cua': 493 | api_key = args.api_key or os.environ.get('OPENAI_API_KEY') 494 | if not api_key: 495 | print( 496 | f'{Fore.RED}ERROR: OpenAI API key not provided. Please set the OPENAI_API_KEY environment variable or use --api-key.{Style.RESET_ALL}' 497 | ) 498 | return 499 | 500 | api_client = get_openai_cua_api_client( 501 | api_key=api_key, 502 | model=args.openai_cua_model, 503 | max_tokens=args.max_tokens, 504 | environment=args.environment, 505 | ) 506 | 507 | if args.num_workers > 1: 508 | print( 509 | f'{Fore.YELLOW}Warning: Using multiple workers ({args.num_workers}) with OpenAI CUA. If you encounter errors, try reducing the number of workers.{Style.RESET_ALL}' 510 | ) 511 | elif args.model == 'gemini': 512 | api_key = args.api_key or os.environ.get('GEMINI_API_KEY') 513 | if not api_key: 514 | print( 515 | f'{Fore.RED}ERROR: Google API key not provided. Please set the GEMINI_API_KEY environment variable or use --api-key.{Style.RESET_ALL}' 516 | ) 517 | return 518 | 519 | api_client = get_gemini_api_client( 520 | api_key=api_key, 521 | model=args.gemini_model, 522 | max_tokens=args.max_tokens, 523 | ) 524 | 525 | if args.num_workers > 1: 526 | print( 527 | f'{Fore.YELLOW}Warning: Using multiple workers ({args.num_workers}) with Gemini. If you encounter errors, try reducing the number of workers.{Style.RESET_ALL}' 528 | ) 529 | elif args.model == 'molmo': 530 | if not args.api_url: 531 | print( 532 | f'{Fore.RED}ERROR: Molmo API URL not provided. Please provide it using --api-url.{Style.RESET_ALL}' 533 | ) 534 | return 535 | 536 | api_client = get_molmo_api_client( 537 | api_url=args.api_url, 538 | api_key=args.api_key, 539 | max_tokens=args.max_tokens, 540 | temperature=args.temperature, 541 | top_p=args.top_p, 542 | top_k=args.top_k, 543 | ) 544 | 545 | if args.num_workers > 1: 546 | print( 547 | f'{Fore.YELLOW}Warning: Using multiple workers ({args.num_workers}) with Molmo. If you encounter errors, try reducing the number of workers.{Style.RESET_ALL}' 548 | ) 549 | elif args.model == 'ui-tars': 550 | if not args.api_url: 551 | print( 552 | f'{Fore.RED}ERROR: UI-TARS API URL not provided. Please provide it using --api-url.{Style.RESET_ALL}' 553 | ) 554 | return 555 | 556 | api_client = get_ui_tars_api_client( 557 | api_url=args.api_url, 558 | api_key=args.api_key or 'super-secret-key', 559 | max_tokens=args.max_tokens, 560 | temperature=args.temperature, 561 | frequency_penalty=args.frequency_penalty, 562 | model_name=args.ui_tars_model, 563 | ) 564 | 565 | if args.num_workers > 1: 566 | print( 567 | f'{Fore.YELLOW}Warning: Using multiple workers ({args.num_workers}) with UI-TARS. If you encounter errors, try reducing the number of workers.{Style.RESET_ALL}' 568 | ) 569 | elif args.model == 'omniparser': 570 | api_client = get_omniparser_api_client( 571 | api_endpoint=args.api_url or 'https://omniparser-api-omniparser-api.modal.run', 572 | model=args.omniparser_model, 573 | temperature=args.omniparser_temperature, 574 | ) 575 | 576 | if args.num_workers > 1: 577 | print( 578 | f'{Fore.YELLOW}Warning: Using multiple workers ({args.num_workers}) with OmniParser. If you encounter errors, try reducing the number of workers.{Style.RESET_ALL}' 579 | ) 580 | else: 581 | api_key = args.api_key or os.environ.get('GENERALAGENTS_API_KEY') 582 | if not api_key: 583 | print( 584 | f'{Fore.RED}ERROR: General Agents API key not provided. Please set the GENERALAGENTS_API_KEY environment variable or use --api-key.{Style.RESET_ALL}' 585 | ) 586 | return 587 | 588 | api_client = ace.get_api_client(api_key, args.ace_model) 589 | 590 | try: 591 | results = evaluate_csv( 592 | csv_file, 593 | frames_dir, 594 | api_client, 595 | args.output_file, 596 | args.sample_size, 597 | args.num_workers, 598 | run_id, 599 | ) 600 | 601 | if results: 602 | print(f'\n{Fore.GREEN}Evaluation completed successfully!{Style.RESET_ALL}') 603 | print(f'Results: {len(results)} items processed') 604 | print(f'Results saved to: {args.output_file}') 605 | print(f'Visualizations saved to: {os.path.join(run_results_dir, "visualizations")}') 606 | 607 | results_analysis = analyze_results(results, run_id) 608 | 609 | metrics_dict = { 610 | 'run_id': run_id, 611 | 'model': args.model, 612 | 'ci': results_analysis.ci, 613 | 'accuracy': results_analysis.accuracy, 614 | 'accuracy_ci_low': results_analysis.accuracy_ci_low, 615 | 'accuracy_ci_high': results_analysis.accuracy_ci_high, 616 | 'total_processed': results_analysis.total_processed, 617 | } 618 | 619 | base_dir = os.path.dirname(os.path.abspath(__file__)) 620 | metrics_file = os.path.join(base_dir, 'results', 'all_metrics.csv') 621 | 622 | if os.path.exists(metrics_file): 623 | all_metrics_df = pd.read_csv(metrics_file) 624 | all_metrics_df = pd.concat( 625 | [all_metrics_df, pd.DataFrame([metrics_dict])], ignore_index=True 626 | ) 627 | else: 628 | all_metrics_df = pd.DataFrame([metrics_dict]) 629 | 630 | all_metrics_df.to_csv(metrics_file, index=False) 631 | print(f'\n{Fore.CYAN}Metrics saved to: {metrics_file}{Style.RESET_ALL}') 632 | 633 | else: 634 | print(f'\n{Fore.RED}Evaluation failed: No results returned.{Style.RESET_ALL}') 635 | 636 | except Exception as e: 637 | print(f'\n{Fore.RED}Evaluation failed with error: {e}{Style.RESET_ALL}') 638 | raise e 639 | 640 | 641 | if __name__ == '__main__': 642 | main() 643 | --------------------------------------------------------------------------------