├── clicks
    ├── .gitignore
    ├── src
    │   └── clicks
    │   │   ├── third_party
    │   │       ├── ui_tars
    │   │       │   ├── __init__.py
    │   │       │   ├── integration.py
    │   │       │   └── client.py
    │   │       ├── molmo
    │   │       │   ├── __init__.py
    │   │       │   ├── client.py
    │   │       │   └── integration.py
    │   │       ├── qwen
    │   │       │   ├── __init__.py
    │   │       │   ├── integration.py
    │   │       │   └── client.py
    │   │       ├── gemini
    │   │       │   ├── __init__.py
    │   │       │   ├── integration.py
    │   │       │   └── client.py
    │   │       ├── openai
    │   │       │   ├── __init__.py
    │   │       │   ├── integration.py
    │   │       │   └── client.py
    │   │       ├── openai_cua
    │   │       │   ├── __init__.py
    │   │       │   ├── integration.py
    │   │       │   └── client.py
    │   │       ├── claude
    │   │       │   ├── __init__.py
    │   │       │   ├── integration.py
    │   │       │   └── client.py
    │   │       ├── omniparser
    │   │       │   ├── __init__.py
    │   │       │   ├── client.py
    │   │       │   └── integration.py
    │   │       ├── common.py
    │   │       └── __init__.py
    │   │   ├── evaluate
    │   │       ├── __init__.py
    │   │       ├── models.py
    │   │       ├── ace.py
    │   │       └── utils.py
    │   │   └── api_client_base.py
    ├── pyproject.toml
    ├── LICENSE
    ├── scripts
    │   ├── collect_runs.py
    │   └── calculate_latency.py
    ├── README.md
    └── eval.py
└── README.md


/clicks/.gitignore:
--------------------------------------------------------------------------------
1 | *.tar
2 | *results*
3 | *.csv
4 | *.json
5 | *.jsonl
6 | *.txt
7 | __pycache__
8 | data/*


--------------------------------------------------------------------------------
/clicks/src/clicks/third_party/ui_tars/__init__.py:
--------------------------------------------------------------------------------
1 | from .integration import (
2 |   UITarsAPIClient,
3 |   get_ui_tars_api_client,
4 | )
5 | 
6 | __all__ = ['UITarsAPIClient', 'get_ui_tars_api_client']
7 | 


--------------------------------------------------------------------------------
/clicks/src/clicks/third_party/molmo/__init__.py:
--------------------------------------------------------------------------------
 1 | from .integration import (
 2 |   MolmoAPIClient,
 3 |   get_molmo_api_client,
 4 | )
 5 | 
 6 | __all__ = [
 7 |   'MolmoAPIClient',
 8 |   'get_molmo_api_client',
 9 | ]
10 | 


--------------------------------------------------------------------------------
/clicks/src/clicks/third_party/qwen/__init__.py:
--------------------------------------------------------------------------------
 1 | from .integration import (
 2 |   QwenVLAPIClient,
 3 |   get_qwen_api_client,
 4 | )
 5 | 
 6 | __all__ = [
 7 |   'QwenVLAPIClient',
 8 |   'get_qwen_api_client',
 9 | ]
10 | 


--------------------------------------------------------------------------------
/clicks/src/clicks/third_party/gemini/__init__.py:
--------------------------------------------------------------------------------
 1 | from .integration import (
 2 |   GeminiAPIClient,
 3 |   get_gemini_api_client,
 4 | )
 5 | 
 6 | __all__ = [
 7 |   'GeminiAPIClient',
 8 |   'get_gemini_api_client',
 9 | ]
10 | 


--------------------------------------------------------------------------------
/clicks/src/clicks/third_party/openai/__init__.py:
--------------------------------------------------------------------------------
 1 | from .integration import (
 2 |   OpenAIAPIClient,
 3 |   get_openai_api_client,
 4 | )
 5 | 
 6 | __all__ = [
 7 |   'OpenAIAPIClient',
 8 |   'get_openai_api_client',
 9 | ]
10 | 


--------------------------------------------------------------------------------
/clicks/src/clicks/third_party/openai_cua/__init__.py:
--------------------------------------------------------------------------------
 1 | from .integration import (
 2 |   OpenAICUAAPIClient,
 3 |   get_openai_cua_api_client,
 4 | )
 5 | 
 6 | __all__ = [
 7 |   'OpenAICUAAPIClient',
 8 |   'get_openai_cua_api_client',
 9 | ]
10 | 


--------------------------------------------------------------------------------
/clicks/src/clicks/third_party/claude/__init__.py:
--------------------------------------------------------------------------------
 1 | from .integration import (
 2 |   ClaudeComputerUseAPIClient,
 3 |   get_claude_api_client,
 4 | )
 5 | 
 6 | __all__ = [
 7 |   'ClaudeComputerUseAPIClient',
 8 |   'get_claude_api_client',
 9 | ]
10 | 


--------------------------------------------------------------------------------
/clicks/src/clicks/third_party/omniparser/__init__.py:
--------------------------------------------------------------------------------
 1 | from .integration import (
 2 |   OmniParserAPIClient,
 3 |   get_omniparser_api_client,
 4 | )
 5 | 
 6 | __all__ = [
 7 |   'OmniParserAPIClient',
 8 |   'get_omniparser_api_client',
 9 | ]
10 | 


--------------------------------------------------------------------------------
/clicks/src/clicks/evaluate/__init__.py:
--------------------------------------------------------------------------------
 1 | from .ace import AceAPIClient, get_api_client
 2 | from .models import (
 3 |   AcePrediction,
 4 |   Action,
 5 |   ClickAction,
 6 |   Coordinate,
 7 |   GroundTruth,
 8 | )
 9 | 
10 | __all__ = [
11 |   'Action',
12 |   'ClickAction',
13 |   'AceAPIClient',
14 |   'AcePrediction',
15 |   'Coordinate',
16 |   'GroundTruth',
17 |   'get_api_client',
18 | ]
19 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # showdown
 2 | 
 3 | General Agents
 4 | 
 5 | `showdown` is a suite of offline and online benchmarks for computer-use agents.
 6 | 
 7 | If you use `showdown` in your research, please cite it as follows:
 8 | 
 9 | ```bibtex
10 | @misc{showdown2025,
11 |   title={The Showdown Computer Control Evaluation Suite},
12 |   author={General Agents Team},
13 |   year={2025},
14 |   url={https://github.com/generalagents/showdown},
15 | }
16 | ```


--------------------------------------------------------------------------------
/clicks/src/clicks/third_party/common.py:
--------------------------------------------------------------------------------
 1 | import base64
 2 | from io import BytesIO
 3 | 
 4 | from PIL import Image
 5 | 
 6 | 
 7 | def encode_image_to_base64_uri(image_path: str) -> str:
 8 |   try:
 9 |     with Image.open(image_path) as img:
10 |       if img.mode != 'RGB':
11 |         img = img.convert('RGB')
12 | 
13 |       buffer = BytesIO()
14 |       img.save(buffer, format='JPEG', quality=95)
15 |       buffer.seek(0)
16 | 
17 |       img_base64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
18 | 
19 |       return f'data:image/jpeg;base64,{img_base64}'
20 |   except Exception as e:
21 |     raise ValueError(f'Error encoding image to base64: {e}')
22 | 


--------------------------------------------------------------------------------
/clicks/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "clicks"
 3 | version = "0.1.0"
 4 | description = "clicks evals"
 5 | requires-python = "==3.12.*"
 6 | dependencies = [
 7 |     "requests>=2.31.0",
 8 |     "pillow>=10.0.0",
 9 |     "tenacity>=8.2.3",
10 |     "colorama>=0.4.6",
11 |     "tqdm>=4.66.1",
12 |     "pandas>=2.0.0",
13 |     "scipy>=1.11.1",
14 |     "anthropic>=0.18.1",
15 |     "openai>=1.66.0",
16 |     "Pillow>=10.0.0",
17 |     "matplotlib>=3.7.0",
18 |     "google-genai>=0.6.0",
19 |     "generalagents>=0.1.0",
20 | ]
21 | 
22 | [tool.hatch.build.targets.wheel]
23 | packages = ["src/clicks"]
24 | 
25 | [build-system]
26 | requires = ["hatchling"]
27 | build-backend = "hatchling.build"
28 | 


--------------------------------------------------------------------------------
/clicks/src/clicks/api_client_base.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import Any, Dict
 3 | 
 4 | from clicks.evaluate.models import EvaluationResult, ParsedPrediction
 5 | 
 6 | 
 7 | class AbstractAPIClient(ABC):
 8 |   client_type: str
 9 |   model_name: str = ''
10 | 
11 |   @abstractmethod
12 |   def predict(self, image_data_uri: str, prompt: str) -> Dict[str, Any] | None:
13 |     pass
14 | 
15 |   @abstractmethod
16 |   def parse_prediction(self, prediction: Dict[str, Any]) -> ParsedPrediction:
17 |     pass
18 | 
19 |   @abstractmethod
20 |   def process_single_item(
21 |     self, item: Dict[str, Any], frames_dir: str, run_id: str
22 |   ) -> EvaluationResult:
23 |     pass
24 | 


--------------------------------------------------------------------------------
/clicks/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 General Agents
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/clicks/src/clicks/third_party/__init__.py:
--------------------------------------------------------------------------------
 1 | from .claude import (
 2 |   ClaudeComputerUseAPIClient,
 3 |   get_claude_api_client,
 4 | )
 5 | from .gemini import (
 6 |   GeminiAPIClient,
 7 |   get_gemini_api_client,
 8 | )
 9 | from .molmo import (
10 |   MolmoAPIClient,
11 |   get_molmo_api_client,
12 | )
13 | from .omniparser import (
14 |   OmniParserAPIClient,
15 |   get_omniparser_api_client,
16 | )
17 | from .openai import (
18 |   OpenAIAPIClient,
19 |   get_openai_api_client,
20 | )
21 | from .openai_cua import (
22 |   OpenAICUAAPIClient,
23 |   get_openai_cua_api_client,
24 | )
25 | from .qwen import (
26 |   QwenVLAPIClient,
27 |   get_qwen_api_client,
28 | )
29 | from .ui_tars import (
30 |   UITarsAPIClient,
31 |   get_ui_tars_api_client,
32 | )
33 | 
34 | __all__ = [
35 |   # Claude
36 |   'ClaudeComputerUseAPIClient',
37 |   'get_claude_api_client',
38 |   # Qwen
39 |   'QwenVLAPIClient',
40 |   'get_qwen_api_client',
41 |   # OpenAI
42 |   'OpenAIAPIClient',
43 |   'get_openai_api_client',
44 |   # OpenAI CUA
45 |   'OpenAICUAAPIClient',
46 |   'get_openai_cua_api_client',
47 |   # Gemini
48 |   'GeminiAPIClient',
49 |   'get_gemini_api_client',
50 |   # Molmo
51 |   'MolmoAPIClient',
52 |   'get_molmo_api_client',
53 |   # UI-TARS
54 |   'UITarsAPIClient',
55 |   'get_ui_tars_api_client',
56 |   # OmniParser
57 |   'OmniParserAPIClient',
58 |   'get_omniparser_api_client',
59 | ]
60 | 


--------------------------------------------------------------------------------
/clicks/scripts/collect_runs.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import os
 3 | 
 4 | import pandas as pd
 5 | from clicks.evaluate.utils import analyze_results
 6 | 
 7 | 
 8 | def summarize_all_results():
 9 |   base_dir = os.path.dirname(os.path.abspath(__file__))
10 |   results_dir = os.path.join(base_dir, '..', 'results', 'showdown-clicks-dev')
11 | 
12 |   output_file = os.path.join(results_dir, '..', 'report', 'metrics.csv')
13 | 
14 |   run_dirs = [d for d in os.listdir(results_dir) if os.path.isdir(os.path.join(results_dir, d))]
15 | 
16 |   all_metrics = []
17 | 
18 |   print(f'Found {len(run_dirs)} result directories to process.')
19 | 
20 |   result_files = glob.glob(os.path.join(results_dir, '*.csv'))
21 | 
22 |   for result_file in result_files:
23 |     try:
24 |       print(f'Processing {result_file}...')
25 | 
26 |       # Extract model name from filename
27 |       filename = os.path.basename(result_file)
28 | 
29 |       # Skip if this is a summary file
30 |       if filename.startswith('summary_') or 'all_metrics' in filename:
31 |         continue
32 | 
33 |       # Parse model name from filename
34 |       name = filename.split('.csv')[0]
35 | 
36 |       # Read the results
37 |       results_df = pd.read_csv(result_file)
38 | 
39 |       # Convert DataFrame to list of dictionaries for analysis
40 |       # Ensure all keys are strings to match analyze_results parameter type
41 |       results = [{str(k): v for k, v in item.items()} for item in results_df.to_dict('records')]
42 | 
43 |       if not results:
44 |         print(f'No results found in {result_file}, skipping.')
45 |         continue
46 | 
47 |       # Apply the same analysis logic
48 |       results_analysis = analyze_results(results)
49 | 
50 |       # Create metrics dictionary
51 |       metrics_dict = {
52 |         'model': name,
53 |         'ci': results_analysis.ci,
54 |         'accuracy': results_analysis.accuracy,
55 |         'total_correct': results_analysis.total_correct,
56 |         'total_processed': results_analysis.total_processed,
57 |         'accuracy_ci_low': results_analysis.accuracy_ci_low,
58 |         'accuracy_ci_high': results_analysis.accuracy_ci_high,
59 |         'result_file': result_file,
60 |       }
61 | 
62 |       all_metrics.append(metrics_dict)
63 |       print(f'Added metrics for {result_file}')
64 | 
65 |     except Exception as e:
66 |       print(f'Error processing {result_file}: {e}')
67 | 
68 |   if all_metrics:
69 |     # Create DataFrame and save to CSV
70 |     all_metrics_df = pd.DataFrame(all_metrics)
71 |     all_metrics_df.to_csv(output_file, index=False)
72 |     print(f'All metrics saved to: {output_file}')
73 |   else:
74 |     print('No metrics were generated.')
75 | 
76 | 
77 | if __name__ == '__main__':
78 |   summarize_all_results()
79 | 


--------------------------------------------------------------------------------
/clicks/src/clicks/evaluate/models.py:
--------------------------------------------------------------------------------
  1 | from typing import Optional
  2 | 
  3 | from pydantic import BaseModel
  4 | 
  5 | 
  6 | class Coordinate(BaseModel):
  7 |   x: Optional[int] = None
  8 |   y: Optional[int] = None
  9 | 
 10 | 
 11 | class ClickAction(BaseModel):
 12 |   kind: str = 'left_click'
 13 |   coordinate: Coordinate
 14 | 
 15 | 
 16 | class Action(BaseModel):
 17 |   kind: str
 18 |   coordinate: Optional[Coordinate] = None
 19 |   text: Optional[str] = None
 20 | 
 21 | 
 22 | class AcePrediction(BaseModel):
 23 |   action: Action
 24 |   raw_response: Optional[str] = None
 25 | 
 26 | 
 27 | class ParsedPrediction(BaseModel):
 28 |   pred_x: Optional[int] = None
 29 |   pred_y: Optional[int] = None
 30 |   raw_response: Optional[str] = None
 31 | 
 32 | 
 33 | class GroundTruth(BaseModel):
 34 |   gt_x1: Optional[int] = None
 35 |   gt_y1: Optional[int] = None
 36 |   gt_x2: Optional[int] = None
 37 |   gt_y2: Optional[int] = None
 38 | 
 39 | 
 40 | class EvaluationMetrics(BaseModel):
 41 |   total_processed: int
 42 |   total_correct: int
 43 |   accuracy: float
 44 |   ci: float
 45 |   accuracy_ci_low: Optional[float] = None
 46 |   accuracy_ci_high: Optional[float] = None
 47 | 
 48 | 
 49 | class EvaluationItem(BaseModel):
 50 |   id: str
 51 |   recording_id: str
 52 |   instruction: str
 53 |   image: str
 54 |   x1: Optional[int] = None
 55 |   y1: Optional[int] = None
 56 |   x2: Optional[int] = None
 57 |   y2: Optional[int] = None
 58 |   width: Optional[int] = None
 59 |   height: Optional[int] = None
 60 | 
 61 | 
 62 | class EvaluationResult(BaseModel):
 63 |   id: str
 64 |   recording_id: str
 65 |   instruction: str
 66 |   image_path: str
 67 |   gt_x1: Optional[int] = None
 68 |   gt_y1: Optional[int] = None
 69 |   gt_x2: Optional[int] = None
 70 |   gt_y2: Optional[int] = None
 71 |   pred_x: Optional[int] = None
 72 |   pred_y: Optional[int] = None
 73 |   is_in_bbox: Optional[bool] = None
 74 |   latency_seconds: float
 75 |   visualization_path: Optional[str] = None
 76 |   raw_response: Optional[str] = None
 77 | 
 78 | 
 79 | class ModelConfig(BaseModel):
 80 |   api_endpoint: str = ''
 81 | 
 82 | 
 83 | class AceModelConfig(ModelConfig):
 84 |   model: str = 'ace-control-medium'
 85 |   api_key: Optional[str] = None
 86 | 
 87 | 
 88 | class ClaudeModelConfig(ModelConfig):
 89 |   model: str = 'claude-3-7-sonnet-20250219'
 90 |   api_key: Optional[str] = None
 91 |   thinking_budget: Optional[int] = 1024
 92 |   tool_version: str = '20250124'
 93 | 
 94 | 
 95 | class QwenModelConfig(ModelConfig):
 96 |   model: str = 'qwen2.5-vl-72b-instruct'
 97 |   api_key: Optional[str] = None
 98 |   max_tokens: int = 4096
 99 |   use_smart_resize: bool = True
100 |   resize_factor: int = 28
101 |   min_pixels: int = 3136
102 |   max_pixels: int = 12845056
103 | 
104 | 
105 | class OpenAIModelConfig(ModelConfig):
106 |   model: str = 'o1'
107 |   api_key: Optional[str] = None
108 |   max_tokens: int = 4096
109 |   reasoning_effort: str = 'medium'
110 |   environment: str = 'mac'
111 | 
112 | 
113 | class GeminiModelConfig(ModelConfig):
114 |   model: str = 'gemini-2.0-flash'
115 |   api_key: Optional[str] = None
116 |   max_tokens: int = 4096
117 |   temperature: float = 0.0
118 | 
119 | 
120 | class OmniParserModelConfig(ModelConfig):
121 |   model: str = 'gpt-4o-2024-05-13'
122 |   temperature: float = 0.7
123 | 


--------------------------------------------------------------------------------
/clicks/src/clicks/third_party/omniparser/client.py:
--------------------------------------------------------------------------------
  1 | import base64
  2 | import io
  3 | import json
  4 | from typing import Any, Dict
  5 | 
  6 | import requests
  7 | from PIL import Image
  8 | from tenacity import retry, stop_after_attempt, wait_exponential
  9 | 
 10 | OMNIPARSER_API_ENDPOINT = 'https://omniparser-api-omniparser-api.modal.run'
 11 | DEFAULT_MODEL = 'gpt-4o-2024-05-13'
 12 | 
 13 | 
 14 | class OmniParserClient:
 15 |   def __init__(
 16 |     self,
 17 |     api_endpoint: str = OMNIPARSER_API_ENDPOINT,
 18 |     model: str = DEFAULT_MODEL,
 19 |     temperature: float = 0.7,
 20 |   ):
 21 |     self.api_endpoint = api_endpoint
 22 |     self.model = model
 23 |     self.temperature = temperature
 24 | 
 25 |   def _convert_to_base64(self, image_path: str) -> str:
 26 |     """Convert image file to base64 string."""
 27 |     try:
 28 |       with open(image_path, 'rb') as image_file:
 29 |         return base64.b64encode(image_file.read()).decode('utf-8')
 30 |     except Exception as e:
 31 |       print(f'Error converting image to base64: {e}')
 32 |       raise
 33 | 
 34 |   def _extract_image_dimensions(self, base64_data: str) -> tuple[int, int]:
 35 |     try:
 36 |       image_data = base64.b64decode(base64_data)
 37 |       image = Image.open(io.BytesIO(image_data))
 38 |       width, height = image.size
 39 |       return width, height
 40 |     except Exception as e:
 41 |       print(f'Error extracting image dimensions: {e}')
 42 |       return 1024, 768
 43 | 
 44 |   @retry(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, min=2, max=15))
 45 |   def predict(self, image_data_uri: str, prompt: str) -> Dict[str, Any]:
 46 |     try:
 47 |       if ',' in image_data_uri:
 48 |         base64_data = image_data_uri.split(',')[1]
 49 |       else:
 50 |         base64_data = image_data_uri
 51 | 
 52 |       width, height = self._extract_image_dimensions(base64_data)
 53 | 
 54 |       payload = {
 55 |         'image': base64_data,
 56 |         'instruction': prompt,
 57 |         'model_name': self.model,
 58 |         'temperature': self.temperature,
 59 |         'width': width,
 60 |         'height': height,
 61 |       }
 62 | 
 63 |       response = requests.post(
 64 |         f'{self.api_endpoint}/computer_use',
 65 |         json=payload,
 66 |         headers={'Content-Type': 'application/json'},
 67 |       )
 68 | 
 69 |       response.raise_for_status()
 70 |       result = response.json()
 71 |       result['model'] = self.model
 72 |       result['width'] = width
 73 |       result['height'] = height
 74 | 
 75 |       return result
 76 | 
 77 |     except Exception as e:
 78 |       print(f'Error making prediction: {e}')
 79 |       return {'error': str(e)}
 80 | 
 81 |   def parse_prediction(self, prediction: Dict[str, Any]) -> Dict[str, Any]:
 82 |     if not prediction or 'error' in prediction:
 83 |       return {
 84 |         'pred_x': None,
 85 |         'pred_y': None,
 86 |         'raw_response': json.dumps(prediction) if prediction else None,
 87 |       }
 88 | 
 89 |     if 'point' in prediction and prediction['point'] and len(prediction['point']) == 2:
 90 |       pred_x, pred_y = prediction['point']
 91 |       pred_x = int(pred_x * prediction['width'])
 92 |       pred_y = int(pred_y * prediction['height'])
 93 |     else:
 94 |       pred_x, pred_y = None, None
 95 | 
 96 |     return {
 97 |       'pred_x': pred_x,
 98 |       'pred_y': pred_y,
 99 |       'raw_response': json.dumps(prediction, default=str),
100 |     }
101 | 


--------------------------------------------------------------------------------
/clicks/scripts/calculate_latency.py:
--------------------------------------------------------------------------------
  1 | import glob
  2 | import os
  3 | from dataclasses import dataclass
  4 | from typing import Dict, List, Optional
  5 | 
  6 | import numpy as np
  7 | import pandas as pd
  8 | from scipy import stats
  9 | 
 10 | 
 11 | @dataclass
 12 | class LatencyMetrics:
 13 |   mean_latency: float
 14 |   ci: float
 15 |   latency_ci_low: Optional[float]
 16 |   latency_ci_high: Optional[float]
 17 | 
 18 | 
 19 | def calculate_latency_metrics(latencies: List[float], ci: float = 0.95) -> LatencyMetrics:
 20 |   if not latencies:
 21 |     return LatencyMetrics(
 22 |       mean_latency=0,
 23 |       ci=ci,
 24 |       latency_ci_low=None,
 25 |       latency_ci_high=None,
 26 |     )
 27 | 
 28 |   latencies = sorted(latencies)
 29 |   # Drop the last ten latency values to remove potential outliers or anomalies
 30 |   latencies = latencies[:-10]
 31 | 
 32 |   mean_latency = np.mean(latencies)
 33 | 
 34 |   def calculate_mean(data):
 35 |     return np.mean(data)
 36 | 
 37 |   latency_ci = None
 38 |   try:
 39 |     latencies_array = np.array(latencies)
 40 |     latency_bootstrap = stats.bootstrap(
 41 |       (latencies_array,),
 42 |       calculate_mean,
 43 |       confidence_level=ci,
 44 |       method='percentile',
 45 |     )
 46 |     latency_ci = latency_bootstrap.confidence_interval
 47 |   except Exception as e:
 48 |     print(f'Error calculating latency confidence interval: {e}')
 49 | 
 50 |   return LatencyMetrics(
 51 |     mean_latency=float(mean_latency),
 52 |     ci=float(ci),
 53 |     latency_ci_low=float(latency_ci.low) if latency_ci else None,
 54 |     latency_ci_high=float(latency_ci.high) if latency_ci else None,
 55 |   )
 56 | 
 57 | 
 58 | def read_and_analyze_results(directory: str = '.') -> Dict[str, LatencyMetrics]:
 59 |   csv_files = glob.glob(os.path.join(directory, '*.csv'))
 60 | 
 61 |   if not csv_files:
 62 |     print(f'No CSV files found in {directory}')
 63 |     return {}
 64 | 
 65 |   results = {}
 66 | 
 67 |   for csv_file in csv_files:
 68 |     model_name = os.path.basename(csv_file).replace('.csv', '')
 69 |     try:
 70 |       df = pd.read_csv(csv_file)
 71 | 
 72 |       if 'latency_seconds' in df.columns:
 73 |         latencies = df['latency_seconds'].dropna().tolist()
 74 |         metrics = calculate_latency_metrics(latencies)
 75 |         results[model_name] = metrics
 76 |       else:
 77 |         print(f'No latency column found in {csv_file}')
 78 |     except Exception as e:
 79 |       print(f'Error processing {csv_file}: {e}')
 80 | 
 81 |   return results
 82 | 
 83 | 
 84 | def print_latency_summary(results: Dict[str, LatencyMetrics]) -> None:
 85 |   if not results:
 86 |     print('No results to display')
 87 |     return
 88 | 
 89 |   data = []
 90 |   for model_name, metrics in results.items():
 91 |     ci_str = (
 92 |       f'[{metrics.latency_ci_low:.2f}, {metrics.latency_ci_high:.2f}]'
 93 |       if metrics.latency_ci_low is not None
 94 |       else 'N/A'
 95 |     )
 96 |     data.append(
 97 |       {
 98 |         'Model': model_name,
 99 |         'Mean (s)': f'{metrics.mean_latency:.2f}',
100 |         '95% CI': ci_str,
101 |       }
102 |     )
103 | 
104 |   df = pd.DataFrame(data)
105 |   print('\nLatency Summary:')
106 |   print('-' * 80)
107 |   print(df.to_string(index=False))
108 |   # calculated from a separate script
109 |   print('operator | 3.88178 | [3.77679, 3.98342]')
110 | 
111 | 
112 | current_dir = os.path.join(
113 |   os.path.dirname(os.path.abspath(__file__)), '..', 'results', 'showdown-clicks-dev'
114 | )
115 | 
116 | results = read_and_analyze_results(current_dir)
117 | 
118 | print_latency_summary(results)
119 | 
120 | # Save results to CSV
121 | output_data = []
122 | for model_name, metrics in results.items():
123 |   output_data.append(
124 |     {
125 |       'model': model_name,
126 |       'mean_latency': metrics.mean_latency,
127 |       'ci_low': metrics.latency_ci_low,
128 |       'ci_high': metrics.latency_ci_high,
129 |     }
130 |   )
131 | 
132 | output_dir = os.path.join(current_dir, '..', 'report')
133 | os.makedirs(output_dir, exist_ok=True)
134 | output_df = pd.DataFrame(output_data)
135 | output_path = os.path.join(output_dir, 'latency_results.csv')
136 | output_df.to_csv(output_path, index=False)
137 | print(f'\nResults saved to: {output_path}')
138 | 


--------------------------------------------------------------------------------
/clicks/src/clicks/third_party/molmo/client.py:
--------------------------------------------------------------------------------
  1 | import base64
  2 | import io
  3 | import json
  4 | import re
  5 | import time
  6 | from typing import Any, Dict, Optional
  7 | 
  8 | import requests
  9 | from PIL import Image
 10 | from tenacity import retry, stop_after_attempt, wait_exponential
 11 | 
 12 | 
 13 | class MolmoClient:
 14 |   def __init__(
 15 |     self,
 16 |     api_url: str,
 17 |     api_key: Optional[str] = None,
 18 |     max_tokens: int = 4096,
 19 |     temperature: float = 0.0,
 20 |     top_p: float = 0.9,
 21 |     top_k: int = 50,
 22 |   ):
 23 |     self.api_url = api_url.rstrip('/')
 24 |     self.api_key = api_key
 25 |     self.max_tokens = max_tokens
 26 |     self.temperature = temperature
 27 |     self.top_p = top_p
 28 |     self.top_k = top_k
 29 | 
 30 |   def _encode_image(self, image_data_uri: str) -> str:
 31 |     if ',' in image_data_uri:
 32 |       base64_data = image_data_uri.split(',')[1]
 33 |     else:
 34 |       base64_data = image_data_uri
 35 |     return base64_data
 36 | 
 37 |   def _extract_image_dimensions(self, base64_data: str) -> tuple[int, int]:
 38 |     try:
 39 |       image_data = base64.b64decode(base64_data)
 40 |       image = Image.open(io.BytesIO(image_data))
 41 |       width, height = image.size
 42 |       return width, height
 43 |     except Exception as e:
 44 |       print(f'Error extracting image dimensions: {e}')
 45 |       return 1024, 768
 46 | 
 47 |   @retry(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, min=2, max=15))
 48 |   def predict(self, image_data_uri: str, prompt: str) -> Dict[str, Any]:
 49 |     base64_image = self._encode_image(image_data_uri)
 50 | 
 51 |     prompt_template = (
 52 |       f"""Point at the UI element to click to achieve the following action: {prompt}"""
 53 |     )
 54 | 
 55 |     width, height = self._extract_image_dimensions(base64_image)
 56 |     request_data = {
 57 |       'images': [base64_image],
 58 |       'text': prompt_template,
 59 |       'max_new_tokens': self.max_tokens,
 60 |       'temperature': self.temperature,
 61 |       'top_p': self.top_p,
 62 |       'top_k': self.top_k,
 63 |     }
 64 | 
 65 |     headers = {'Content-Type': 'application/json'}
 66 |     if self.api_key:
 67 |       headers['Authorization'] = f'Bearer {self.api_key}'
 68 | 
 69 |     try:
 70 |       start_time = time.time()
 71 | 
 72 |       response = requests.post(
 73 |         f'{self.api_url}/generate',
 74 |         json=request_data,
 75 |         headers=headers,
 76 |         timeout=3600,
 77 |       )
 78 | 
 79 |       end_time = time.time()
 80 |       latency = end_time - start_time
 81 | 
 82 |       if response.status_code == 200:
 83 |         result = response.json()
 84 |         return {
 85 |           'raw_response': json.dumps(result),
 86 |           'content': result.get('generated_text', ''),
 87 |           'latency_seconds': latency,
 88 |           'width': width,
 89 |           'height': height,
 90 |         }
 91 |       else:
 92 |         error_text = response.text
 93 |         try:
 94 |           error_json = response.json()
 95 |           error_text = json.dumps(error_json)
 96 |         except:
 97 |           pass
 98 | 
 99 |         return {
100 |           'error': f'HTTP Error {response.status_code}',
101 |           'error_details': error_text,
102 |           'latency_seconds': latency,
103 |         }
104 | 
105 |     except Exception as e:
106 |       return {
107 |         'error': f'API Error: {str(e)}',
108 |         'latency_seconds': 0,
109 |       }
110 | 
111 |   def parse_prediction(self, prediction: Dict[str, Any]) -> Dict[str, Any]:
112 |     if 'error' in prediction:
113 |       return {
114 |         'pred_x': None,
115 |         'pred_y': None,
116 |         'error': prediction.get('error'),
117 |         'error_details': prediction.get('error_details', ''),
118 |         'raw_responses': prediction.get('raw_response', '{}'),
119 |       }
120 | 
121 |     content = prediction.get('content', '')
122 |     point_match = re.search(r'<point x="([\d.]+)" y="([\d.]+)"', content)
123 | 
124 |     pred_x = None
125 |     pred_y = None
126 | 
127 |     if point_match:
128 |       rel_x = float(point_match.group(1))
129 |       rel_y = float(point_match.group(2))
130 | 
131 |       width = prediction.get('width', 0)
132 |       height = prediction.get('height', 0)
133 | 
134 |       if width > 0 and height > 0:
135 |         pred_x = int(rel_x * width / 100)
136 |         pred_y = int(rel_y * height / 100)
137 | 
138 |     return {
139 |       'pred_x': pred_x,
140 |       'pred_y': pred_y,
141 |       'content': content,
142 |       'raw_responses': prediction.get('raw_response', '{}'),
143 |     }
144 | 


--------------------------------------------------------------------------------
/clicks/src/clicks/evaluate/ace.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | from typing import Any, Dict, Optional
  4 | 
  5 | from clicks.api_client_base import AbstractAPIClient
  6 | from clicks.evaluate.models import (
  7 |   AceModelConfig,
  8 |   AcePrediction,
  9 |   EvaluationItem,
 10 |   EvaluationResult,
 11 |   ParsedPrediction,
 12 | )
 13 | from clicks.evaluate.utils import (
 14 |   check_prediction_in_bbox,
 15 |   print_colored_result,
 16 |   visualize_prediction,
 17 | )
 18 | from generalagents import Agent
 19 | from PIL import Image
 20 | 
 21 | 
 22 | class AceAPIClient(AbstractAPIClient):
 23 |   def __init__(self, config: AceModelConfig):
 24 |     self.config = config
 25 |     self.client_type = 'ace'
 26 |     self.agent = Agent(model=self.config.model, api_key=self.config.api_key or '')
 27 | 
 28 |   def predict(
 29 |     self,
 30 |     image_data_uri: str,
 31 |     prompt: str,
 32 |     model: Optional[str] = None,
 33 |   ) -> Dict[str, Any]:
 34 |     raise NotImplementedError('Ace API client does not support predict method')
 35 | 
 36 |   def predict_ace(
 37 |     self,
 38 |     image: Image.Image,
 39 |     prompt: str,
 40 |   ) -> ParsedPrediction:
 41 |     session = self.agent.start(prompt)
 42 |     action = session.plan(image)
 43 | 
 44 |     if action.kind == 'left_click':
 45 |       pred_x = action.coordinate.x
 46 |       pred_y = action.coordinate.y
 47 |     else:
 48 |       pred_x, pred_y = None, None
 49 | 
 50 |     return ParsedPrediction(
 51 |       pred_x=pred_x,
 52 |       pred_y=pred_y,
 53 |       raw_response=str(action),
 54 |     )
 55 | 
 56 |   def parse_prediction(self, prediction: Dict[str, Any]) -> ParsedPrediction:
 57 |     ace_prediction = AcePrediction.model_validate(prediction)
 58 |     if ace_prediction.action.kind == 'left_click':
 59 |       if ace_prediction.action.coordinate:
 60 |         pred_x = ace_prediction.action.coordinate.x
 61 |         pred_y = ace_prediction.action.coordinate.y
 62 |       else:
 63 |         pred_x, pred_y = None, None
 64 |     else:
 65 |       pred_x, pred_y = None, None
 66 | 
 67 |     return ParsedPrediction(
 68 |       pred_x=pred_x,
 69 |       pred_y=pred_y,
 70 |       raw_response=ace_prediction.raw_response,
 71 |     )
 72 | 
 73 |   def process_single_item(
 74 |     self,
 75 |     item: Dict[str, Any],
 76 |     frames_dir: str,
 77 |     run_id: str,
 78 |   ) -> EvaluationResult:
 79 |     eval_item = item if isinstance(item, EvaluationItem) else EvaluationItem.model_validate(item)
 80 | 
 81 |     image_path = eval_item.image
 82 |     local_path = os.path.join(frames_dir, image_path)
 83 |     if not os.path.exists(local_path):
 84 |       raise FileNotFoundError(f'Local image not found at {local_path}')
 85 |     image = Image.open(local_path)
 86 | 
 87 |     try:
 88 |       start_time = time.time()
 89 |       prediction = self.predict_ace(image, eval_item.instruction)
 90 |       latency = time.time() - start_time
 91 | 
 92 |       pred_x = prediction.pred_x
 93 |       pred_y = prediction.pred_y
 94 | 
 95 |       gt_x1 = eval_item.x1
 96 |       gt_y1 = eval_item.y1
 97 |       gt_x2 = eval_item.x2
 98 |       gt_y2 = eval_item.y2
 99 | 
100 |       is_in_bbox = check_prediction_in_bbox(pred_x, pred_y, gt_x1, gt_y1, gt_x2, gt_y2)
101 | 
102 |       visualization_path = visualize_prediction(
103 |         local_path,
104 |         pred_x,
105 |         pred_y,
106 |         eval_item.id,
107 |         eval_item.recording_id,
108 |         eval_item.instruction,
109 |         self.config.model,
110 |         run_id,
111 |         gt_x1,
112 |         gt_y1,
113 |         gt_x2,
114 |         gt_y2,
115 |         is_in_bbox,
116 |       )
117 | 
118 |       print_colored_result(
119 |         eval_item.id,
120 |         eval_item.instruction,
121 |         pred_x,
122 |         pred_y,
123 |         latency,
124 |         is_in_bbox,
125 |       )
126 | 
127 |       result = EvaluationResult(
128 |         id=eval_item.id,
129 |         recording_id=eval_item.recording_id,
130 |         instruction=eval_item.instruction,
131 |         image_path=local_path,
132 |         gt_x1=gt_x1,
133 |         gt_y1=gt_y1,
134 |         gt_x2=gt_x2,
135 |         gt_y2=gt_y2,
136 |         pred_x=pred_x,
137 |         pred_y=pred_y,
138 |         is_in_bbox=is_in_bbox,
139 |         latency_seconds=latency,
140 |         raw_response=prediction.raw_response,
141 |         visualization_path=visualization_path,
142 |       )
143 | 
144 |       return result
145 | 
146 |     except Exception as e:
147 |       print(f'API request failed for {eval_item.id}: {str(e)}')
148 |       raise e
149 | 
150 | 
151 | def get_api_client(api_key: Optional[str] = None, model: Optional[str] = None) -> AceAPIClient:
152 |   api_key = api_key or os.environ.get('GENERALAGENTS_API_KEY', '')
153 |   config = AceModelConfig(api_key=api_key, model=model or 'ace-control-small')
154 |   return AceAPIClient(config=config)
155 | 


--------------------------------------------------------------------------------
/clicks/src/clicks/third_party/molmo/integration.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | from typing import Any, Dict, Optional
  4 | 
  5 | from clicks.api_client_base import AbstractAPIClient
  6 | from clicks.evaluate.models import (
  7 |   EvaluationItem,
  8 |   EvaluationResult,
  9 |   ParsedPrediction,
 10 | )
 11 | from clicks.evaluate.utils import (
 12 |   check_prediction_in_bbox,
 13 |   print_colored_result,
 14 |   visualize_prediction,
 15 | )
 16 | 
 17 | from ..common import encode_image_to_base64_uri
 18 | from .client import MolmoClient
 19 | 
 20 | 
 21 | class MolmoAPIClient(AbstractAPIClient):
 22 |   def __init__(
 23 |     self,
 24 |     api_url: str,
 25 |     api_key: Optional[str] = None,
 26 |     max_tokens: int = 4096,
 27 |     temperature: float = 0.0,
 28 |     top_p: float = 0.9,
 29 |     top_k: int = 50,
 30 |   ):
 31 |     self.client = MolmoClient(
 32 |       api_url=api_url,
 33 |       api_key=api_key,
 34 |       max_tokens=max_tokens,
 35 |       temperature=temperature,
 36 |       top_p=top_p,
 37 |       top_k=top_k,
 38 |     )
 39 |     self.client_type = 'molmo'
 40 | 
 41 |   def predict(self, image_data_uri: str, prompt: str) -> Dict[str, Any]:
 42 |     return self.client.predict(image_data_uri=image_data_uri, prompt=prompt)
 43 | 
 44 |   def parse_prediction(self, prediction: Dict[str, Any]) -> ParsedPrediction:
 45 |     parsed = self.client.parse_prediction(prediction)
 46 | 
 47 |     return ParsedPrediction(
 48 |       pred_x=parsed.get('pred_x'),
 49 |       pred_y=parsed.get('pred_y'),
 50 |       raw_response=parsed.get('raw_responses', '{}'),
 51 |     )
 52 | 
 53 |   def process_single_item(
 54 |     self,
 55 |     item: Dict[str, Any],
 56 |     frames_dir: str,
 57 |     run_id: str,
 58 |   ) -> EvaluationResult:
 59 |     eval_item = EvaluationItem(
 60 |       id=item['id'],
 61 |       recording_id=item['recording_id'],
 62 |       instruction=item['instruction'],
 63 |       image=item['image'],
 64 |       x1=item['x1'],
 65 |       y1=item['y1'],
 66 |       x2=item['x2'],
 67 |       y2=item['y2'],
 68 |     )
 69 | 
 70 |     image_path = eval_item.image
 71 | 
 72 |     local_path = os.path.join(frames_dir, image_path)
 73 |     if not os.path.exists(local_path):
 74 |       raise FileNotFoundError(f'Local image not found at {local_path}')
 75 | 
 76 |     image_data_uri = encode_image_to_base64_uri(local_path)
 77 |     if image_data_uri is None:
 78 |       raise ValueError(f'Failed to encode image at {local_path}')
 79 | 
 80 |     try:
 81 |       start_time = time.time()
 82 |       prediction = self.predict(image_data_uri, eval_item.instruction)
 83 |       latency = time.time() - start_time
 84 | 
 85 |       pred_result = self.parse_prediction(prediction)
 86 |       pred_x = pred_result.pred_x
 87 |       pred_y = pred_result.pred_y
 88 | 
 89 |       is_in_bbox = check_prediction_in_bbox(
 90 |         pred_x, pred_y, eval_item.x1, eval_item.y1, eval_item.x2, eval_item.y2
 91 |       )
 92 | 
 93 |       print_colored_result(
 94 |         item_id=eval_item.id,
 95 |         instruction=eval_item.instruction,
 96 |         pred_x=pred_x,
 97 |         pred_y=pred_y,
 98 |         latency=latency,
 99 |         is_in_bbox=is_in_bbox,
100 |       )
101 | 
102 |       visualization_path = visualize_prediction(
103 |         image_path=local_path,
104 |         pred_x=pred_x,
105 |         pred_y=pred_y,
106 |         item_id=eval_item.id,
107 |         recording_id=eval_item.recording_id,
108 |         instruction=eval_item.instruction,
109 |         model_name='molmo',
110 |         run_id=run_id,
111 |         gt_x1=eval_item.x1,
112 |         gt_y1=eval_item.y1,
113 |         gt_x2=eval_item.x2,
114 |         gt_y2=eval_item.y2,
115 |         is_in_bbox=is_in_bbox,
116 |       )
117 | 
118 |       result = EvaluationResult(
119 |         id=eval_item.id,
120 |         recording_id=eval_item.recording_id,
121 |         instruction=eval_item.instruction,
122 |         image_path=local_path,
123 |         gt_x1=eval_item.x1,
124 |         gt_y1=eval_item.y1,
125 |         gt_x2=eval_item.x2,
126 |         gt_y2=eval_item.y2,
127 |         pred_x=pred_x,
128 |         pred_y=pred_y,
129 |         is_in_bbox=is_in_bbox,
130 |         latency_seconds=latency,
131 |         raw_response=prediction.get('raw_response', '{}'),
132 |         visualization_path=visualization_path,
133 |       )
134 | 
135 |       return result
136 | 
137 |     except Exception as e:
138 |       print(f'Error processing item {eval_item.id}: {str(e)}')
139 |       raise e
140 | 
141 | 
142 | def get_molmo_api_client(
143 |   api_url: str,
144 |   api_key: Optional[str] = None,
145 |   max_tokens: int = 4096,
146 |   temperature: float = 0.0,
147 |   top_p: float = 0.9,
148 |   top_k: int = 50,
149 | ) -> MolmoAPIClient:
150 |   return MolmoAPIClient(
151 |     api_url=api_url,
152 |     api_key=api_key,
153 |     max_tokens=max_tokens,
154 |     temperature=temperature,
155 |     top_p=top_p,
156 |     top_k=top_k,
157 |   )
158 | 


--------------------------------------------------------------------------------
/clicks/src/clicks/third_party/ui_tars/integration.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | from typing import Any, Dict
  4 | 
  5 | from clicks.api_client_base import AbstractAPIClient
  6 | from clicks.evaluate.models import (
  7 |   EvaluationItem,
  8 |   EvaluationResult,
  9 |   ParsedPrediction,
 10 | )
 11 | from clicks.evaluate.utils import (
 12 |   check_prediction_in_bbox,
 13 |   print_colored_result,
 14 |   visualize_prediction,
 15 | )
 16 | 
 17 | from ..common import encode_image_to_base64_uri
 18 | from .client import UITarsClient
 19 | 
 20 | 
 21 | class UITarsAPIClient(AbstractAPIClient):
 22 |   def __init__(
 23 |     self,
 24 |     api_url: str,
 25 |     api_key: str = 'super-secret-key',
 26 |     max_tokens: int = 128,
 27 |     temperature: float = 0.0,
 28 |     frequency_penalty: float = 1.0,
 29 |     model_name: str = 'bytedance-research/UI-TARS-72B-SFT',
 30 |   ):
 31 |     self.client = UITarsClient(
 32 |       api_url=api_url,
 33 |       api_key=api_key,
 34 |       max_tokens=max_tokens,
 35 |       temperature=temperature,
 36 |       frequency_penalty=frequency_penalty,
 37 |       model_name=model_name,
 38 |     )
 39 |     self.model_name = model_name
 40 |     self.client_type = 'ui_tars'
 41 | 
 42 |   def predict(self, image_data_uri: str, prompt: str) -> Dict[str, Any]:
 43 |     return self.client.predict(image_data_uri=image_data_uri, prompt=prompt)
 44 | 
 45 |   def parse_prediction(self, prediction: Dict[str, Any]) -> ParsedPrediction:
 46 |     parsed_data = self.client.parse_prediction(prediction)
 47 | 
 48 |     return ParsedPrediction(
 49 |       pred_x=parsed_data.get('pred_x'),
 50 |       pred_y=parsed_data.get('pred_y'),
 51 |       raw_response=parsed_data.get('raw_responses', '{}'),
 52 |     )
 53 | 
 54 |   def process_single_item(
 55 |     self,
 56 |     item: Dict[str, Any],
 57 |     frames_dir: str,
 58 |     run_id: str,
 59 |   ) -> EvaluationResult:
 60 |     eval_item = EvaluationItem(
 61 |       id=item['id'],
 62 |       recording_id=item['recording_id'],
 63 |       instruction=item['instruction'],
 64 |       image=item['image'],
 65 |       x1=item['x1'],
 66 |       y1=item['y1'],
 67 |       x2=item['x2'],
 68 |       y2=item['y2'],
 69 |     )
 70 | 
 71 |     image_path = os.path.join(frames_dir, eval_item.image)
 72 | 
 73 |     image_data_uri = encode_image_to_base64_uri(image_path)
 74 |     if not image_data_uri:
 75 |       raise ValueError(f'Failed to encode image: {image_path}')
 76 | 
 77 |     print(f'Processing item {eval_item.id}: {eval_item.instruction}')
 78 |     start_time = time.time()
 79 |     prediction = self.predict(image_data_uri=image_data_uri, prompt=eval_item.instruction)
 80 |     end_time = time.time()
 81 |     latency = end_time - start_time
 82 | 
 83 |     parsed_prediction = self.parse_prediction(prediction)
 84 | 
 85 |     error = prediction.get('error')
 86 |     if error:
 87 |       print(f'Error: {error}')
 88 |       raise ValueError(error)
 89 | 
 90 |     is_in_bbox = check_prediction_in_bbox(
 91 |       pred_x=parsed_prediction.pred_x,
 92 |       pred_y=parsed_prediction.pred_y,
 93 |       gt_x1=eval_item.x1,
 94 |       gt_y1=eval_item.y1,
 95 |       gt_x2=eval_item.x2,
 96 |       gt_y2=eval_item.y2,
 97 |     )
 98 | 
 99 |     print_colored_result(
100 |       item_id=eval_item.id,
101 |       instruction=eval_item.instruction,
102 |       pred_x=parsed_prediction.pred_x,
103 |       pred_y=parsed_prediction.pred_y,
104 |       latency=latency,
105 |       is_in_bbox=is_in_bbox,
106 |     )
107 | 
108 |     visualization_path = visualize_prediction(
109 |       image_path=image_path,
110 |       pred_x=parsed_prediction.pred_x,
111 |       pred_y=parsed_prediction.pred_y,
112 |       item_id=eval_item.id,
113 |       recording_id=eval_item.recording_id,
114 |       instruction=eval_item.instruction,
115 |       model_name=self.model_name,
116 |       run_id=run_id,
117 |       gt_x1=eval_item.x1,
118 |       gt_y1=eval_item.y1,
119 |       gt_x2=eval_item.x2,
120 |       gt_y2=eval_item.y2,
121 |       is_in_bbox=is_in_bbox,
122 |     )
123 | 
124 |     return EvaluationResult(
125 |       id=eval_item.id,
126 |       recording_id=eval_item.recording_id,
127 |       instruction=eval_item.instruction,
128 |       image_path=image_path,
129 |       gt_x1=eval_item.x1,
130 |       gt_y1=eval_item.y1,
131 |       gt_x2=eval_item.x2,
132 |       gt_y2=eval_item.y2,
133 |       pred_x=parsed_prediction.pred_x,
134 |       pred_y=parsed_prediction.pred_y,
135 |       is_in_bbox=is_in_bbox,
136 |       latency_seconds=latency,
137 |       raw_response=parsed_prediction.raw_response,
138 |       visualization_path=visualization_path,
139 |     )
140 | 
141 | 
142 | def get_ui_tars_api_client(
143 |   api_url: str,
144 |   api_key: str = 'super-secret-key',
145 |   max_tokens: int = 128,
146 |   temperature: float = 0.0,
147 |   frequency_penalty: float = 1.0,
148 |   model_name: str = 'bytedance-research/UI-TARS-72B-SFT',
149 |   **kwargs,
150 | ) -> UITarsAPIClient:
151 |   client = UITarsAPIClient(
152 |     api_url=api_url,
153 |     api_key=api_key,
154 |     max_tokens=max_tokens,
155 |     temperature=temperature,
156 |     frequency_penalty=frequency_penalty,
157 |     model_name=model_name,
158 |   )
159 |   return client
160 | 


--------------------------------------------------------------------------------
/clicks/src/clicks/third_party/gemini/integration.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | from typing import Any, Dict, Optional
  4 | 
  5 | from clicks.api_client_base import AbstractAPIClient
  6 | from clicks.evaluate.models import (
  7 |   EvaluationItem,
  8 |   EvaluationResult,
  9 |   GeminiModelConfig,
 10 |   ParsedPrediction,
 11 | )
 12 | from clicks.evaluate.utils import (
 13 |   check_prediction_in_bbox,
 14 |   print_colored_result,
 15 |   visualize_prediction,
 16 | )
 17 | 
 18 | from ..common import encode_image_to_base64_uri
 19 | from .client import DEFAULT_MODEL
 20 | from .client import GeminiClient as GeminiBaseClient
 21 | 
 22 | 
 23 | class GeminiAPIClient(AbstractAPIClient):
 24 |   def __init__(
 25 |     self,
 26 |     config: Optional[GeminiModelConfig] = None,
 27 |   ):
 28 |     self.config = config or GeminiModelConfig()
 29 | 
 30 |     self.api_key = self.config.api_key
 31 |     self.model = self.config.model
 32 |     self.max_tokens = self.config.max_tokens
 33 |     self.temperature = self.config.temperature
 34 |     self.client_type = 'gemini'
 35 | 
 36 |   def predict(
 37 |     self, image_data_uri: str, prompt: str, model: Optional[str] = None
 38 |   ) -> Dict[str, Any] | None:
 39 |     client = GeminiBaseClient(
 40 |       api_key=self.api_key,
 41 |       model=model or self.model,
 42 |       max_tokens=self.max_tokens,
 43 |       temperature=self.temperature,
 44 |     )
 45 | 
 46 |     return client.predict(image_data_uri, prompt)
 47 | 
 48 |   def parse_prediction(self, prediction: Dict[str, Any]) -> ParsedPrediction:
 49 |     client = GeminiBaseClient(
 50 |       api_key=self.api_key,
 51 |       model=self.model,
 52 |       max_tokens=self.max_tokens,
 53 |       temperature=self.temperature,
 54 |     )
 55 | 
 56 |     parsed = client.parse_prediction(prediction)
 57 | 
 58 |     return ParsedPrediction(
 59 |       pred_x=parsed['pred_x'],
 60 |       pred_y=parsed['pred_y'],
 61 |       raw_response=parsed['raw_response'],
 62 |     )
 63 | 
 64 |   def process_single_item(
 65 |     self,
 66 |     item: Dict[str, Any],
 67 |     frames_dir: str,
 68 |     run_id: str,
 69 |   ) -> EvaluationResult:
 70 |     eval_item = EvaluationItem(
 71 |       id=item['id'],
 72 |       recording_id=item['recording_id'],
 73 |       instruction=item['instruction'],
 74 |       image=item['image'],
 75 |       x1=item['x1'],
 76 |       y1=item['y1'],
 77 |       x2=item['x2'],
 78 |       y2=item['y2'],
 79 |     )
 80 | 
 81 |     image_path = os.path.join(frames_dir, eval_item.image)
 82 | 
 83 |     if not os.path.exists(image_path):
 84 |       raise FileNotFoundError(f'Image file not found: {image_path}')
 85 | 
 86 |     image_data_uri = encode_image_to_base64_uri(image_path)
 87 |     start_time = time.time()
 88 |     prediction = self.predict(image_data_uri, eval_item.instruction)
 89 |     end_time = time.time()
 90 |     latency = end_time - start_time
 91 | 
 92 |     if prediction:
 93 |       parsed_prediction = self.parse_prediction(prediction)
 94 | 
 95 |       is_in_bbox = check_prediction_in_bbox(
 96 |         pred_x=parsed_prediction.pred_x,
 97 |         pred_y=parsed_prediction.pred_y,
 98 |         gt_x1=eval_item.x1,
 99 |         gt_y1=eval_item.y1,
100 |         gt_x2=eval_item.x2,
101 |         gt_y2=eval_item.y2,
102 |       )
103 | 
104 |       print_colored_result(
105 |         item_id=eval_item.id,
106 |         instruction=eval_item.instruction,
107 |         pred_x=parsed_prediction.pred_x,
108 |         pred_y=parsed_prediction.pred_y,
109 |         latency=latency,
110 |         is_in_bbox=is_in_bbox,
111 |       )
112 | 
113 |       visualization_path = visualize_prediction(
114 |         image_path=image_path,
115 |         pred_x=parsed_prediction.pred_x,
116 |         pred_y=parsed_prediction.pred_y,
117 |         item_id=eval_item.id,
118 |         recording_id=eval_item.recording_id,
119 |         instruction=eval_item.instruction,
120 |         model_name='gemini',
121 |         run_id=run_id,
122 |         gt_x1=eval_item.x1,
123 |         gt_y1=eval_item.y1,
124 |         gt_x2=eval_item.x2,
125 |         gt_y2=eval_item.y2,
126 |         is_in_bbox=is_in_bbox,
127 |       )
128 | 
129 |       result = EvaluationResult(
130 |         id=eval_item.id,
131 |         recording_id=eval_item.recording_id,
132 |         instruction=eval_item.instruction,
133 |         image_path=image_path,
134 |         gt_x1=eval_item.x1,
135 |         gt_y1=eval_item.y1,
136 |         gt_x2=eval_item.x2,
137 |         gt_y2=eval_item.y2,
138 |         pred_x=parsed_prediction.pred_x,
139 |         pred_y=parsed_prediction.pred_y,
140 |         is_in_bbox=is_in_bbox,
141 |         latency_seconds=latency,
142 |         raw_response=parsed_prediction.raw_response,
143 |         visualization_path=visualization_path,
144 |       )
145 | 
146 |       return result
147 |     else:
148 |       raise ValueError('Prediction is None')
149 | 
150 | 
151 | def get_gemini_api_client(
152 |   api_key: Optional[str] = None,
153 |   model: str = DEFAULT_MODEL,
154 |   max_tokens: int = 4096,
155 |   temperature: float = 0.0,
156 | ) -> GeminiAPIClient:
157 |   config = GeminiModelConfig(
158 |     model=model,
159 |     api_key=api_key or os.environ.get('GEMINI_API_KEY'),
160 |     max_tokens=max_tokens,
161 |     temperature=temperature,
162 |   )
163 | 
164 |   return GeminiAPIClient(config=config)
165 | 


--------------------------------------------------------------------------------
/clicks/src/clicks/third_party/claude/integration.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | from typing import Any, Dict, Optional
  4 | 
  5 | from clicks.api_client_base import AbstractAPIClient
  6 | from clicks.evaluate.models import (
  7 |   ClaudeModelConfig,
  8 |   EvaluationItem,
  9 |   EvaluationResult,
 10 |   ParsedPrediction,
 11 | )
 12 | from clicks.evaluate.utils import (
 13 |   check_prediction_in_bbox,
 14 |   print_colored_result,
 15 |   visualize_prediction,
 16 | )
 17 | 
 18 | from ..common import encode_image_to_base64_uri
 19 | from .client import DEFAULT_MODEL, ClaudeComputerUseClient
 20 | 
 21 | 
 22 | class ClaudeComputerUseAPIClient(AbstractAPIClient):
 23 |   def __init__(
 24 |     self,
 25 |     config: Optional[ClaudeModelConfig] = None,
 26 |   ):
 27 |     if config is None:
 28 |       config = ClaudeModelConfig(
 29 |         api_endpoint='https://api.anthropic.com/v1/messages',
 30 |         model=DEFAULT_MODEL,
 31 |         api_key=os.environ.get('ANTHROPIC_API_KEY'),
 32 |         thinking_budget=1024,
 33 |         tool_version='20250124',
 34 |       )
 35 | 
 36 |     self.claude_client = ClaudeComputerUseClient(
 37 |       api_key=config.api_key,
 38 |       api_endpoint=config.api_endpoint,
 39 |       model=config.model,
 40 |       thinking_budget=config.thinking_budget,
 41 |       tool_version=config.tool_version,
 42 |     )
 43 |     self.config = config
 44 |     self.client_type = 'claude'
 45 | 
 46 |   def predict(
 47 |     self, image_data_uri: str, prompt: str, model: Optional[str] = None
 48 |   ) -> Dict[str, Any] | None:
 49 |     return self.claude_client.predict(image_data_uri, prompt)
 50 | 
 51 |   def parse_prediction(self, prediction: Dict[str, Any]) -> ParsedPrediction:
 52 |     result_dict = self.claude_client.parse_prediction(prediction)
 53 | 
 54 |     return ParsedPrediction(
 55 |       pred_x=result_dict.get('pred_x'),
 56 |       pred_y=result_dict.get('pred_y'),
 57 |       raw_response=result_dict.get('raw_response'),
 58 |     )
 59 | 
 60 |   def process_single_item(
 61 |     self,
 62 |     item: Dict[str, Any],
 63 |     frames_dir: str,
 64 |     run_id: str,
 65 |   ) -> EvaluationResult:
 66 |     try:
 67 |       eval_item = item if isinstance(item, EvaluationItem) else EvaluationItem.model_validate(item)
 68 | 
 69 |       item_id = eval_item.id
 70 |       recording_id = eval_item.recording_id
 71 |       instruction = eval_item.instruction
 72 |       image_path = eval_item.image
 73 | 
 74 |       gt_x1 = eval_item.x1
 75 |       gt_y1 = eval_item.y1
 76 |       gt_x2 = eval_item.x2
 77 |       gt_y2 = eval_item.y2
 78 | 
 79 |       local_path = os.path.join(frames_dir, image_path)
 80 |       if not os.path.exists(local_path):
 81 |         raise FileNotFoundError(f'Image file not found: {local_path}')
 82 | 
 83 |       image_data_uri = encode_image_to_base64_uri(local_path)
 84 |       print(f'Processing item {item_id} with instruction: {instruction}')
 85 | 
 86 |       start_time = time.time()
 87 |       prediction = self.predict(image_data_uri, instruction)
 88 |       end_time = time.time()
 89 |       latency = end_time - start_time
 90 | 
 91 |       if prediction is None:
 92 |         print(f'Claude returned None for item {item_id}')
 93 |         pred_result = ParsedPrediction()
 94 |         pred_x = None
 95 |         pred_y = None
 96 |       else:
 97 |         pred_result = self.parse_prediction(prediction)
 98 |         pred_x = pred_result.pred_x
 99 |         pred_y = pred_result.pred_y
100 | 
101 |       is_in_bbox = check_prediction_in_bbox(pred_x, pred_y, gt_x1, gt_y1, gt_x2, gt_y2)
102 | 
103 |       print_colored_result(
104 |         item_id,
105 |         instruction,
106 |         pred_x,
107 |         pred_y,
108 |         latency,
109 |         is_in_bbox,
110 |       )
111 | 
112 |       visualization_path = visualize_prediction(
113 |         local_path,
114 |         pred_x,
115 |         pred_y,
116 |         item_id,
117 |         recording_id,
118 |         instruction,
119 |         self.config.model,
120 |         run_id,
121 |         gt_x1,
122 |         gt_y1,
123 |         gt_x2,
124 |         gt_y2,
125 |         is_in_bbox,
126 |       )
127 | 
128 |       result = EvaluationResult(
129 |         id=item_id,
130 |         recording_id=recording_id,
131 |         instruction=instruction,
132 |         image_path=local_path,
133 |         gt_x1=gt_x1,
134 |         gt_y1=gt_y1,
135 |         gt_x2=gt_x2,
136 |         gt_y2=gt_y2,
137 |         pred_x=pred_x,
138 |         pred_y=pred_y,
139 |         is_in_bbox=is_in_bbox,
140 |         latency_seconds=latency,
141 |         raw_response=pred_result.raw_response if pred_result else None,
142 |         visualization_path=visualization_path,
143 |       )
144 | 
145 |       return result
146 | 
147 |     except Exception as e:
148 |       print(f'API request failed: {str(e)}')
149 |       raise e
150 | 
151 | 
152 | def get_claude_api_client(
153 |   api_key: Optional[str] = None,
154 |   api_endpoint: Optional[str] = None,
155 |   model: str = DEFAULT_MODEL,
156 |   thinking_budget: Optional[int] = 1024,
157 |   tool_version: str = '20250124',
158 | ) -> ClaudeComputerUseAPIClient:
159 |   config = ClaudeModelConfig(
160 |     api_endpoint=api_endpoint or 'https://api.anthropic.com/v1/messages',
161 |     model=model,
162 |     api_key=api_key or os.environ.get('ANTHROPIC_API_KEY'),
163 |     thinking_budget=thinking_budget,
164 |     tool_version=tool_version,
165 |   )
166 |   return ClaudeComputerUseAPIClient(config)
167 | 


--------------------------------------------------------------------------------
/clicks/src/clicks/third_party/omniparser/integration.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | from typing import Any, Dict, Optional
  4 | 
  5 | from clicks.api_client_base import AbstractAPIClient
  6 | from clicks.evaluate.models import (
  7 |   EvaluationItem,
  8 |   EvaluationResult,
  9 |   OmniParserModelConfig,
 10 |   ParsedPrediction,
 11 | )
 12 | from clicks.evaluate.utils import (
 13 |   check_prediction_in_bbox,
 14 |   print_colored_result,
 15 |   visualize_prediction,
 16 | )
 17 | 
 18 | from ..common import encode_image_to_base64_uri
 19 | from .client import DEFAULT_MODEL
 20 | from .client import OmniParserClient as OmniParserBaseClient
 21 | 
 22 | 
 23 | class OmniParserAPIClient(AbstractAPIClient):
 24 |   def __init__(
 25 |     self,
 26 |     config: Optional[OmniParserModelConfig] = None,
 27 |   ):
 28 |     self.config = config or OmniParserModelConfig(
 29 |       api_endpoint=os.environ.get(
 30 |         'OMNIPARSER_API_ENDPOINT', 'https://omniparser-api-omniparser-api.modal.run'
 31 |       )
 32 |     )
 33 | 
 34 |     self.api_endpoint = self.config.api_endpoint
 35 |     self.model = self.config.model
 36 |     self.temperature = self.config.temperature
 37 |     self.client_type = 'omniparser'
 38 | 
 39 |   def predict(
 40 |     self, image_data_uri: str, prompt: str, model: Optional[str] = None
 41 |   ) -> Dict[str, Any] | None:
 42 |     client = OmniParserBaseClient(
 43 |       api_endpoint=self.api_endpoint,
 44 |       model=model or self.model,
 45 |       temperature=self.temperature,
 46 |     )
 47 | 
 48 |     return client.predict(image_data_uri, prompt)
 49 | 
 50 |   def parse_prediction(self, prediction: Dict[str, Any]) -> ParsedPrediction:
 51 |     client = OmniParserBaseClient(
 52 |       api_endpoint=self.api_endpoint,
 53 |       model=self.model,
 54 |       temperature=self.temperature,
 55 |     )
 56 | 
 57 |     parsed = client.parse_prediction(prediction)
 58 | 
 59 |     return ParsedPrediction(
 60 |       pred_x=parsed['pred_x'],
 61 |       pred_y=parsed['pred_y'],
 62 |       raw_response=parsed['raw_response'],
 63 |     )
 64 | 
 65 |   def process_single_item(
 66 |     self,
 67 |     item: Dict[str, Any],
 68 |     frames_dir: str,
 69 |     run_id: str,
 70 |   ) -> EvaluationResult:
 71 |     eval_item = EvaluationItem(
 72 |       id=item['id'],
 73 |       recording_id=item['recording_id'],
 74 |       instruction=item['instruction'],
 75 |       image=item['image'],
 76 |       x1=item['x1'],
 77 |       y1=item['y1'],
 78 |       x2=item['x2'],
 79 |       y2=item['y2'],
 80 |     )
 81 | 
 82 |     image_path = os.path.join(frames_dir, eval_item.image)
 83 | 
 84 |     if not os.path.exists(image_path):
 85 |       raise FileNotFoundError(f'Image file not found at {image_path}')
 86 | 
 87 |     image_data_uri = encode_image_to_base64_uri(image_path)
 88 |     if image_data_uri is None:
 89 |       raise ValueError(f'Failed to encode image at {image_path}')
 90 | 
 91 |     start_time = time.time()
 92 |     prediction = self.predict(image_data_uri, eval_item.instruction)
 93 |     end_time = time.time()
 94 |     latency = end_time - start_time
 95 | 
 96 |     if prediction:
 97 |       parsed_prediction = self.parse_prediction(prediction)
 98 | 
 99 |       is_in_bbox = check_prediction_in_bbox(
100 |         pred_x=parsed_prediction.pred_x,
101 |         pred_y=parsed_prediction.pred_y,
102 |         gt_x1=eval_item.x1,
103 |         gt_y1=eval_item.y1,
104 |         gt_x2=eval_item.x2,
105 |         gt_y2=eval_item.y2,
106 |       )
107 | 
108 |       print_colored_result(
109 |         item_id=eval_item.id,
110 |         instruction=eval_item.instruction,
111 |         pred_x=parsed_prediction.pred_x,
112 |         pred_y=parsed_prediction.pred_y,
113 |         latency=latency,
114 |         is_in_bbox=is_in_bbox,
115 |       )
116 | 
117 |       visualization_path = visualize_prediction(
118 |         image_path=image_path,
119 |         pred_x=parsed_prediction.pred_x,
120 |         pred_y=parsed_prediction.pred_y,
121 |         item_id=eval_item.id,
122 |         recording_id=eval_item.recording_id,
123 |         instruction=eval_item.instruction,
124 |         model_name='omniparser',
125 |         run_id=run_id,
126 |         gt_x1=eval_item.x1,
127 |         gt_y1=eval_item.y1,
128 |         gt_x2=eval_item.x2,
129 |         gt_y2=eval_item.y2,
130 |         is_in_bbox=is_in_bbox,
131 |       )
132 | 
133 |       result = EvaluationResult(
134 |         id=eval_item.id,
135 |         recording_id=eval_item.recording_id,
136 |         instruction=eval_item.instruction,
137 |         image_path=image_path,
138 |         gt_x1=eval_item.x1,
139 |         gt_y1=eval_item.y1,
140 |         gt_x2=eval_item.x2,
141 |         gt_y2=eval_item.y2,
142 |         pred_x=parsed_prediction.pred_x,
143 |         pred_y=parsed_prediction.pred_y,
144 |         is_in_bbox=is_in_bbox,
145 |         latency_seconds=latency,
146 |         raw_response=parsed_prediction.raw_response,
147 |         visualization_path=visualization_path,
148 |       )
149 | 
150 |       return result
151 |     else:
152 |       raise ValueError('Prediction is None')
153 | 
154 | 
155 | def get_omniparser_api_client(
156 |   api_endpoint: Optional[str] = None,
157 |   model: str = DEFAULT_MODEL,
158 |   temperature: float = 0.7,
159 | ) -> OmniParserAPIClient:
160 |   config = OmniParserModelConfig(
161 |     api_endpoint=api_endpoint
162 |     or os.environ.get('OMNIPARSER_API_ENDPOINT', 'https://omniparser-api-omniparser-api.modal.run'),
163 |     model=model,
164 |     temperature=temperature,
165 |   )
166 | 
167 |   return OmniParserAPIClient(config=config)
168 | 


--------------------------------------------------------------------------------
/clicks/src/clicks/third_party/openai_cua/integration.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | from typing import Any, Dict, Optional
  4 | 
  5 | from clicks.api_client_base import AbstractAPIClient
  6 | from clicks.evaluate.models import (
  7 |   EvaluationItem,
  8 |   EvaluationResult,
  9 |   OpenAIModelConfig,
 10 |   ParsedPrediction,
 11 | )
 12 | from clicks.evaluate.utils import (
 13 |   check_prediction_in_bbox,
 14 |   print_colored_result,
 15 |   visualize_prediction,
 16 | )
 17 | 
 18 | from ..common import encode_image_to_base64_uri
 19 | from .client import DEFAULT_ENVIRONMENT, DEFAULT_MODEL, OpenAICUAClient
 20 | 
 21 | 
 22 | class OpenAICUAAPIClient(AbstractAPIClient):
 23 |   def __init__(
 24 |     self,
 25 |     config: Optional[OpenAIModelConfig] = None,
 26 |   ):
 27 |     self.config = config or OpenAIModelConfig(
 28 |       api_endpoint=os.environ.get('OPENAI_API_ENDPOINT', 'https://api.openai.com/v1')
 29 |     )
 30 | 
 31 |     self.api_key = self.config.api_key
 32 |     self.api_endpoint = self.config.api_endpoint
 33 |     self.model = self.config.model
 34 |     self.max_tokens = self.config.max_tokens
 35 |     self.environment = getattr(self.config, 'environment', DEFAULT_ENVIRONMENT)
 36 |     self.client_type = 'openai_cua'
 37 | 
 38 |   def predict(
 39 |     self, image_data_uri: str, prompt: str, model: Optional[str] = None
 40 |   ) -> Dict[str, Any] | None:
 41 |     client = OpenAICUAClient(
 42 |       api_key=self.api_key,
 43 |       api_endpoint=self.api_endpoint,
 44 |       model=model or self.model,
 45 |       max_tokens=self.max_tokens,
 46 |       environment=self.environment,
 47 |     )
 48 | 
 49 |     return client.predict(image_data_uri, prompt)
 50 | 
 51 |   def parse_prediction(self, prediction: Dict[str, Any]) -> ParsedPrediction:
 52 |     client = OpenAICUAClient(
 53 |       api_key=self.api_key,
 54 |       api_endpoint=self.api_endpoint,
 55 |       model=self.model,
 56 |       max_tokens=self.max_tokens,
 57 |       environment=self.environment,
 58 |     )
 59 | 
 60 |     parsed = client.parse_prediction(prediction)
 61 | 
 62 |     return ParsedPrediction(
 63 |       pred_x=parsed['pred_x'],
 64 |       pred_y=parsed['pred_y'],
 65 |       raw_response=parsed['raw_response'],
 66 |     )
 67 | 
 68 |   def process_single_item(
 69 |     self,
 70 |     item: Dict[str, Any],
 71 |     frames_dir: str,
 72 |     run_id: str,
 73 |   ) -> EvaluationResult:
 74 |     eval_item = EvaluationItem(
 75 |       id=item['id'],
 76 |       recording_id=item['recording_id'],
 77 |       instruction=item['instruction'],
 78 |       image=item['image'],
 79 |       x1=item['x1'],
 80 |       y1=item['y1'],
 81 |       x2=item['x2'],
 82 |       y2=item['y2'],
 83 |     )
 84 | 
 85 |     image_path = os.path.join(frames_dir, eval_item.image)
 86 | 
 87 |     if not os.path.exists(image_path):
 88 |       raise FileNotFoundError(f'Image file not found at {image_path}')
 89 | 
 90 |     image_data_uri = encode_image_to_base64_uri(image_path)
 91 |     if image_data_uri is None:
 92 |       raise ValueError(f'Failed to encode image at {image_path}')
 93 | 
 94 |     start_time = time.time()
 95 |     prediction = self.predict(image_data_uri, eval_item.instruction)
 96 |     end_time = time.time()
 97 |     latency = end_time - start_time
 98 | 
 99 |     if prediction:
100 |       parsed_prediction = self.parse_prediction(prediction)
101 | 
102 |       is_in_bbox = check_prediction_in_bbox(
103 |         pred_x=parsed_prediction.pred_x,
104 |         pred_y=parsed_prediction.pred_y,
105 |         gt_x1=eval_item.x1,
106 |         gt_y1=eval_item.y1,
107 |         gt_x2=eval_item.x2,
108 |         gt_y2=eval_item.y2,
109 |       )
110 | 
111 |       print_colored_result(
112 |         item_id=eval_item.id,
113 |         instruction=eval_item.instruction,
114 |         pred_x=parsed_prediction.pred_x,
115 |         pred_y=parsed_prediction.pred_y,
116 |         latency=latency,
117 |         is_in_bbox=is_in_bbox,
118 |       )
119 | 
120 |       visualization_path = visualize_prediction(
121 |         image_path=image_path,
122 |         pred_x=parsed_prediction.pred_x,
123 |         pred_y=parsed_prediction.pred_y,
124 |         item_id=eval_item.id,
125 |         recording_id=eval_item.recording_id,
126 |         instruction=eval_item.instruction,
127 |         model_name='openai_cua',
128 |         run_id=run_id,
129 |         gt_x1=eval_item.x1,
130 |         gt_y1=eval_item.y1,
131 |         gt_x2=eval_item.x2,
132 |         gt_y2=eval_item.y2,
133 |         is_in_bbox=is_in_bbox,
134 |       )
135 | 
136 |       result = EvaluationResult(
137 |         id=eval_item.id,
138 |         recording_id=eval_item.recording_id,
139 |         instruction=eval_item.instruction,
140 |         image_path=image_path,
141 |         gt_x1=eval_item.x1,
142 |         gt_y1=eval_item.y1,
143 |         gt_x2=eval_item.x2,
144 |         gt_y2=eval_item.y2,
145 |         pred_x=parsed_prediction.pred_x,
146 |         pred_y=parsed_prediction.pred_y,
147 |         is_in_bbox=is_in_bbox,
148 |         latency_seconds=latency,
149 |         raw_response=parsed_prediction.raw_response,
150 |         visualization_path=visualization_path,
151 |       )
152 | 
153 |       return result
154 |     else:
155 |       raise ValueError('Prediction is None')
156 | 
157 | 
158 | def get_openai_cua_api_client(
159 |   api_key: Optional[str] = None,
160 |   api_endpoint: Optional[str] = None,
161 |   model: str = DEFAULT_MODEL,
162 |   max_tokens: int = 4096,
163 |   environment: str = DEFAULT_ENVIRONMENT,
164 | ) -> OpenAICUAAPIClient:
165 |   # Create a custom config that includes CUA-specific parameters
166 |   config = OpenAIModelConfig(
167 |     api_endpoint=api_endpoint or os.environ.get('OPENAI_API_ENDPOINT', 'https://api.openai.com/v1'),
168 |     model=model,
169 |     api_key=api_key,
170 |     max_tokens=max_tokens,
171 |   )
172 | 
173 |   # Add CUA-specific attributes
174 |   setattr(config, 'environment', environment)
175 | 
176 |   return OpenAICUAAPIClient(config=config)
177 | 


--------------------------------------------------------------------------------
/clicks/src/clicks/third_party/qwen/integration.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | from typing import Any, Dict, Optional
  4 | 
  5 | from clicks.api_client_base import AbstractAPIClient
  6 | from clicks.evaluate.models import (
  7 |   EvaluationItem,
  8 |   EvaluationResult,
  9 |   ParsedPrediction,
 10 |   QwenModelConfig,
 11 | )
 12 | from clicks.evaluate.utils import (
 13 |   check_prediction_in_bbox,
 14 |   print_colored_result,
 15 |   visualize_prediction,
 16 | )
 17 | 
 18 | from ..common import encode_image_to_base64_uri
 19 | from .client import DEFAULT_MODEL, QwenVLClient
 20 | 
 21 | 
 22 | class QwenVLAPIClient(AbstractAPIClient):
 23 |   def __init__(
 24 |     self,
 25 |     config: Optional[QwenModelConfig] = None,
 26 |   ):
 27 |     if config is None:
 28 |       config = QwenModelConfig(
 29 |         api_endpoint='https://dashscope.aliyuncs.com/compatible-mode/v1',
 30 |         model=DEFAULT_MODEL,
 31 |         api_key=os.environ.get('DASHSCOPE_API_KEY'),
 32 |         max_tokens=4096,
 33 |         use_smart_resize=True,
 34 |         resize_factor=28,
 35 |         min_pixels=3136,
 36 |         max_pixels=12845056,
 37 |       )
 38 | 
 39 |     self.qwen_client = QwenVLClient(
 40 |       api_key=config.api_key,
 41 |       api_endpoint=config.api_endpoint,
 42 |       model=config.model,
 43 |       max_tokens=config.max_tokens,
 44 |       use_smart_resize=config.use_smart_resize,
 45 |       resize_factor=config.resize_factor,
 46 |       min_pixels=config.min_pixels,
 47 |       max_pixels=config.max_pixels,
 48 |     )
 49 |     self.config = config
 50 |     self.client_type = 'qwen'
 51 | 
 52 |   def predict(
 53 |     self, image_data_uri: str, prompt: str, model: Optional[str] = None
 54 |   ) -> Dict[str, Any] | None:
 55 |     return self.qwen_client.predict(image_data_uri, prompt)
 56 | 
 57 |   def parse_prediction(self, prediction: Dict[str, Any]) -> ParsedPrediction:
 58 |     result_dict = self.qwen_client.parse_prediction(prediction)
 59 | 
 60 |     return ParsedPrediction(
 61 |       pred_x=result_dict.get('pred_x'),
 62 |       pred_y=result_dict.get('pred_y'),
 63 |       raw_response=result_dict.get('raw_response'),
 64 |     )
 65 | 
 66 |   def process_single_item(
 67 |     self,
 68 |     item: Dict[str, Any],
 69 |     frames_dir: str,
 70 |     run_id: str,
 71 |   ) -> EvaluationResult:
 72 |     try:
 73 |       eval_item = EvaluationItem(
 74 |         id=item['id'],
 75 |         recording_id=item['recording_id'],
 76 |         instruction=item['instruction'],
 77 |         image=item['image'],
 78 |         x1=item['x1'],
 79 |         y1=item['y1'],
 80 |         x2=item['x2'],
 81 |         y2=item['y2'],
 82 |       )
 83 | 
 84 |       item_id = eval_item.id
 85 |       recording_id = eval_item.recording_id
 86 |       instruction = eval_item.instruction
 87 |       image_path = eval_item.image
 88 | 
 89 |       local_path = os.path.join(frames_dir, image_path)
 90 |       if not os.path.exists(local_path):
 91 |         raise FileNotFoundError(f'Image file not found: {local_path}')
 92 | 
 93 |       image_data_uri = encode_image_to_base64_uri(local_path)
 94 |       if image_data_uri is None:
 95 |         raise ValueError(f'Failed to encode image: {local_path}')
 96 | 
 97 |       print(f'Processing item {item_id} with instruction: {instruction}')
 98 |       start_time = time.time()
 99 |       prediction = self.predict(image_data_uri, instruction)
100 |       latency = time.time() - start_time
101 | 
102 |       if prediction is None:
103 |         print(f'Qwen returned None for item {item_id}')
104 |         pred_result = ParsedPrediction()
105 |         pred_x = None
106 |         pred_y = None
107 |       else:
108 |         pred_result = self.parse_prediction(prediction)
109 |         pred_x = pred_result.pred_x
110 |         pred_y = pred_result.pred_y
111 | 
112 |       is_in_bbox = check_prediction_in_bbox(
113 |         pred_x, pred_y, eval_item.x1, eval_item.y1, eval_item.x2, eval_item.y2
114 |       )
115 | 
116 |       print_colored_result(
117 |         item_id,
118 |         instruction,
119 |         pred_x,
120 |         pred_y,
121 |         latency,
122 |         is_in_bbox,
123 |       )
124 | 
125 |       visualization_path = visualize_prediction(
126 |         local_path,
127 |         pred_x,
128 |         pred_y,
129 |         item_id,
130 |         recording_id,
131 |         instruction,
132 |         self.config.model,
133 |         run_id,
134 |         eval_item.x1,
135 |         eval_item.y1,
136 |         eval_item.x2,
137 |         eval_item.y2,
138 |         is_in_bbox,
139 |       )
140 | 
141 |       result = EvaluationResult(
142 |         id=item_id,
143 |         recording_id=recording_id,
144 |         instruction=instruction,
145 |         image_path=local_path,
146 |         gt_x1=eval_item.x1,
147 |         gt_y1=eval_item.y1,
148 |         gt_x2=eval_item.x2,
149 |         gt_y2=eval_item.y2,
150 |         pred_x=pred_x,
151 |         pred_y=pred_y,
152 |         is_in_bbox=is_in_bbox,
153 |         latency_seconds=latency,
154 |         raw_response=pred_result.raw_response,
155 |         visualization_path=visualization_path,
156 |       )
157 | 
158 |       return result
159 | 
160 |     except Exception as e:
161 |       print(f'Error processing item: {e}')
162 |       raise e
163 | 
164 | 
165 | def get_qwen_api_client(
166 |   api_key: Optional[str] = None,
167 |   api_endpoint: Optional[str] = None,
168 |   model: str = DEFAULT_MODEL,
169 |   max_tokens: int = 4096,
170 |   use_smart_resize: bool = True,
171 |   resize_factor: int = 28,
172 |   min_pixels: int = 3136,
173 |   max_pixels: int = 12845056,
174 | ) -> QwenVLAPIClient:
175 |   config = QwenModelConfig(
176 |     api_endpoint=api_endpoint or 'https://dashscope.aliyuncs.com/compatible-mode/v1',
177 |     model=model,
178 |     api_key=api_key or os.environ.get('DASHSCOPE_API_KEY'),
179 |     max_tokens=max_tokens,
180 |     use_smart_resize=use_smart_resize,
181 |     resize_factor=resize_factor,
182 |     min_pixels=min_pixels,
183 |     max_pixels=max_pixels,
184 |   )
185 |   return QwenVLAPIClient(config)
186 | 


--------------------------------------------------------------------------------
/clicks/src/clicks/third_party/openai/integration.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | from typing import Any, Dict, Optional
  4 | 
  5 | from clicks.api_client_base import AbstractAPIClient
  6 | from clicks.evaluate.models import (
  7 |   EvaluationItem,
  8 |   EvaluationResult,
  9 |   OpenAIModelConfig,
 10 |   ParsedPrediction,
 11 | )
 12 | from clicks.evaluate.utils import (
 13 |   check_prediction_in_bbox,
 14 |   print_colored_result,
 15 |   visualize_prediction,
 16 | )
 17 | 
 18 | from ..common import encode_image_to_base64_uri
 19 | from .client import DEFAULT_MODEL
 20 | from .client import OpenAIClient as OpenAIBaseClient
 21 | 
 22 | 
 23 | class OpenAIAPIClient(AbstractAPIClient):
 24 |   def __init__(
 25 |     self,
 26 |     config: Optional[OpenAIModelConfig] = None,
 27 |   ):
 28 |     self.config = config or OpenAIModelConfig(
 29 |       api_endpoint=os.environ.get('OPENAI_API_ENDPOINT', 'https://api.openai.com/v1')
 30 |     )
 31 | 
 32 |     self.api_key = self.config.api_key
 33 |     self.api_endpoint = self.config.api_endpoint
 34 |     self.model = self.config.model
 35 |     self.max_tokens = self.config.max_tokens
 36 |     self.reasoning_effort = self.config.reasoning_effort
 37 |     self.client_type = 'openai'
 38 | 
 39 |   def predict(
 40 |     self, image_data_uri: str, prompt: str, model: Optional[str] = None
 41 |   ) -> Dict[str, Any] | None:
 42 |     client = OpenAIBaseClient(
 43 |       api_key=self.api_key,
 44 |       api_endpoint=self.api_endpoint,
 45 |       model=model or self.model,
 46 |       max_tokens=self.max_tokens,
 47 |       reasoning_effort=self.reasoning_effort,
 48 |     )
 49 | 
 50 |     return client.predict(image_data_uri, prompt)
 51 | 
 52 |   def parse_prediction(self, prediction: Dict[str, Any]) -> ParsedPrediction:
 53 |     client = OpenAIBaseClient(
 54 |       api_key=self.api_key,
 55 |       api_endpoint=self.api_endpoint,
 56 |       model=self.model,
 57 |       max_tokens=self.max_tokens,
 58 |       reasoning_effort=self.reasoning_effort,
 59 |     )
 60 | 
 61 |     parsed = client.parse_prediction(prediction)
 62 | 
 63 |     return ParsedPrediction(
 64 |       pred_x=parsed['pred_x'],
 65 |       pred_y=parsed['pred_y'],
 66 |       raw_response=parsed['raw_response'],
 67 |     )
 68 | 
 69 |   def process_single_item(
 70 |     self,
 71 |     item: Dict[str, Any],
 72 |     frames_dir: str,
 73 |     run_id: str,
 74 |   ) -> EvaluationResult:
 75 |     try:
 76 |       eval_item = EvaluationItem(
 77 |         id=item['id'],
 78 |         recording_id=item['recording_id'],
 79 |         instruction=item['instruction'],
 80 |         image=item['image'],
 81 |         x1=item['x1'],
 82 |         y1=item['y1'],
 83 |         x2=item['x2'],
 84 |         y2=item['y2'],
 85 |       )
 86 | 
 87 |       item_id = eval_item.id
 88 |       recording_id = eval_item.recording_id
 89 |       instruction = eval_item.instruction
 90 |       image_path = eval_item.image
 91 | 
 92 |       local_path = os.path.join(frames_dir, image_path)
 93 |       if not os.path.exists(local_path):
 94 |         raise FileNotFoundError(f'Image file not found at {local_path}')
 95 | 
 96 |       image_data_uri = encode_image_to_base64_uri(local_path)
 97 |       if image_data_uri is None:
 98 |         raise ValueError(f'Failed to encode image at {local_path}')
 99 | 
100 |       start_time = time.time()
101 |       prediction = self.predict(image_data_uri, instruction)
102 |       end_time = time.time()
103 |       latency = end_time - start_time
104 | 
105 |       if not prediction:
106 |         raise ValueError('Prediction is None')
107 | 
108 |       parsed_prediction = self.parse_prediction(prediction)
109 | 
110 |       is_in_bbox = check_prediction_in_bbox(
111 |         pred_x=parsed_prediction.pred_x,
112 |         pred_y=parsed_prediction.pred_y,
113 |         gt_x1=eval_item.x1,
114 |         gt_y1=eval_item.y1,
115 |         gt_x2=eval_item.x2,
116 |         gt_y2=eval_item.y2,
117 |       )
118 | 
119 |       visualization_path = visualize_prediction(
120 |         image_path=local_path,
121 |         pred_x=parsed_prediction.pred_x,
122 |         pred_y=parsed_prediction.pred_y,
123 |         item_id=item_id,
124 |         recording_id=recording_id,
125 |         instruction=instruction,
126 |         model_name=self.config.model,
127 |         run_id=run_id,
128 |         gt_x1=eval_item.x1,
129 |         gt_y1=eval_item.y1,
130 |         gt_x2=eval_item.x2,
131 |         gt_y2=eval_item.y2,
132 |         is_in_bbox=is_in_bbox,
133 |       )
134 | 
135 |       print_colored_result(
136 |         item_id=item_id,
137 |         instruction=instruction,
138 |         pred_x=parsed_prediction.pred_x,
139 |         pred_y=parsed_prediction.pred_y,
140 |         latency=latency,
141 |         is_in_bbox=is_in_bbox,
142 |       )
143 | 
144 |       return EvaluationResult(
145 |         id=item_id,
146 |         recording_id=recording_id,
147 |         instruction=instruction,
148 |         image_path=local_path,
149 |         gt_x1=eval_item.x1,
150 |         gt_y1=eval_item.y1,
151 |         gt_x2=eval_item.x2,
152 |         gt_y2=eval_item.y2,
153 |         pred_x=parsed_prediction.pred_x,
154 |         pred_y=parsed_prediction.pred_y,
155 |         is_in_bbox=is_in_bbox,
156 |         latency_seconds=latency,
157 |         raw_response=parsed_prediction.raw_response,
158 |         visualization_path=visualization_path,
159 |       )
160 |     except Exception as e:
161 |       print(f'API request failed: {str(e)}')
162 |       raise e
163 | 
164 | 
165 | def get_openai_api_client(
166 |   api_key: Optional[str] = None,
167 |   api_endpoint: Optional[str] = None,
168 |   model: str = DEFAULT_MODEL,
169 |   max_tokens: int = 4096,
170 |   reasoning_effort: str = 'medium',
171 | ) -> OpenAIAPIClient:
172 |   config = OpenAIModelConfig(
173 |     api_endpoint=api_endpoint or os.environ.get('OPENAI_API_ENDPOINT', 'https://api.openai.com/v1'),
174 |     model=model,
175 |     api_key=api_key,
176 |     max_tokens=max_tokens,
177 |     reasoning_effort=reasoning_effort,
178 |   )
179 | 
180 |   return OpenAIAPIClient(config=config)
181 | 


--------------------------------------------------------------------------------
/clicks/src/clicks/third_party/gemini/client.py:
--------------------------------------------------------------------------------
  1 | import base64
  2 | import io
  3 | import json
  4 | import os
  5 | import time
  6 | from typing import Any, Dict, Optional
  7 | 
  8 | from google import genai
  9 | from google.genai.types import GenerateContentConfig, SafetySetting
 10 | from PIL import Image
 11 | from pydantic import BaseModel
 12 | from tenacity import retry, stop_after_attempt, wait_exponential
 13 | 
 14 | DEFAULT_MODEL = 'gemini-2.0-flash'
 15 | 
 16 | 
 17 | class Point(BaseModel):
 18 |   point: list[int]
 19 |   label: str
 20 | 
 21 | 
 22 | class GeminiClient:
 23 |   def __init__(
 24 |     self,
 25 |     api_key: Optional[str] = None,
 26 |     model: str = DEFAULT_MODEL,
 27 |     max_tokens: int = 4096,
 28 |     temperature: float = 0.0,
 29 |   ):
 30 |     self.api_key = api_key or os.environ.get('GEMINI_API_KEY')
 31 |     if not self.api_key:
 32 |       raise ValueError(
 33 |         'API key must be provided either as an argument or through the GEMINI_API_KEY environment variable'
 34 |       )
 35 | 
 36 |     self.model = model
 37 |     self.max_tokens = max_tokens
 38 |     self.temperature = temperature
 39 | 
 40 |   def _extract_image_dimensions(self, base64_data: str) -> tuple[int, int]:
 41 |     try:
 42 |       image_data = base64.b64decode(base64_data)
 43 |       image = Image.open(io.BytesIO(image_data))
 44 |       width, height = image.size
 45 |       return width, height
 46 |     except Exception as e:
 47 |       print(f'Error extracting image dimensions: {e}')
 48 |       return 1024, 768
 49 | 
 50 |   def _encode_image(self, image_path: str) -> str:
 51 |     try:
 52 |       with open(image_path, 'rb') as image_file:
 53 |         return base64.b64encode(image_file.read()).decode('utf-8')
 54 |     except Exception as e:
 55 |       print(f'Error encoding image: {e}')
 56 |       return ''
 57 | 
 58 |   @retry(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, min=2, max=15))
 59 |   def predict(self, image_data_uri: str, prompt: str, **kwargs) -> Dict[str, Any]:
 60 |     try:
 61 |       if ',' in image_data_uri:
 62 |         base64_data = image_data_uri.split(',')[1]
 63 |         mime_type = image_data_uri.split(',')[0]
 64 |       else:
 65 |         base64_data = image_data_uri
 66 |         mime_type = 'data:image/jpeg;base64'
 67 | 
 68 |       image_data = base64.b64decode(base64_data)
 69 |       image_pil = Image.open(io.BytesIO(image_data))
 70 |       width, height = image_pil.size
 71 | 
 72 |       start_time = time.time()
 73 | 
 74 |       client = genai.Client(api_key=self.api_key)
 75 | 
 76 |       config_localize = GenerateContentConfig(
 77 |         temperature=0.0,
 78 |       )
 79 | 
 80 |       prompt_localize = f"""You are an AI assistant that helps users with their tasks.
 81 | You are given an image and a task. Your job is to return a description of the UI element that should be clicked on to advance or complete the task.
 82 | There will always be an UI element that can be clicked to advance or complete the task. Do not question this.
 83 | The description should not be a question, or an action. For example, "the "-" button in the Target Membership section." is good, but "click the "-" button in the Target Membership section." is bad.
 84 | You MUST remember that you are not describing the actions you will take, but the UI element that should be clicked. You should just describe the UI element.
 85 | The task is: `{prompt}`.
 86 | Return nothing else but the singular description."""
 87 | 
 88 |       result_localize = client.models.generate_content(
 89 |         model=self.model,
 90 |         contents=[
 91 |           image_pil,
 92 |           prompt_localize,
 93 |         ],
 94 |         config=config_localize,
 95 |       )
 96 | 
 97 |       print(f'Localize: {result_localize.text}')
 98 | 
 99 |       prompt = (
100 |         """Point to the UI element matching the description: `"""
101 |         + (result_localize.text or prompt)
102 |         + """`, with no more than 1 item. The answer should follow the json format: [{'point': <point>, "label": <label1>}, ...]. The points are in [y, x] format normalized to 0-1000."""
103 |       )
104 | 
105 |       config = GenerateContentConfig(
106 |         temperature=0.5,
107 |         safety_settings=[
108 |           SafetySetting(
109 |             category='HARM_CATEGORY_DANGEROUS_CONTENT',  # type: ignore
110 |             threshold='BLOCK_ONLY_HIGH',  # type: ignore
111 |           ),
112 |         ],
113 |         response_mime_type='application/json',
114 |         response_schema=list[Point],
115 |       )
116 | 
117 |       response = client.models.generate_content(
118 |         model=self.model,
119 |         contents=[
120 |           image_pil,
121 |           prompt,
122 |         ],
123 |         config=config,
124 |       )
125 | 
126 |       latency = time.time() - start_time
127 | 
128 |       return {
129 |         'response': response,
130 |         'latency': latency,
131 |         'width': width,
132 |         'height': height,
133 |       }
134 | 
135 |     except Exception as e:
136 |       print(f'API Error: {str(e)}')
137 |       return {'response': None, 'latency': 0, 'width': 0, 'height': 0, 'error': str(e)}
138 | 
139 |   def parse_prediction(self, prediction: Dict[str, Any]) -> Dict[str, Any]:
140 |     if not prediction or 'response' not in prediction:
141 |       return {
142 |         'pred_x': None,
143 |         'pred_y': None,
144 |         'raw_response': json.dumps(prediction) if prediction else None,
145 |       }
146 | 
147 |     response = prediction['response']
148 | 
149 |     pred_x = None
150 |     pred_y = None
151 | 
152 |     width = prediction['width']
153 |     height = prediction['height']
154 | 
155 |     try:
156 |       content = response.parsed
157 | 
158 |       point = content[0]
159 | 
160 |       y, x = point.point
161 |       y = int(y / 1000 * height)
162 |       x = int(x / 1000 * width)
163 | 
164 |       pred_x = x
165 |       pred_y = y
166 | 
167 |       pred_x = int(pred_x)
168 |       pred_y = int(pred_y)
169 |     except Exception as e:
170 |       print(f'Error parsing prediction: {e}')
171 | 
172 |     return {
173 |       'pred_x': pred_x,
174 |       'pred_y': pred_y,
175 |       'raw_response': json.dumps(prediction, default=str),
176 |     }
177 | 


--------------------------------------------------------------------------------
/clicks/src/clicks/third_party/openai/client.py:
--------------------------------------------------------------------------------
  1 | import base64
  2 | import io
  3 | import json
  4 | import os
  5 | import time
  6 | from typing import Any, Dict, Optional
  7 | 
  8 | from openai import OpenAI
  9 | from PIL import Image
 10 | from tenacity import retry, stop_after_attempt, wait_exponential
 11 | 
 12 | OPENAI_API_ENDPOINT = 'https://api.openai.com/v1'
 13 | DEFAULT_MODEL = 'gpt-4o'
 14 | 
 15 | 
 16 | class OpenAIClient:
 17 |   def __init__(
 18 |     self,
 19 |     api_key: Optional[str] = None,
 20 |     api_endpoint: str = OPENAI_API_ENDPOINT,
 21 |     model: str = DEFAULT_MODEL,
 22 |     max_tokens: int = 4096,
 23 |     reasoning_effort: str = 'medium',
 24 |   ):
 25 |     self.api_key = api_key or os.environ.get('OPENAI_API_KEY')
 26 |     if not self.api_key:
 27 |       raise ValueError(
 28 |         'API key must be provided either as an argument or through the OPENAI_API_KEY environment variable'
 29 |       )
 30 | 
 31 |     self.api_endpoint = api_endpoint
 32 |     self.model = model
 33 |     self.max_tokens = max_tokens
 34 |     self.reasoning_effort = reasoning_effort
 35 |     self.client = OpenAI(api_key=self.api_key)
 36 | 
 37 |   def _extract_image_dimensions(self, image_data_uri: str) -> tuple[int, int]:
 38 |     try:
 39 |       image_data = base64.b64decode(image_data_uri)
 40 |       image_pil = Image.open(io.BytesIO(image_data))
 41 |       width, height = image_pil.size
 42 |       return width, height
 43 |     except Exception as e:
 44 |       print(f'Error extracting image dimensions: {e}')
 45 |       return 1024, 768
 46 | 
 47 |   @retry(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, min=2, max=15))
 48 |   def predict(self, image_data_uri: str, prompt: str) -> Dict[str, Any]:
 49 |     try:
 50 |       if ',' in image_data_uri:
 51 |         base64_data = image_data_uri.split(',')[1]
 52 |       else:
 53 |         base64_data = image_data_uri
 54 | 
 55 |       width, height = self._extract_image_dimensions(base64_data)
 56 | 
 57 |       messages = [
 58 |         {
 59 |           'role': 'system',
 60 |           'content': (
 61 |             'You are an AI assistant that can see and interact with a computer screen. '
 62 |             'You will be shown a screenshot and given an instruction. '
 63 |             'The instruction always requires you to interact with a UI element on the screen. You can assume that the UI element is always visible on the screen.'
 64 |             'Return the position of the UI element that can be interacted with to advance the action specified in the instruction. '
 65 |             f'The screen resolution is {width}x{height}. '
 66 |             'Your response must be in JSON format.'
 67 |           ),
 68 |         },
 69 |         {
 70 |           'role': 'user',
 71 |           'content': [
 72 |             {
 73 |               'type': 'image_url',
 74 |               'image_url': {'url': f'data:image/jpeg;base64,{base64_data}', 'detail': 'high'},
 75 |             },
 76 |             {'type': 'text', 'text': prompt},
 77 |           ],
 78 |         },
 79 |       ]
 80 | 
 81 |       start_time = time.time()
 82 | 
 83 |       params = {
 84 |         'model': self.model,
 85 |         'messages': messages,
 86 |         'max_completion_tokens': self.max_tokens,
 87 |         'response_format': {'type': 'json_object'},
 88 |         'seed': 42,
 89 |         'tools': [
 90 |           {
 91 |             'type': 'function',
 92 |             'function': {
 93 |               'name': 'click_on_element',
 94 |               'description': 'Click on the specified UI element',
 95 |               'parameters': {
 96 |                 'type': 'object',
 97 |                 'properties': {
 98 |                   'x': {'type': 'integer', 'description': 'The x-coordinate for the click action'},
 99 |                   'y': {'type': 'integer', 'description': 'The y-coordinate for the click action'},
100 |                 },
101 |                 'required': ['x', 'y'],
102 |               },
103 |             },
104 |           }
105 |         ],
106 |         'tool_choice': 'auto',
107 |       }
108 | 
109 |       if self.model == 'o1':
110 |         params['reasoning_effort'] = self.reasoning_effort
111 | 
112 |       response = self.client.chat.completions.create(**params)
113 | 
114 |       print(response)
115 | 
116 |       end_time = time.time()
117 |       latency = end_time - start_time
118 | 
119 |       result = {
120 |         'model': self.model,
121 |         'latency': latency,
122 |         'response': response,
123 |       }
124 | 
125 |       return result
126 | 
127 |     except Exception as e:
128 |       print(f'Error making prediction: {e}')
129 |       return {'error': str(e)}
130 | 
131 |   def parse_prediction(self, prediction: Dict[str, Any]) -> Dict[str, Any]:
132 |     if not prediction or 'response' not in prediction:
133 |       return {
134 |         'pred_x': None,
135 |         'pred_y': None,
136 |         'raw_response': json.dumps(prediction) if prediction else None,
137 |       }
138 | 
139 |     response = prediction['response']
140 | 
141 |     pred_x = None
142 |     pred_y = None
143 | 
144 |     if hasattr(response, 'choices') and len(response.choices) > 0:
145 |       choice = response.choices[0]
146 | 
147 |       if (
148 |         hasattr(choice, 'message')
149 |         and hasattr(choice.message, 'tool_calls')
150 |         and choice.message.tool_calls
151 |       ):
152 |         tool_call = choice.message.tool_calls[0]
153 |         if hasattr(tool_call, 'function') and hasattr(tool_call.function, 'arguments'):
154 |           try:
155 |             args = json.loads(tool_call.function.arguments)
156 |             pred_x = args.get('x')
157 |             pred_y = args.get('y')
158 |           except Exception as e:
159 |             print(f'Error parsing tool call arguments: {e}')
160 | 
161 |       elif (
162 |         hasattr(choice, 'message') and hasattr(choice.message, 'content') and choice.message.content
163 |       ):
164 |         try:
165 |           content = json.loads(choice.message.content)
166 |           pred_x = content.get('x')
167 |           pred_y = content.get('y')
168 |         except Exception as e:
169 |           print(f'Error parsing message content: {e}')
170 | 
171 |     return {
172 |       'pred_x': pred_x,
173 |       'pred_y': pred_y,
174 |       'raw_response': json.dumps(prediction, default=str),
175 |     }
176 | 


--------------------------------------------------------------------------------
/clicks/src/clicks/evaluate/utils.py:
--------------------------------------------------------------------------------
  1 | import base64
  2 | import os
  3 | import urllib.parse
  4 | from typing import Any, Dict, List, Optional
  5 | 
  6 | import numpy as np
  7 | from clicks.evaluate.models import EvaluationMetrics
  8 | from colorama import Fore, Style, init
  9 | from PIL import Image, ImageDraw, ImageFont
 10 | from scipy import stats
 11 | 
 12 | init(autoreset=True)
 13 | 
 14 | 
 15 | def is_point_in_bbox(
 16 |   x: Optional[int],
 17 |   y: Optional[int],
 18 |   x1: Optional[int],
 19 |   y1: Optional[int],
 20 |   x2: Optional[int],
 21 |   y2: Optional[int],
 22 | ) -> bool:
 23 |   """Check if a point is within a bounding box."""
 24 |   if x is None or y is None or x1 is None or y1 is None or x2 is None or y2 is None:
 25 |     return False
 26 |   return x1 <= x <= x2 and y1 <= y <= y2
 27 | 
 28 | 
 29 | def visualize_prediction(
 30 |   image_path: str,
 31 |   pred_x: Optional[int],
 32 |   pred_y: Optional[int],
 33 |   item_id: str,
 34 |   recording_id: str,
 35 |   instruction: str,
 36 |   model_name: str,
 37 |   run_id: Optional[str],
 38 |   gt_x1: Optional[int],
 39 |   gt_y1: Optional[int],
 40 |   gt_x2: Optional[int],
 41 |   gt_y2: Optional[int],
 42 |   is_in_bbox: Optional[bool] = None,
 43 | ) -> Optional[str]:
 44 |   try:
 45 |     base_dir = os.path.dirname(
 46 |       os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 47 |     )
 48 |     if run_id:
 49 |       vis_dir = os.path.join(base_dir, 'results', run_id, model_name, 'visualizations')
 50 |     else:
 51 |       vis_dir = os.path.join(base_dir, 'visualizations')
 52 |     os.makedirs(vis_dir, exist_ok=True)
 53 |     print(f'Visualization directory: {vis_dir}')
 54 | 
 55 |     img = Image.open(image_path)
 56 |     draw = ImageDraw.Draw(img)
 57 | 
 58 |     try:
 59 |       font = ImageFont.truetype('Arial.ttf', 16)
 60 |     except IOError:
 61 |       font = ImageFont.load_default()
 62 | 
 63 |     image_filename = os.path.basename(image_path)
 64 | 
 65 |     # Draw the bounding box if available
 66 |     if all(v is not None for v in [gt_x1, gt_y1, gt_x2, gt_y2]):
 67 |       draw.rectangle([(gt_x1, gt_y1), (gt_x2, gt_y2)], outline='blue', width=2)  # type: ignore
 68 |       if gt_y1 is not None:
 69 |         draw.text((gt_x1, gt_y1 - 20), 'Bounding Box', fill='blue', font=font)  # type: ignore
 70 |       else:
 71 |         draw.text((gt_x1, 0), 'Bounding Box', fill='blue', font=font)  # type: ignore
 72 | 
 73 |     if pred_x is not None and pred_y is not None:
 74 |       # Different color based on whether it's in bbox or not
 75 |       outline_color = 'orange' if is_in_bbox else 'red'
 76 |       draw.ellipse(
 77 |         [(pred_x - 15, pred_y - 15), (pred_x + 15, pred_y + 15)], outline=outline_color, width=3
 78 |       )
 79 |       draw.text((pred_x + 20, pred_y + 30), 'Prediction', fill=outline_color, font=font)
 80 | 
 81 |     draw.text(
 82 |       (10, img.height - 30),
 83 |       f'Item ID: {item_id} | Recording ID: {recording_id} | Instruction: {instruction}',
 84 |       fill='white',
 85 |       font=font,
 86 |     )
 87 | 
 88 |     output_filename = (
 89 |       f'rec{recording_id}_item{item_id}_{image_filename}_{urllib.parse.quote_plus(instruction)}.png'
 90 |     )
 91 |     if is_in_bbox:
 92 |       output_dir = os.path.join(vis_dir, 'correct')
 93 |     else:
 94 |       output_dir = os.path.join(vis_dir, 'incorrect')
 95 |     os.makedirs(output_dir, exist_ok=True)
 96 |     output_path = os.path.join(output_dir, output_filename)
 97 |     img.save(output_path)
 98 | 
 99 |     return output_path
100 |   except Exception as e:
101 |     print(f'Error creating visualization: {e}')
102 |     return None
103 | 
104 | 
105 | def encode_image_to_base64(image_path: str) -> Optional[str]:
106 |   try:
107 |     with open(image_path, 'rb') as f:
108 |       image_data = f.read()
109 |     encoded_data = base64.b64encode(image_data).decode('utf-8')
110 |     mime_type = 'image/png' if image_path.lower().endswith('.png') else 'image/jpeg'
111 |     return f'data:{mime_type};base64,{encoded_data}'
112 |   except Exception as e:
113 |     print(f'Error encoding image: {e}')
114 |     return None
115 | 
116 | 
117 | def check_prediction_in_bbox(
118 |   pred_x: Optional[int],
119 |   pred_y: Optional[int],
120 |   gt_x1: Optional[int],
121 |   gt_y1: Optional[int],
122 |   gt_x2: Optional[int],
123 |   gt_y2: Optional[int],
124 | ) -> bool:
125 |   return is_point_in_bbox(pred_x, pred_y, gt_x1, gt_y1, gt_x2, gt_y2)
126 | 
127 | 
128 | def print_colored_result(
129 |   item_id: str,
130 |   instruction: str,
131 |   pred_x: Optional[int],
132 |   pred_y: Optional[int],
133 |   latency: float,
134 |   is_in_bbox: Optional[bool] = None,
135 | ) -> None:
136 |   color = Fore.GREEN if is_in_bbox else Fore.RED
137 |   print(
138 |     f'{color}ID: {item_id} | Instruction: {instruction} | '
139 |     f'Prediction: {pred_x} {pred_y} | '
140 |     f'Correct: {is_in_bbox} | Time: {latency:.2f}s{Style.RESET_ALL}'
141 |   )
142 | 
143 | 
144 | def analyze_results(
145 |   results: List[Dict[str, Any]], run_id: Optional[str] = None
146 | ) -> EvaluationMetrics:
147 |   if not results:
148 |     raise ValueError('No results to summarize')
149 | 
150 |   total_processed = len(results)
151 |   total_in_bbox = sum(1 for result in results if result.get('is_in_bbox', False))
152 | 
153 |   bbox_results = np.array(
154 |     [1 if result.get('is_in_bbox', False) else 0 for result in results if 'is_in_bbox' in result]
155 |   )
156 | 
157 |   accuracy = (total_in_bbox / total_processed) * 100 if total_processed > 0 else None
158 | 
159 |   accuracy_ci = None
160 |   ci = 0.95
161 | 
162 |   def calculate_accuracy(data):
163 |     return np.mean(data) * 100
164 | 
165 |   if len(bbox_results) > 0:
166 |     try:
167 |       accuracy_bootstrap = stats.bootstrap(
168 |         (bbox_results,), calculate_accuracy, confidence_level=ci, method='percentile'
169 |       )
170 |       accuracy_ci = accuracy_bootstrap.confidence_interval
171 |     except Exception as e:
172 |       print(f'Error calculating bounding box accuracy confidence interval: {e}')
173 | 
174 |   print('\nResults Summary:')
175 |   print(f'Total Processed: {total_processed}')
176 |   print(f'Total Correct: {total_in_bbox}')
177 |   print(f'Accuracy: {accuracy:.2f}%')
178 |   if accuracy_ci:
179 |     print(f'95% CI: [{accuracy_ci.low:.2f}%, {accuracy_ci.high:.2f}%]')
180 | 
181 |   metrics = EvaluationMetrics(
182 |     total_processed=total_processed,
183 |     total_correct=total_in_bbox,
184 |     accuracy=accuracy if accuracy is not None else 0,
185 |     ci=ci,
186 |     accuracy_ci_low=accuracy_ci.low if accuracy_ci else None,
187 |     accuracy_ci_high=accuracy_ci.high if accuracy_ci else None,
188 |   )
189 | 
190 |   return metrics
191 | 


--------------------------------------------------------------------------------
/clicks/src/clicks/third_party/openai_cua/client.py:
--------------------------------------------------------------------------------
  1 | import base64
  2 | import io
  3 | import json
  4 | import os
  5 | import time
  6 | import warnings
  7 | from typing import Any, Dict, Optional, Tuple
  8 | 
  9 | from openai import OpenAI
 10 | from PIL import Image
 11 | from tenacity import retry, stop_after_attempt, wait_exponential
 12 | 
 13 | warnings.filterwarnings('ignore')
 14 | 
 15 | OPENAI_API_ENDPOINT = 'https://api.openai.com/v1'
 16 | DEFAULT_MODEL = 'computer-use-preview'
 17 | DEFAULT_DISPLAY_WIDTH = 1024
 18 | DEFAULT_DISPLAY_HEIGHT = 768
 19 | DEFAULT_ENVIRONMENT = 'mac'
 20 | 
 21 | 
 22 | class OpenAICUAClient:
 23 |   def __init__(
 24 |     self,
 25 |     api_key: Optional[str] = None,
 26 |     api_endpoint: str = OPENAI_API_ENDPOINT,
 27 |     model: str = DEFAULT_MODEL,
 28 |     max_tokens: int = 4096,
 29 |     environment: str = DEFAULT_ENVIRONMENT,
 30 |   ):
 31 |     self.api_key = api_key or os.environ.get('OPENAI_API_KEY')
 32 |     if not self.api_key:
 33 |       raise ValueError(
 34 |         'API key must be provided either as an argument or through the OPENAI_API_KEY environment variable'
 35 |       )
 36 | 
 37 |     self.api_endpoint = api_endpoint
 38 |     self.model = model
 39 |     self.max_tokens = max_tokens
 40 |     self.environment = environment
 41 |     self.client = OpenAI(api_key=self.api_key)
 42 | 
 43 |   def _extract_image_dimensions(self, image_data_uri: str) -> Tuple[int, int]:
 44 |     try:
 45 |       if ',' in image_data_uri:
 46 |         base64_data = image_data_uri.split(',')[1]
 47 |       else:
 48 |         base64_data = image_data_uri
 49 | 
 50 |       image_data = base64.b64decode(base64_data)
 51 |       image_pil = Image.open(io.BytesIO(image_data))
 52 |       width, height = image_pil.size
 53 |       return width, height
 54 |     except Exception as e:
 55 |       print(f'Error extracting image dimensions: {e}')
 56 |       return DEFAULT_DISPLAY_WIDTH, DEFAULT_DISPLAY_HEIGHT
 57 | 
 58 |   @retry(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, min=2, max=15))
 59 |   def predict(self, image_data_uri: str, prompt: str) -> Dict[str, Any]:
 60 |     try:
 61 |       base64_data = image_data_uri.split(',')[1] if ',' in image_data_uri else image_data_uri
 62 |       width, height = self._extract_image_dimensions(base64_data)
 63 | 
 64 |       start_time = time.time()
 65 | 
 66 |       input_data = [
 67 |         {
 68 |           'role': 'user',
 69 |           'content': [
 70 |             {
 71 |               'type': 'input_text',
 72 |               'text': f'Perform the following task on the screen: {prompt} by clicking on a UI element. Do not ask for confirmation or clarifications. You have all the information you need to complete the task.',
 73 |             },
 74 |             {
 75 |               'type': 'input_image',
 76 |               'image_url': f'data:image/png;base64,{base64_data}',
 77 |             },
 78 |           ],
 79 |         }
 80 |       ]
 81 | 
 82 |       response = self.client.responses.create(
 83 |         model=self.model,
 84 |         tools=[
 85 |           {
 86 |             'type': 'computer_use_preview',  # type: ignore
 87 |             'display_width': width,
 88 |             'display_height': height,
 89 |             'environment': self.environment,
 90 |           }
 91 |         ],
 92 |         input=input_data,  # type: ignore
 93 |         truncation='auto',
 94 |       )
 95 | 
 96 |       print(response)
 97 |       print(f'Response: {response.output}')
 98 | 
 99 |       iteration = 0
100 |       previous_response_id = response.id
101 | 
102 |       computer_call = None
103 | 
104 |       while iteration < 10:
105 |         iteration += 1
106 | 
107 |         computer_calls = [item for item in response.output if item.type == 'computer_call']
108 |         print(f'Computer calls: {computer_calls}')
109 |         if len(computer_calls) > 0:
110 |           computer_call = computer_calls[0]
111 |           if computer_call.action.type == 'click':
112 |             print(f'Click action found: {computer_call.action}')
113 |             break
114 |           else:
115 |             print(f'Non-click action found: {computer_call.action}. Continuing...')
116 | 
117 |         if len(computer_calls) > 0:
118 |           computer_call = computer_calls[0]
119 |           call_id = computer_call.call_id
120 | 
121 |           pending_safety_checks = getattr(computer_call, 'pending_safety_checks', [])
122 |           acknowledged_safety_checks = []
123 | 
124 |           if pending_safety_checks:
125 |             print(f'Safety checks detected: {pending_safety_checks}')
126 |             acknowledged_safety_checks = pending_safety_checks
127 |         else:
128 |           elapsed_time = time.time() - start_time
129 |           return {
130 |             'elapsed_time': elapsed_time,
131 |             'response': response.model_dump(),
132 |             'computer_call': None,
133 |           }
134 | 
135 |         input_data = [
136 |           {
137 |             'call_id': call_id,
138 |             'type': 'computer_call_output',
139 |             'output': {
140 |               'type': 'input_image',
141 |               'image_url': f'data:image/png;base64,{base64_data}',
142 |             },
143 |           }
144 |         ]
145 | 
146 |         if acknowledged_safety_checks:
147 |           input_data[0]['acknowledged_safety_checks'] = acknowledged_safety_checks
148 | 
149 |         response = self.client.responses.create(
150 |           model=self.model,
151 |           previous_response_id=previous_response_id,
152 |           tools=[
153 |             {
154 |               'type': 'computer_use_preview',  # type: ignore
155 |               'display_width': width,
156 |               'display_height': height,
157 |               'environment': self.environment,
158 |             }
159 |           ],
160 |           input=input_data,  # type: ignore
161 |           truncation='auto',
162 |         )
163 | 
164 |         print(f'[CUA Loop Iteration {iteration}] Response: {response.output}')
165 | 
166 |         previous_response_id = response.id
167 | 
168 |       elapsed_time = time.time() - start_time
169 |       print(f'API call completed in {elapsed_time:.2f} seconds')
170 | 
171 |       return {
172 |         'elapsed_time': elapsed_time,
173 |         'response': response.model_dump(),
174 |         'computer_call': computer_call.model_dump() if computer_call else None,
175 |       }
176 |     except Exception as e:
177 |       print(f'Error in CUA prediction: {e}')
178 |       raise
179 | 
180 |   def parse_prediction(self, prediction: Dict[str, Any]) -> Dict[str, Any]:
181 |     if not prediction or 'computer_call' not in prediction or not prediction['computer_call']:
182 |       return {
183 |         'pred_x': None,
184 |         'pred_y': None,
185 |         'raw_response': json.dumps(prediction, default=str) if prediction else None,
186 |       }
187 | 
188 |     computer_call = prediction['computer_call']
189 | 
190 |     pred_x = None
191 |     pred_y = None
192 | 
193 |     if computer_call and 'action' in computer_call and computer_call['action']['type'] == 'click':
194 |       pred_x = computer_call['action']['x']
195 |       pred_y = computer_call['action']['y']
196 | 
197 |     return {
198 |       'pred_x': pred_x,
199 |       'pred_y': pred_y,
200 |       'raw_response': json.dumps(prediction, default=str),
201 |     }
202 | 


--------------------------------------------------------------------------------
/clicks/src/clicks/third_party/claude/client.py:
--------------------------------------------------------------------------------
  1 | import base64
  2 | import io
  3 | import json
  4 | import os
  5 | import time
  6 | from typing import Any, Dict, List, Optional
  7 | 
  8 | import requests
  9 | from PIL import Image
 10 | from tenacity import retry, stop_after_attempt, wait_exponential
 11 | 
 12 | CLAUDE_API_ENDPOINT = 'https://api.anthropic.com/v1/messages'
 13 | DEFAULT_MODEL = 'claude-3-7-sonnet-20250219'
 14 | 
 15 | 
 16 | class ClaudeComputerUseClient:
 17 |   def __init__(
 18 |     self,
 19 |     api_key: Optional[str] = None,
 20 |     api_endpoint: str = CLAUDE_API_ENDPOINT,
 21 |     model: str = DEFAULT_MODEL,
 22 |     max_tokens: int = 4096,
 23 |     thinking_budget: Optional[int] = 1024,
 24 |     tool_version: str = '20250124',
 25 |   ):
 26 |     self.api_key = api_key or os.environ.get('ANTHROPIC_API_KEY')
 27 |     if not self.api_key:
 28 |       raise ValueError(
 29 |         'API key must be provided either as an argument or through the ANTHROPIC_API_KEY environment variable'
 30 |       )
 31 | 
 32 |     self.api_endpoint = api_endpoint
 33 |     self.model = model
 34 |     self.max_tokens = max_tokens
 35 |     self.thinking_budget = thinking_budget
 36 |     self.tool_version = tool_version
 37 |     self.beta_flag = (
 38 |       'computer-use-2025-01-24' if '20250124' in tool_version else 'computer-use-2024-10-22'
 39 |     )
 40 |     self.display_width = None
 41 |     self.display_height = None
 42 | 
 43 |   def _extract_image_dimensions(self, base64_data: str) -> tuple[int, int]:
 44 |     try:
 45 |       image_data = base64.b64decode(base64_data)
 46 |       image = Image.open(io.BytesIO(image_data))
 47 |       width, height = image.size
 48 |       return width, height
 49 |     except Exception as e:
 50 |       print(f'Error extracting image dimensions: {e}')
 51 |       return 1024, 768
 52 | 
 53 |   def _create_tools(self) -> List[Dict[str, Any]]:
 54 |     width = self.display_width or 1024
 55 |     height = self.display_height or 768
 56 | 
 57 |     return [
 58 |       {
 59 |         'type': f'computer_{self.tool_version}',
 60 |         'name': 'computer',
 61 |         'display_width_px': width,
 62 |         'display_height_px': height,
 63 |         'display_number': 1,
 64 |       },
 65 |     ]
 66 | 
 67 |   def _create_thinking_config(self) -> Optional[Dict[str, Any]]:
 68 |     if self.thinking_budget is None:
 69 |       return None
 70 | 
 71 |     return {'type': 'enabled', 'budget_tokens': self.thinking_budget}
 72 | 
 73 |   @retry(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, min=2, max=15))
 74 |   def predict(self, image_data_uri: str, prompt: str) -> Dict[str, Any] | None:
 75 |     headers = {
 76 |       'Content-Type': 'application/json',
 77 |       'x-api-key': self.api_key,
 78 |       'anthropic-version': '2023-06-01',
 79 |       'anthropic-beta': self.beta_flag,
 80 |     }
 81 | 
 82 |     if ',' in image_data_uri:
 83 |       base64_data = image_data_uri.split(',')[1]
 84 |     else:
 85 |       base64_data = image_data_uri
 86 | 
 87 |     self.display_width, self.display_height = self._extract_image_dimensions(base64_data)
 88 | 
 89 |     # Note: it is unclear if the Claude computer use agent expects the screenshot to be in the very first user message,
 90 |     # or is it elicited by the screenshot request tool call. During testing, the behavior was inconsistent.
 91 |     # The default behavior here is to not include the screenshot in the first user message, and then the same image
 92 |     # is sent in the tool result. You may wonder why we don't include the screenshot in the first user message.
 93 |     # During testing on the dev set, the evals are almost the same whether we include the screenshot in the first
 94 |     # user message or not.
 95 |     payload = {
 96 |       'model': self.model,
 97 |       'max_tokens': self.max_tokens,
 98 |       'messages': [
 99 |         {
100 |           'role': 'user',
101 |           'content': [
102 |             {
103 |               'type': 'text',
104 |               'text': prompt,
105 |             },
106 |           ],
107 |         }
108 |       ],
109 |       'tools': self._create_tools(),
110 |     }
111 | 
112 |     thinking = self._create_thinking_config()
113 |     if thinking:
114 |       payload['thinking'] = thinking
115 | 
116 |     start_time = time.time()
117 |     response = requests.post(
118 |       self.api_endpoint,
119 |       headers=headers,
120 |       json=payload,
121 |     )
122 | 
123 |     if response.status_code != 200:
124 |       print(f'API Error: {response.status_code} - {response.text}')
125 |       response.raise_for_status()
126 | 
127 |     result = response.json()
128 | 
129 |     raw_response = json.dumps(result)
130 | 
131 |     print(result)
132 | 
133 |     tool_use = None
134 |     tool_use_id = None
135 |     for content_item in result.get('content', []):
136 |       if content_item.get('type') == 'tool_use' and content_item.get('name') == 'computer':
137 |         tool_use = content_item.get('input', {})
138 |         tool_use_id = content_item.get('id')
139 |         break
140 | 
141 |     if not tool_use:
142 |       print('No computer tool call found in the response')
143 |       return None
144 |     if tool_use.get('action') == 'screenshot':
145 |       print('Claude requested a screenshot. Sending the same image again...')
146 | 
147 |       payload = {
148 |         'model': self.model,
149 |         'max_tokens': self.max_tokens,
150 |         'messages': [
151 |           {
152 |             'role': 'user',
153 |             'content': [
154 |               {
155 |                 'type': 'text',
156 |                 'text': prompt,
157 |               },
158 |             ],
159 |           },
160 |           {
161 |             'role': 'assistant',
162 |             'content': result.get('content', []),
163 |           },
164 |           {
165 |             'role': 'user',
166 |             'content': [
167 |               {
168 |                 'type': 'tool_result',
169 |                 'tool_use_id': tool_use_id,
170 |                 'content': [
171 |                   {
172 |                     'type': 'image',
173 |                     'source': {
174 |                       'type': 'base64',
175 |                       'media_type': 'image/jpeg',
176 |                       'data': base64_data,
177 |                     },
178 |                   }
179 |                 ],
180 |               }
181 |             ],
182 |           },
183 |         ],
184 |         'tools': self._create_tools(),
185 |       }
186 | 
187 |       if thinking:
188 |         payload['thinking'] = thinking
189 | 
190 |       response = requests.post(
191 |         self.api_endpoint,
192 |         headers=headers,
193 |         json=payload,
194 |       )
195 | 
196 |       if response.status_code != 200:
197 |         print(f'API Error: {response.status_code} - {response.text}')
198 |         response.raise_for_status()
199 | 
200 |       result = response.json()
201 | 
202 |       raw_response_second = json.dumps(result)
203 | 
204 |       print('Second response after screenshot:')
205 |       print(result)
206 | 
207 |       tool_use = None
208 |       for content_item in result.get('content', []):
209 |         if content_item.get('type') == 'tool_use' and content_item.get('name') == 'computer':
210 |           tool_use = content_item.get('input', {})
211 |           break
212 | 
213 |       if not tool_use:
214 |         print('No computer tool call found in the second response')
215 |         return None
216 | 
217 |       tool_use['raw_responses'] = [raw_response, raw_response_second]
218 |     else:
219 |       tool_use['raw_responses'] = [raw_response]
220 | 
221 |     tool_use['latency'] = time.time() - start_time
222 |     tool_use['model'] = self.model
223 | 
224 |     if 'thinking' in result:
225 |       tool_use['thinking'] = result['thinking']
226 | 
227 |     return tool_use
228 | 
229 |   def parse_prediction(self, prediction: Dict[str, Any]) -> Dict[str, Any]:
230 |     try:
231 |       assert isinstance(prediction, dict)
232 | 
233 |       action_kind = prediction.get('action', {})
234 | 
235 |       if action_kind == 'left_click':
236 |         coordinate = prediction.get('coordinate', {})
237 |         pred_x, pred_y = coordinate
238 |         pred_type = 'left_click'
239 |         pred_text = None
240 |       elif action_kind == 'type':
241 |         pred_x, pred_y = None, None
242 |         pred_type = 'type'
243 |         pred_text = prediction.get('text')
244 |       elif action_kind == 'screenshot':
245 |         pred_x, pred_y = None, None
246 |         pred_type = 'screenshot'
247 |         pred_text = None
248 |       else:
249 |         pred_x, pred_y = None, None
250 |         pred_type = action_kind
251 |         pred_text = None
252 | 
253 |       result = {
254 |         'pred_type': pred_type,
255 |         'pred_x': pred_x,
256 |         'pred_y': pred_y,
257 |         'pred_text': pred_text,
258 |       }
259 | 
260 |       if 'raw_responses' in prediction:
261 |         result['raw_responses'] = prediction['raw_responses']
262 | 
263 |       if 'thinking' in prediction:
264 |         result['thinking'] = prediction['thinking']
265 | 
266 |       return result
267 | 
268 |     except Exception as e:
269 |       print(f'Error parsing prediction: {e}')
270 |       return {
271 |         'pred_type': None,
272 |         'pred_x': None,
273 |         'pred_y': None,
274 |         'pred_text': None,
275 |       }
276 | 


--------------------------------------------------------------------------------
/clicks/src/clicks/third_party/ui_tars/client.py:
--------------------------------------------------------------------------------
  1 | import base64
  2 | import io
  3 | import json
  4 | import re
  5 | import time
  6 | from typing import Any, Dict
  7 | 
  8 | import requests
  9 | from PIL import Image
 10 | from tenacity import retry, stop_after_attempt, wait_exponential
 11 | 
 12 | 
 13 | class UITarsClient:
 14 |   def __init__(
 15 |     self,
 16 |     api_url: str,
 17 |     api_key: str = 'super-secret-key',
 18 |     max_tokens: int = 128,
 19 |     temperature: float = 0.0,
 20 |     frequency_penalty: float = 1.0,
 21 |     model_name: str = 'bytedance-research/UI-TARS-72B-SFT',
 22 |   ):
 23 |     self.api_url = api_url.rstrip('/')
 24 |     self.api_key = api_key
 25 |     self.max_tokens = max_tokens
 26 |     self.temperature = temperature
 27 |     self.frequency_penalty = frequency_penalty
 28 |     self.model_name = model_name
 29 | 
 30 |   def _encode_image(self, image_data_uri: str) -> str:
 31 |     if ',' in image_data_uri:
 32 |       base64_data = image_data_uri.split(',')[1]
 33 |     else:
 34 |       base64_data = image_data_uri
 35 | 
 36 |     return base64_data
 37 | 
 38 |   def _extract_image_dimensions(self, base64_data: str) -> tuple[int, int]:
 39 |     try:
 40 |       image_data = base64.b64decode(base64_data)
 41 |       image = Image.open(io.BytesIO(image_data))
 42 |       width, height = image.size
 43 |       return width, height
 44 |     except Exception as e:
 45 |       print(f'Error extracting image dimensions: {e}')
 46 |       return 1024, 768
 47 | 
 48 |   @retry(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, min=2, max=15))
 49 |   def predict(self, image_data_uri: str, prompt: str) -> Dict[str, Any]:
 50 |     base64_image = self._encode_image(image_data_uri)
 51 |     width, height = self._extract_image_dimensions(base64_image)
 52 | 
 53 |     # Note: UI-TARS is not a generalist VLM: prompting it with plain English will cause the model to severely collapse.
 54 |     # Hence, it is unclear how to change the given computer use prompt, so we just use the default one provided in the UI-TARS repo.
 55 |     # drag, right_single, hotkey, type, scroll, wait, finished, call_user are here but we won't use them and will treat it as a failure.
 56 |     prompt_template = f"""You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task. 
 57 | 
 58 | ## Output Format
 59 | ```\nThought: ...
 60 | Action: ...\n```
 61 | 
 62 | ## Action Space
 63 | 
 64 | click(start_box='<|box_start|>(x1,y1)<|box_end|>')
 65 | left_double(start_box='<|box_start|>(x1,y1)<|box_end|>')
 66 | right_single(start_box='<|box_start|>(x1,y1)<|box_end|>')
 67 | drag(start_box='<|box_start|>(x1,y1)<|box_end|>', end_box='<|box_start|>(x3,y3)<|box_end|>')
 68 | hotkey(key='')
 69 | type(content='') #If you want to submit your input, use \"\
 70 | \" at the end of `content`.
 71 | scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left')
 72 | wait() #Sleep for 5s and take a screenshot to check for any changes.
 73 | finished()
 74 | call_user() # Submit the task and call the user when the task is unsolvable, or when you need the user's help.
 75 | 
 76 | 
 77 | ## Note
 78 | - Use Chinese in `Thought` part.
 79 | - Summarize your next action (with its target element) in one sentence in `Thought` part.
 80 | 
 81 | ## User Instruction
 82 | {prompt}"""
 83 | 
 84 |     # Prepare the multimodal message
 85 |     multimodal_message = {
 86 |       'role': 'user',
 87 |       'content': [
 88 |         {'type': 'image_url', 'image_url': {'url': f'data:image/png;base64,{base64_image}'}},
 89 |         {'type': 'text', 'text': prompt_template},
 90 |       ],
 91 |     }
 92 | 
 93 |     request_data = {
 94 |       'messages': [multimodal_message],
 95 |       'model': self.model_name,
 96 |       'max_tokens': self.max_tokens,
 97 |       'temperature': self.temperature,
 98 |       'frequency_penalty': self.frequency_penalty,
 99 |     }
100 | 
101 |     headers = {
102 |       'Content-Type': 'application/json',
103 |       'Authorization': f'Bearer {self.api_key}',
104 |     }
105 | 
106 |     try:
107 |       start_time = time.time()
108 | 
109 |       response = requests.post(
110 |         f'{self.api_url}/v1/chat/completions',
111 |         json=request_data,
112 |         headers=headers,
113 |         timeout=7200,
114 |       )
115 | 
116 |       end_time = time.time()
117 |       latency = end_time - start_time
118 | 
119 |       if response.status_code == 200:
120 |         result = response.json()
121 | 
122 |         content = result.get('choices', [{}])[0].get('message', {}).get('content', '')
123 | 
124 |         print(content)
125 |         return {
126 |           'raw_response': json.dumps(result),
127 |           'content': content,
128 |           'latency_seconds': latency,
129 |           'width': width,
130 |           'height': height,
131 |         }
132 |       else:
133 |         print(response.text)
134 |         error_text = response.text
135 |         try:
136 |           error_json = response.json()
137 |           error_text = json.dumps(error_json)
138 |         except:
139 |           pass
140 | 
141 |         return {
142 |           'error': f'HTTP Error {response.status_code}',
143 |           'error_details': error_text,
144 |           'latency_seconds': latency,
145 |         }
146 | 
147 |     except Exception as e:
148 |       print(f'API Error: {str(e)}')
149 |       return {
150 |         'error': f'API Error: {str(e)}',
151 |         'latency_seconds': 0,
152 |       }
153 | 
154 |   def parse_prediction(self, prediction: Dict[str, Any]) -> Dict[str, Any]:
155 |     if 'error' in prediction:
156 |       return {
157 |         'pred_x': None,
158 |         'pred_y': None,
159 |         'error': prediction.get('error'),
160 |         'error_details': prediction.get('error_details', ''),
161 |         'raw_responses': prediction.get('raw_response', '{}'),
162 |       }
163 | 
164 |     content = prediction.get('content', '')
165 |     width = prediction.get('width', 1920)  # Default to 1920 if width not provided
166 |     height = prediction.get('height', 1080)  # Default to 1080 if height not provided
167 | 
168 |     action_match = re.search(r'Action:\s*(.*?)(?:\n|$)', content, re.DOTALL)
169 |     action_text = action_match.group(1).strip() if action_match else content
170 | 
171 |     click_match = re.search(
172 |       r"click\(start_box='<\|box_start\|>\((\d+),\s*(\d+)\)<\|box_end\|>'\)", action_text
173 |     )
174 |     if click_match:
175 |       rel_x = int(click_match.group(1))
176 |       rel_y = int(click_match.group(2))
177 |       pred_x = round(width * rel_x / 1000)
178 |       pred_y = round(height * rel_y / 1000)
179 |       return {
180 |         'pred_x': pred_x,
181 |         'pred_y': pred_y,
182 |         'content': content,
183 |         'raw_responses': prediction.get('raw_response', '{}'),
184 |       }
185 | 
186 |     # Process double click action
187 |     double_click_match = re.search(
188 |       r"left_double\(start_box='<\|box_start\|>\((\d+),\s*(\d+)\)<\|box_end\|>'\)", action_text
189 |     )
190 |     if double_click_match:
191 |       rel_x = int(double_click_match.group(1))
192 |       rel_y = int(double_click_match.group(2))
193 |       pred_x = round(width * rel_x / 1000)
194 |       pred_y = round(height * rel_y / 1000)
195 |       return {
196 |         'pred_x': pred_x,
197 |         'pred_y': pred_y,
198 |         'content': content,
199 |         'raw_responses': prediction.get('raw_response', '{}'),
200 |       }
201 | 
202 |     # Process generic coordinate pattern
203 |     coord_match = re.search(r'\((\d+),\s*(\d+)\)', content)
204 |     if coord_match:
205 |       rel_x = int(coord_match.group(1))
206 |       rel_y = int(coord_match.group(2))
207 |       pred_x = round(width * rel_x / 1000)
208 |       pred_y = round(height * rel_y / 1000)
209 |       return {
210 |         'pred_x': pred_x,
211 |         'pred_y': pred_y,
212 |         'content': content,
213 |         'raw_responses': prediction.get('raw_response', '{}'),
214 |       }
215 | 
216 |     # Process x=X, y=Y format
217 |     x_match = re.search(r'x\s*=\s*(\d+)', content, re.IGNORECASE)
218 |     y_match = re.search(r'y\s*=\s*(\d+)', content, re.IGNORECASE)
219 |     if x_match and y_match:
220 |       rel_x = int(x_match.group(1))
221 |       rel_y = int(y_match.group(1))
222 |       pred_x = round(width * rel_x / 1000)
223 |       pred_y = round(height * rel_y / 1000)
224 |       return {
225 |         'pred_x': pred_x,
226 |         'pred_y': pred_y,
227 |         'content': content,
228 |         'raw_responses': prediction.get('raw_response', '{}'),
229 |       }
230 | 
231 |     # Process box format
232 |     box_match = re.search(r'<\|box_start\|>\((\d+),\s*(\d+)\)<\|box_end\|>', content)
233 |     if box_match:
234 |       rel_x = int(box_match.group(1))
235 |       rel_y = int(box_match.group(2))
236 |       pred_x = round(width * rel_x / 1000)
237 |       pred_y = round(height * rel_y / 1000)
238 |       return {
239 |         'pred_x': pred_x,
240 |         'pred_y': pred_y,
241 |         'content': content,
242 |         'raw_responses': prediction.get('raw_response', '{}'),
243 |       }
244 | 
245 |     return {
246 |       'pred_x': None,
247 |       'pred_y': None,
248 |       'content': content,
249 |       'raw_responses': prediction.get('raw_response', '{}'),
250 |       'error': 'No coordinates found in response',
251 |     }
252 | 


--------------------------------------------------------------------------------
/clicks/README.md:
--------------------------------------------------------------------------------
  1 | # showdown-clicks
  2 | 
  3 | General Agents
  4 | 
  5 | [🤗 Dataset](https://huggingface.co/datasets/generalagents/showdown-clicks) | [GitHub](https://github.com/generalagents/showdown)
  6 | 
  7 | `showdown` is a suite of offline and online benchmarks for computer-use agents.
  8 | 
  9 | `showdown-clicks` is a collection of 5,679 left clicks of humans performing various tasks in a macOS desktop environment. It is intended to evaluate instruction-following and low-level control capabilities of computer-use agents.
 10 | 
 11 | As of March 2025, we are releasing a subset of the full set, `showdown-clicks-dev`, containing 557 clicks. All examples are annotated with the bounding box of viable click locations for the UI element.
 12 | 
 13 | The episodes range from tens of seconds to minutes, and screenshots are between WXGA (1280×800) and WSXGA+ (1680×1050). The recordings contain no PII and were collected in late 2024.
 14 | 
 15 | | Column | Description |
 16 | |--------|-------------|
 17 | | id | Unique identifier for each data entry (alphanumeric string) |
 18 | | image | Path to the screenshot image file showing the UI state |
 19 | | instruction | Natural language instruction describing the task to be performed |
 20 | | x1 | Top-left x-coordinate of the bounding box |
 21 | | y1 | Top-left y-coordinate of the bounding box | 
 22 | | x2 | Bottom-right x-coordinate of the bounding box |
 23 | | y2 | Bottom-right y-coordinate of the bounding box |
 24 | | width | Width of the image |
 25 | | height | Height of the image |
 26 | 
 27 | ## `showdown-clicks-dev` Results
 28 | 
 29 | | Model                                                | Accuracy      | 95% CI              | Latency [^1] | 95% CI                  |
 30 | |------------------------------------------------------|---------------|---------------------|--------------|-------------------------|
 31 | | `ace-control-medium`                                         | **77.56%**    | +3.41%/-3.59%       | 533ms        | +8ms/-7ms               |
 32 | | `ace-control-small`                                          | 72.89%        | +3.59%/-3.77%       | **324ms**    | +7ms/-7ms               |
 33 | | Operator (OpenAI CUA, macOS)                         | 64.27%        | +3.95%/-3.95%       | 6385ms       | +182ms/-177ms           |
 34 | | Molmo-72B-0924                                       | 54.76%        | +4.13%/-4.13%       | 6599ms       | +113ms/-114ms           |
 35 | | Claude 3.7 Sonnet (Thinking, Computer Use)           | 53.68%        | +4.13%/-4.13%       | 9656ms       | +95ms/-97ms             |
 36 | | UI-TARS-72B-SFT                                      | 54.4%         | +4.13%/-4.13%       | 1977ms       | +15ms/-16ms             |
 37 | | OmniParser V2 + GPT-4o                               | 51.71%        | +4.12%/-4.13%       | 12642ms      | +361ms/-349ms           |
 38 | | Gemini 2.0 Flash                                     | 33.39%        | +3.95%/-3.95%       | 3069ms       | +16ms/-16ms             |
 39 | | Qwen2.5-VL-72B-Instruct                              | 24.78%        | +3.59%/-3.60%       | 3790ms       | +57ms/-55ms             |
 40 | | GPT-4o                                               | 5.21%         | +1.97%/-1.80%       | 2500ms       | +49ms/-48ms             |
 41 | 
 42 | ### Run evals
 43 | ```bash
 44 | uv run eval.py --model ace --dataset dev --num-workers 1 --run-id showdown-clicks-dev
 45 | uv run eval.py --model claude --dataset dev --num-workers 16 --run-id showdown-clicks-dev
 46 | uv run eval.py --model qwen --dataset dev --num-workers 3 --run-id showdown-clicks-dev
 47 | uv run eval.py --model gemini --dataset dev --num-workers 16 --run-id showdown-clicks-dev
 48 | uv run eval.py --model openai --dataset dev --num-workers 16 --run-id showdown-clicks-dev
 49 | uv run eval.py --model openai-cua --dataset dev --num-workers 16 --run-id showdown-clicks-dev
 50 | uv run eval.py --model molmo --dataset dev --num-workers 2 --run-id showdown-clicks-dev --api-url $YOUR_MOLMO_MODAL_API
 51 | uv run eval.py --model ui-tars --dataset dev --run-id showdown-clicks-dev --api-url $YOUR_UITARS_MODAL_API --api-key $YOUR_UITARS_API_KEY --num-workers 1 --ui-tars-model bytedance-research/UI-TARS-72B-SFT
 52 | uv run eval.py --model omniparser --dataset dev --run-id showdown-clicks-dev --omniparser-model gpt-4o-2024-05-13 --api-url $YOUR_OMNIPARSER_MODAL_API  --num-workers 4
 53 | ```
 54 | 
 55 | When you are done with the evals, go to Modal's UI and terminate the individual apps.
 56 | 
 57 | ## Directory Structure
 58 | 
 59 | The project is organized as follows:
 60 | 
 61 | - `data/`: Input data
 62 |   - `showdown-clicks-dev/data.csv`: Records
 63 |   - `showdown-clicks-dev/frames`: Image frames
 64 | 
 65 | - `results/`: Output data
 66 |   - CSV result files from evaluations
 67 |   - `showdown-clicks-dev/{$MODEL}/visualizations/`: Visualizations of model predictions
 68 |   - `report/`: Analysis reports and summary metrics
 69 | 
 70 | - `scripts/`: Utility scripts
 71 |   - `calculate_latency.py`: Script to calculate latency metrics
 72 |   - `collect_runs.py`: Script to collect results from multiple runs
 73 | 
 74 | - `src/clicks/`: Main source code
 75 |   - `api_client_base.py`: Base API client classes
 76 |   - `evaluate/`: Evaluation code
 77 |     - `ace.py`: Ace model implementation
 78 |     - `models.py`: Data models for evaluation
 79 |     - `utils.py`: Utilities for visualization and evaluation
 80 |   - `third_party/`: Third-party model integrations
 81 |     - `claude/`: Claude model integration
 82 |     - `gemini/`: Gemini model integration
 83 |     - `molmo/`: Molmo model integration
 84 |     - `omniparser/`: OmniParser model integration
 85 |     - `openai/`: OpenAI model integration
 86 |     - `openai_cua/`: OpenAI Computer Use Agent integration
 87 |     - `qwen/`: Qwen model integration
 88 |     - `ui_tars/`: UI-TARS model integration
 89 | 
 90 | ## Usage
 91 | 
 92 | To run the evaluation, use the `eval.py` script:
 93 | 
 94 | ```bash
 95 | # Run on the dev dataset (default)
 96 | uv run eval.py
 97 | 
 98 | # Run with a specific model (ace, claude, qwen, etc.)
 99 | uv run eval.py --model claude --api-key YOUR_API_KEY
100 | 
101 | # Run with a limited sample size (for testing)
102 | uv run eval.py --sample-size 10
103 | 
104 | # Run with multiple workers for parallel processing
105 | uv run eval.py --num-workers 4
106 | 
107 | # Run with a custom output file
108 | uv run eval.py --output-file results/custom_results.csv
109 | ```
110 | 
111 | ### Model-specific options
112 | 
113 | #### Computer-use agents
114 | 
115 | Claude:
116 | ```bash
117 | uv run eval.py --model claude --api-key YOUR_ANTHROPIC_API_KEY --claude-model claude-3-7-sonnet-20250219 --thinking-budget 1024
118 | ```
119 | 
120 | Qwen:
121 | ```bash
122 | uv run eval.py --model qwen --api-key YOUR_DASHSCOPE_API_KEY --qwen-model qwen2.5-vl-72b-instruct --max-tokens 4096
123 | ```
124 | 
125 | UI-TARS:
126 | ```bash
127 | uv run eval.py --model ui-tars --api-url YOUR_UITARS_API_URL --api-key YOUR_API_KEY --ui-tars-model bytedance-research/UI-TARS-72B-SFT --max-tokens 128 --temperature 0.0 --frequency-penalty 1.0
128 | ```
129 | 
130 | OmniParser:
131 | ```bash
132 | uv run eval.py --model omniparser --dataset dev --run-id showdown-clicks-dev --omniparser-model gpt-4o-2024-05-13 --api-url YOUR_OMNIPARSER_API_URL --omniparser-temperature 0.7
133 | ```
134 | 
135 | Operator:
136 | ```bash
137 | uv run eval.py --model openai-cua --dataset dev --run-id showdown-clicks-dev --environment mac
138 | ```
139 | 
140 | Ace (default):
141 | ```bash
142 | uv run eval.py --model ace
143 | ```
144 | 
145 | #### VLMs
146 | 
147 | OpenAI:
148 | ```bash
149 | uv run eval.py --model openai --api-key YOUR_OPENAI_API_KEY --openai-model gpt-4o --dataset dev
150 | ```
151 | 
152 | Gemini:
153 | ```bash
154 | uv run eval.py --model gemini --api-key YOUR_GEMINI_API_KEY --gemini-model gemini-1.5-pro-latest --dataset dev
155 | ```
156 | 
157 | Molmo:
158 | ```bash
159 | uv run eval.py --model molmo --api-url YOUR_MOLMO_API_URL --dataset dev
160 | ```
161 | 
162 | ## Environment Variables
163 | 
164 | Alternative to passing API keys as command-line arguments:
165 | 
166 | - `ANTHROPIC_API_KEY`: API key for Claude
167 | - `DASHSCOPE_API_KEY`: API key for Qwen
168 | - `OPENAI_API_KEY`: API key for OpenAI and OpenAI CUA
169 | - `GEMINI_API_KEY`: API key for Gemini
170 | - `GENERALAGENTS_API_KEY`: API key for General Agents (Ace)
171 | 
172 | ## Visualization
173 | 
174 | The evaluation script generates visualizations of model predictions, showing both the ground truth click position, bounding box, and the predicted click position. These visualizations are saved in the `results/[run-id]/[model]/visualizations/` directory, organized by model and correctness.
175 | 
176 | ## Results Format
177 | 
178 | The evaluation results are saved as CSV files in the `results/[run-id]/` directory. Each row in the CSV file contains:
179 | 
180 | | Column | Description |
181 | |--------|-------------|
182 | | id | Unique identifier for the evaluation item |
183 | | recording_id | Identifier for the recording session |
184 | | instruction | The instruction given to the model |
185 | | image_path | Path to the image file |
186 | | gt_x1 | Ground truth bounding box left X-coordinate |
187 | | gt_y1 | Ground truth bounding box top Y-coordinate |
188 | | gt_x2 | Ground truth bounding box right X-coordinate |
189 | | gt_y2 | Ground truth bounding box bottom Y-coordinate |
190 | | pred_x | Predicted X-coordinate |
191 | | pred_y | Predicted Y-coordinate |
192 | | is_in_bbox | Whether the prediction is within the ground truth bounding box |
193 | | latency_seconds | Time taken for the model to make the prediction |
194 | | visualization_path | Path to the visualization image |
195 | | raw_response | Raw response from the model |
196 | 
197 | ## Metrics
198 | 
199 | The evaluation script calculates the percentage of correct predictions (within the bounding box), with 95% confidence intervals created from bootstrapping.
200 | 
201 | ## License
202 | 
203 | This project is licensed under the MIT License - see the LICENSE file for details.
204 | 
205 | ## Disclaimer
206 | 
207 | The images used in this evaluation dataset may contain content that some users might find offensive, inappropriate, or objectionable. These images are included solely for the purpose of evaluating model performance on realistic computer use scenarios.
208 | 
209 | We do not endorse, approve of, or claim responsibility for any content displayed in these images. The inclusion of any image in this dataset does not represent our views or opinions, and is not intended to promote any particular content, website, or viewpoint.
210 | 
211 | Researchers and users of this evaluation framework should be aware of this possibility when reviewing results and visualizations.
212 | 
213 | ## Citation
214 | 
215 | If you use `showdown-clicks` in your research, please cite it as follows:
216 | 
217 | ```bibtex
218 | @misc{showdown2025,
219 |   title={The Showdown Computer Control Evaluation Suite},
220 |   author={General Agents Team},
221 |   year={2025},
222 |   url={https://github.com/generalagents/showdown},
223 | }
224 | ```
225 | 
226 | [^1]: Latency values vary significantly by provider, demand, computational resources, geographical location, and other factors - most of which are opaque to us for models we don't have direct access to. Ace models are served via General Agent's API; Qwen, Claude, Gemini, and OpenAI models utilize their respective first-party APIs; while Molmo, UI-TARS, and OmniParser models are served through Modal.


--------------------------------------------------------------------------------
/clicks/src/clicks/third_party/qwen/client.py:
--------------------------------------------------------------------------------
  1 | import base64
  2 | import io
  3 | import json
  4 | import math
  5 | import os
  6 | import re
  7 | import time
  8 | from typing import Any, Dict, Optional, Tuple
  9 | 
 10 | from openai import OpenAI
 11 | from PIL import Image
 12 | from tenacity import retry, stop_after_attempt, wait_exponential
 13 | 
 14 | DEFAULT_MODEL = 'qwen2.5-vl-72b-instruct'
 15 | DASHSCOPE_API_ENDPOINT = 'https://dashscope.aliyuncs.com/compatible-mode/v1'
 16 | 
 17 | # Qwen performs better with this slightly modified prompt adapted from the official cookbook
 18 | QWEN_ACTION_SPACE = {
 19 |   'type': 'function',
 20 |   'function': {
 21 |     'name_for_human': 'computer_use',
 22 |     'name': 'computer_use',
 23 |     'description': "Use a mouse and keyboard to interact with a computer, and take screenshots.\\n* This is an interface to a desktop GUI. You do not have access to a terminal or applications menu. You must click on desktop icons to start applications.\\n* Some applications may take time to start or process actions, so you may need to wait and take successive screenshots to see the results of your actions. E.g. if you click on Firefox and a window doesn't open, try wait and taking another screenshot.\\n* The screen's resolution is {RES_WIDTH}x{RES_HEIGHT}.\\n* Whenever you intend to move the cursor to click on an element like an icon, you should consult a screenshot to determine the coordinates of the element before moving the cursor.\\n* If you tried clicking on a program or link but it failed to load, even after waiting, try adjusting your cursor position so that the tip of the cursor visually falls on the element that you want to click.\\n* Make sure to click any buttons, links, icons, etc with the cursor tip in the center of the element. Don't click boxes on their edges unless asked.",
 24 |     'parameters': {
 25 |       'properties': {
 26 |         'action': {
 27 |           'description': 'The action to perform. The available actions are:\\n* `key`: Performs key down presses on the arguments passed in order, then performs key releases in reverse order.\\n* `type`: Type a string of text on the keyboard.\\n* `mouse_move`: Move the cursor to a specified (x, y) pixel coordinate on the screen.\\n* `left_click`: Click the left mouse button.\\n* `left_click_drag`: Click and drag the cursor from a start coordinate to an end coordinate on the screen.\\n* `right_click`: Click the right mouse button.\\n* `double_click`: Double-click the left mouse button.\\n* `scroll`: Performs a scroll of the mouse scroll wheel.\\n* `wait`: Wait for the change to happen.\\n* `terminate`: Terminate the current task when it is completed.',
 28 |           'enum': [
 29 |             'key',
 30 |             'type',
 31 |             'mouse_move',
 32 |             'left_click',
 33 |             'left_click_drag',
 34 |             'right_click',
 35 |             'double_click',
 36 |             'scroll',
 37 |             'wait',
 38 |             'terminate',
 39 |           ],
 40 |           'type': 'string',
 41 |         },
 42 |         'keys': {'description': 'Required only by `action=key`.', 'type': 'array'},
 43 |         'text': {'description': 'Required only by `action=type`.', 'type': 'string'},
 44 |         'start_coordinate': {
 45 |           'description': '(x, y): The starting x (pixels from the left edge) and y (pixels from the top edge) coordinates. Required only by `action=left_click_drag`.',
 46 |           'type': 'array',
 47 |         },
 48 |         'end_coordinate': {
 49 |           'description': '(x, y): The ending x (pixels from the left edge) and y (pixels from the top edge) coordinates. Required only by `action=left_click_drag`.',
 50 |           'type': 'array',
 51 |         },
 52 |         'coordinate': {
 53 |           'description': '(x, y): The x (pixels from the left edge) and y (pixels from the top edge) coordinates to move the mouse to. Required by `action=mouse_move, action=left_click, action=right_click, action=double_click`.',
 54 |           'type': 'array',
 55 |         },
 56 |         'pixels': {
 57 |           'description': 'The amount of scrolling to perform. Positive values scroll up, negative values scroll down. Required only by `action=scroll`.',
 58 |           'type': 'number',
 59 |         },
 60 |       },
 61 |       'required': ['action'],
 62 |       'type': 'object',
 63 |     },
 64 |     'args_format': 'Format the arguments as a JSON object.',
 65 |   },
 66 | }
 67 | 
 68 | BASE_PROMPT_TEMPLATE = (
 69 |   """# Tools
 70 | 
 71 | You MUST call a single function to assist with the user query. Do not call multiple functions, and do not answer the user's query without calling a function.
 72 | 
 73 | You are provided with function signatures within <tools></tools> XML tags:
 74 | <tools>"""
 75 |   + json.dumps(QWEN_ACTION_SPACE)
 76 |   + """</tools>
 77 | 
 78 | For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:
 79 | <tool_call>
 80 | {"name": <function-name>, "arguments": <args-json-object>}
 81 | </tool_call>"""
 82 | )
 83 | 
 84 | 
 85 | def smart_resize(
 86 |   height: int, width: int, factor: int = 28, min_pixels: int = 3136, max_pixels: int = 12845056
 87 | ):
 88 |   if height < factor or width < factor:
 89 |     raise ValueError(f'height:{height} or width:{width} must be larger than factor:{factor}')
 90 |   elif max(height, width) / min(height, width) > 200:
 91 |     raise ValueError(
 92 |       f'absolute aspect ratio must be smaller than 200, got {max(height, width) / min(height, width)}'
 93 |     )
 94 |   h_bar = round(height / factor) * factor
 95 |   w_bar = round(width / factor) * factor
 96 |   if h_bar * w_bar > max_pixels:
 97 |     beta = math.sqrt((height * width) / max_pixels)
 98 |     h_bar = math.floor(height / beta / factor) * factor
 99 |     w_bar = math.floor(width / beta / factor) * factor
100 |   elif h_bar * w_bar < min_pixels:
101 |     beta = math.sqrt(min_pixels / (height * width))
102 |     h_bar = math.ceil(height * beta / factor) * factor
103 |     w_bar = math.ceil(width * beta / factor) * factor
104 |   return h_bar, w_bar
105 | 
106 | 
107 | class QwenVLClient:
108 |   def __init__(
109 |     self,
110 |     api_key: Optional[str] = None,
111 |     api_endpoint: str = DASHSCOPE_API_ENDPOINT,
112 |     model: str = DEFAULT_MODEL,
113 |     max_tokens: int = 4096,
114 |     use_smart_resize: bool = True,
115 |     resize_factor: int = 28,
116 |     min_pixels: int = 3136,
117 |     max_pixels: int = 12845056,
118 |   ):
119 |     self.api_key = api_key or os.environ.get('DASHSCOPE_API_KEY')
120 |     if not self.api_key:
121 |       raise ValueError(
122 |         'API key must be provided either as an argument or through the DASHSCOPE_API_KEY environment variable'
123 |       )
124 | 
125 |     self.api_endpoint = api_endpoint
126 |     self.model = model
127 |     self.max_tokens = max_tokens
128 |     self.display_width = None
129 |     self.display_height = None
130 |     self.original_width = None
131 |     self.original_height = None
132 |     self.use_smart_resize = use_smart_resize
133 |     self.resize_factor = resize_factor
134 |     self.min_pixels = min_pixels
135 |     self.max_pixels = max_pixels
136 | 
137 |   def _create_client(self) -> OpenAI:
138 |     return OpenAI(
139 |       api_key=self.api_key,
140 |       base_url=self.api_endpoint,
141 |     )
142 | 
143 |   def _extract_image_dimensions(self, base64_data: str) -> tuple[int, int]:
144 |     try:
145 |       image_data = base64.b64decode(base64_data)
146 |       image = Image.open(io.BytesIO(image_data))
147 |       width, height = image.size
148 |       return width, height
149 |     except Exception as e:
150 |       print(f'Error extracting image dimensions: {e}')
151 |       return 1024, 768
152 | 
153 |   def _resize_image(self, base64_data: str) -> Tuple[str, int, int]:
154 |     try:
155 |       image_data = base64.b64decode(base64_data)
156 |       image = Image.open(io.BytesIO(image_data))
157 |       self.original_width, self.original_height = image.size
158 | 
159 |       new_height, new_width = smart_resize(
160 |         self.original_height,
161 |         self.original_width,
162 |         factor=self.resize_factor,
163 |         min_pixels=self.min_pixels,
164 |         max_pixels=self.max_pixels,
165 |       )
166 | 
167 |       resized_image = image.resize((new_width, new_height), resample=2)
168 | 
169 |       buffer = io.BytesIO()
170 |       resized_image.save(buffer, format=image.format or 'JPEG')
171 |       buffer.seek(0)
172 |       new_base64_data = base64.b64encode(buffer.getvalue()).decode('utf-8')
173 | 
174 |       return new_base64_data, new_width, new_height
175 |     except Exception as e:
176 |       print(f'Error resizing image: {e}')
177 |       width = self.original_width or 1024
178 |       height = self.original_height or 768
179 |       return base64_data, width, height
180 | 
181 |   def _translate_coordinates(
182 |     self, x: Optional[int], y: Optional[int]
183 |   ) -> Tuple[Optional[int], Optional[int]]:
184 |     if x is None or y is None:
185 |       return x, y
186 | 
187 |     orig_width = self.original_width or 1024
188 |     orig_height = self.original_height or 768
189 |     disp_width = self.display_width or 1024
190 |     disp_height = self.display_height or 768
191 | 
192 |     x_scale = orig_width / disp_width
193 |     y_scale = orig_height / disp_height
194 | 
195 |     original_x = round(x * x_scale)
196 |     original_y = round(y * y_scale)
197 | 
198 |     return original_x, original_y
199 | 
200 |   def _create_system_prompt(self) -> str:
201 |     width = self.display_width or 1024
202 |     height = self.display_height or 768
203 | 
204 |     prompt = BASE_PROMPT_TEMPLATE.replace('{RES_WIDTH}', str(width)).replace(
205 |       '{RES_HEIGHT}', str(height)
206 |     )
207 | 
208 |     return prompt
209 | 
210 |   @retry(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, min=2, max=15))
211 |   def predict(self, image_data_uri: str, prompt: str) -> Dict[str, Any] | None:
212 |     if ',' in image_data_uri:
213 |       base64_data = image_data_uri.split(',')[1]
214 |       mime_type = image_data_uri.split(',')[0]
215 |     else:
216 |       base64_data = image_data_uri
217 |       mime_type = 'data:image/jpeg;base64'
218 | 
219 |     self.original_width, self.original_height = self._extract_image_dimensions(base64_data)
220 | 
221 |     if self.use_smart_resize:
222 |       base64_data, new_width, new_height = self._resize_image(base64_data)
223 |       self.display_width, self.display_height = new_width, new_height
224 |     else:
225 |       self.display_width, self.display_height = self.original_width, self.original_height
226 | 
227 |     image_data_uri = f'{mime_type},{base64_data}'
228 | 
229 |     system_prompt = self._create_system_prompt()
230 | 
231 |     try:
232 |       client = self._create_client()
233 | 
234 |       start_time = time.time()
235 | 
236 |       response = client.chat.completions.create(
237 |         model=self.model,
238 |         temperature=0.0,
239 |         max_tokens=self.max_tokens,
240 |         messages=[
241 |           {
242 |             'role': 'system',
243 |             'content': [
244 |               {
245 |                 'type': 'text',
246 |                 'text': 'You are a helpful assistant.',
247 |               },
248 |               {'type': 'text', 'text': system_prompt},
249 |             ],
250 |           },
251 |           {
252 |             'role': 'user',
253 |             'content': [
254 |               {'type': 'image_url', 'image_url': {'url': image_data_uri}},
255 |               {'type': 'text', 'text': prompt},
256 |             ],
257 |           },
258 |         ],
259 |       )
260 | 
261 |       result = response.model_dump()
262 | 
263 |       raw_response = json.dumps(result)
264 | 
265 |       assistant_message = result.get('choices', [{}])[0].get('message', {})
266 |       content = assistant_message.get('content', '')
267 | 
268 |       return {
269 |         'raw_response': raw_response,
270 |         'content': content,
271 |         'latency': time.time() - start_time,
272 |         'original_width': self.original_width,
273 |         'original_height': self.original_height,
274 |         'display_width': self.display_width,
275 |         'display_height': self.display_height,
276 |       }
277 | 
278 |     except Exception as e:
279 |       print(f'API Error: {str(e)}')
280 |       return None
281 | 
282 |   def parse_prediction(self, prediction: Dict[str, Any]) -> Dict[str, Any]:
283 |     if not prediction or 'content' not in prediction:
284 |       return {
285 |         'pred_x': None,
286 |         'pred_y': None,
287 |         'raw_responses': prediction.get('raw_response', '{}'),
288 |       }
289 | 
290 |     content = prediction['content']
291 | 
292 |     print(f'Content: {content}')
293 | 
294 |     tool_call_match = re.search(r'<tool_call>\s*(\{.*?\})\s*</tool_call>', content, flags=re.DOTALL)
295 |     if not tool_call_match:
296 |       return {
297 |         'pred_x': None,
298 |         'pred_y': None,
299 |         'raw_responses': prediction.get('raw_response', '{}'),
300 |       }
301 | 
302 |     try:
303 |       json_text = tool_call_match.group(1)
304 |       data = json.loads(json_text)
305 | 
306 |       if 'arguments' not in data:
307 |         return {
308 |           'pred_x': None,
309 |           'pred_y': None,
310 |           'raw_responses': prediction.get('raw_response', '{}'),
311 |         }
312 | 
313 |       args = data['arguments']
314 |       action_str = args.get('action')
315 | 
316 |       if not action_str:
317 |         return {
318 |           'pred_x': None,
319 |           'pred_y': None,
320 |           'raw_responses': prediction.get('raw_response', '{}'),
321 |         }
322 | 
323 |       pred_x = None
324 |       pred_y = None
325 |       if (
326 |         'coordinate' in args
327 |         and isinstance(args['coordinate'], list)
328 |         and len(args['coordinate']) == 2
329 |       ):
330 |         pred_x = int(args['coordinate'][0])
331 |         pred_y = int(args['coordinate'][1])
332 | 
333 |         if self.use_smart_resize and pred_x is not None and pred_y is not None:
334 |           pred_x, pred_y = self._translate_coordinates(pred_x, pred_y)
335 | 
336 |       return {
337 |         'pred_x': pred_x,
338 |         'pred_y': pred_y,
339 |         'raw_responses': prediction.get('raw_response', '{}'),
340 |         'original_width': self.original_width,
341 |         'original_height': self.original_height,
342 |         'display_width': self.display_width,
343 |         'display_height': self.display_height,
344 |       }
345 | 
346 |     except Exception as e:
347 |       print(f'Error parsing prediction: {e}')
348 |       return {
349 |         'pred_x': None,
350 |         'pred_y': None,
351 |         'raw_responses': prediction.get('raw_response', '{}'),
352 |       }
353 | 


--------------------------------------------------------------------------------
/clicks/eval.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import multiprocessing
  3 | import os
  4 | from datetime import datetime
  5 | from functools import partial
  6 | from typing import Any, Dict, List, Optional
  7 | 
  8 | import clicks.evaluate.ace as ace
  9 | import pandas as pd
 10 | from clicks.api_client_base import AbstractAPIClient
 11 | from clicks.evaluate.models import EvaluationResult
 12 | from clicks.evaluate.utils import analyze_results
 13 | from clicks.third_party import (
 14 |   get_claude_api_client,
 15 |   get_gemini_api_client,
 16 |   get_molmo_api_client,
 17 |   get_omniparser_api_client,
 18 |   get_openai_api_client,
 19 |   get_openai_cua_api_client,
 20 |   get_qwen_api_client,
 21 |   get_ui_tars_api_client,
 22 | )
 23 | from colorama import Fore, Style, init
 24 | from tqdm import tqdm
 25 | 
 26 | init(autoreset=True)
 27 | 
 28 | 
 29 | def process_item(
 30 |   item: Dict[str, Any],
 31 |   frames_dir: str,
 32 |   api_client: AbstractAPIClient,
 33 |   run_id: str,
 34 | ) -> EvaluationResult:
 35 |   try:
 36 |     image_path = item.get('image', '')
 37 | 
 38 |     if not image_path or image_path.startswith('data:'):
 39 |       print(f'Skipping item {item.get("id", "unknown")}: Invalid image path: {image_path}')
 40 |       raise ValueError(f'Invalid image path: {image_path}')
 41 | 
 42 |     if hasattr(image_path, 'item'):
 43 |       image_path = image_path.item()
 44 | 
 45 |     image_path = str(image_path)
 46 |     result = api_client.process_single_item(item, frames_dir, run_id)
 47 | 
 48 |     return result
 49 |   except Exception as e:
 50 |     print(f'Error processing item {item.get("id", "unknown")}: {str(e)}')
 51 |     raise e
 52 | 
 53 | 
 54 | def evaluate_csv(
 55 |   csv_file: str,
 56 |   frames_dir: str,
 57 |   api_client: Any,
 58 |   output_file: Optional[str] = None,
 59 |   sample_size: Optional[int] = None,
 60 |   num_workers: int = 1,
 61 |   run_id: str = datetime.now().strftime('%Y-%m-%d-%H-%M'),
 62 | ) -> List[Dict[str, Any]]:
 63 |   results: List[Dict[str, Any]] = []
 64 | 
 65 |   df = pd.read_csv(csv_file)
 66 |   print(f'Loaded {len(df)} items from {csv_file}')
 67 | 
 68 |   if sample_size is not None and sample_size < len(df):
 69 |     df = df.sample(sample_size, random_state=42)
 70 |     print(f'Sampled {len(df)} items for evaluation')
 71 | 
 72 |   items: List[Dict[str, Any]] = [
 73 |     {str(k): v for k, v in item.items()} for item in df.to_dict('records')
 74 |   ]
 75 | 
 76 |   print(f'Using {num_workers} concurrent workers for processing')
 77 | 
 78 |   process_func = partial(
 79 |     process_item,
 80 |     frames_dir=frames_dir,
 81 |     api_client=api_client,
 82 |     run_id=run_id,
 83 |   )
 84 | 
 85 |   try:
 86 |     multiprocessing.set_start_method('spawn', force=True)
 87 |   except RuntimeError:
 88 |     pass
 89 | 
 90 |   results = []
 91 |   total_processed = 0
 92 |   total_in_bbox = 0
 93 | 
 94 |   try:
 95 |     with multiprocessing.Pool(processes=num_workers, maxtasksperchild=1) as pool:
 96 |       with tqdm(total=len(items), desc='Evaluating', unit='item') as pbar:
 97 |         for result in pool.imap_unordered(process_func, items):
 98 |           if result is not None:
 99 |             results.append(result.model_dump())
100 |             total_processed += 1
101 |             if result.is_in_bbox:
102 |               total_in_bbox += 1
103 |             running_accuracy = (total_in_bbox / total_processed) * 100 if total_processed > 0 else 0
104 |             pbar.set_postfix({'accuracy': f'{running_accuracy:.2f}%'})
105 | 
106 |           pbar.update(1)
107 |   except Exception as e:
108 |     print(f'{Fore.RED}Error in multiprocessing: {e}{Style.RESET_ALL}')
109 |     raise e
110 | 
111 |   if output_file and results:
112 |     results_df = pd.DataFrame(results)
113 |     results_df.to_csv(output_file, index=False)
114 |     print(f'Results written to {output_file}')
115 | 
116 |   return results
117 | 
118 | 
119 | def main():
120 |   parser = argparse.ArgumentParser(
121 |     description='Evaluate models on the clicks dataset with bounding box and pixel distance evaluation'
122 |   )
123 |   parser.add_argument(
124 |     '--dataset',
125 |     type=str,
126 |     choices=['dev', 'full'],
127 |     default='dev',
128 |     help='Dataset to evaluate on (dev or full)',
129 |   )
130 |   parser.add_argument(
131 |     '--model',
132 |     type=str,
133 |     choices=[
134 |       'ace',
135 |       'claude',
136 |       'qwen',
137 |       'openai',
138 |       'openai-cua',
139 |       'gemini',
140 |       'molmo',
141 |       'ui-tars',
142 |       'omniparser',
143 |     ],
144 |     default='ace',
145 |     help='Model to use for evaluation (ace, claude, qwen, openai, openai-cua, gemini, molmo, ui-tars, or omniparser)',
146 |   )
147 |   parser.add_argument(
148 |     '--api-url',
149 |     type=str,
150 |     default='',
151 |     help='API endpoint for the model',
152 |   )
153 |   parser.add_argument(
154 |     '--api-key',
155 |     type=str,
156 |     default=None,
157 |     help='API key for the model (if required)',
158 |   )
159 |   parser.add_argument(
160 |     '--claude-model',
161 |     type=str,
162 |     default='claude-3-7-sonnet-20250219',
163 |     help='Claude model to use (default: claude-3-7-sonnet-20250219)',
164 |   )
165 |   parser.add_argument(
166 |     '--thinking-budget',
167 |     type=int,
168 |     default=1024,
169 |     help='Budget for Claude thinking tokens (default: 1024, 0 to disable)',
170 |   )
171 |   parser.add_argument(
172 |     '--tool-version',
173 |     type=str,
174 |     default='20250124',
175 |     help='Version of Claude computer use tools (default: 20250124)',
176 |   )
177 |   parser.add_argument(
178 |     '--qwen-model',
179 |     type=str,
180 |     default='qwen2.5-vl-72b-instruct',
181 |     help='Qwen model to use (default: qwen2.5-vl-72b-instruct)',
182 |   )
183 |   parser.add_argument(
184 |     '--openai-model',
185 |     type=str,
186 |     default='gpt-4o',
187 |     help='OpenAI model to use (default: gpt-4o)',
188 |   )
189 |   parser.add_argument(
190 |     '--openai-cua-model',
191 |     type=str,
192 |     default='computer-use-preview',
193 |     help='OpenAI CUA model to use (default: computer-use-preview)',
194 |   )
195 |   parser.add_argument(
196 |     '--environment',
197 |     type=str,
198 |     default='mac',
199 |     choices=['browser', 'mac', 'windows', 'ubuntu'],
200 |     help='Environment for OpenAI CUA (default: browser)',
201 |   )
202 |   parser.add_argument(
203 |     '--reasoning-effort',
204 |     type=str,
205 |     default='medium',
206 |     choices=['low', 'medium', 'high'],
207 |     help='Reasoning effort for OpenAI (default: medium)',
208 |   )
209 |   parser.add_argument(
210 |     '--max-tokens',
211 |     type=int,
212 |     default=4096,
213 |     help='Maximum tokens for model response (default: 4096)',
214 |   )
215 |   parser.add_argument(
216 |     '--sample-size',
217 |     type=int,
218 |     default=None,
219 |     help='Number of samples to evaluate (optional, for testing)',
220 |   )
221 |   parser.add_argument(
222 |     '--output-file', type=str, default=None, help='Path to output CSV file (optional)'
223 |   )
224 |   parser.add_argument(
225 |     '--num-workers',
226 |     type=int,
227 |     default=1,
228 |     help='Number of concurrent workers for processing (default: 1)',
229 |   )
230 |   parser.add_argument(
231 |     '--run-id',
232 |     type=str,
233 |     default=None,
234 |     help='Custom run ID (optional, defaults to current timestamp)',
235 |   )
236 |   parser.add_argument(
237 |     '--gemini-model',
238 |     type=str,
239 |     default='gemini-1.5-pro-latest',
240 |     help='Gemini model to use (default: gemini-1.5-pro-latest)',
241 |   )
242 |   parser.add_argument(
243 |     '--top-p',
244 |     type=float,
245 |     default=0.9,
246 |     help='Top-p sampling parameter (default: 0.9)',
247 |   )
248 |   parser.add_argument(
249 |     '--top-k',
250 |     type=int,
251 |     default=50,
252 |     help='Top-k sampling parameter (default: 50)',
253 |   )
254 |   parser.add_argument(
255 |     '--temperature',
256 |     type=float,
257 |     default=0.0,
258 |     help='Temperature for sampling (default: 0.0)',
259 |   )
260 |   parser.add_argument(
261 |     '--frequency-penalty',
262 |     type=float,
263 |     default=1.0,
264 |     help='Frequency penalty parameter for UI-TARS (default: 1.0)',
265 |   )
266 |   parser.add_argument(
267 |     '--ui-tars-model',
268 |     type=str,
269 |     default='bytedance-research/UI-TARS-72B-SFT',
270 |     help='UI-TARS model to use (default: bytedance-research/UI-TARS-72B-SFT)',
271 |   )
272 |   parser.add_argument(
273 |     '--omniparser-model',
274 |     type=str,
275 |     default='gpt-4o-2024-05-13',
276 |     help='OmniParser model to use (default: gpt-4o-2024-05-13)',
277 |   )
278 |   parser.add_argument(
279 |     '--omniparser-temperature',
280 |     type=float,
281 |     default=0.7,
282 |     help='Temperature for OmniParser generation (default: 0.7)',
283 |   )
284 |   parser.add_argument(
285 |     '--ace-model',
286 |     type=str,
287 |     default='ace-control-medium',
288 |     help='Ace model to use (default: ace-control-medium)',
289 |   )
290 |   args = parser.parse_args()
291 | 
292 |   base_dir = os.path.dirname(os.path.abspath(__file__))
293 |   data_dir = os.path.join(base_dir, 'data')
294 |   results_dir = os.path.join(base_dir, 'results')
295 | 
296 |   os.makedirs(results_dir, exist_ok=True)
297 | 
298 |   run_id = args.run_id or datetime.now().strftime('%Y-%m-%d-%H-%M')
299 | 
300 |   run_results_dir = os.path.join(results_dir, run_id)
301 |   os.makedirs(run_results_dir, exist_ok=True)
302 | 
303 |   if args.dataset == 'dev':
304 |     csv_file = os.path.join(data_dir, 'showdown-clicks-dev/data.csv')
305 |   else:
306 |     raise ValueError('Full dataset not currently supported')
307 | 
308 |   frames_dir = os.path.join(data_dir, 'showdown-clicks-dev')
309 | 
310 |   if args.output_file is None:
311 |     if args.model == 'claude':
312 |       model_name = args.claude_model.replace('-', '_')
313 |       args.output_file = os.path.join(
314 |         run_results_dir, f'claude_results_{model_name}_{args.dataset}.csv'
315 |       )
316 |     elif args.model == 'qwen':
317 |       model_name = args.qwen_model.replace('-', '_').replace('.', '_')
318 |       args.output_file = os.path.join(
319 |         run_results_dir, f'qwen_results_{model_name}_{args.dataset}.csv'
320 |       )
321 |     elif args.model == 'openai':
322 |       model_name = args.openai_model.replace('-', '_')
323 |       args.output_file = os.path.join(
324 |         run_results_dir, f'openai_results_{model_name}_{args.dataset}.csv'
325 |       )
326 |     elif args.model == 'openai-cua':
327 |       model_name = args.openai_cua_model.replace('-', '_')
328 |       args.output_file = os.path.join(
329 |         run_results_dir, f'openai_cua_results_{model_name}_{args.dataset}.csv'
330 |       )
331 |     elif args.model == 'gemini':
332 |       model_name = args.gemini_model.replace('-', '_').replace('.', '_')
333 |       args.output_file = os.path.join(
334 |         run_results_dir, f'gemini_results_{model_name}_{args.dataset}.csv'
335 |       )
336 |     elif args.model == 'molmo':
337 |       args.output_file = os.path.join(run_results_dir, f'molmo_results_{args.dataset}.csv')
338 |     elif args.model == 'ui-tars':
339 |       model_name = args.ui_tars_model.replace('/', '_').replace('-', '_')
340 |       args.output_file = os.path.join(
341 |         run_results_dir, f'ui_tars_results_{model_name}_{args.dataset}.csv'
342 |       )
343 |     elif args.model == 'omniparser':
344 |       model_name = args.omniparser_model.replace('-', '_')
345 |       args.output_file = os.path.join(
346 |         run_results_dir, f'omniparser_results_{model_name}_{args.dataset}.csv'
347 |       )
348 |     else:
349 |       model_name = args.ace_model.replace('-', '_')
350 |       args.output_file = os.path.join(
351 |         run_results_dir, f'ace_results_{model_name}_{args.dataset}.csv'
352 |       )
353 | 
354 |   print(f'{Fore.CYAN}Running evaluation with the following configuration:{Style.RESET_ALL}')
355 |   print(f'{Fore.CYAN}  Dataset: {args.dataset}{Style.RESET_ALL}')
356 |   print(f'{Fore.CYAN}  Model: {args.model}{Style.RESET_ALL}')
357 |   print(f'{Fore.CYAN}  CSV file: {csv_file}{Style.RESET_ALL}')
358 |   print(f'{Fore.CYAN}  Frames directory: {frames_dir}{Style.RESET_ALL}')
359 |   print(f'{Fore.CYAN}  Run ID: {run_id}{Style.RESET_ALL}')
360 |   print(f'{Fore.CYAN}  Results directory: {run_results_dir}{Style.RESET_ALL}')
361 |   print(f'{Fore.CYAN}  Output file: {args.output_file}{Style.RESET_ALL}')
362 |   print(f'{Fore.CYAN}  Concurrent workers: {args.num_workers}{Style.RESET_ALL}')
363 | 
364 |   if args.model == 'ace':
365 |     print(f'  API URL: {args.api_url}')
366 |     print(f'  Ace model: {args.ace_model}')
367 |   elif args.model == 'claude':
368 |     print(f'  Claude model: {args.claude_model}')
369 |     print(f'  Thinking budget: {args.thinking_budget}')
370 |     print(f'  Tool version: {args.tool_version}')
371 |   elif args.model == 'qwen':
372 |     print(f'  Qwen model: {args.qwen_model}')
373 |     print(f'  Max tokens: {args.max_tokens}')
374 |   elif args.model == 'openai':
375 |     print(f'  OpenAI model: {args.openai_model}')
376 |     print(f'  Max tokens: {args.max_tokens}')
377 |     print(f'  Reasoning effort: {args.reasoning_effort}')
378 |   elif args.model == 'openai-cua':
379 |     print(f'  OpenAI CUA model: {args.openai_cua_model}')
380 |     print(f'  Max tokens: {args.max_tokens}')
381 |     print(f'  Environment: {args.environment}')
382 |   elif args.model == 'gemini':
383 |     print(f'  Gemini model: {args.gemini_model}')
384 |     print(f'  Max tokens: {args.max_tokens}')
385 |   elif args.model == 'molmo':
386 |     print(f'  API URL: {args.api_url}')
387 |     print(f'  Max tokens: {args.max_tokens}')
388 |     print(f'  Temperature: {args.temperature}')
389 |     print(f'  Top-p: {args.top_p}')
390 |     print(f'  Top-k: {args.top_k}')
391 |   elif args.model == 'ui-tars':
392 |     print(f'  API URL: {args.api_url}')
393 |     print(f'  UI-TARS model: {args.ui_tars_model}')
394 |     print(f'  Max tokens: {args.max_tokens}')
395 |     print(f'  Temperature: {args.temperature}')
396 |     print(f'  Frequency penalty: {args.frequency_penalty}')
397 |   elif args.model == 'omniparser':
398 |     print(f'  API URL: {args.api_url or "https://omniparser-api-omniparser-api.modal.run"}')
399 |     print(f'  OmniParser model: {args.omniparser_model}')
400 |     print(f'  Temperature: {args.omniparser_temperature}')
401 | 
402 |   if args.sample_size:
403 |     print(f'  Sample size: {args.sample_size}')
404 | 
405 |   if not os.path.exists(csv_file):
406 |     print(f'{Fore.RED}ERROR: CSV file not found: {csv_file}{Style.RESET_ALL}')
407 |     return
408 | 
409 |   if not os.path.exists(frames_dir):
410 |     print(f'{Fore.RED}ERROR: Frames directory not found: {frames_dir}{Style.RESET_ALL}')
411 |     return
412 | 
413 |   try:
414 |     df = pd.read_csv(csv_file)
415 |     print('\nFirst few rows of the CSV file:')
416 |     print(df.head(2))
417 |     print(f'\nTotal rows in CSV: {len(df)}')
418 | 
419 |     if not df.empty:
420 |       sample_image_series = df.iloc[0]['image']
421 |       sample_image = (
422 |         sample_image_series.item()
423 |         if hasattr(sample_image_series, 'item')
424 |         else str(sample_image_series)
425 |       )
426 |       print(f'Sample image path: {sample_image}')
427 | 
428 |       full_path = os.path.join(frames_dir, sample_image)
429 |       print(f'Full sample image path: {full_path}')
430 |       print(f'Image exists: {os.path.exists(full_path)}')
431 | 
432 |       if not os.path.exists(full_path):
433 |         print(f'{Fore.YELLOW}WARNING: Sample image does not exist at {full_path}{Style.RESET_ALL}')
434 |         print(
435 |           f'{Fore.YELLOW}Please ensure all images are available in the frames directory:{Style.RESET_ALL}'
436 |         )
437 |         print(f'  - Extract frames: tar -xf {os.path.join(data_dir, "frames.tar")} -C {data_dir}/')
438 |   except Exception as e:
439 |     print(f'{Fore.RED}Error reading CSV file: {e}{Style.RESET_ALL}')
440 | 
441 |   if args.model == 'claude':
442 |     api_key = args.api_key or os.environ.get('ANTHROPIC_API_KEY')
443 |     if not api_key:
444 |       print(
445 |         f'{Fore.RED}ERROR: Anthropic API key not provided. Please set the ANTHROPIC_API_KEY environment variable or use --api-key.{Style.RESET_ALL}'
446 |       )
447 |       return
448 |     thinking_budget = args.thinking_budget if args.thinking_budget > 0 else None
449 |     api_client = get_claude_api_client(
450 |       api_key=api_key,
451 |       model=args.claude_model,
452 |       thinking_budget=thinking_budget,
453 |       tool_version=args.tool_version,
454 |     )
455 |   elif args.model == 'qwen':
456 |     api_key = args.api_key or os.environ.get('DASHSCOPE_API_KEY')
457 |     if not api_key:
458 |       print(
459 |         f'{Fore.RED}ERROR: DashScope API key not provided. Please set the DASHSCOPE_API_KEY environment variable or use --api-key.{Style.RESET_ALL}'
460 |       )
461 |       return
462 | 
463 |     api_client = get_qwen_api_client(
464 |       api_key=api_key,
465 |       model=args.qwen_model,
466 |       max_tokens=args.max_tokens,
467 |     )
468 | 
469 |     if args.num_workers > 1:
470 |       print(
471 |         f'{Fore.YELLOW}Warning: Using multiple workers ({args.num_workers}) with Qwen. If you encounter errors, try reducing the number of workers.{Style.RESET_ALL}'
472 |       )
473 |   elif args.model == 'openai':
474 |     api_key = args.api_key or os.environ.get('OPENAI_API_KEY')
475 |     if not api_key:
476 |       print(
477 |         f'{Fore.RED}ERROR: OpenAI API key not provided. Please set the OPENAI_API_KEY environment variable or use --api-key.{Style.RESET_ALL}'
478 |       )
479 |       return
480 | 
481 |     api_client = get_openai_api_client(
482 |       api_key=api_key,
483 |       model=args.openai_model,
484 |       max_tokens=args.max_tokens,
485 |       reasoning_effort=args.reasoning_effort,
486 |     )
487 | 
488 |     if args.num_workers > 1:
489 |       print(
490 |         f'{Fore.YELLOW}Warning: Using multiple workers ({args.num_workers}) with OpenAI. If you encounter errors, try reducing the number of workers.{Style.RESET_ALL}'
491 |       )
492 |   elif args.model == 'openai-cua':
493 |     api_key = args.api_key or os.environ.get('OPENAI_API_KEY')
494 |     if not api_key:
495 |       print(
496 |         f'{Fore.RED}ERROR: OpenAI API key not provided. Please set the OPENAI_API_KEY environment variable or use --api-key.{Style.RESET_ALL}'
497 |       )
498 |       return
499 | 
500 |     api_client = get_openai_cua_api_client(
501 |       api_key=api_key,
502 |       model=args.openai_cua_model,
503 |       max_tokens=args.max_tokens,
504 |       environment=args.environment,
505 |     )
506 | 
507 |     if args.num_workers > 1:
508 |       print(
509 |         f'{Fore.YELLOW}Warning: Using multiple workers ({args.num_workers}) with OpenAI CUA. If you encounter errors, try reducing the number of workers.{Style.RESET_ALL}'
510 |       )
511 |   elif args.model == 'gemini':
512 |     api_key = args.api_key or os.environ.get('GEMINI_API_KEY')
513 |     if not api_key:
514 |       print(
515 |         f'{Fore.RED}ERROR: Google API key not provided. Please set the GEMINI_API_KEY environment variable or use --api-key.{Style.RESET_ALL}'
516 |       )
517 |       return
518 | 
519 |     api_client = get_gemini_api_client(
520 |       api_key=api_key,
521 |       model=args.gemini_model,
522 |       max_tokens=args.max_tokens,
523 |     )
524 | 
525 |     if args.num_workers > 1:
526 |       print(
527 |         f'{Fore.YELLOW}Warning: Using multiple workers ({args.num_workers}) with Gemini. If you encounter errors, try reducing the number of workers.{Style.RESET_ALL}'
528 |       )
529 |   elif args.model == 'molmo':
530 |     if not args.api_url:
531 |       print(
532 |         f'{Fore.RED}ERROR: Molmo API URL not provided. Please provide it using --api-url.{Style.RESET_ALL}'
533 |       )
534 |       return
535 | 
536 |     api_client = get_molmo_api_client(
537 |       api_url=args.api_url,
538 |       api_key=args.api_key,
539 |       max_tokens=args.max_tokens,
540 |       temperature=args.temperature,
541 |       top_p=args.top_p,
542 |       top_k=args.top_k,
543 |     )
544 | 
545 |     if args.num_workers > 1:
546 |       print(
547 |         f'{Fore.YELLOW}Warning: Using multiple workers ({args.num_workers}) with Molmo. If you encounter errors, try reducing the number of workers.{Style.RESET_ALL}'
548 |       )
549 |   elif args.model == 'ui-tars':
550 |     if not args.api_url:
551 |       print(
552 |         f'{Fore.RED}ERROR: UI-TARS API URL not provided. Please provide it using --api-url.{Style.RESET_ALL}'
553 |       )
554 |       return
555 | 
556 |     api_client = get_ui_tars_api_client(
557 |       api_url=args.api_url,
558 |       api_key=args.api_key or 'super-secret-key',
559 |       max_tokens=args.max_tokens,
560 |       temperature=args.temperature,
561 |       frequency_penalty=args.frequency_penalty,
562 |       model_name=args.ui_tars_model,
563 |     )
564 | 
565 |     if args.num_workers > 1:
566 |       print(
567 |         f'{Fore.YELLOW}Warning: Using multiple workers ({args.num_workers}) with UI-TARS. If you encounter errors, try reducing the number of workers.{Style.RESET_ALL}'
568 |       )
569 |   elif args.model == 'omniparser':
570 |     api_client = get_omniparser_api_client(
571 |       api_endpoint=args.api_url or 'https://omniparser-api-omniparser-api.modal.run',
572 |       model=args.omniparser_model,
573 |       temperature=args.omniparser_temperature,
574 |     )
575 | 
576 |     if args.num_workers > 1:
577 |       print(
578 |         f'{Fore.YELLOW}Warning: Using multiple workers ({args.num_workers}) with OmniParser. If you encounter errors, try reducing the number of workers.{Style.RESET_ALL}'
579 |       )
580 |   else:
581 |     api_key = args.api_key or os.environ.get('GENERALAGENTS_API_KEY')
582 |     if not api_key:
583 |       print(
584 |         f'{Fore.RED}ERROR: General Agents API key not provided. Please set the GENERALAGENTS_API_KEY environment variable or use --api-key.{Style.RESET_ALL}'
585 |       )
586 |       return
587 | 
588 |     api_client = ace.get_api_client(api_key, args.ace_model)
589 | 
590 |   try:
591 |     results = evaluate_csv(
592 |       csv_file,
593 |       frames_dir,
594 |       api_client,
595 |       args.output_file,
596 |       args.sample_size,
597 |       args.num_workers,
598 |       run_id,
599 |     )
600 | 
601 |     if results:
602 |       print(f'\n{Fore.GREEN}Evaluation completed successfully!{Style.RESET_ALL}')
603 |       print(f'Results: {len(results)} items processed')
604 |       print(f'Results saved to: {args.output_file}')
605 |       print(f'Visualizations saved to: {os.path.join(run_results_dir, "visualizations")}')
606 | 
607 |       results_analysis = analyze_results(results, run_id)
608 | 
609 |       metrics_dict = {
610 |         'run_id': run_id,
611 |         'model': args.model,
612 |         'ci': results_analysis.ci,
613 |         'accuracy': results_analysis.accuracy,
614 |         'accuracy_ci_low': results_analysis.accuracy_ci_low,
615 |         'accuracy_ci_high': results_analysis.accuracy_ci_high,
616 |         'total_processed': results_analysis.total_processed,
617 |       }
618 | 
619 |       base_dir = os.path.dirname(os.path.abspath(__file__))
620 |       metrics_file = os.path.join(base_dir, 'results', 'all_metrics.csv')
621 | 
622 |       if os.path.exists(metrics_file):
623 |         all_metrics_df = pd.read_csv(metrics_file)
624 |         all_metrics_df = pd.concat(
625 |           [all_metrics_df, pd.DataFrame([metrics_dict])], ignore_index=True
626 |         )
627 |       else:
628 |         all_metrics_df = pd.DataFrame([metrics_dict])
629 | 
630 |       all_metrics_df.to_csv(metrics_file, index=False)
631 |       print(f'\n{Fore.CYAN}Metrics saved to: {metrics_file}{Style.RESET_ALL}')
632 | 
633 |     else:
634 |       print(f'\n{Fore.RED}Evaluation failed: No results returned.{Style.RESET_ALL}')
635 | 
636 |   except Exception as e:
637 |     print(f'\n{Fore.RED}Evaluation failed with error: {e}{Style.RESET_ALL}')
638 |     raise e
639 | 
640 | 
641 | if __name__ == '__main__':
642 |   main()
643 | 


--------------------------------------------------------------------------------