├── pre-requirements.txt
├── requirements.txt
├── ipynb
    └── Quick_Demo.ipynb
├── README.md
├── LICENSE
└── app.py


/pre-requirements.txt:
--------------------------------------------------------------------------------
1 | pip>=23.0.0
2 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | gradio==6.1.0
 2 | transformers==4.57.1
 3 | huggingface-hub
 4 | numpy
 5 | torch
 6 | torchvision
 7 | accelerate
 8 | qwen-vl-utils
 9 | requests
10 | pillow
11 | spaces
12 | 


--------------------------------------------------------------------------------
/ipynb/Quick_Demo.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |   "nbformat": 4,
 3 |   "nbformat_minor": 0,
 4 |   "metadata": {
 5 |     "colab": {
 6 |       "provenance": []
 7 |     },
 8 |     "kernelspec": {
 9 |       "name": "python3",
10 |       "display_name": "Python 3"
11 |     },
12 |     "language_info": {
13 |       "name": "python"
14 |     }
15 |   },
16 |   "cells": [
17 |     {
18 |       "cell_type": "markdown",
19 |       "source": [
20 |         "## **Gliese-CUA-Tool-Call-8B-Localization**\n",
21 |         "\n",
22 |         "A Gradio-based demonstration for the prithivMLmods/Gliese-CUA-Tool-Call-8B model, specialized in GUI element localization. Users upload UI screenshots, provide task instructions (e.g., \"Click on the search bar\"), and receive predicted click coordinates in `Click(x, y)` format, visualized as crosshairs and labels on the image. Features model download to local directory for offline use, smart image resizing, and coordinate scaling to original resolution.\n"
23 |       ],
24 |       "metadata": {
25 |         "id": "X9DtFvrxTx3r"
26 |       }
27 |     },
28 |     {
29 |       "cell_type": "code",
30 |       "execution_count": null,
31 |       "metadata": {
32 |         "id": "xj3l-af5TvNo"
33 |       },
34 |       "outputs": [],
35 |       "source": [
36 |         "!git clone https://github.com/PRITHIVSAKTHIUR/Gliese-CUA-Tool-Call-8B-Localization.git"
37 |       ]
38 |     },
39 |     {
40 |       "cell_type": "code",
41 |       "source": [
42 |         "%cd Gliese-CUA-Tool-Call-8B-Localization"
43 |       ],
44 |       "metadata": {
45 |         "id": "IHwkj8sDT_ua"
46 |       },
47 |       "execution_count": null,
48 |       "outputs": []
49 |     },
50 |     {
51 |       "cell_type": "code",
52 |       "source": [
53 |         "!pip install -r requirements.txt"
54 |       ],
55 |       "metadata": {
56 |         "id": "yyTNDHk0UB9r"
57 |       },
58 |       "execution_count": null,
59 |       "outputs": []
60 |     },
61 |     {
62 |       "cell_type": "code",
63 |       "source": [
64 |         "!python app.py"
65 |       ],
66 |       "metadata": {
67 |         "id": "Y69P0JV2UD4y"
68 |       },
69 |       "execution_count": null,
70 |       "outputs": []
71 |     }
72 |   ]
73 | }


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # **Gliese-CUA-Tool-Call-8B-Localization**
 2 | 
 3 | > A Gradio-based demonstration for the prithivMLmods/Gliese-CUA-Tool-Call-8B model, specialized in GUI element localization. Users upload UI screenshots, provide task instructions (e.g., "Click on the search bar"), and receive predicted click coordinates in `Click(x, y)` format, visualized as crosshairs and labels on the image. Features model download to local directory for offline use, smart image resizing, and coordinate scaling to original resolution.
 4 | 
 5 | ## Features
 6 | 
 7 | - **Element Localization**: Natural language tasks predict precise pixel coordinates for UI components (e.g., buttons, inputs).
 8 | - **Action Visualization**: Overlays red crosshairs with yellow labels on the output image using PIL for clear action points.
 9 | - **Smart Resizing**: Automatically resizes inputs based on model processor params (min/max pixels, patch/merge sizes) for optimal inference.
10 | - **Coordinate Scaling**: Adjusts resized coordinates back to original image dimensions for accurate absolute positioning.
11 | - **Efficient Inference**: Uses bfloat16/float32 precision on CUDA; generates up to 128 new tokens with deterministic output.
12 | - **Local Model Storage**: Downloads model via Hugging Face Hub snapshot to `./model/` for faster reloads and offline capability.
13 | - **Custom Theme**: OrangeRedTheme with gradients for an intuitive interface.
14 | - **Queueing Support**: Handles up to 50 concurrent inferences.
15 | 
16 | ## Prerequisites
17 | 
18 | - Python 3.10 or higher.
19 | - CUDA-compatible GPU (recommended for bfloat16; falls back to CPU).
20 | - Stable internet for initial model download (subsequent runs use local cache).
21 | 
22 | ## Installation
23 | 
24 | 1. Clone the repository:
25 |    ```
26 |    git clone https://github.com/PRITHIVSAKTHIUR/Gliese-CUA-Tool-Call-8B-Localization.git
27 |    cd Gliese-CUA-Tool-Call-8B-Localization
28 |    ```
29 | 
30 | 2. Install dependencies:
31 |    Create a `requirements.txt` file with the following content, then run:
32 |    ```
33 |    pip install -r requirements.txt
34 |    ```
35 | 
36 |    **requirements.txt content:**
37 |    ```
38 |    gradio==6.1.0
39 |    transformers==4.57.1
40 |    huggingface-hub
41 |    numpy
42 |    torch
43 |    torchvision
44 |    accelerate
45 |    qwen-vl-utils
46 |    requests
47 |    pillow
48 |    spaces
49 |    ```
50 | 
51 | 3. Start the application:
52 |    ```
53 |    python app.py
54 |    ```
55 |    The demo launches at `http://localhost:7860` (or the provided URL if using Spaces). The first run downloads the model (~8B params) to `./model/Gliese-CUA-Tool-Call-8B`.
56 | 
57 | ## Usage
58 | 
59 | 1. **Upload Image**: Provide a UI screenshot (e.g., PNG of a web page or app; height up to 500px).
60 | 
61 | 2. **Enter Task**: Describe the target (e.g., "Locate the search bar" or "Find the submit button").
62 | 
63 | 3. **Call CUA Agent**: Click the button to run inference.
64 | 
65 | 4. **View Results**:
66 |    - Text: Raw model response with parsed `Click(x, y)`.
67 |    - Image: Annotated screenshot with crosshair visualization.
68 | 
69 | ### Example Workflow
70 | - Upload a browser screenshot.
71 | - Task: "Click on the search bar."
72 | - Output: `Click(250, 150)` and image with red crosshair on the bar.
73 | 
74 | ## Troubleshooting
75 | 
76 | - **Model Download Fails**: Check internet; resume with `resume_download=True`. Verify `allow_patterns="Localization-8B/**"`.
77 | - **Loading Errors**: Ensure transformers 4.57.1; check CUDA with `torch.cuda.is_available()`. Use `torch.float32` if bfloat16 OOM.
78 | - **No Coordinates Parsed**: Task must be localization-focused; raw output in console. Increase max_new_tokens if needed.
79 | - **Resizing Issues**: `smart_resize` enforces min/max pixels; fallback to original if errors.
80 | - **Visualization Problems**: PIL font fallback used; ensure RGB images.
81 | - **Queue Full**: Increase `max_size` in `demo.queue()`.
82 | - **Spaces Deployment**: Install `spaces`; set `show_error=True` for debugging.
83 | 
84 | ## Contributing
85 | 
86 | Contributions encouraged! Fork the repo, create a feature branch (e.g., for multi-target support), and submit PRs with tests. Focus areas:
87 | - Extension to tool-calling beyond localization.
88 | - Batch image processing.
89 | - Custom prompt templates.
90 | 
91 | Repository: [https://github.com/PRITHIVSAKTHIUR/Gliese-CUA-Tool-Call-8B-Localization.git](https://github.com/PRITHIVSAKTHIUR/Gliese-CUA-Tool-Call-8B-Localization.git)
92 | 
93 | ## License
94 | 
95 | Apache License 2.0. See [LICENSE](LICENSE) for details.
96 | 
97 | Built by Prithiv Sakthi. Report issues via the repository.
98 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | import json
  4 | import time
  5 | import unicodedata
  6 | import gc
  7 | from io import BytesIO
  8 | from typing import Iterable, Tuple, Optional, List, Dict, Any
  9 | 
 10 | import gradio as gr
 11 | import numpy as np
 12 | import torch
 13 | import spaces
 14 | from PIL import Image, ImageDraw, ImageFont
 15 | from huggingface_hub import snapshot_download
 16 | 
 17 | from transformers import (
 18 |     AutoProcessor,
 19 |     AutoModelForImageTextToText,
 20 | )
 21 | from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
 22 | 
 23 | from gradio.themes import Soft
 24 | from gradio.themes.utils import colors, fonts, sizes
 25 | 
 26 | colors.orange_red = colors.Color(
 27 |     name="orange_red",
 28 |     c50="#FFF0E5",
 29 |     c100="#FFE0CC",
 30 |     c200="#FFC299",
 31 |     c300="#FFA366",
 32 |     c400="#FF8533",
 33 |     c500="#FF4500",
 34 |     c600="#E63E00",
 35 |     c700="#CC3700",
 36 |     c800="#B33000",
 37 |     c900="#992900",
 38 |     c950="#802200",
 39 | )
 40 | 
 41 | class OrangeRedTheme(Soft):
 42 |     def __init__(
 43 |         self,
 44 |         *,
 45 |         primary_hue: colors.Color | str = colors.gray,
 46 |         secondary_hue: colors.Color | str = colors.orange_red,
 47 |         neutral_hue: colors.Color | str = colors.slate,
 48 |         text_size: sizes.Size | str = sizes.text_lg,
 49 |         font: fonts.Font | str | Iterable[fonts.Font | str] = (
 50 |             fonts.GoogleFont("Outfit"), "Arial", "sans-serif",
 51 |         ),
 52 |         font_mono: fonts.Font | str | Iterable[fonts.Font | str] = (
 53 |             fonts.GoogleFont("IBM Plex Mono"), "ui-monospace", "monospace",
 54 |         ),
 55 |     ):
 56 |         super().__init__(
 57 |             primary_hue=primary_hue,
 58 |             secondary_hue=secondary_hue,
 59 |             neutral_hue=neutral_hue,
 60 |             text_size=text_size,
 61 |             font=font,
 62 |             font_mono=font_mono,
 63 |         )
 64 |         super().set(
 65 |             background_fill_primary="*primary_50",
 66 |             background_fill_primary_dark="*primary_900",
 67 |             body_background_fill="linear-gradient(135deg, *primary_200, *primary_100)",
 68 |             body_background_fill_dark="linear-gradient(135deg, *primary_900, *primary_800)",
 69 |             button_primary_text_color="white",
 70 |             button_primary_text_color_hover="white",
 71 |             button_primary_background_fill="linear-gradient(90deg, *secondary_500, *secondary_600)",
 72 |             button_primary_background_fill_hover="linear-gradient(90deg, *secondary_600, *secondary_700)",
 73 |             button_primary_background_fill_dark="linear-gradient(90deg, *secondary_600, *secondary_700)",
 74 |             button_primary_background_fill_hover_dark="linear-gradient(90deg, *secondary_500, *secondary_600)",
 75 |             button_secondary_text_color="black",
 76 |             button_secondary_text_color_hover="white",
 77 |             button_secondary_background_fill="linear-gradient(90deg, *primary_300, *primary_300)",
 78 |             button_secondary_background_fill_hover="linear-gradient(90deg, *primary_400, *primary_400)",
 79 |             button_secondary_background_fill_dark="linear-gradient(90deg, *primary_500, *primary_600)",
 80 |             button_secondary_background_fill_hover_dark="linear-gradient(90deg, *primary_500, *primary_500)",
 81 |             slider_color="*secondary_500",
 82 |             slider_color_dark="*secondary_600",
 83 |             block_title_text_weight="600",
 84 |             block_border_width="3px",
 85 |             block_shadow="*shadow_drop_lg",
 86 |             button_primary_shadow="*shadow_drop_lg",
 87 |             button_large_padding="11px",
 88 |             color_accent_soft="*primary_100",
 89 |             block_label_background_fill="*primary_200",
 90 |         )
 91 | 
 92 | orange_red_theme = OrangeRedTheme()
 93 | 
 94 | device = "cuda" if torch.cuda.is_available() else "cpu"
 95 | print(f"Running on device: {device}")
 96 | 
 97 | print("🔄 Downloading Gliese-CUA-Tool-Call-8B model...")
 98 | local_dir = "./model/Gliese-CUA-Tool-Call-8B"
 99 | snapshot_download(
100 |     repo_id="prithivMLmods/Gliese-CUA-Tool-Call-8B",
101 |     local_dir=local_dir,
102 |     resume_download=True,
103 |     allow_patterns="Localization-8B/**",
104 | )
105 | model_path = os.path.join(local_dir, "Localization-8B")
106 | print("✅ Model downloaded.")
107 | 
108 | 
109 | print("🔄 Loading Gliese-CUA-Tool-Call-8B...")
110 | try:
111 |     processor_x = AutoProcessor.from_pretrained(model_path, trust_remote_code=True, use_fast=False)
112 |     model_x = AutoModelForImageTextToText.from_pretrained(
113 |         model_path,
114 |         trust_remote_code=True,
115 |         torch_dtype=torch.bfloat16 if device == "cuda" else torch.float32,
116 |     ).to(device).eval()
117 | except Exception as e:
118 |     print(f"Failed to Gliese-CUA-Tool-Call-8B model: {e}")
119 |     model_x = None
120 |     processor_x = None
121 | 
122 | print("✅ Models loading sequence complete.")
123 | 
124 | def array_to_image(image_array: np.ndarray) -> Image.Image:
125 |     if image_array is None: raise ValueError("No image provided.")
126 |     return Image.fromarray(np.uint8(image_array))
127 | 
128 | def get_image_proc_params(processor) -> Dict[str, int]:
129 |     ip = getattr(processor, "image_processor", None)
130 |     
131 |     default_min = 256 * 256
132 |     default_max = 1280 * 1280
133 | 
134 |     patch_size = getattr(ip, "patch_size", 14)
135 |     merge_size = getattr(ip, "merge_size", 2)
136 |     min_pixels = getattr(ip, "min_pixels", default_min)
137 |     max_pixels = getattr(ip, "max_pixels", default_max)
138 | 
139 |     size_config = getattr(ip, "size", {})
140 |     if isinstance(size_config, dict):
141 |         if "shortest_edge" in size_config:
142 |             min_pixels = size_config["shortest_edge"]
143 |         if "longest_edge" in size_config:
144 |             max_pixels = size_config["longest_edge"]
145 | 
146 |     if min_pixels is None: min_pixels = default_min
147 |     if max_pixels is None: max_pixels = default_max
148 | 
149 |     return {
150 |         "patch_size": patch_size,
151 |         "merge_size": merge_size,
152 |         "min_pixels": min_pixels,
153 |         "max_pixels": max_pixels,
154 |     }
155 | 
156 | def apply_chat_template_compat(processor, messages: List[Dict[str, Any]], thinking: bool = True) -> str:
157 |     if hasattr(processor, "apply_chat_template"):
158 |         try:
159 |             return processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, thinking=thinking)
160 |         except TypeError:
161 |             return processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
162 |             
163 |     tok = getattr(processor, "tokenizer", None)
164 |     if tok is not None and hasattr(tok, "apply_chat_template"):
165 |         return tok.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
166 |     
167 |     raise AttributeError("Could not apply chat template.")
168 | 
169 | def trim_generated(generated_ids, inputs):
170 |     in_ids = getattr(inputs, "input_ids", None)
171 |     if in_ids is None and isinstance(inputs, dict):
172 |         in_ids = inputs.get("input_ids", None)
173 |     if in_ids is None:
174 |         return generated_ids
175 |     return [out_ids[len(in_seq):] for in_seq, out_ids in zip(in_ids, generated_ids)]
176 | 
177 | def get_localization_prompt(task, image):
178 |     guidelines = (
179 |         "Localize an element on the GUI image according to my instructions and "
180 |         "output a click position as Click(x, y) with x num pixels from the left edge "
181 |         "and y num pixels from the top edge."
182 |     )
183 |     return [
184 |         {
185 |             "role": "user",
186 |             "content": [
187 |                 {"type": "image", "image": image},
188 |                 {"type": "text", "text": f"{guidelines}\n{task}"}
189 |             ],
190 |         }
191 |     ]
192 | 
193 | def parse_click_response(text: str) -> List[Dict]:
194 |     actions = []
195 |     text = text.strip()
196 |     
197 |     matches_click = re.findall(r"(?:click|left_click|right_click|double_click)\s*\(\s*(\d+)\s*,\s*(\d+)\s*\)", text, re.IGNORECASE)
198 |     for m in matches_click:
199 |         actions.append({"type": "click", "x": int(m[0]), "y": int(m[1]), "text": "", "norm": False})
200 | 
201 |     matches_point = re.findall(r"point=\[\s*(\d+)\s*,\s*(\d+)\s*\]", text, re.IGNORECASE)
202 |     for m in matches_point:
203 |         actions.append({"type": "click", "x": int(m[0]), "y": int(m[1]), "text": "", "norm": False})
204 | 
205 |     matches_box = re.findall(r"start_box=['\"]?\(\s*(\d+)\s*,\s*(\d+)\s*\)['\"]?", text, re.IGNORECASE)
206 |     for m in matches_box:
207 |         actions.append({"type": "click", "x": int(m[0]), "y": int(m[1]), "text": "", "norm": False})
208 |     
209 |     if not actions:
210 |         matches_tuple = re.findall(r"(?:^|\s)\(\s*(\d+)\s*,\s*(\d+)\s*\)(?:$|\s|,)", text)
211 |         for m in matches_tuple:
212 |             actions.append({"type": "click", "x": int(m[0]), "y": int(m[1]), "text": "", "norm": False})
213 | 
214 |     return actions
215 | 
216 | def create_localized_image(original_image: Image.Image, actions: list[dict]) -> Optional[Image.Image]:
217 |     if not actions: return None
218 |     img_copy = original_image.copy()
219 |     draw = ImageDraw.Draw(img_copy)
220 |     
221 |     try:
222 |         font = ImageFont.load_default(size=18)
223 |     except IOError:
224 |         font = ImageFont.load_default()
225 |     
226 |     for act in actions:
227 |         x = act['x']
228 |         y = act['y']
229 |         
230 |         pixel_x, pixel_y = int(x), int(y)
231 |             
232 |         color = 'red' if 'click' in act['type'].lower() else 'blue'
233 |         
234 |         line_len = 15
235 |         width = 4
236 |         draw.line((pixel_x - line_len, pixel_y, pixel_x + line_len, pixel_y), fill=color, width=width)
237 |         draw.line((pixel_x, pixel_y - line_len, pixel_x, pixel_y + line_len), fill=color, width=width)
238 |         
239 |         r = 20
240 |         draw.ellipse([pixel_x - r, pixel_y - r, pixel_x + r, pixel_y + r], outline=color, width=3)
241 |         
242 |         label = f"{act['type']}"
243 |         if act.get('text'): label += f": \"{act['text']}\""
244 |         
245 |         text_pos = (pixel_x + 25, pixel_y - 15)
246 |         
247 |         try:
248 |             bbox = draw.textbbox(text_pos, label, font=font)
249 |             padded_bbox = (bbox[0]-4, bbox[1]-2, bbox[2]+4, bbox[3]+2)
250 |             draw.rectangle(padded_bbox, fill="yellow", outline=color)
251 |             draw.text(text_pos, label, fill="black", font=font)
252 |         except Exception as e:
253 |             draw.text(text_pos, label, fill="white")
254 | 
255 |     return img_copy
256 | 
257 | @spaces.GPU
258 | def process_screenshot(input_numpy_image: np.ndarray, task: str):
259 |     if input_numpy_image is None: return "⚠️ Please upload an image.", None
260 |     if not task.strip(): return "⚠️ Please provide a task instruction.", None
261 | 
262 |     input_pil_image = array_to_image(input_numpy_image)
263 |     orig_w, orig_h = input_pil_image.size
264 |     actions = []
265 |     raw_response = ""
266 | 
267 |     if model_x is None: return "Error: UI-TARS model failed to load.", None
268 |     print("Using UI-TARS Pipeline...")
269 |     
270 |     model, processor = model_x, processor_x
271 |     ip_params = get_image_proc_params(processor)
272 |     
273 |     resized_h, resized_w = smart_resize(
274 |         input_pil_image.height, input_pil_image.width,
275 |         factor=ip_params["patch_size"] * ip_params["merge_size"],
276 |         min_pixels=ip_params["min_pixels"], 
277 |         max_pixels=ip_params["max_pixels"]
278 |     )
279 |     proc_image = input_pil_image.resize((resized_w, resized_h), Image.Resampling.LANCZOS)
280 |     
281 |     messages = get_localization_prompt(task, proc_image)
282 |     text_prompt = apply_chat_template_compat(processor, messages)
283 |     
284 |     inputs = processor(text=[text_prompt], images=[proc_image], padding=True, return_tensors="pt")
285 |     inputs = {k: v.to(device) for k, v in inputs.items()}
286 |     
287 |     with torch.no_grad():
288 |         generated_ids = model.generate(**inputs, max_new_tokens=128)
289 |         
290 |     generated_ids = trim_generated(generated_ids, inputs)
291 |     raw_response = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
292 |     actions = parse_click_response(raw_response)
293 |     
294 |     if resized_w > 0 and resized_h > 0:
295 |         scale_x = orig_w / resized_w
296 |         scale_y = orig_h / resized_h
297 |         for a in actions:
298 |             a['x'] = int(a['x'] * scale_x)
299 |             a['y'] = int(a['y'] * scale_y)
300 | 
301 | 
302 |     print(f"Raw Output: {raw_response}")
303 |     print(f"Parsed Actions: {actions}")
304 | 
305 |     output_image = input_pil_image
306 |     if actions:
307 |         vis = create_localized_image(input_pil_image, actions)
308 |         if vis: output_image = vis
309 |             
310 |     return raw_response, output_image
311 | 
312 | css="""
313 | #col-container {
314 |     margin: 0 auto;
315 |     max-width: 960px;
316 | }
317 | #main-title h1 {font-size: 2.1em !important;}
318 | """
319 | with gr.Blocks() as demo:
320 |     gr.Markdown("# **Gliese-CUA-Tool-Call-8B-Localization 🖥️**", elem_id="main-title")
321 | 
322 |     with gr.Row():
323 |         with gr.Column(scale=2):
324 |             input_image = gr.Image(label="Upload UI Image", type="numpy", height=500)
325 |             
326 |             task_input = gr.Textbox(
327 |                 label="Task Instruction",
328 |                 placeholder="e.g. Click on the search bar",
329 |                 lines=2
330 |             )
331 |             submit_btn = gr.Button("Call CUA Agent", variant="primary")
332 | 
333 |         with gr.Column(scale=3):
334 |             output_image = gr.Image(label="Visualized Action Points", elem_id="out_img", height=500)
335 |             output_text = gr.Textbox(label="Agent Model Response", lines=10)
336 | 
337 |     submit_btn.click(
338 |         fn=process_screenshot,
339 |         inputs=[input_image, task_input],
340 |         outputs=[output_text, output_image]
341 |     )
342 |     
343 | if __name__ == "__main__":
344 |     demo.queue(max_size=50).launch(theme=orange_red_theme, css=css, show_error=True)
345 | 


--------------------------------------------------------------------------------