├── assets
    ├── overview.png
    └── first_image.png
├── scripts
    └── server.sh
├── .gitmodules
├── train
    └── sft.yaml
├── LICENSE
├── deploy
    ├── main.py
    ├── prompt.py
    ├── utils.py
    ├── env.py
    └── agent.py
├── README.md
└── postprocess
    ├── prepare.py
    ├── prompt.py
    ├── boost.py
    ├── refinement.py
    └── utils.py


/assets/overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GAIR-NLP/PC-Agent-E/HEAD/assets/overview.png


--------------------------------------------------------------------------------
/assets/first_image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GAIR-NLP/PC-Agent-E/HEAD/assets/first_image.png


--------------------------------------------------------------------------------
/scripts/server.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | vllm serve "henryhe0123/PC-Agent-E" --tensor-parallel-size 4 --port 8030
4 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "train/LLaMA-Factory"]
2 | 	path = train/LLaMA-Factory
3 | 	url = git@github.com:hiyouga/LLaMA-Factory.git
4 | 


--------------------------------------------------------------------------------
/train/sft.yaml:
--------------------------------------------------------------------------------
 1 | ### model
 2 | model_name_or_path: Qwen/Qwen2.5-VL-72B-Instruct
 3 | image_max_pixels: 1000000
 4 | 
 5 | ### method
 6 | stage: sft
 7 | do_train: true
 8 | finetuning_type: full
 9 | freeze_vision_tower: true
10 | deepspeed: examples/deepspeed/ds_z3_config.json
11 | 
12 | ### dataset
13 | dataset: pc-agent-e
14 | template: qwen2_vl
15 | cutoff_len: 8192
16 | overwrite_cache: true
17 | preprocessing_num_workers: 64
18 | 
19 | ### output
20 | output_dir: saves/pc-agent-e/Qwen2.5-VL-72B-sft
21 | logging_steps: 1
22 | save_steps: 100
23 | plot_loss: true
24 | overwrite_output_dir: true
25 | 
26 | ### train
27 | per_device_train_batch_size: 2
28 | gradient_accumulation_steps: 2
29 | learning_rate: 2.0e-6
30 | num_train_epochs: 2
31 | lr_scheduler_type: cosine
32 | warmup_ratio: 0.05
33 | bf16: true
34 | ddp_timeout: 180000000
35 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 SII - Generative Artificial Intelligence Research Lab (GAIR)
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/deploy/main.py:
--------------------------------------------------------------------------------
 1 | # main.py
 2 | 
 3 | from openai import OpenAI
 4 | from agent import PCAgentE
 5 | from env import PCEnv
 6 | 
 7 | client = OpenAI(
 8 |     api_key="EMPTY",
 9 |     base_url="http://localhost:8030/v1",
10 | )
11 | model = "henryhe0123/PC-Agent-E"
12 | 
13 | 
14 | def run(task_description, max_steps=30):
15 |     # Initialize agent and environment
16 |     agent = PCAgentE(client, model, max_steps)
17 |     env = PCEnv()
18 |     
19 |     # Reset environment to get initial observation
20 |     obs = env.reset()
21 |     
22 |     # Run interaction loop
23 |     while True:
24 |         # Agent predicts next action based on current observation
25 |         actions, logs = agent.predict(task_description, obs)
26 |         if not actions:
27 |             print("Agent failed to generate valid actions, terminating execution")
28 |             return
29 |             
30 |         # Execute each action
31 |         for action in actions:
32 |             print(f"Executing action: {action}")
33 |             obs, done = env.step(action)
34 |             if done:
35 |                 return
36 | 
37 | 
38 | if __name__ == "__main__":
39 |     task_description = input("Please enter task description: ")
40 |     run(task_description)
41 | 


--------------------------------------------------------------------------------
/deploy/prompt.py:
--------------------------------------------------------------------------------
 1 | # prompt.py
 2 | 
 3 | AGENT_PROMPT = """You are a helpful assistant who can help users complete computer tasks, with **full permission** to make any operations on the user's computer. 
 4 | Based on the provided current state, you need to suggest the next action to complete the task. Do not try to complete the entire task in one step. Break it down into smaller steps, and at each step you will get a new state to interact with.
 5 | 
 6 | IMPORTANT: You must strictly adhere to the following rules:
 7 | 1. Choose ONLY ONE action from the list below for each response, DO NOT perform more than one action per step.
 8 | 2. Follow the exact syntax format for the selected action, DO NOT create or use any actions other than those listed.
 9 | 3. Once the task is completed, output action finish.
10 | 
11 | Valid actions:
12 | 
13 | 1. click (x, y)
14 | click the element at the position (x, y) on the screen
15 | 
16 | 2. right click (x, y)
17 | right click the element at the position (x, y) on the screen
18 | 
19 | 3. double click (x, y)
20 | double click the element at the position (x, y) on the screen
21 | 
22 | 4. drag from (x1, y1) to (x2, y2)
23 | drag the element from position (x1, y1) to (x2, y2).
24 | 
25 | 5. scroll (x)
26 | scroll the screen vertically with pixel offset x. Positive values of x: scroll up, negative values of x: scroll down.
27 | 
28 | 6. press key: key_content
29 | press the key key_content on the keyboard.
30 | 
31 | 7. hotkey (key1, key2)
32 | press the hotkey composed of key1 and key2.
33 | 
34 | 8. hotkey (key1, key2, key3)
35 | press the hotkey composed of key1, key2, and key3.
36 | 
37 | 9. type text: text_content
38 | type content text_content on the keyboard.
39 | 
40 | 10. wait
41 | wait for some time, usually for the system to respond, screen to refresh, advertisement to finish.
42 | 
43 | 11. finish
44 | indicating that the task has been completed.
45 | 
46 | 12. fail
47 | indicating that the task has failed, of this task is infeasible because not enough information is provided.
48 | 
49 | Response Format: {Your thought process}\n\nAction: {The specific action you choose to take}
50 | 
51 | --------------------------------------------
52 | 
53 | """
54 | 


--------------------------------------------------------------------------------
/deploy/utils.py:
--------------------------------------------------------------------------------
 1 | # utils.py
 2 | 
 3 | import io
 4 | import base64
 5 | 
 6 | 
 7 | KEYBOARD_KEYS = ['\t', '\n', '\r', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~', 'accept', 'add', 'alt', 'altleft', 'altright', 'apps', 'backspace', 'browserback', 'browserfavorites', 'browserforward', 'browserhome', 'browserrefresh', 'browsersearch', 'browserstop', 'capslock', 'clear', 'convert', 'ctrl', 'ctrlleft', 'ctrlright', 'decimal', 'del', 'delete', 'divide', 'down', 'end', 'enter', 'esc', 'escape', 'execute', 'f1', 'f10', 'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f2', 'f20', 'f21', 'f22', 'f23', 'f24', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'final', 'fn', 'hanguel', 'hangul', 'hanja', 'help', 'home', 'insert', 'junja', 'kana', 'kanji', 'launchapp1', 'launchapp2', 'launchmail', 'launchmediaselect', 'left', 'modechange', 'multiply', 'nexttrack', 'nonconvert', 'num0', 'num1', 'num2', 'num3', 'num4', 'num5', 'num6', 'num7', 'num8', 'num9', 'numlock', 'pagedown', 'pageup', 'pause', 'pgdn', 'pgup', 'playpause', 'prevtrack', 'print', 'printscreen', 'prntscrn', 'prtsc', 'prtscr', 'return', 'right', 'scrolllock', 'select', 'separator', 'shift', 'shiftleft', 'shiftright', 'sleep', 'space', 'stop', 'subtract', 'tab', 'up', 'volumedown', 'volumemute', 'volumeup', 'win', 'winleft', 'winright', 'yen', 'command', 'option', 'optionleft', 'optionright']
 8 | 
 9 | 
10 | def encode_image(image):
11 |     # encode image to base64 string
12 |     buffered = io.BytesIO()
13 |     image.save(buffered, format="PNG")
14 |     return base64.b64encode(buffered.getvalue()).decode('utf-8')
15 | 
16 | 
17 | def save_screenshot(screenshot, path):
18 |     screenshot.save(path, format="PNG")
19 | 
20 | 
21 | def get_mllm_messages(instruction, base64_image):
22 |     messages = [
23 |         {
24 |             "role": "user",
25 |             "content": [
26 |                 {
27 |                     "type": "text",
28 |                     "text": instruction
29 |                 },
30 |                 {
31 |                     "type": "image_url",
32 |                     "image_url": {
33 |                         "url": f"data:image/png;base64,{base64_image}"
34 |                     },
35 |                 },
36 |             ],
37 |         },
38 |     ]
39 |     return messages
40 | 


--------------------------------------------------------------------------------
/deploy/env.py:
--------------------------------------------------------------------------------
 1 | # env.py
 2 | 
 3 | import time
 4 | import pyautogui
 5 | from io import BytesIO
 6 | from PIL import ImageGrab
 7 | 
 8 | class PCEnv:
 9 |     """
10 |     PC Environment class, encapsulates the local computer environment,
11 |     supports executing pyautogui code and capturing screenshots
12 |     """
13 |     def __init__(self, screenshot_size=(1280, 720)):
14 |         """
15 |         Initialize the environment
16 |         Args:
17 |             screenshot_size: Screenshot dimensions
18 |         """
19 |         self.screenshot_size = screenshot_size
20 |         # Ensure pyautogui has failsafe measures
21 |         pyautogui.FAILSAFE = True
22 |         print("Initializing PC Environment...")
23 |         
24 |     def step(self, action):
25 |         """
26 |         Execute an action and return new observation
27 |         Args:
28 |             action: Action to execute (pyautogui code string)
29 |         Returns:
30 |             obs: Observation containing new screenshot
31 |             done: Whether the task is completed
32 |         """
33 |         done = False
34 |         
35 |         # Handle special actions
36 |         if action == "WAIT":
37 |             time.sleep(3)
38 |         elif action == "DONE":
39 |             print("Task completed, terminating execution")
40 |             done = True
41 |             return {"screenshot": self.get_screenshot()}, done
42 |         elif action == "FAIL":
43 |             print("Task failed, terminating execution")
44 |             done = True
45 |             return {"screenshot": self.get_screenshot()}, done
46 |         else:
47 |             # Execute pyautogui code
48 |             try:
49 |                 # Since we've imported pyautogui at the module level,
50 |                 # exec can directly execute strings like "pyautogui.click(1, 1)"
51 |                 # The pyautogui module is available in the exec's namespace
52 |                 exec(action)
53 |                 # Wait briefly to let UI respond
54 |                 time.sleep(1)
55 |             except Exception as e:
56 |                 print(f"Action execution failed: {e}")
57 |         
58 |         # Return new observation (screenshot)
59 |         return {"screenshot": self.get_screenshot()}, done
60 |     
61 |     def get_screenshot(self):
62 |         """
63 |         Capture current screen screenshot
64 |         Returns:
65 |             screenshot: Binary data of the screenshot
66 |         """
67 |         # Take screenshot
68 |         screenshot = ImageGrab.grab()
69 |         
70 |         # Warning if size is not as expected
71 |         if screenshot.size != self.screenshot_size:
72 |             print(f"Warning: Screenshot size is not as expected. Expected {self.screenshot_size}, got {screenshot.size}")
73 |             
74 |         # Convert to binary
75 |         buffer = BytesIO()
76 |         screenshot.save(buffer, format='PNG')
77 |         return buffer.getvalue()
78 |     
79 |     def reset(self):
80 |         """
81 |         Reset the environment
82 |         Returns:
83 |             obs: Observation containing new screenshot
84 |         """
85 |         # Reset only needs to return current screenshot as initial observation
86 |         return {"screenshot": self.get_screenshot()} 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Efficient Agent Training for Computer Use
  2 | 
  3 | <p align="center">
  4 |   📄 <a href="https://arxiv.org/abs/2505.13909" target="_blank">Paper</a> &nbsp; | &nbsp;
  5 |   🌐 <a href="https://gair-nlp.github.io/PC-Agent-E" target="_blank">Website</a> &nbsp; | &nbsp;
  6 |   🤖 <a href="https://huggingface.co/henryhe0123/PC-Agent-E" target="_blank">Model</a> &nbsp; | &nbsp;
  7 |   🤗 <a href="https://huggingface.co/datasets/henryhe0123/PC-Agent-E" target="_blank">Dataset</a> &nbsp; | &nbsp;
  8 |   ⚔️ <a href="https://github.com/GAIR-NLP/WindowsAgentArena-V2" target="_blank">WindowsAgentArena-V2</a>
  9 | </p>
 10 | 
 11 | <p align="center">
 12 |   <img src="./assets/first_image.png" width="50%" alt="animation">
 13 | </p>
 14 | 
 15 | ## Demo
 16 | 
 17 | Check out our demo of PC Agent-E autonomously controlling a computer to complete tasks on Windows and Linux systems!
 18 | 
 19 | https://github.com/user-attachments/assets/9540d8cb-630d-41e2-a108-a96ca3fcb32e
 20 | 
 21 | https://github.com/user-attachments/assets/18b436e7-733f-49a5-8716-25c29a990766
 22 | 
 23 | ## Introduction
 24 | 
 25 | We introduce **PC Agent-E**, an efficient agent training framework that elicits strong computer use capabilities with remarkable **data efficiency**. 
 26 | This framework is implemented with four key components: 
 27 | 1. **Trajectory Collection**, gathering a small set of task trajectories from human annotators with [PC Tracker](https://github.com/GAIR-NLP/PC-Agent?tab=readme-ov-file#pc-tracker);
 28 | 2. **Thought Completion**, reconstructing the latent human thought process before each action;
 29 | 3. **Trajectory Boost**, synthesizing diverse alternative action decisions;
 30 | 4. **Agent Training**, training native agent model with augmented trajectories.
 31 | 
 32 | ![overview](./assets/overview.png)
 33 | 
 34 | ## Main Results
 35 | 
 36 | Table: Results of successful rate (%) for different models on [WindowsAgentArena-V2](https://github.com/GAIR-NLP/WindowsAgentArena-V2), an improved benchmark we also released.
 37 | 
 38 | | Models                   | LibreOffice | Chrome | Edge  | System | VS Code | VLC  | Utils | Total |
 39 | |--------------------------|-------------|--------|-------|--------|---------|------|--------|-------|
 40 | | **Number of Tasks**      | 42          | 17     | 13    | 24     | 19      | 14   | 12     | 141   |
 41 | | Qwen2.5-VL-72B           | 0.0         | 34.7   | 15.4  | 20.8   | 26.3    | 7.6  | 16.7   | 14.9  |
 42 | | UI-TARS-1.5-7B           | **7.1**     | 34.7   | 23.1  | 45.8   | 21.1    | 7.6  | 16.7   | 21.3  |
 43 | | UI-TARS-72B-DPO          | 0.0         | 40.6   | 38.5  | 58.3   | 36.8    | 7.6  | 25.0   | 26.2  |
 44 | | Claude 3.7 Sonnet        | 2.4         | 46.5   | **61.5** | 54.2   | 52.6    | 29.0 | 16.7   | 32.6  |
 45 | | Claude 3.7 Sonnet (thinking) | 2.4     | **64.1** | 46.2  | **66.7** | 52.6    | 21.9 | 25.0   | 35.4  |
 46 | | **PC Agent-E (Ours)**    | 4.8         | **64.1** | 46.2  | 50.0   | **57.9**| **35.7** | **33.3** | **36.0** |
 47 | 
 48 | ## Quick Start
 49 | 
 50 | ### Trajectory Collection
 51 | 
 52 | Collect raw human trajectory with PC Tracker. See usage [here](https://github.com/GAIR-NLP/PC-Agent?tab=readme-ov-file#pc-tracker).
 53 | 
 54 | ### Post Processing
 55 | 
 56 | To convert raw human trajectory into high-quality trajectories for training, follow these steps:
 57 | 1. Place recorded in the `data/` directory. 
 58 | 2. Run post processing pipeline:
 59 | ```bash
 60 | # Data refinement
 61 | python postprocess/refinement.py
 62 | 
 63 | # Thought completion and Trajectory Boost    
 64 | python postprocess/boost.py    
 65 | ```
 66 | 
 67 | Note: You need to prepare your API key in advance.
 68 | 
 69 | ### Agent Training
 70 | 
 71 | You can use [our dataset](https://huggingface.co/datasets/henryhe0123/PC-Agent-E) or build data set with above steps on your own. To prepare data for agent training, put the dataset in the `data/` directory, and run:
 72 | ```bash
 73 | python postprocess/prepare.py 
 74 | ```
 75 | 
 76 | We recommend using [LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory) for agent training. To launch distributed training across multiple nodes, you can run:
 77 | 
 78 | ```bash
 79 | FORCE_TORCHRUN=1 NNODES=4 NODE_RANK=${PET_NODE_RANK} MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train train/sft.yaml
 80 | ```
 81 | 
 82 | Replace PET_NODE_RANK with the rank of the current node (from 0 to 3).
 83 | 
 84 | ### Agent Deployment
 85 | 
 86 | We provide a reference implementation of our PC Agent-E scaffold in the `deploy/` directory. To deploy our agent on your computer, run:
 87 | 
 88 | ```bash
 89 | python deploy/main.py
 90 | ```
 91 | 
 92 | Reference scripts for model deployment can be found in `scripts/server.sh`.
 93 | 
 94 | ## Acknowledgments
 95 | 
 96 | We would like to express our sincere gratitude to Shijie Xia for his meticulous review and constructive
 97 | suggestions, which significantly improved the quality of this paper. This project is supported by SJTU SEIEE - ByteDance Large Language Model Joint Laboratory, SII.
 98 | 
 99 | ## Citation
100 | 
101 | If you find this work helpful, please consider citing:
102 | 
103 | ```
104 | @misc{he2025efficientagenttrainingcomputer,
105 |       title={Efficient Agent Training for Computer Use}, 
106 |       author={Yanheng He and Jiahe Jin and Pengfei Liu},
107 |       year={2025},
108 |       eprint={2505.13909},
109 |       archivePrefix={arXiv},
110 |       primaryClass={cs.AI},
111 |       url={https://arxiv.org/abs/2505.13909}, 
112 | }
113 | ```
114 | 


--------------------------------------------------------------------------------
/postprocess/prepare.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import copy
  4 | import traceback
  5 | from prompt import AGENT_PROMPT
  6 | from utils import refine_response, refine_thought, combine_thought_action_to_response, get_history_str
  7 | 
  8 | 
  9 | output_file = f"train/LLaMA-Factory/data/pc-agent-e.json"
 10 | all_data = []
 11 | BOOST = True
 12 | HUMAN = True
 13 | REMOVE_NO_FISISH = True
 14 | BOOST_CNT = 9
 15 | 
 16 | 
 17 | def get_instruction(task_description, action_history):
 18 |     prompt = AGENT_PROMPT + f"Your task is: {task_description}\n\n"
 19 |     prompt += f"History of the previous actions and thoughts you have done to reach the current screen: {action_history}\n\n"
 20 |     prompt += "--------------------------------------------\n\n"
 21 |     prompt += f"Given the screenshot. What's the next step that you will do to help with the task?<image>"
 22 |     return prompt
 23 | 
 24 | 
 25 | def check_boost_response(boost_response, action):
 26 |     if boost_response is None:
 27 |         return False
 28 | 
 29 |     if REMOVE_NO_FISISH and action == "finish" and not "finish" in boost_response:
 30 |         # print(f"last action for boost is not finish, remove it!")
 31 |         return False
 32 |     if "(x, y)" in boost_response or "(x,y)" in boost_response:
 33 |         return False
 34 |     
 35 |     return True
 36 | 
 37 | 
 38 | def process_task_jsonl_file(file_path, dir_path, task_description):
 39 |     with open(file_path, "r", encoding="utf-8") as file:
 40 |         lines = file.readlines()          
 41 |     
 42 |     response_history = []  # for action history in natural language
 43 |         
 44 |     for idx, line in enumerate(lines):
 45 |         formatted_task = {
 46 |             "messages": [],
 47 |             "images": "",
 48 |         }
 49 | 
 50 |         entry = json.loads(line)
 51 |         action = entry["action"]
 52 |         
 53 |         # Reorganize press key action
 54 |         if action.startswith("press key"):
 55 |             action = action.replace("press key", "press key:")
 56 | 
 57 |         screenshot_path = entry["screenshot"]
 58 |         screenshot_path = f"{dir_path}/{screenshot_path}"
 59 |         # Add image path
 60 |         formatted_task["images"] = [screenshot_path]
 61 |         
 62 |         # Add user message
 63 |         action_history = get_history_str(response_history)
 64 |             
 65 |         query = get_instruction(task_description, action_history)
 66 |         formatted_task["messages"].append({"role": "user", "content": query})
 67 |         
 68 |         # Add boost response
 69 |         if BOOST and "boost_responses" in entry:
 70 |             for id, boost_response in enumerate(entry["boost_responses"][:BOOST_CNT]):
 71 |                 if not check_boost_response(boost_response, action):
 72 |                     continue
 73 |                 boost_response_cleaned = refine_response(boost_response)
 74 |                 if boost_response_cleaned is None:
 75 |                     continue
 76 |                 formatted_task_copy = copy.deepcopy(formatted_task)
 77 |                 formatted_task_copy["messages"].append({"role": "assistant", "content": boost_response_cleaned})
 78 |                 all_data.append(formatted_task_copy)
 79 | 
 80 |         # Add assistant message
 81 |         thought = refine_thought(entry['thought'])
 82 |         if thought is not None:
 83 |             response = combine_thought_action_to_response(thought, action)
 84 |             formatted_task["messages"].append({"role": "assistant", "content": response})
 85 |             response_history.append(response)
 86 |             if HUMAN:
 87 |                 all_data.append(formatted_task)
 88 | 
 89 | 
 90 | def process_events_directories():
 91 |     # Get the parent directory of the current script
 92 |     root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 93 |     
 94 |     # Build the path to the data folder
 95 |     data_dir = os.path.join(root_dir, 'data')
 96 |     if not os.path.exists(data_dir):
 97 |         print(f"error: {data_dir} directory does not exist")
 98 |         exit()
 99 |         
100 |     # Events folder prefix
101 |     events_prefix = "events"
102 | 
103 |     # Process each subdirectory under /data
104 |     for item in os.listdir(data_dir):
105 |         item_path = os.path.join(data_dir, item)
106 | 
107 |         # Check if it's a directory and starts with specified name
108 |         if os.path.isdir(item_path) and item.startswith(events_prefix):
109 |             for filename in os.listdir(item_path):
110 |                 # Process each jsonl file under the directory
111 |                 if filename.endswith(".jsonl") and "task" in filename:
112 |                     file_path = os.path.join(item_path, filename)
113 |                     md_path = os.path.join(item_path, filename.replace(".jsonl", ".md"))
114 |                     with open(md_path, "r", encoding="utf-8") as file:
115 |                         lines = file.readlines()
116 |                     try:
117 |                         task_description = lines[1].replace("**Description:** ", "").strip()
118 |                     except:
119 |                         print(f"Error: Unable to extract task description from {md_path}")
120 |                         continue
121 |                     try:
122 |                         process_task_jsonl_file(file_path, item_path, task_description)
123 |                     except Exception as e:
124 |                         error_traceback = traceback.format_exc()
125 |                         print(f"{file_path} encountered error: {e}")
126 |                         print(f"{error_traceback}")
127 | 
128 | 
129 | if __name__ == "__main__":
130 |     process_events_directories()
131 |     print(f"entries: {len(all_data)}")
132 |     with open(output_file, "w", encoding="utf-8") as file:
133 |         json.dump(all_data, file, indent=2, ensure_ascii=False)
134 |     


--------------------------------------------------------------------------------
/postprocess/prompt.py:
--------------------------------------------------------------------------------
  1 | THOUGHT_COMPLETION_PROMPT = """You are a helpful computer use agent designed to complete tasks on a computer. Your goal is to recreate your thought process behind a specific action.
  2 | 
  3 | You will be provided with:
  4 | 
  5 | 1. The task you are attempting to complete.
  6 | 2. A history of the steps you have already performed (up to 50, if any; none if it was the first action).
  7 | 3. The specific action you chose to take.
  8 | 4. The name of the element you clicked (if you clicked on an element). It might be too general or vague, you have to decied what to click based on the screenshot.
  9 | 5. A screenshot of the computer screen at the moment you decided to take the action.
 10 | 6. The red marks on the screenshot indicate the position of the click or drag action.
 11 | 
 12 | 
 13 | To formulate your thought process, consider:
 14 | 
 15 | 1. What do you observe on the screen? Consider your task and previous action when you analyzing current screenshot.
 16 | 2. Evaluate your previous action (if applicable):
 17 |    - Did it achieve the intended effect? If not, identify possible reasons (e.g., misclick, inactive element).
 18 |       Some typical examples for ineffective action:
 19 |        - misclick in an empty space
 20 |        - ineffective opening some elements without double click
 21 |        - ineffective type text/ press key because of inactivated input box
 22 |    - Did the result align with your previous plan, or did something unexpected happen?
 23 |       Some typical examples for ineffective action:
 24 |          - misclick in a wrong element
 25 |          - forget to clear existing text in input bar
 26 | 3. Based on your action history, assess your progress toward completing the overall task.
 27 | 4. Consider if you're exploring how to finish the task because of failed attempts in history steps.
 28 | 
 29 | 
 30 | Present your thought process as a clear, natural first-person narrative that explains your reasoning at that moment.
 31 | 
 32 | Important requirements:
 33 | 1. **DO NOT** mention the red marks in your response. These marks were **added after the fact** to indicate the position of your click or drag actions, and they were not on the screen when you made the decision. **DO NOT** mention "red box", "red square", "red circle", or "red arrow" in your response.
 34 | 2. Write as if you are thinking in real-time before taking the action. Do not include post-action evaluation or hindsight.
 35 | 
 36 | --------------------------------------------
 37 | """
 38 | 
 39 | 
 40 | TRAJECTORY_BOOST_PROMPT = """
 41 | You are a helpful assistant who can help users complete computer tasks, with **full permission** to make any operations on the user's computer. The operating system is windows.
 42 | Based on the provided current state, you need to suggest the next action to complete the task. Do not try to complete the entire task in one step. Break it down into smaller steps, and at each step you will get a new state to interact with.
 43 | 
 44 | IMPORTANT: You must strictly adhere to the following rules:
 45 | 
 46 | 1. Choose ONLY ONE action from the list below for each response, DO NOT perform more than one action per step.
 47 | 2. Follow the exact syntax format for the selected action, DO NOT create or use any actions other than those listed.
 48 | 3. Once the task is completed, output action finish.
 49 | 
 50 | Valid actions:
 51 | 
 52 | 1. click (x, y)
 53 |    click the element at the position (x, y) on the screen
 54 | 
 55 | 2. right click (x, y)
 56 |    right click the element at the position (x, y) on the screen
 57 | 
 58 | 3. double click (x, y)
 59 |    double click the element at the position (x, y) on the screen
 60 | 
 61 | 4. drag from (x1, y1) to (x2, y2)
 62 |    drag the element from position (x1, y1) to (x2, y2).
 63 | 
 64 | 5. scroll (x)
 65 |    scroll the screen vertically with pixel offset x. Positive values of x: scroll up, negative values of x: scroll down.
 66 | 
 67 | 6. press key: key_content
 68 |    press the key key_content on the keyboard.
 69 | 
 70 | 7. hotkey (key1, key2)
 71 |    press the hotkey composed of key1 and key2.
 72 | 
 73 | 8. hotkey (key1, key2, key3)
 74 |    press the hotkey composed of key1, key2, and key3.
 75 | 
 76 | 9. type text: text_content
 77 |    type content text_content on the keyboard. 
 78 |    Note that before typing text, you need to ensure the text box or input field is active/focused first. If the text box is not yet activated, you should first click on it to activate it, and then use type text in a separate step.
 79 | 
 80 | 10. wait
 81 |     wait for some time, usually for the system to respond, screen to refresh, advertisement to finish.
 82 | 
 83 | 11. finish
 84 |     indicating that the task has been completed.
 85 | 
 86 | 12. fail
 87 |     indicating that the task has failed, of this task is infeasible because not enough information is provided.
 88 |     
 89 |     
 90 | Before deciding your next action, you should think carefully about the current state of the screen and your history steps. Contain the following points in your thought process:
 91 | 
 92 | 1. What do you observe on the screen? Consider your task and previous action when you analyzing current screenshot.
 93 | 2. What's your previous plan and action (if applicable)? Evaluate your previous plan and action in three conditions:
 94 |    1. It didn't make any effect. You should dentify possible reasons (e.g., misclick, inactive element) and adjust your plan in this step.
 95 |       Some typical examples for ineffective action:
 96 |        - misclick in an empty space
 97 |        - ineffective opening some elements without double click
 98 |        - ineffective type text/ press key because of inactivated input box
 99 |    2. It made some effect, but the result does not align with previous plan. You should dentify possible reasons (e.g., misclick, inactive element) and correct it in this step.
100 |       Some typical examples for ineffective action:
101 |          - misclick in a wrong element
102 |          - forget to clear existing text in input bar
103 |    3. It made some effect, and it successfully align with previous plan. You should progress to the next step based on the current state.
104 | 3. Based on your action history, assess your progress toward completing the overall task.
105 | 4. Exploring new ways to finish the task if there are already failed attempts in history steps. **DO NOT repeat** the history actions.
106 | 
107 | 
108 | Response Format: Your thought process\n\nAction: The specific action you choose to take
109 | """
110 | 
111 | 
112 | AGENT_PROMPT = """You are a helpful assistant who can help users complete computer tasks, with **full permission** to make any operations on the user's computer. The operating system is windows. 
113 | Based on the provided current state, you need to suggest the next action to complete the task. Do not try to complete the entire task in one step. Break it down into smaller steps, and at each step you will get a new state to interact with.
114 | 
115 | IMPORTANT: You must strictly adhere to the following rules:
116 | 1. Choose ONLY ONE action from the list below for each response, DO NOT perform more than one action per step.
117 | 2. Follow the exact syntax format for the selected action, DO NOT create or use any actions other than those listed.
118 | 3. Once the task is completed, output action finish.
119 | 
120 | Valid actions:
121 | 
122 | 1. click (x, y)
123 | click the element at the position (x, y) on the screen
124 | 
125 | 2. right click (x, y)
126 | right click the element at the position (x, y) on the screen
127 | 
128 | 3. double click (x, y)
129 | double click the element at the position (x, y) on the screen
130 | 
131 | 4. drag from (x1, y1) to (x2, y2)
132 | drag the element from position (x1, y1) to (x2, y2).
133 | 
134 | 5. scroll (x)
135 | scroll the screen vertically with pixel offset x. Positive values of x: scroll up, negative values of x: scroll down.
136 | 
137 | 6. press key: key_content
138 | press the key key_content on the keyboard.
139 | 
140 | 7. hotkey (key1, key2)
141 | press the hotkey composed of key1 and key2.
142 | 
143 | 8. hotkey (key1, key2, key3)
144 | press the hotkey composed of key1, key2, and key3.
145 | 
146 | 9. type text: text_content
147 | type content text_content on the keyboard.
148 | 
149 | 10. wait
150 | wait for some time, usually for the system to respond, screen to refresh, advertisement to finish.
151 | 
152 | 11. finish
153 | indicating that the task has been completed.
154 | 
155 | 12. fail
156 | indicating that the task has failed, of this task is infeasible because not enough information is provided.
157 | 
158 | Response Format: {Your thought process}\n\nAction: {The specific action you choose to take}
159 | 
160 | --------------------------------------------
161 | 
162 | """
163 | 
164 | 


--------------------------------------------------------------------------------
/deploy/agent.py:
--------------------------------------------------------------------------------
  1 | # agent.py
  2 | 
  3 | import re
  4 | import time
  5 | from typing import Dict, List
  6 | from PIL import Image
  7 | from io import BytesIO
  8 | from utils import *
  9 | from prompt import * 
 10 | 
 11 | 
 12 | class PCAgentE:
 13 |     def __init__(
 14 |         self, client, model, max_steps=30, screenshot_size=(1280, 720), prompt=AGENT_PROMPT
 15 |     ):
 16 |         self.retry_click_elements = []
 17 |         self.history = []
 18 |         self.history_cut_off = 10
 19 |         self.client = client
 20 |         self.model = model
 21 |         self.max_steps = max_steps
 22 |         self.screenshot_size = screenshot_size
 23 |         self.prompt = prompt
 24 |         self.steps = 0
 25 |         print(f"Using model: {model}")
 26 |        
 27 |     def predict(self, instruction: str, obs: Dict):
 28 |         """
 29 |         Predict the next action based on the current observation
 30 |         Args:
 31 |             instruction: the task description
 32 |             obs: the current observation (obs['screenshot'])
 33 |         Returns:
 34 |             actions: the code of next action
 35 |             logs: the logs of next action
 36 |         """
 37 |         logs = {}
 38 |         self.task_description = instruction
 39 |         
 40 |         # get and process the screenshot
 41 |         image_file = BytesIO(obs['screenshot'])
 42 |         view_image = Image.open(image_file)
 43 | 
 44 |         # call the visual language model for planning
 45 |         self.screenshot_size = view_image.size
 46 |         try_time = 5
 47 |         feedback = ""
 48 |         while try_time > 0:
 49 |             plan, action = self.get_plan(view_image, self.task_description, feedback)
 50 |             action_code = self.get_action_code(action)
 51 |             if action_code is None:
 52 |                 print(f"Invalid action: {action}, Try again.")
 53 |                 feedback = f"\n\nNote: You have provided an invalid action before: {action}, please try again."
 54 |                 try_time -= 1
 55 |                 if try_time == 0:
 56 |                     raise ValueError(f"Fail to get valid action after 5 try: {action}")
 57 |             else:
 58 |                 self.add_to_history(f"Plan: {plan}\n\nAction: {action}")
 59 |                 break
 60 | 
 61 |         # check if the steps is greater than the max steps
 62 |         self.steps += 1
 63 |         if self.steps >= self.max_steps and action_code != "DONE":
 64 |             logs['plan_result'] = "Max steps reached"
 65 |             actions = ["FAIL"]
 66 |         else:
 67 |             logs['plan_result'] = f"Plan: {plan}\n\nAction: {action}"
 68 |             actions = [action_code]
 69 | 
 70 |         return actions, logs
 71 | 
 72 |     def reset(self):
 73 |         """Reset the agent state"""
 74 |         self.history.clear()
 75 |         pass
 76 | 
 77 |     def get_plan(self, screenshot, task_description, feedback=""):
 78 |         """
 79 |         get the next plan
 80 |         Args:
 81 |             screenshot: the screenshot
 82 |             task_description: task description
 83 |             retry_click_elements: the list of elements that failed to click before
 84 |         Returns:
 85 |             plan_str: plan description
 86 |             action_str: specific action
 87 |         """
 88 |         base64_image = encode_image(screenshot)
 89 |         try_time = 5
 90 |         while try_time > 0:
 91 |             try:
 92 |                 instruction = self.get_plan_instruction(task_description, feedback)
 93 |                 messages = get_mllm_messages(instruction, base64_image)
 94 |                 
 95 |                 completion = self.client.chat.completions.create(
 96 |                     model=self.model,
 97 |                     messages=messages,
 98 |                     max_tokens=512,
 99 |                     temperature=0.8
100 |                 )
101 |                 output_text = completion.choices[0].message.content
102 |                 print(f"Output from agent: {output_text}")
103 | 
104 |                 if not "Action" in output_text:
105 |                     feedback = f"\n\nNote: You should provide an action after 'Action:' in the response."
106 |                 
107 |                 return self.split_output(output_text)
108 |             
109 |             except Exception as e:
110 |                 print(f"Failed to get the plan: {e}, try again.")
111 |                 time.sleep(1)
112 |                 if try_time == 1:
113 |                     raise Exception(f"Failed to get the plan: {e}")
114 |                 
115 |             try_time -= 1
116 | 
117 |     def add_to_history(self, output):
118 |         """
119 |         add the output to the history
120 |         """
121 |         self.history.append(output)
122 | 
123 |     def get_action_history(self):
124 |         if len(self.history) > self.history_cut_off:
125 |             history_str = "\n\n".join(f"[{i+1}] {item}" for i, item in enumerate(self.history[-self.history_cut_off:]))
126 |         else:
127 |             history_str = "\n\n".join(f"[{i+1}] {item}" for i, item in enumerate(self.history))
128 |                 
129 |         if history_str == '':
130 |             history_str = "None"
131 |     
132 |         return history_str
133 |     
134 |     def get_plan_instruction(self, task_description, feedback=""):
135 |         """
136 |         generate the planning instruction
137 |         """
138 |         prompt = self.prompt + f"Your task is: {task_description}\n\n"
139 |         history_str = self.get_action_history()
140 |         prompt += f"History of the previous actions and thoughts you have done to reach the current screen: {history_str}\n\n"
141 |         prompt += "--------------------------------------------\n\n"
142 |         prompt += f"Given the screenshot. What's the next step that you will do to help with the task?"
143 |         prompt += feedback
144 |         return prompt
145 |     
146 |     def split_output(self, output):
147 |         """
148 |         split the output into plan and action
149 |         """
150 |         plan_str = output.split("Action:")[0].strip().strip('{}')
151 |         action_str = output.split("Action:")[1].strip().strip('{}')
152 |         return plan_str, action_str
153 |     
154 |     def get_action_code(self, action) -> str:
155 |         screen_width, screen_height = self.screenshot_size
156 |         # click
157 |         match = re.match(r"click \((-?\d+), (-?\d+)\)", action)
158 |         if match:
159 |             x = int(match.group(1))
160 |             y = int(match.group(2))
161 |             if 0 <= x < screen_width and 0 <= y < screen_height:
162 |                 return f"pyautogui.click({x}, {y})"
163 |             else:
164 |                 return None
165 | 
166 |         # right click
167 |         match = re.match(r"right click \((-?\d+), (-?\d+)\)", action)
168 |         if match:
169 |             x = int(match.group(1))
170 |             y = int(match.group(2))
171 |             if 0 <= x < screen_width and 0 <= y < screen_height:
172 |                 return f"pyautogui.rightClick({x}, {y})"
173 |             else:
174 |                 return None
175 | 
176 |         # double click
177 |         match = re.match(r"double click \((-?\d+), (-?\d+)\)", action)
178 |         if match:
179 |             x = int(match.group(1))
180 |             y = int(match.group(2))
181 |             if 0 <= x < screen_width and 0 <= y < screen_height:
182 |                 return f"pyautogui.doubleClick({x}, {y})"
183 |             else:
184 |                 return None
185 | 
186 |         # drag
187 |         match = re.match(r"drag from \((-?\d+), (-?\d+)\) to \((-?\d+), (-?\d+)\)", action)
188 |         if match:
189 |             x1 = int(match.group(1))  # start x coordinate
190 |             y1 = int(match.group(2))  # start y coordinate
191 |             x2 = int(match.group(3))  # target x coordinate
192 |             y2 = int(match.group(4))  # target y coordinate
193 |             if 0 <= x1 < screen_width and 0 <= y1 < screen_height and 0 <= x2 < screen_width and 0 <= y2 < screen_height:
194 |                 return f"pyautogui.mouseDown({x1}, {y1})\npyautogui.dragTo({x2}, {y2}, duration=0.5)"
195 |             else:
196 |                 return None
197 |             
198 |         # scroll
199 |         match = re.match(r"scroll \((-?\d+)\)", action)
200 |         if match:
201 |             y = int(match.group(1))  # vertical scroll distance
202 |             return f"pyautogui.scroll({y})"  # positive: scroll up, negative: scroll down
203 | 
204 |         # press key
205 |         match = re.match(r"press key: (.+)", action)
206 |         if match:
207 |             key_content = match.group(1).lower()
208 |             # Format error
209 |             if 'key' in key_content:
210 |                 return None
211 |             # if key is not in the valid keyboard keys list
212 |             if key_content not in KEYBOARD_KEYS:
213 |                 return None
214 |             return f"pyautogui.press('{key_content}')"
215 | 
216 |         # hotkey
217 |         match = re.match(r"hotkey \((.+), (.+), (.+)\)", action)
218 |         if match:
219 |             key1 = match.group(1).strip("'").lower()
220 |             key2 = match.group(2).strip("'").lower()
221 |             key3 = match.group(3).strip("'").lower()
222 |             # Format error
223 |             if 'key' in key1 or 'key' in key2 or 'key' in key3:
224 |                 return None
225 |             return f"pyautogui.hotkey('{key1}', '{key2}', '{key3}')"
226 |         
227 |         match = re.match(r"hotkey \((.+), (.+)\)", action)
228 |         if match:
229 |             key1 = match.group(1).strip("'").lower()
230 |             key2 = match.group(2).strip("'").lower()
231 |             # Format error
232 |             if 'key' in key1 or 'key' in key2:
233 |                 return None
234 |             return f"pyautogui.hotkey('{key1}', '{key2}')"
235 |         
236 |         # type text
237 |         match = re.match(r"type text: (.+)", action)
238 |         if match:
239 |             text_content = match.group(1).strip("'").strip("\"")
240 |             text_content = text_content.replace("\"", "\\\"")
241 |             text_content = text_content.replace("\'", "\\\'")
242 |             # Format error
243 |             if "text_content" in text_content:
244 |                 return None
245 |             return f"pyautogui.write(\"{text_content}\")"
246 | 
247 |         # wait
248 |         if action == "wait":
249 |             return "WAIT"
250 |             
251 |         # finish
252 |         if action == "finish":
253 |             return "DONE"
254 | 
255 |         # fail
256 |         if action == "fail":
257 |             return "FAIL"
258 |         
259 |         return None
260 | 


--------------------------------------------------------------------------------
/postprocess/boost.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import sys
  4 | import random
  5 | import concurrent.futures
  6 | import argparse
  7 | import traceback
  8 | import time
  9 | from datetime import datetime
 10 | from openai import OpenAI
 11 | from concurrent.futures import ThreadPoolExecutor
 12 | from prompt import *
 13 | from utils import *
 14 | 
 15 | THOUGHT = True
 16 | BOOST = True
 17 | CONCURRENT_NUM = 18
 18 | RE_GENERATE = False
 19 | MAX_CONTEXT_ENTRIES = 30
 20 | DETAILED_OUTPUT = True
 21 | BOOST_COUNT = 9
 22 | 
 23 | 
 24 | client = OpenAI()
 25 | model = "claude-3-7-sonnet-20250219"
 26 | print(f"Using model: {model}")
 27 | 
 28 | 
 29 | def call_model(query, base64_image=None):
 30 |     messages = [
 31 |             {
 32 |                 "role": "user",
 33 |                 "content": [
 34 |                     {
 35 |                         "type": "image_url",
 36 |                         "image_url": {
 37 |                             "url": f"data:image/jpeg;base64,{base64_image}"
 38 |                         }
 39 |                     } if base64_image else None,
 40 |                     {
 41 |                         "type": "text",
 42 |                         "text": query
 43 |                     },
 44 |                 ],
 45 |             },
 46 |         ]
 47 | 
 48 |     retry_time = 10
 49 |     while retry_time > 0:
 50 |         try:
 51 |             completion = client.chat.completions.create(
 52 |                 model=model,
 53 |                 messages=messages,
 54 |                 max_tokens=1000
 55 |             )
 56 |             reply = completion.choices[0].message.content
 57 |             
 58 |             return reply
 59 |         except Exception as e:
 60 |             if retry_time == 1:
 61 |                 raise e
 62 |             else:
 63 |                 pass
 64 |         retry_time -= 1
 65 |          
 66 | 
 67 | def process_concurrently(data_dir, events_prefix, function):
 68 |     tasks = []
 69 |     
 70 |     for item in os.listdir(data_dir):
 71 |         item_path = os.path.join(data_dir, item)
 72 | 
 73 |         if os.path.isdir(item_path) and item.startswith(events_prefix):
 74 |             print(f'Processing directory: {item_path}')
 75 |             for filename in os.listdir(item_path):
 76 |                 if filename.endswith('.jsonl') and 'task' in filename:
 77 |                     file_path = os.path.join(item_path, filename)   
 78 |                     md_path = os.path.join(item_path, filename.replace('.jsonl', '.md'))
 79 |                     try:
 80 |                         with open(md_path, 'r', encoding='utf-8') as file:
 81 |                             lines = file.readlines()
 82 |                         task_description = lines[1].replace('**Description:** ', '').strip()
 83 |                         tasks.append((file_path, task_description))
 84 |                     except Exception as e:
 85 |                         print(f"error: failed to extract task description from {md_path}: {e}")
 86 | 
 87 |     random.shuffle(tasks)
 88 |     with ThreadPoolExecutor(max_workers=CONCURRENT_NUM) as executor:
 89 |         futures = [executor.submit(function, file_path, task_description) 
 90 |                   for file_path, task_description in tasks]
 91 |         concurrent.futures.wait(futures)
 92 |         
 93 | 
 94 | def get_history_str_for_boost(history_steps):
 95 |     """
 96 |     no context limit, extra 
 97 |     """
 98 |     history_str = ""
 99 |     for i, step in enumerate(history_steps):
100 |         step_id, step_content = step
101 |         if i == len(history_steps) - 1:
102 |             history_str += f"**Your Previous Step**: Step {step_id}: {step_content}"
103 |         else:
104 |             history_str += f"Step {step_id}: {step_content}\n\n"
105 |     return history_str
106 | 
107 | 
108 | def get_thought(task_description, entry, history_steps, marked_screenshot_path):
109 |     """
110 |     Generate thought for the action.
111 |     """
112 |     base64_image = encode_image(marked_screenshot_path)
113 |     action = entry["action"]
114 |     element_description = entry["element"]
115 |     history_str = get_history_str_for_boost(history_steps)
116 | 
117 |     query = THOUGHT_COMPLETION_PROMPT \
118 |         + f"The task you are attempting to complete: {task_description}\n\n" \
119 |         + f"Your performing history:\n{history_str}\n\n" \
120 |         + f"The specific action you chose to perform: {action}\n\n"
121 |         
122 |     if element_description and element_description != "Unknown":
123 |         query += f"The element you clicked: {element_description}\n\n"
124 | 
125 |     while True:
126 |         thought = call_model(query, base64_image)
127 |         thought = refine_thought(thought)
128 |         if thought is not None:
129 |             return thought
130 | 
131 | 
132 | def get_boost_responses(task_description, entry, history_steps, screenshot_path, num):
133 |     """
134 |     Generate boost responses
135 |     """
136 |     base64_image = encode_image(screenshot_path)
137 |     history_str = get_history_str_for_boost(history_steps)
138 |     
139 |     query = TRAJECTORY_BOOST_PROMPT \
140 |         + f"The task you are attempting to complete: {task_description}\n\n" \
141 |         + f"Your performing history:\n{history_str}\n\n" \
142 |         + f"Given the screenshot as below. What's the next step that you will do to help with the task?"
143 | 
144 |     responses = []
145 |     
146 |     # Add more boost responses one by one
147 |     for i in range(num-len(responses)):
148 |         try_time = 5
149 |         while try_time > 0:
150 |             response = call_model(query, base64_image)
151 |             response = refine_response(response)
152 |             if response is not None:
153 |                 responses.append(response)
154 |                 break
155 |             try_time -= 1
156 |         responses.append(None)
157 |             
158 |     return responses
159 |           
160 |     
161 | def add_entry_for_file(file_path, task_description):
162 |     print(f"begin add entry for {file_path}")
163 |     entries = []
164 | 
165 |     try:
166 |         with open(file_path, 'r', encoding='utf-8') as file:
167 |             entries = [json.loads(line) for line in file]
168 |     except Exception as e:
169 |         print(f"error: failed to read file {file_path}: {e}")
170 |         return
171 |         
172 |     try:
173 |         for id, entry in enumerate(entries):
174 |             # check marked screenshot available
175 |             if 'marked_screenshot' not in entry:
176 |                 print(f"error: marked_screenshot field not found: {file_path}")
177 |                 continue
178 |             
179 |             marked_screenshot_path = os.path.join(os.path.dirname(file_path), entry['marked_screenshot'])
180 |             screenshot_path = os.path.join(os.path.dirname(file_path), entry['screenshot'])
181 |             if not os.path.isfile(marked_screenshot_path):
182 |                 print(f"error: screenshot file not found: {marked_screenshot_path}")
183 |                 continue
184 |             
185 |             # build history steps
186 |             history_steps = []
187 |             start_idx = max(0, id - MAX_CONTEXT_ENTRIES)
188 |             for hist_id in range(start_idx, id):
189 |                 hist_entry = entries[hist_id]
190 |                 if 'thought' in hist_entry:
191 |                     history_steps.append((hist_id+1, combine_thought_action_to_response(hist_entry['thought'], hist_entry['action'])))
192 |             
193 |             # get thought completion
194 |             if THOUGHT:
195 |                 try:
196 |                     field = "thought"
197 |                     if field in entry:
198 |                         if RE_GENERATE:
199 |                             entry[field] = get_thought(task_description, entry, history_steps, marked_screenshot_path)
200 |                         else:
201 |                             # try refine thought
202 |                             thought = refine_thought(entry[field])
203 |                             # re-generate if not qualified
204 |                             if thought is None:
205 |                                 entry[field] = get_thought(task_description, entry, history_steps, marked_screenshot_path)
206 |                     else:
207 |                         entry[field] = get_thought(task_description, entry, history_steps, marked_screenshot_path)
208 |                 except Exception as e:
209 |                     print(f"error: failed to add thought file {file_path}: {e}")  
210 |             
211 |             # get boost responses
212 |             if BOOST:
213 |                 try:
214 |                     field = "boost_responses"
215 |                     if field in entry:
216 |                         if RE_GENERATE:
217 |                             entry[field] = get_boost_responses(task_description, entry, history_steps, screenshot_path, BOOST_COUNT)
218 |                         else:
219 |                             responses = []
220 |                             # append existing reponse after refinement
221 |                             for response in entry[field]:
222 |                                 # remove empty response
223 |                                 if response is None:
224 |                                     continue
225 |                                 response = refine_response(response)
226 |                                 responses.append(response)
227 |                             # add new reponses if not enough
228 |                             if len(responses) < BOOST_COUNT:
229 |                                 print(f"append new boost response\n")
230 |                                 new_reponses = get_boost_responses(task_description, entry, history_steps, screenshot_path, BOOST_COUNT - len(responses))
231 |                                 responses.extend(new_reponses)
232 |                                 
233 |                             entry[field] = responses
234 |                     else:
235 |                         entry[field] = get_boost_responses(task_description, entry, history_steps, screenshot_path, BOOST_COUNT)    
236 |                 except Exception as e:
237 |                     print(f"error: failed to boost file {file_path}: {e}") 
238 |                     raise 
239 |                 
240 |             if DETAILED_OUTPUT:
241 |                 print(f"boost finished for entry {id} in file {file_path}")
242 |          
243 |             with open(file_path, 'w', encoding='utf-8') as file:
244 |                 for entry in entries:
245 |                     json.dump(entry, file, ensure_ascii=False)
246 |                     file.write('\n')
247 | 
248 |         rewrite_markdown_file_by_jsonl(file_path)
249 |         print(f"finished adding thought for {file_path}")
250 | 
251 |     except Exception as e:
252 |         traceback.print_exc()
253 |         print(f"error: failed to process file {file_path}: {e}")
254 |         if "Expecting" in str(e) or "Invalid control character" in str(e):
255 |             print(f"file {file_path} is corrupted, deleting...")
256 |             try:
257 |                 os.remove(file_path)
258 |                 print(f"deleted corrupted file: {file_path}")
259 |             except OSError as delete_error:
260 |                 print(f"error: failed to delete corrupted file: {delete_error}")
261 | 
262 | 
263 | if __name__ == "__main__": 
264 |     parser = argparse.ArgumentParser(description="Choose which model to use.")
265 |     parser.add_argument(
266 |         "--specific_data_dir",
267 |         type=str,
268 |         default=None,
269 |         help="Optional path to a specific data directory.",
270 |     )
271 |     parser.add_argument(
272 |         "--events_prefix",
273 |         type=str,
274 |         default=None,
275 |         help="Events prefix",
276 |     )
277 |     parser.add_argument(
278 |         "--boost_count",
279 |         type=int,
280 |         default=None,
281 |         help="Optional number of items to boost. If None, boost all."
282 |     )
283 |     
284 |     args = parser.parse_args()
285 |     
286 |     start_time = datetime.now()
287 |     print(f"start time: {start_time}")
288 | 
289 |     # Get parent directory of current script
290 |     root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
291 | 
292 |     # Build total data folder path
293 |     data_dir = os.path.join(root_dir, 'data')
294 |     if not os.path.exists(data_dir):
295 |         print(f"error: {data_dir} directory does not exist")
296 |         exit()
297 |     
298 |     # Events folder prefix
299 |     events_prefix = "events" if args.events_prefix is None else args.events_prefix
300 |     
301 |     process_concurrently(data_dir, events_prefix, add_entry_for_file)
302 |     
303 |     print("process events finished!")
304 |     
305 |     end_time = datetime.now()
306 |     print(f"end time: {end_time}")
307 |     print(f"Total time: {end_time - start_time}")
308 | 


--------------------------------------------------------------------------------
/postprocess/refinement.py:
--------------------------------------------------------------------------------
  1 | # multi-function script for data refinement
  2 | # 1. rewrite screenshot path
  3 | # 2. clean fail and error record
  4 | # 3. check last action finish
  5 | # 4. merge press and drag
  6 | # 5. remove redundant actions
  7 | # 6. remove meaningless actions
  8 | # 7. rewrite scroll
  9 | # 8. resize screenshot and coordinates to 1080p -> 720p
 10 | # 9. clean tracker interface
 11 | # 10. mark screenshot with red rect and point
 12 | # 11. rewrite markdown file
 13 | # 12. statistics
 14 | # support interrupt
 15 | 
 16 | 
 17 | import os
 18 | import json
 19 | import sys
 20 | import numpy as np
 21 | from PIL import Image
 22 | from utils import *
 23 | 
 24 | OVERWRITE_MARKED = True
 25 | REMOVE_FAIL_RECORD = True
 26 | DETAIL_OUTPUT = False 
 27 | 
 28 | 
 29 | def screenshot_of_tracker(screenshot_path, sample_size=100):
 30 |     """
 31 |     check if the screenshot is a Tracker interface.
 32 |     """
 33 |     if get_file_size_kb(screenshot_path) > 83:  # magic number
 34 |         return False
 35 | 
 36 |     bg_color = "#f0f0f0"
 37 |     bg_threshold = 0.8
 38 |     top_offset = 40  # top area offset
 39 |     bottom_offset = 80  # bottom area offset
 40 | 
 41 |     with Image.open(screenshot_path) as img:
 42 |         width, height = img.size
 43 | 
 44 |         # define the sampling regions
 45 |         sample_regions = [
 46 |             (0, top_offset, sample_size, sample_size + top_offset),  # top left corner
 47 |             (width - sample_size, top_offset, width, sample_size + top_offset),  # top right corner
 48 |             (0, height - sample_size - bottom_offset, sample_size, height - bottom_offset),  # bottom left corner
 49 |             (width - sample_size, height - sample_size - bottom_offset, width, height - bottom_offset)  # bottom right corner
 50 |         ]
 51 | 
 52 |         # convert the background color to numpy array
 53 |         bg_color_rgb = np.array([int(bg_color[i:i + 2], 16) for i in (1, 3, 5)])
 54 | 
 55 |         # check the four regions
 56 |         for region in sample_regions:
 57 |             sample_region = img.crop(region)
 58 |             sample_array = np.array(sample_region)[:, :, :3]
 59 |             matches = np.all(sample_array == bg_color_rgb, axis=2)
 60 |             bg_ratio = np.sum(matches) / matches.size
 61 | 
 62 |             if bg_ratio < bg_threshold:
 63 |                 return False
 64 | 
 65 |     return True
 66 | 
 67 | 
 68 | def clean_tracker_interface(file_path):
 69 |     """
 70 |     clean the action records of the Tracker interface.
 71 | 
 72 |     return the number of actions after cleaning, -1 means the file is deleted
 73 |     """
 74 |     if DETAIL_OUTPUT:
 75 |         print(f"Clean tracker interface: {file_path}")
 76 |     screenshot_paths = []
 77 |     entries = []
 78 | 
 79 |     with open(file_path, 'r', encoding='utf-8') as file:
 80 |         for line in file:
 81 |             entry = json.loads(line)
 82 |             full_path = os.path.join(os.path.dirname(file_path), entry['screenshot'])
 83 |             screenshot_paths.append(full_path)
 84 |             entries.append(entry)
 85 | 
 86 |     last_entry_action = entries[-1].get('action')
 87 |     markdown_path = file_path.replace('.jsonl', '.md')
 88 | 
 89 |     # scan and identify the action of the Tracker interface
 90 |     begin = -1
 91 |     interval_list = []  # [begin, end)
 92 |     for index, screenshot_path in enumerate(screenshot_paths):
 93 |         # find the screenshot of the Tracker interface
 94 |         if screenshot_of_tracker(screenshot_path):
 95 |             if begin == -1:
 96 |                 begin = index
 97 |         else:
 98 |             # back to the screenshot of non-Tracker interface, end the interval
 99 |             if begin != -1:
100 |                 interval_list.append((begin, index))
101 |                 begin = -1
102 | 
103 |     interval_list.append((begin, len(screenshot_paths)))  # the last interval (begin maybe -1)
104 | 
105 |     # delete the last interval (finish/fail)
106 |     begin, end = interval_list.pop()
107 |     if begin != -1:
108 |         entries = entries[:begin]
109 |         try:
110 |             entries[-1]['action'] = last_entry_action
111 |             entries[-1]['element'] = None
112 |             entries[-1]['rect'] = None
113 |         except Exception as e:  # empty data
114 |             print(f"[ERROR] delete related records (probably empty): {e}")
115 |             # delete the JSONL file
116 |             os.remove(file_path)
117 |             # delete the Markdown file
118 |             os.remove(markdown_path)
119 |             # delete the screenshot files
120 |             for screenshot_path in screenshot_paths:
121 |                 remove_screenshot(screenshot_path)
122 |             return -1
123 | 
124 |         for i in range(begin, end):
125 |             remove_screenshot(screenshot_paths[i])
126 | 
127 |     # delete other intervals
128 |     to_remove_entry_set = set()
129 |     for begin, end in interval_list:
130 |         for i in range(begin - 1, end):
131 |             remove_screenshot(screenshot_paths[i])
132 |             to_remove_entry_set.add(i)
133 | 
134 |     entries = [entry for i, entry in enumerate(entries) if i not in to_remove_entry_set]
135 | 
136 |     # save the updated JSONL file
137 |     with open(file_path, 'w', encoding='utf-8') as file:
138 |         for entry in entries:
139 |             json.dump(entry, file, ensure_ascii=False)
140 |             file.write('\n')
141 | 
142 |     return len(entries)
143 | 
144 | 
145 | def clean_fail_and_error(file_path):
146 |     """
147 |     clean the records without corresponding Markdown files or the last action is 'fail' or there is None in action.
148 | 
149 |     return True if the file is deleted, False otherwise.
150 |     """
151 |     markdown_path = file_path.replace('.jsonl', '.md')
152 |     if DETAIL_OUTPUT:
153 |         print(f"Clean fail: {file_path}")
154 |     try:
155 |         with open(file_path, 'r', encoding='utf-8') as infile:
156 |             entries = [json.loads(line) for line in infile]
157 |     except Exception as e:
158 |         print(f"[ERROR] Failed to read file {file_path}: {e}")
159 |         return False
160 | 
161 |     screenshot_paths = [os.path.join(os.path.dirname(file_path), entry['screenshot']) for entry in entries]
162 |     last_entry_action = entries[-1]['action'] if entries else ''
163 | 
164 |     # delete the records without corresponding Markdown files
165 |     if not os.path.exists(markdown_path):
166 |         if DETAIL_OUTPUT:
167 |             print(f"File {file_path} has no corresponding Markdown file")
168 |             print("Delete related records...")
169 |         # delete the JSONL file
170 |         os.remove(file_path)
171 |         # delete the screenshot files
172 |         for screenshot_path in screenshot_paths:
173 |             remove_screenshot(screenshot_path)
174 |         return True
175 | 
176 |     # clean the fail records (optional)
177 |     if REMOVE_FAIL_RECORD and last_entry_action == 'fail':
178 |         if DETAIL_OUTPUT:
179 |             print(f"File {file_path} ends with fail action")
180 |             print("Delete related records...")
181 |         # delete the JSONL file
182 |         os.remove(file_path)
183 |         # delete the Markdown file
184 |         os.remove(markdown_path)
185 |         # delete the screenshot files
186 |         for screenshot_path in screenshot_paths:
187 |             remove_screenshot(screenshot_path)
188 |         return True
189 |     
190 |     # check if there is None in action
191 |     for entry in entries:
192 |         if entry['action'] is None or "None" in entry['action']:
193 |             if DETAIL_OUTPUT:
194 |                 print(f"File {file_path} has None in action")
195 |                 print("Delete related records...")
196 |             # delete the JSONL file
197 |             os.remove(file_path)
198 |             # delete the Markdown file
199 |             os.remove(markdown_path)
200 |             # delete the screenshot files
201 |             for screenshot_path in screenshot_paths:
202 |                 remove_screenshot(screenshot_path)
203 |             return True
204 |     
205 |     return False
206 | 
207 | 
208 | def resize(file_path):
209 |     if DETAIL_OUTPUT:
210 |         print(f"Resize file: {file_path}")
211 | 
212 |     # get the directory of the file
213 |     task_dir = os.path.dirname(file_path)
214 | 
215 |     # read the screenshot path of the last entry
216 |     try:
217 |         with open(file_path, 'r', encoding='utf-8') as infile:
218 |             lines = infile.readlines()
219 |             last_line = lines[-1]
220 |             last_entry = json.loads(last_line)
221 |             screenshot_path = os.path.join(task_dir, last_entry['screenshot'])
222 |     except Exception as e:
223 |         print(f"[ERROR] Failed to read the screenshot path of the last entry: {e}")
224 |         return
225 | 
226 |     if not os.path.exists(screenshot_path):
227 |         print(f"[ERROR] The screenshot file does not exist: {screenshot_path}")
228 |         return
229 | 
230 |     # get the resolution of the screenshot
231 |     try:
232 |         with Image.open(screenshot_path) as img:
233 |             original_width, original_height = img.size
234 |             if DETAIL_OUTPUT:
235 |                 print(f"Original resolution: {original_width}x{original_height}")
236 |     except Exception as e:
237 |         print(f"[ERROR] Failed to open the screenshot file {screenshot_path}: {e}")
238 |         return
239 | 
240 |     # original_width, original_height = 2560, 1440
241 | 
242 |     # target resolution
243 |     target_width, target_height = 1280, 720
244 |     if original_width == target_width and original_height == target_height:
245 |         if DETAIL_OUTPUT:
246 |             print(f"The screenshot resolution is the same as the target resolution, no need to resize")
247 |         return
248 | 
249 |     scale_x = target_width / original_width
250 |     scale_y = target_height / original_height
251 |     if DETAIL_OUTPUT:
252 |         print(f"Resize ratio - X: {scale_x:.4f}, Y: {scale_y:.4f}")
253 | 
254 |     # process the JSONL file
255 |     modified_lines = []
256 |     for line in lines:
257 |         try:
258 |             data = json.loads(line)
259 | 
260 |             # process the screenshot
261 |             screenshot_path = os.path.join(task_dir, data['screenshot'])
262 |             assert resize_to_720p(screenshot_path), "Error occured!"
263 | 
264 |             # process the action
265 |             data['action'] = resize_action(data['action'], scale_x, scale_y)
266 | 
267 |             # process the rect
268 |             if 'rect' in data and isinstance(data['rect'], dict):
269 |                 rect = data['rect']
270 |                 rect['left'] = round(rect['left'] * scale_x)
271 |                 rect['top'] = round(rect['top'] * scale_y)
272 |                 rect['right'] = round(rect['right'] * scale_x)
273 |                 rect['bottom'] = round(rect['bottom'] * scale_y)
274 |                 if DETAIL_OUTPUT:
275 |                     print(f"Resize rect: {rect}")
276 | 
277 |             modified_lines.append(json.dumps(data, ensure_ascii=False) + '\n')
278 |         except Exception as e:
279 |             print(f"[WARNING] Error when processing the line: {line.strip()} - {e}")
280 |             modified_lines.append(line)
281 | 
282 |     # directly write the modified content, overwrite the original file
283 |     try:
284 |         with open(file_path, 'w', encoding='utf-8') as outfile:
285 |             outfile.writelines(modified_lines)
286 |         if DETAIL_OUTPUT:
287 |             print(f"Saved the modified file: {file_path}")
288 |     except Exception as e:
289 |         print(f"[ERROR] Failed to write the file {file_path}: {e}")
290 | 
291 | 
292 | def mark(file_path):
293 |     if DETAIL_OUTPUT:
294 |         print(f"Mark file: {file_path}")
295 |     
296 |     # get the directory of the file
297 |     task_dir = os.path.dirname(file_path)
298 |     
299 |     # process the JSONL file
300 |     modified_lines = []
301 |     with open(file_path, 'r', encoding='utf-8') as infile:
302 |         for line in infile:
303 |             entry = json.loads(line)
304 | 
305 |             if not OVERWRITE_MARKED and 'marked_screenshot' in entry:
306 |                 if DETAIL_OUTPUT:
307 |                     print(f"Already marked: {entry['marked_screenshot']}")
308 |                 modified_lines.append(line)
309 |                 continue
310 | 
311 |             screenshot = os.path.join(task_dir, entry.get('screenshot'))
312 |             action = entry.get('action')
313 |             rect = entry.get('rect')
314 | 
315 |             if rect is not None and action != "finish": # click or drag
316 |                 click_action_name, coordinates = parse_click_action(action)
317 |                 if click_action_name != None: # click related action
318 |                     x, y = coordinates
319 |                     marked_screenshot = mark_image(is_click_action=True, image_path=screenshot, rect=rect, point1={'x': x, 'y': y})
320 |                     entry['marked_screenshot'] = marked_screenshot
321 |                 else: # drag related action
322 |                     (x1, y1), (x2, y2) = parse_drag_action(action)
323 |                     marked_screenshot = mark_image(is_click_action=False, image_path=screenshot, rect=rect, point1={'x': x1, 'y': y1}, point2={'x': x2, 'y': y2})
324 |                     entry['marked_screenshot'] = marked_screenshot
325 |             else:
326 |                 # rect is None, copy the original screenshot path
327 |                 entry['marked_screenshot'] = screenshot
328 | 
329 |             # remove the task_dir prefix of marked_screenshot
330 |             entry['marked_screenshot'] = entry['marked_screenshot'].replace(
331 |                 task_dir + '/', '')
332 | 
333 |             modified_lines.append(json.dumps(entry, ensure_ascii=False) + '\n')
334 | 
335 |     # write the modified content, overwrite the original file
336 |     with open(file_path, 'w', encoding='utf-8') as outfile:
337 |         outfile.writelines(modified_lines)
338 | 
339 | 
340 | def rewrite_screenshot_path(file_path):
341 |     if DETAIL_OUTPUT:
342 |         print(f"Rewrite screenshot path: {file_path}")
343 | 
344 |     modified_lines = []
345 |     with open(file_path, 'r', encoding='utf-8') as file:
346 |         for line in file:
347 |             entry = json.loads(line)
348 | 
349 |             # process the screenshot field, remove the possible prefix 'events\\'
350 |             if entry['screenshot'].startswith('events\\'):
351 |                 entry['screenshot'] = entry['screenshot'][7:]  # remove the 'events\\' prefix
352 | 
353 |             # replace the backslash with the forward slash (Linux format)
354 |             if "\\" in entry['screenshot']:
355 |                 entry['screenshot'] = entry['screenshot'].replace("\\", "/")
356 | 
357 |             modified_lines.append(json.dumps(entry, ensure_ascii=False) + '\n')
358 | 
359 |     with open(file_path, 'w', encoding='utf-8') as outfile:
360 |         outfile.writelines(modified_lines)
361 | 
362 | 
363 | duplicate_clicks = 0
364 | adjacent_clicks = 0
365 | 
366 | 
367 | def remove_redundant_actions(file_path):
368 |     if DETAIL_OUTPUT:
369 |         print(f"Remove redundant actions: {file_path}")
370 |     ctrl_cnt = 0
371 |     shift_cnt = 0
372 |     wait_cnt = 0
373 |     caps_lock_cnt = 0
374 |     all_entries = []
375 |     kept_entries = []
376 |     screenshot_paths = []
377 |     continuous_wait_at_begin = False
378 | 
379 |     with open(file_path, 'r', encoding='utf-8') as file:
380 |         for line in file:
381 |             entry = json.loads(line)
382 |             all_entries.append(entry)
383 |             
384 |     total_cnt = len(all_entries)
385 |     skip = False
386 |     for id, entry in enumerate(all_entries):
387 |         if skip:
388 |             skip = False
389 |             continue
390 |         # check the continuous adjacent clicks
391 |         screenshot_path = os.path.join(os.path.dirname(file_path), entry['screenshot'])
392 |         if entry != all_entries[-1] and 'click' in entry['action'] and 'click' in all_entries[id+1]['action']:
393 |             _, (x1, y1) = parse_click_action(entry['action'])
394 |             _, (x2, y2) = parse_click_action(all_entries[id+1]['action'])
395 |             global adjacent_clicks
396 |             global duplicate_clicks
397 |             if entry['action'] == all_entries[id+1]['action']:
398 |                 duplicate_clicks += 1;
399 |             elif abs(x1-x2) + abs(y1-y2) < 5:
400 |                 adjacent_clicks += 1;
401 |         
402 |         # delete the continuous wait at the beginning
403 |         if entry['action'] != 'wait':
404 |             continuous_wait_at_begin = False
405 |         if entry['action'] == 'wait' and (id == 0 or continuous_wait_at_begin):
406 |             wait_cnt += 1
407 |             screenshot_paths.append(screenshot_path)
408 |             continuous_wait_at_begin = True
409 |         # delete the redundant ctrl and shift
410 |         elif entry['action'] == 'press key ctrl' and (entry == all_entries[-1] or all_entries[id+1]['action'] == 'press key ctrl' or all_entries[id+1]['action'].startswith("hotkey (Ctrl,")):
411 |             ctrl_cnt += 1
412 |             screenshot_paths.append(screenshot_path)
413 |         elif entry['action'] == 'press key shift' and (entry == all_entries[-1] or all_entries[id+1]['action'] == 'press key shift' or all_entries[id+1]['action'].startswith('type')):
414 |             shift_cnt += 1
415 |             screenshot_paths.append(screenshot_path)
416 |         elif entry['action'] == 'press key ctrl' and all_entries[id+1]['action'] == 'press key shift':
417 |             # this action and the next action should be deleted
418 |             ctrl_cnt += 1
419 |             shift_cnt += 1
420 |             screenshot_paths.append(screenshot_path)
421 |             screenshot_paths.append(os.path.join(os.path.dirname(file_path), all_entries[id+1]['screenshot']))
422 |             if DETAIL_OUTPUT:
423 |                 print(f"remove ctrl + shift in {file_path} action {id}")
424 |             skip = True
425 |         elif entry['action'] == 'press key caps_lock':
426 |             caps_lock_cnt += 1
427 |             screenshot_paths.append(screenshot_path)
428 |         else:
429 |             kept_entries.append(entry)
430 |                 
431 |     with open(file_path, 'w', encoding='utf-8') as file:
432 |         for entry in kept_entries:
433 |             json.dump(entry, file, ensure_ascii=False)
434 |             file.write('\n')
435 |                 
436 |     if len(kept_entries) == len(all_entries):
437 |         if DETAIL_OUTPUT:
438 |             print(f"File {file_path} has no redundant actions")
439 |         return
440 |     if DETAIL_OUTPUT:
441 |         if wait_cnt != 0:
442 |             print(f"File {file_path} has {wait_cnt}/{total_cnt} redundant wait, removed")
443 |         if ctrl_cnt != 0:
444 |             print(f"File {file_path} has {ctrl_cnt}/{total_cnt} redundant ctrl, removed")
445 |         if shift_cnt != 0:
446 |             print(f"File {file_path} has {shift_cnt}/{total_cnt} redundant shift, removed")
447 |         if caps_lock_cnt != 0:
448 |             print(f"File {file_path} has {caps_lock_cnt}/{total_cnt} redundant caps_lock, removed")
449 |     
450 |     # delete the screenshot files
451 |     for screenshot_path in screenshot_paths:
452 |         os.remove(screenshot_path)
453 | 
454 | 
455 | def remove_meaningless_actions(file_path):
456 |     if DETAIL_OUTPUT:
457 |         print(f"Remove meaningless actions: {file_path}")
458 |     all_entries = []
459 |     kept_entries = []
460 |     screenshot_paths = []
461 | 
462 |     with open(file_path, 'r', encoding='utf-8') as file:
463 |         for line in file:
464 |             entry = json.loads(line)
465 |             all_entries.append(entry)
466 |             
467 |     for id, entry in enumerate(all_entries):
468 |         # check the similarity of two continuous screenshots
469 |         if entry != all_entries[-1] and (entry['action'] == 'wait' or 'click' in entry['action']):
470 |             screenshot_path1 = os.path.join(os.path.dirname(file_path), entry['screenshot'])
471 |             screenshot_path2 = os.path.join(os.path.dirname(file_path), all_entries[id+1]['screenshot'])
472 |             if are_screenshots_identical(screenshot_path1, screenshot_path2):
473 |                 screenshot_paths.append(screenshot_path1)
474 |                 if DETAIL_OUTPUT:
475 |                     print(f"action {id}: {entry['action']} in {file_path} is a meaningless action, it has been removed")
476 |             else:
477 |                 kept_entries.append(entry)
478 |         else:
479 |             kept_entries.append(entry)
480 |             
481 |     if len(kept_entries) == len(all_entries):
482 |         if DETAIL_OUTPUT:
483 |             print(f"File {file_path} has no meaningless actions")
484 |         return
485 |     
486 |     # rewrite the JSON file       
487 |     with open(file_path, 'w', encoding='utf-8') as file:
488 |         for entry in kept_entries:
489 |             json.dump(entry, file, ensure_ascii=False)
490 |             file.write('\n')
491 | 
492 |     # delete the screenshot files
493 |     for screenshot_path in screenshot_paths:
494 |         os.remove(screenshot_path)
495 | 
496 |      
497 | def merge_press_drag(file_path):
498 |     if DETAIL_OUTPUT:
499 |         print(f"Merge press and drag: {file_path}")
500 |     
501 |     all_entries = []
502 |     kept_entries = []
503 |     screenshot_paths = []
504 | 
505 |     with open(file_path, 'r', encoding='utf-8') as file:
506 |         for line in file:
507 |             entry = json.loads(line)
508 |             all_entries.append(entry)
509 |     
510 |     id = 0       
511 |     while id < len(all_entries):
512 |         # check the press action
513 |         if id != len(all_entries) - 1 and all_entries[id]['action'].startswith("press ("):
514 |             if id == len(all_entries) - 2 and all_entries[id+1]['action'] == "finish":
515 |                 # delete this press action
516 |                 id += 1
517 |                 continue
518 |             # the next action must be drag to
519 |             assert all_entries[id+1]['action'].startswith("drag"), f"Error: In file {file_path}, action {id+1} should start with 'drag', but it's {all_entries[id+1]['action']}"
520 |             x1, y1 = extract_coordinates(all_entries[id]['action'])
521 |             x2, y2 = extract_coordinates(all_entries[id+1]['action'])
522 |             if abs(x1-x2) + abs(y1-y2) <= 10:
523 |                 if DETAIL_OUTPUT:
524 |                     print(f"delta: {abs(x1-x2) + abs(y1-y2)} in {file_path} action {id} is too small, it's merged into a single click")
525 |                 all_entries[id]['action'] = f"click ({x2}, {y2})"
526 |             else:
527 |                 if DETAIL_OUTPUT:
528 |                     print(f"action {id}: {all_entries[id]['action']} in {file_path} has been merged with action {id+1}: {all_entries[id+1]['action']}")
529 |                 all_entries[id]['action'] = f"drag from ({x1}, {y1}) to ({x2}, {y2})"
530 |             screenshot_paths.append(os.path.join(os.path.dirname(file_path), all_entries[id+1]['screenshot']))
531 |             kept_entries.append(all_entries[id])
532 |             id += 1 # skip the next action
533 |         else:
534 |             kept_entries.append(all_entries[id])
535 |         
536 |         id += 1
537 |         
538 |     if len(kept_entries) == len(all_entries):
539 |         if DETAIL_OUTPUT:
540 |             print(f"File {file_path} has no press and drag to be merged")
541 |         return
542 |     
543 |     # rewrite the JSON file       
544 |     with open(file_path, 'w', encoding='utf-8') as file:
545 |         for entry in kept_entries:
546 |             json.dump(entry, file, ensure_ascii=False)
547 |             file.write('\n')
548 | 
549 |     # delete the screenshot files
550 |     for screenshot_path in screenshot_paths:
551 |         os.remove(screenshot_path)
552 | 
553 | def rewrite_scroll(file_path):
554 |     if DETAIL_OUTPUT:
555 |         print(f"Rewrite Scroll: {file_path}")
556 |     
557 |     all_entries = []
558 |     kept_entries = []
559 | 
560 |     with open(file_path, 'r', encoding='utf-8') as file:
561 |         for line in file:
562 |             entry = json.loads(line)
563 |             all_entries.append(entry)
564 |     
565 |     for entry in all_entries:
566 |         if entry['action'].startswith("scroll"):
567 |             new_pattern = r'^scroll \(-?\d+\)$'
568 |             match = re.match(new_pattern, entry['action'])
569 |             if match: # already in the new pattern
570 |                 kept_entries.append(entry)
571 |             else:
572 |                 dx, dy = extract_coordinates(entry['action'])
573 |                 if dy != 0:
574 |                     action = f"scroll ({dy})"
575 |                     entry['action'] = action
576 |                     kept_entries.append(entry)
577 |         else:
578 |             kept_entries.append(entry)
579 |                 
580 |     
581 |     # rewrite the JSON file       
582 |     with open(file_path, 'w', encoding='utf-8') as file:
583 |         for entry in kept_entries:
584 |             json.dump(entry, file, ensure_ascii=False)
585 |             file.write('\n')
586 | 
587 |     
588 | def check_finish(file_path):
589 |     if DETAIL_OUTPUT:
590 |         print(f"Check finish: {file_path}")
591 |     
592 |     # read all lines
593 |     try:
594 |         with open(file_path, 'r', encoding='utf-8') as infile:
595 |             lines = infile.readlines()
596 |             last_line = lines[-1]
597 |             last_entry = json.loads(last_line)
598 |     except Exception as e:
599 |         print(f"[ERROR] Failed to read the file content: {e}")
600 |         return
601 | 
602 |     # replace the last action with finish
603 |     if last_entry.get('action') == 'finish':
604 |         if DETAIL_OUTPUT:
605 |             print("The last entry is already 'finish'")
606 |         return
607 |     else:
608 |         if DETAIL_OUTPUT:
609 |             print("The last entry is ", last_entry.get('action'))
610 |             print("Modify the last entry to 'finish'")
611 |         last_entry['action'] = 'finish'
612 | 
613 |     # update the last line
614 |     lines[-1] = json.dumps(last_entry, ensure_ascii=False) + '\n'
615 | 
616 |     # write back to file
617 |     try:
618 |         with open(file_path, 'w', encoding='utf-8') as outfile:
619 |             outfile.writelines(lines)
620 |         if DETAIL_OUTPUT:
621 |             print(f"Saved the modified file: {file_path}")
622 |     except Exception as e:
623 |         print(f"[ERROR] Failed to write the file {file_path}: {e}")
624 |         
625 | 
626 | def process_task_jsonl_file(file_path):
627 |     if DETAIL_OUTPUT:
628 |         print(f"Process task jsonl file: {file_path}")
629 |     rewrite_screenshot_path(file_path)
630 |     if clean_fail_and_error(file_path):
631 |         return -1  # the file is deleted
632 |     check_finish(file_path)
633 |     merge_press_drag(file_path)
634 |     remove_redundant_actions(file_path)
635 |     remove_meaningless_actions(file_path)
636 |     rewrite_scroll(file_path)
637 |     resize(file_path)
638 |     cnt = clean_tracker_interface(file_path)
639 |     if cnt != -1:
640 |         mark(file_path)
641 |         rewrite_markdown_file_by_jsonl(file_path)
642 |     return cnt
643 | 
644 | 
645 | def process_events_directories():
646 |     # Get parent directory of current script
647 |     root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
648 |     
649 |     # Build the path to the data folder
650 |     data_dir = os.path.join(root_dir, 'data')
651 |     if not os.path.exists(data_dir):
652 |         print(f"error: {data_dir} directory does not exist")
653 |         exit()
654 |         
655 |     # Events folder prefix
656 |     events_prefix = "events_"
657 | 
658 |     total_action_cnt = 0
659 |     total_record_cnt = 0
660 |     max_action_cnt = 0
661 | 
662 |     # traverse all subdirectories of the data folder
663 |     for item in os.listdir(data_dir):
664 |         item_path = os.path.join(data_dir, item)
665 | 
666 |         # check if it's a directory and starts with the specified name
667 |         if os.path.isdir(item_path) and item.startswith(events_prefix):
668 |             print(f'Processing directory: {item_path}')
669 |             for filename in os.listdir(item_path):
670 |                 # task jsonl file
671 |                 if filename.endswith('.jsonl') and 'task' in filename:
672 |                     file_path = os.path.join(item_path, filename)
673 |                     cnt = process_task_jsonl_file(file_path)
674 |                     if cnt != -1:
675 |                         total_action_cnt += cnt
676 |                         total_record_cnt += 1
677 |                         max_action_cnt = max(max_action_cnt, cnt)
678 |     
679 |     average_action_cnt = total_action_cnt / total_record_cnt
680 |     print(f"Total records: {total_record_cnt}")
681 |     print(f"Average actions per record: {average_action_cnt:.2f}")
682 |     print(f"Maximum actions: {max_action_cnt}")
683 |     
684 | 
685 | if __name__ == "__main__":
686 |     process_events_directories()


--------------------------------------------------------------------------------
/postprocess/utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | import json
  4 | import base64
  5 | import cv2
  6 | import numpy as np
  7 | from PIL import Image, ImageDraw
  8 | 
  9 | POINT_RADIUS = 2
 10 | CIRCLE_RADIUS = 18
 11 | CIRCLE_WIDTH = 2
 12 | RECT_WIDTH = 2
 13 | 
 14 | KEYBOARD_KEYS = ['\t', '\n', '\r', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~', 'accept', 'add', 'alt', 'altleft', 'altright', 'apps', 'backspace', 'browserback', 'browserfavorites', 'browserforward', 'browserhome', 'browserrefresh', 'browsersearch', 'browserstop', 'capslock', 'clear', 'convert', 'ctrl', 'ctrlleft', 'ctrlright', 'decimal', 'del', 'delete', 'divide', 'down', 'end', 'enter', 'esc', 'escape', 'execute', 'f1', 'f10', 'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f2', 'f20', 'f21', 'f22', 'f23', 'f24', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'final', 'fn', 'hanguel', 'hangul', 'hanja', 'help', 'home', 'insert', 'junja', 'kana', 'kanji', 'launchapp1', 'launchapp2', 'launchmail', 'launchmediaselect', 'left', 'modechange', 'multiply', 'nexttrack', 'nonconvert', 'num0', 'num1', 'num2', 'num3', 'num4', 'num5', 'num6', 'num7', 'num8', 'num9', 'numlock', 'pagedown', 'pageup', 'pause', 'pgdn', 'pgup', 'playpause', 'prevtrack', 'print', 'printscreen', 'prntscrn', 'prtsc', 'prtscr', 'return', 'right', 'scrolllock', 'select', 'separator', 'shift', 'shiftleft', 'shiftright', 'sleep', 'space', 'stop', 'subtract', 'tab', 'up', 'volumedown', 'volumemute', 'volumeup', 'win', 'winleft', 'winright', 'yen', 'command', 'option', 'optionleft', 'optionright']
 15 | 
 16 | 
 17 | def refine_response(response):
 18 |     # Returns: refined response or None
 19 |     if response is None:
 20 |         return None
 21 |     response = response.replace("**Action:**", "Action:").strip()
 22 |     response = response.replace("### Action:\nAction:", "Action:").strip()
 23 |     thought, action = parse_thought_action_from_response(response)
 24 |     thought = refine_thought(thought)
 25 |     action = refine_action(action)
 26 |     if thought is None or action is None:
 27 |         return None
 28 |     
 29 |     return combine_thought_action_to_response(thought, action)
 30 | 
 31 | 
 32 | def refine_action(action):
 33 |     # Returns: refined action or None
 34 |     if action is None:
 35 |         return None
 36 |     
 37 |     action = remove_comments_from_action(action)
 38 |     
 39 |     # check if valid
 40 |     if get_action_code(action) is None:
 41 |         return None
 42 |     
 43 |     return action
 44 | 
 45 | 
 46 | def remove_comments_from_action(action):
 47 |     if action is None:
 48 |         return None
 49 |     # Find '#'
 50 |     pos_hash = action.find('#')
 51 |     if pos_hash != -1:
 52 |         action = action[:pos_hash]
 53 |     # Find '//'
 54 |     pos_slash = action.find('//')
 55 |     if pos_slash != -1:
 56 |         action = action[:pos_slash]
 57 |     
 58 |     return action.strip()
 59 | 
 60 | 
 61 | def refine_thought(thought):
 62 |     # Returns: refined thought or None
 63 |     
 64 |     # rule 0: check None
 65 |     if thought is None:
 66 |         return None
 67 |     
 68 |     # rule 1: check 'I can't assist'
 69 |     if "sorry, I can\'t assist" in thought:
 70 |         return None
 71 |     
 72 |     thought = thought.replace("**Thought Process:**", "").strip()
 73 |     
 74 |     # rule 2: check 'Action:' in thought
 75 |     if "Action:" in thought:
 76 |         thought = thought.split("Action:")[0].strip()
 77 |     if "*Action*:" in thought:
 78 |         thought = thought.split("*Action*:")[0].strip()
 79 |     if "**Action:**" in thought:
 80 |         thought = thought.split("**Action:**")[0].strip()
 81 |     
 82 |     # rule 3: check useless title with #
 83 |     if thought.startswith("# Thought Process") or thought.startswith("# My Thought Process"):
 84 |         newline_index = thought.find("\n")
 85 |         if newline_index != -1:
 86 |             thought = thought[newline_index+1:].strip()
 87 |         else:
 88 |             return None
 89 |     
 90 |     # rule 4: check if thought is enclosed in {}
 91 |     if thought.startswith("{") and thought.endswith("}"):
 92 |         thought = thought[1:-1].strip()  # remove the outer {}
 93 |         
 94 |     # rule 5: check if start with Your thought process
 95 |     unwanted_contents = ["{Your thought process}", "Your thought process", "## Thought Process", "# Thought process", "#*# Thought Process", "Thought process:", "Thought process", "My thought process:", "My thought process", "#\n", "#:\n", ":\n"]
 96 |     for unwanted in unwanted_contents:
 97 |         if unwanted in thought:
 98 |             thought = thought.replace(unwanted, "").strip()
 99 |     
100 |     # rule 6: check too short thought
101 |     if len(thought)< 15:
102 |         return None
103 |     
104 |     return thought
105 | 
106 | 
107 | def rewrite_markdown_file_by_jsonl(jsonl_path):
108 |     """
109 |     rewrite markdown file by jsonl file
110 |     """
111 |     with open(jsonl_path, 'r', encoding='utf-8') as file:
112 |         lines = file.readlines()
113 | 
114 |     entries = [json.loads(line) for line in lines]
115 |     markdown_path = jsonl_path.replace('.jsonl', '.md')
116 |     rewrite_markdown_file(markdown_path, entries)
117 | 
118 | 
119 | def rewrite_markdown_file(markdown_path, entries):
120 |     """
121 |     rewrite markdown file by entries, use marked_screenshot if exists
122 |     """
123 |     prompt = '''Given the screenshot as below. What's the next step that you will do to help with the task?'''
124 |     with open(markdown_path, 'r', encoding='utf-8') as file:
125 |         lines = file.readlines()
126 | 
127 |     # keep the first 5 lines
128 |     kept_lines = lines[:5]
129 | 
130 |     # add new lines after the kept lines
131 |     for index, entry in enumerate(entries):
132 |         action = get_full_action(entry)
133 |         screenshot_path = entry['marked_screenshot'] if 'marked_screenshot' in entry else entry['screenshot']
134 |         thought = entry['thought'] if 'thought' in entry else None
135 |         # boost_responses = entry['boost_responses'] if 'boost_responses' in entry else []
136 | 
137 |         kept_lines.append(f'### Step {index+1}\n')
138 |         kept_lines.append(f'**Input:** \n\n{prompt}\n\n')
139 |         kept_lines.append(
140 |             f'<img src="{screenshot_path}" width="100%" height="100%">\n\n')
141 |         
142 |         if thought:
143 |             kept_lines.append(f'**Thought:** \n\n{thought}\n\n')
144 |         
145 |         kept_lines.append(f'**Output:** \n\n{action}\n\n')
146 |                 
147 |     # rewrite the file
148 |     with open(markdown_path, 'w', encoding='utf-8') as file:
149 |         file.writelines(kept_lines)
150 | 
151 | 
152 | def remove_screenshot(screenshot_path):
153 |     """
154 |     remove the screenshot file and the possible _marked file
155 |     """
156 |     if os.path.exists(screenshot_path):
157 |         os.remove(screenshot_path)
158 | 
159 |     # remove the possible _marked file
160 |     marked_screenshot_path = screenshot_path.replace('.png', '_marked.png')
161 |     if os.path.exists(marked_screenshot_path):
162 |         os.remove(marked_screenshot_path)
163 | 
164 | 
165 | def get_full_action(entry):
166 |     """
167 |     get the full action string from entry
168 |     """
169 |     action = entry['action']
170 |     element = entry['element']
171 |     if element:
172 |         target = 'click'
173 |         index = action.find(target)
174 |         if index != -1:
175 |             # find the end position of 'click'
176 |             insert_position = index + len(target)
177 |             # insert ':' after 'click'
178 |             action = action[:insert_position] + \
179 |                 f' element {element} at' + action[insert_position:]
180 |     return action
181 | 
182 | 
183 | def encode_image(image_path):
184 |     """
185 |     encode image to base64
186 |     """
187 |     with open(image_path, "rb") as image_file:
188 |         return base64.b64encode(image_file.read()).decode('utf-8')
189 | 
190 | 
191 | def get_file_size_kb(file_path):
192 |     file_size_bytes = os.path.getsize(file_path)
193 |     file_size_kb = file_size_bytes / 1024  # convert to KB
194 |     return round(file_size_kb, 1)  # keep 1 decimal place
195 | 
196 | 
197 | def mark_image(is_click_action, image_path, rect, point1, point2=None):
198 |     """
199 |     mark the image and save as a new file, return the new file path
200 |     """
201 |     # open the image
202 |     with Image.open(image_path) as image:
203 |         if is_click_action:
204 |             # create a drawable object
205 |             draw = ImageDraw.Draw(image)
206 | 
207 |             # draw a rectangle
208 |             draw.rectangle(
209 |                 [(rect["left"], rect["top"]), (rect["right"], rect["bottom"])],
210 |                 outline="red",
211 |                 width=RECT_WIDTH  # line width
212 |             )
213 | 
214 |             # draw a point
215 |             draw_point(point1["x"], point1["y"], draw)
216 | 
217 |             # draw a circle
218 |             draw_circle(point1["x"], point1["y"], draw)
219 | 
220 |             # draw a short arrow
221 |             draw_short_arrow(point1["x"], point1["y"], draw)
222 | 
223 |         else:
224 |             draw = ImageDraw.Draw(image)
225 | 
226 |             # draw a point
227 |             draw_point(point1["x"], point1["y"], draw)
228 |             draw_point(point2["x"], point2["y"], draw)
229 | 
230 |             if (abs(point1["x"] - point2["x"]) + abs(point1["y"] - point2["y"])) > 15:
231 |                 # draw a circle
232 |                 draw_circle(point1["x"], point1["y"], draw)
233 |                 draw_circle(point2["x"], point2["y"], draw)
234 |             else:
235 |                 print(f"the distance between point1 and point2 in image {image_path} is too small, skip drawing circles")
236 |                 
237 |             # draw a long arrow
238 |             draw_long_arrow(point1["x"], point1["y"], point2["x"], point2["y"], draw)
239 | 
240 |     # generate the output path, add "_marked" to the original file name
241 |     base, ext = os.path.splitext(image_path)
242 |     output_path = f"{base}_marked{ext}"
243 | 
244 |     # save the marked image
245 |     image.save(output_path)
246 |     # print(f"marked image saved to: {output_path}")
247 |     return output_path
248 | 
249 | 
250 | def mark_image_for_boost(is_click_action, image_path, boost_idx, point1, point2=None):
251 |     """
252 |     mark the image and save as a new file, return the new file path
253 |     """
254 |     # open the image
255 |     with Image.open(image_path) as image:
256 |         if is_click_action:
257 |             # create a drawable object
258 |             draw = ImageDraw.Draw(image)
259 | 
260 |             # draw a point
261 |             draw_point(point1["x"], point1["y"], draw)
262 | 
263 |             # draw a circle
264 |             draw_circle(point1["x"], point1["y"], draw)
265 | 
266 |             # draw a short arrow
267 |             draw_short_arrow(point1["x"], point1["y"], draw)
268 | 
269 |         else:
270 |             draw = ImageDraw.Draw(image)
271 | 
272 |             # draw a point
273 |             draw_point(point1["x"], point1["y"], draw)
274 |             draw_point(point2["x"], point2["y"], draw)
275 | 
276 |             if (abs(point1["x"] - point2["x"]) + abs(point1["y"] - point2["y"])) > 15:
277 |                 # draw a circle
278 |                 draw_circle(point1["x"], point1["y"], draw)
279 |                 draw_circle(point2["x"], point2["y"], draw)
280 |             else:
281 |                 print(f"the distance between point1 and point2 in image {image_path} is too small, skip drawing circles")
282 |                 
283 |             # draw a long arrow
284 |             draw_long_arrow(point1["x"], point1["y"], point2["x"], point2["y"], draw)
285 | 
286 |     # generate the output path, add "_marked" to the original file name
287 |     base, ext = os.path.splitext(image_path)
288 |     output_path = f"{base}_marked_boost_{boost_idx}{ext}"
289 | 
290 |     # save the marked image
291 |     image.save(output_path)
292 |     # print(f"marked image saved to: {output_path}")
293 |     return output_path
294 | 
295 | 
296 | def resize_to_720p(image_path):
297 |     """
298 |     check and resize the image to fixed 1280x720 resolution, return whether success
299 |     """
300 |     try:
301 |         with Image.open(image_path) as img:
302 |             img.verify()  # verify the image integrity
303 |     except:
304 |         print(f"[ERROR] image corrupted: {image_path}")
305 |         return False
306 | 
307 |     # open the image
308 |     with Image.open(image_path) as img:
309 |         if img.size == (1280, 720):
310 |             print(f"image is already 720p, no need to resize: {image_path}")
311 |             return True
312 | 
313 |         try:
314 |             resized_img = img.resize((1280, 720), Image.LANCZOS)
315 |         except:
316 |             print(f"[ERROR] cannot resize image: {image_path}")
317 |             return False
318 | 
319 |         # save the resized image, overwrite the original file
320 |         resized_img.save(image_path, optimize=True)
321 |         print(f"image resized to 720p and saved: {image_path}")
322 |         return True
323 | 
324 | 
325 | def resize_to_1080p(image_path):
326 |     """
327 |     check and resize the image to fixed 1920x1080 resolution, return whether success
328 |     """
329 |     try:
330 |         with Image.open(image_path) as img:
331 |             img.verify()  # verify the image integrity
332 |     except:
333 |         print(f"[ERROR] image corrupted: {image_path}")
334 |         return False
335 | 
336 |     # open the image
337 |     with Image.open(image_path) as img:
338 |         # check if the image is already 1080p
339 |         if img.size == (1920, 1080):
340 |             print(f"image is already 1080p, no need to resize: {image_path}")
341 |             return True
342 | 
343 |         # resize the image to fixed 1920x1080 resolution
344 |         try:
345 |             resized_img = img.resize((1920, 1080), Image.LANCZOS)
346 |         except:
347 |             print(f"[ERROR] cannot resize image: {image_path}")
348 |             return False
349 | 
350 |         # save the resized image, overwrite the original file
351 |         resized_img.save(image_path, optimize=True)
352 |         print(f"image resized and saved: {image_path}")
353 |         return True
354 | 
355 | 
356 | def resize_action(action_str, scale_x, scale_y):
357 |     """
358 |     extract coordinates from the action string, scale them, and replace the coordinate part in the original string.
359 |     supports both single-point actions (e.g. "double click (1415, 741)") and 
360 |     drag actions (e.g. "drag from (1230, 26) to (1209, 26)").
361 | 
362 |     :param action_str: action string
363 |     :param scale_x: X axis scale factor
364 |     :param scale_y: Y axis scale factor
365 |     :return: the scaled action string
366 |     """
367 |     # use regex to match coordinate pairs
368 |     pattern = r'\((\d+),\s*(\d+)\)'
369 |     
370 |     def scale_coords(match):
371 |         original_x = float(match.group(1))
372 |         original_y = float(match.group(2))
373 |         scaled_x = round(original_x * scale_x)
374 |         scaled_y = round(original_y * scale_y)
375 |         print(f"scale coordinates: ({original_x}, {original_y}) -> ({scaled_x}, {scaled_y})")
376 |         return f"({scaled_x}, {scaled_y})"
377 |     
378 |     # replace all coordinate pairs using the callback function
379 |     new_action_str = re.sub(pattern, scale_coords, action_str)
380 |     return new_action_str
381 | 
382 | 
383 | def are_screenshots_identical(screenshot_path1, screenshot_path2):
384 |     """
385 |     check if two screenshots are identical
386 |     """
387 |     # read the images
388 |     img1 = cv2.imread(screenshot_path1)
389 |     img2 = cv2.imread(screenshot_path2)
390 | 
391 |     # check if the images are successfully read
392 |     if img1 is None or img2 is None:
393 |         print(f"cannot read image: {screenshot_path1} or {screenshot_path2}")
394 |         return False
395 | 
396 |     # check if the images have the same size
397 |     if img1.shape != img2.shape:
398 |         return False
399 | 
400 |     # check if the images are identical
401 |     difference = cv2.subtract(img1, img2)
402 |     return not np.any(difference)
403 | 
404 | 
405 | def parse_click_action(action):
406 |     pattern = r'((?:double |right )?click)\s*\((\d+),\s*(\d+)\)'
407 |     match = re.match(pattern, action)
408 |     
409 |     if match:
410 |         action = match.group(1)  # extract the action name
411 |         x = int(match.group(2))  # extract x coordinate and convert to integer
412 |         y = int(match.group(3))  # extract y coordinate and convert to integer
413 |         return action, (x, y)
414 |     else:
415 |         return None, None
416 | 
417 | 
418 | def parse_drag_action(action):
419 |     assert action.startswith('drag from'), f"error: action '{action}' is not a drag action"
420 |     start1 = action.find('from (') + 6
421 |     end1 = action.find(') to (')
422 |     start2 = action.find('to (') + 4
423 |     end2 = len(action) - 1
424 |     
425 |     # extract two sets of coordinates
426 |     coord1 = action[start1:end1]
427 |     coord2 = action[start2:end2]
428 |     
429 |     # split and convert to integers
430 |     x1, y1 = map(int, coord1.split(', '))
431 |     x2, y2 = map(int, coord2.split(', '))
432 |     
433 |     return (x1, y1), (x2, y2)
434 | 
435 | 
436 | def extract_coordinates(text):
437 |     # Pattern for drag/press/scroll coordinates
438 |     coord_pattern_1 = r'(?:drag to|press|scroll) \((\-?\d+), (\-?\d+)\)'
439 |     coord_match = re.search(coord_pattern_1, text)
440 |     if coord_match:
441 |         x, y = map(int, coord_match.groups())
442 |         return x, y
443 |     
444 |     # Pattern for scroll with dx and dy
445 |     coord_pattern_2 = r'scroll dx\s*=\s*(\-?\d+),\s*dy\s*=\s*(\-?\d+)'
446 |     coord_match = re.search(coord_pattern_2, text)
447 |     if coord_match:
448 |         dx, dy = map(int, coord_match.groups())
449 |         return dx, dy
450 |     
451 |     # If no match is found, return None
452 |     return None
453 | 
454 | 
455 | def draw_point(x, y, draw):
456 |     radius = POINT_RADIUS
457 |     left = x - radius
458 |     top = y - radius
459 |     right = x + radius
460 |     bottom = y + radius
461 | 
462 |     draw.ellipse(
463 |         [(left, top), (right, bottom)],
464 |         fill="red"
465 |     )
466 | 
467 | 
468 | def draw_circle(x, y, draw):
469 |     radius = CIRCLE_RADIUS
470 |     left = x - radius
471 |     top = y - radius
472 |     right = x + radius
473 |     bottom = y + radius
474 | 
475 |     draw.ellipse(
476 |         [(left, top), (right, bottom)],
477 |         outline="red",
478 |         width=CIRCLE_WIDTH
479 |     )
480 | 
481 | 
482 | def draw_short_arrow(x, y, draw):
483 |     arrow_length = 50  # arrow length
484 |     arrow_gap = CIRCLE_RADIUS + 2  # arrow gap
485 |     arrow_width = 10   # arrow width
486 |     angle = np.radians(30)  # arrow angle
487 |     cos_angle = np.cos(angle)
488 |     sin_angle = np.sin(angle)
489 | 
490 |     # draw the arrow body
491 |     start_x = x - arrow_length * cos_angle
492 |     start_y = y - arrow_length * sin_angle
493 |     end_x = x - arrow_gap * cos_angle
494 |     end_y = y - arrow_gap * sin_angle
495 |     draw.line([(start_x, start_y), (end_x, end_y)],
496 |               fill="red", width=3)
497 | 
498 |     # draw the arrow head
499 |     arrow_point1 = (
500 |         int(end_x - arrow_width),
501 |         int(end_y)
502 |     )
503 |     arrow_point2 = (
504 |         int(end_x - arrow_width * sin_angle),
505 |         int(end_y - arrow_width * cos_angle)
506 |     )
507 | 
508 |     draw.polygon([
509 |         (end_x, end_y),
510 |         arrow_point1,
511 |         arrow_point2
512 |     ], fill="red")
513 | 
514 | 
515 | def draw_long_arrow(x1, y1, x2, y2, draw):
516 |     head_length = 18  # arrow head length
517 |     head_angle = np.radians(30)  # arrow head angle
518 | 
519 |     # calculate the midpoint of the line
520 |     mid_x = (x1 + x2) / 2
521 |     mid_y = (y1 + y2) / 2
522 | 
523 |     # draw the arrow body
524 |     draw.line([(x1, y1), (x2, y2)], fill="red", width=3)
525 | 
526 |     # arrow head direction vector
527 |     vector_x = x2 - x1
528 |     vector_y = y2 - y1
529 |     length = np.hypot(vector_x, vector_y)
530 |     unit_vector_x = vector_x / length
531 |     unit_vector_y = vector_y / length
532 | 
533 |     # calculate the positions of the two points of the arrow head (now based on the midpoint)
534 |     left_x = mid_x - head_length * \
535 |         (unit_vector_x * np.cos(head_angle) +
536 |          unit_vector_y * np.sin(head_angle))
537 |     left_y = mid_y - head_length * \
538 |         (unit_vector_y * np.cos(head_angle) -
539 |          unit_vector_x * np.sin(head_angle))
540 | 
541 |     right_x = mid_x - head_length * \
542 |         (unit_vector_x * np.cos(head_angle) -
543 |          unit_vector_y * np.sin(head_angle))
544 |     right_y = mid_y - head_length * \
545 |         (unit_vector_y * np.cos(head_angle) +
546 |          unit_vector_x * np.sin(head_angle))
547 | 
548 |     # use the midpoint as the vertex of the arrow head
549 |     draw.polygon([(mid_x, mid_y), (left_x, left_y),
550 |                   (right_x, right_y)], fill="red")
551 | 
552 | 
553 | def parse_thought_action_from_response(response):
554 |     """
555 |     Parse thought, action from response by finding the last occurrence of 'Action:'.
556 |     """
557 |     if response is None:
558 |         return None, None
559 |     
560 |     # Find the last occurrence of 'Action:'
561 |     index = response.rfind("Action:")
562 |     if index == -1:
563 |         return response.strip(), None
564 |     
565 |     # Split the response into thought and action
566 |     thought = response[:index].strip()
567 |     action_start = index + len("Action:")
568 |     action = response[action_start:].strip()
569 |     
570 |     return thought, action
571 | 
572 | 
573 | def combine_thought_action_to_response(thought, action):
574 |     return f"{thought}\n\nAction: {action}"
575 | 
576 | 
577 | def get_mllm_messages(instruction, base64_image=None):
578 |     messages = [
579 |         {
580 |             "role": "user",
581 |             "content": [
582 |                 {
583 |                     "type": "image_url",
584 |                     "image_url": {
585 |                         "url": f"data:image/jpeg;base64,{base64_image}"
586 |                     }
587 |                 },
588 |                 {
589 |                     "type": "text",
590 |                     "text": instruction
591 |                 },
592 |             ] if base64_image else [
593 |                 {
594 |                     "type": "text",
595 |                     "text": instruction
596 |                 }
597 |             ]
598 |         },
599 |     ]
600 |     return messages
601 | 
602 | 
603 | def match(action, gt_entry):
604 |     """
605 |     Determine if the predicted action is equivalent to the ground truth entry
606 |     
607 |     Args:
608 |         action (str): Predicted action string
609 |         gt_entry (dict): Dictionary containing ground truth action and related information
610 |     
611 |     Returns:
612 |         bool: Returns True if actions are equivalent, False otherwise
613 |     """
614 |     # Handle edge cases first
615 |     if action is None or gt_entry is None or "action" not in gt_entry:
616 |         return False
617 |     
618 |     gt_action = gt_entry["action"]
619 |     
620 |     # Handle all click-type actions (click, right click, double click)
621 |     click_types = ["click", "right click", "double click"]
622 |     
623 |     for click_type in click_types:
624 |         if action.startswith(click_type) and gt_action.startswith(click_type):
625 |             # After confirming click type match, check coordinates
626 |             try:
627 |                 # Extract coordinates from predicted action
628 |                 import re
629 |                 coord_match = re.search(r'\((\d+),\s*(\d+)\)', action)
630 |                 if not coord_match:
631 |                     return False
632 |                 
633 |                 x, y = int(coord_match.group(1)), int(coord_match.group(2))
634 |                 
635 |                 # Check if coordinates are within gt_entry's rect range
636 |                 if "rect" in gt_entry:
637 |                     rect = gt_entry["rect"]
638 |                     # Check rect format, usually [x1, y1, x2, y2] representing top-left and bottom-right coordinates
639 |                     if isinstance(rect, list) and len(rect) == 4:
640 |                         x1, y1, x2, y2 = rect
641 |                         return x1 <= x <= x2 and y1 <= y <= y2
642 |             except Exception as e:
643 |                 print(f"Error in matching click coordinates: {e}")
644 |                 return False
645 |     
646 |     # For all other action types, directly compare if strings are identical
647 |     return action == gt_action
648 | 
649 | 
650 | def get_action_code(action) -> str:
651 |         screen_width, screen_height = 1280, 720
652 |         # click
653 |         match = re.match(r"click \((-?\d+), (-?\d+)\)", action)
654 |         if match:
655 |             x = int(match.group(1))
656 |             y = int(match.group(2))
657 |             if 0 <= x < screen_width and 0 <= y < screen_height:
658 |                 return f"pyautogui.click({x}, {y})"
659 |             else:
660 |                 return None
661 | 
662 |         # right click
663 |         match = re.match(r"right click \((-?\d+), (-?\d+)\)", action)
664 |         if match:
665 |             x = int(match.group(1))
666 |             y = int(match.group(2))
667 |             if 0 <= x < screen_width and 0 <= y < screen_height:
668 |                 return f"pyautogui.rightClick({x}, {y})"
669 |             else:
670 |                 return None
671 | 
672 |         # double click
673 |         match = re.match(r"double click \((-?\d+), (-?\d+)\)", action)
674 |         if match:
675 |             x = int(match.group(1))
676 |             y = int(match.group(2))
677 |             if 0 <= x < screen_width and 0 <= y < screen_height:
678 |                 return f"pyautogui.doubleClick({x}, {y})"
679 |             else:
680 |                 return None
681 | 
682 |         # drag
683 |         match = re.match(r"drag from \((-?\d+), (-?\d+)\) to \((-?\d+), (-?\d+)\)", action)
684 |         if match:
685 |             x1 = int(match.group(1))  # start x coordinate
686 |             y1 = int(match.group(2))  # start y coordinate
687 |             x2 = int(match.group(3))  # target x coordinate
688 |             y2 = int(match.group(4))  # target y coordinate
689 |             if 0 <= x1 < screen_width and 0 <= y1 < screen_height and 0 <= x2 < screen_width and 0 <= y2 < screen_height:
690 |                 return f"pyautogui.mouseDown({x1}, {y1})\npyautogui.dragTo({x2}, {y2}, duration=0.5)"
691 |             else:
692 |                 return None
693 |         
694 |         # scroll
695 |         match = re.match(r"scroll \((-?\d+)\)", action)
696 |         if match:
697 |             y = int(match.group(1))  # vertical scroll distance
698 |             return f"pyautogui.scroll({y})"  # positive: scroll up, negative: scroll down
699 | 
700 |         # press key
701 |         match = re.match(r"press key: (.+)", action)
702 |         if match:
703 |             key_content = match.group(1).lower()
704 |             # Format error
705 |             if 'key' in key_content:
706 |                 return None
707 |             # If key is not in the valid key list
708 |             if key_content not in KEYBOARD_KEYS:
709 |                 return None
710 |             return f"pyautogui.press('{key_content}')"
711 | 
712 |         # hotkey
713 |         match = re.match(r"hotkey \((.+), (.+), (.+)\)", action)
714 |         if match:
715 |             key1 = match.group(1).strip("'").lower()
716 |             key2 = match.group(2).strip("'").lower()
717 |             key3 = match.group(3).strip("'").lower()
718 |             # Format error
719 |             if 'key' in key1 or 'key' in key2 or 'key' in key3:
720 |                 return None
721 |             return f"pyautogui.hotkey('{key1}', '{key2}', '{key3}')"
722 |         
723 |         match = re.match(r"hotkey \((.+), (.+)\)", action)
724 |         if match:
725 |             key1 = match.group(1).strip("'").lower()
726 |             key2 = match.group(2).strip("'").lower()
727 |             # Format error
728 |             if 'key' in key1 or 'key' in key2:
729 |                 return None
730 |             return f"pyautogui.hotkey('{key1}', '{key2}')"
731 |         
732 |         # type text
733 |         match = re.match(r"type text: (.+)", action)
734 |         if match:
735 |             text_content = match.group(1).strip("'").strip("\"")
736 |             text_content = text_content.replace("\"", "\\\"")
737 |             text_content = text_content.replace("\'", "\\\'")
738 |             # Format error
739 |             if "text_content" in text_content:
740 |                 return None
741 |             return f"pyautogui.write(\"{text_content}\")"
742 | 
743 |         # wait
744 |         if action == "wait":
745 |             return "WAIT"
746 |             
747 |         # finish
748 |         if action == "finish":
749 |             return "DONE"
750 | 
751 |         # fail
752 |         if action == "fail":
753 |             return "FAIL"
754 |         
755 |         return None
756 |     
757 | 
758 | def get_history_str(history: list):
759 |     history_cut_off = 10
760 |     if len(history) > history_cut_off:
761 |         history_str = "\n\n".join(f"[{i+1}] {item}" for i, item in enumerate(history[-history_cut_off:]))
762 |     else:
763 |         history_str = "\n\n".join(f"[{i+1}] {item}" for i, item in enumerate(history))
764 |             
765 |     if history_str == '':
766 |         history_str = "None"
767 |     
768 |     return history_str


--------------------------------------------------------------------------------