├── android_env2
    ├── __init__.py
    ├── exception.py
    ├── constant.py
    ├── config.py
    ├── env.py
    ├── observation.py
    ├── reward.py
    ├── README.md
    ├── phone.py
    ├── simulator.py
    ├── actions.py
    └── xml_tool.py
├── scripts
    ├── prepare_files
    │   ├── image.jpg
    │   ├── image1.jpeg
    │   ├── sample.pdf
    │   └── sample1.pdf
    ├── env_setup.py
    └── env_setup_crossapp.py
├── trajectory
    ├── GPT4-Search_for_large_language_model_in_Firefox.pdf
    ├── GPT4-Add_a_new_city_Beijing_to_World_Clock_in_Google_Clock.pdf
    ├── GPT4-Change_the_default_web_browser_to_chrome_on_my_Android_device.pdf
    ├── GPT4-Get_directions_from_my_current_location_to_Microsoft_SVC_Building.pdf
    └── README.md
├── app_configs
    ├── weather.yaml
    ├── message.yaml
    ├── chrome.yaml
    ├── clock.yaml
    ├── contacts.yaml
    ├── gmail.yaml
    ├── camera.yaml
    ├── settings.yaml
    ├── firefox.yaml
    ├── youtube.yaml
    ├── googlemaps.yaml
    ├── photos.yaml
    ├── slack.yaml
    ├── google-drive.yaml
    ├── calendar.yaml
    └── phone.yaml
├── tasks
    ├── firefox.yaml
    ├── photos.yaml
    ├── slack.yaml
    ├── camera.yaml
    ├── clock.yaml
    ├── youtube.yaml
    ├── contacts.yaml
    ├── google-maps.yaml
    ├── google-drive.yaml
    ├── weather.yaml
    ├── settings.yaml
    ├── messages.yaml
    ├── gmail.yaml
    ├── cross-app.yaml
    ├── calendar.yaml
    ├── constrain.yaml
    └── MTG.py
├── requirements.txt
├── agents
    ├── agent_base.py
    ├── replay_agent.py
    ├── lm_reward.py
    ├── action_parser.py
    ├── utils.py
    ├── tasks.py
    ├── replay_buffer.py
    ├── prompt.py
    └── lm_agent.py
├── run_replay_agent.py
├── run_lm_agent.py
├── README.md
└── run_evaluator.py


/android_env2/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/scripts/prepare_files/image.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndroidArenaAgent/AndroidArena/HEAD/scripts/prepare_files/image.jpg


--------------------------------------------------------------------------------
/scripts/prepare_files/image1.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndroidArenaAgent/AndroidArena/HEAD/scripts/prepare_files/image1.jpeg


--------------------------------------------------------------------------------
/scripts/prepare_files/sample.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndroidArenaAgent/AndroidArena/HEAD/scripts/prepare_files/sample.pdf


--------------------------------------------------------------------------------
/scripts/prepare_files/sample1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndroidArenaAgent/AndroidArena/HEAD/scripts/prepare_files/sample1.pdf


--------------------------------------------------------------------------------
/trajectory/GPT4-Search_for_large_language_model_in_Firefox.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndroidArenaAgent/AndroidArena/HEAD/trajectory/GPT4-Search_for_large_language_model_in_Firefox.pdf


--------------------------------------------------------------------------------
/trajectory/GPT4-Add_a_new_city_Beijing_to_World_Clock_in_Google_Clock.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndroidArenaAgent/AndroidArena/HEAD/trajectory/GPT4-Add_a_new_city_Beijing_to_World_Clock_in_Google_Clock.pdf


--------------------------------------------------------------------------------
/app_configs/weather.yaml:
--------------------------------------------------------------------------------
1 | app:
2 |   name: Weather
3 |   package: com.weather.Weather
4 |   description: Get weather information.
5 |   permissions:
6 |     - GPS
7 |     - FileSystem
8 |   activities:
9 | activities:


--------------------------------------------------------------------------------
/trajectory/GPT4-Change_the_default_web_browser_to_chrome_on_my_Android_device.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndroidArenaAgent/AndroidArena/HEAD/trajectory/GPT4-Change_the_default_web_browser_to_chrome_on_my_Android_device.pdf


--------------------------------------------------------------------------------
/android_env2/exception.py:
--------------------------------------------------------------------------------
 1 | class OutputParserException(Exception):
 2 |     pass
 3 | 
 4 | 
 5 | class AndroidActionException(Exception):
 6 |     pass
 7 | 
 8 | 
 9 | class ActionInputParserException(Exception):
10 |     pass
11 | 


--------------------------------------------------------------------------------
/trajectory/GPT4-Get_directions_from_my_current_location_to_Microsoft_SVC_Building.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndroidArenaAgent/AndroidArena/HEAD/trajectory/GPT4-Get_directions_from_my_current_location_to_Microsoft_SVC_Building.pdf


--------------------------------------------------------------------------------
/app_configs/message.yaml:
--------------------------------------------------------------------------------
1 | app:
2 |   name: Messages
3 |   package: com.google.android.apps.messaging
4 |   description: Simple, helpful messaging by Google.
5 |   permissions:
6 |     - GPS
7 |     - FileSystem
8 |   activities:
9 | activities:


--------------------------------------------------------------------------------
/app_configs/chrome.yaml:
--------------------------------------------------------------------------------
1 | app:
2 |   name: Chrome
3 |   package: com.chrome.beta
4 |   description: The speed and simplicity of Chrome, now on your Android device.
5 |   permissions:
6 |     - GPS
7 |     - FileSystem
8 |   activities:
9 | activities:


--------------------------------------------------------------------------------
/app_configs/clock.yaml:
--------------------------------------------------------------------------------
1 | app:
2 |   name: Clock
3 |   package: com.google.android.deskclock
4 |   description: Check the time at different timezone and set alarm.
5 |   permissions:
6 |     - GPS
7 |     - FileSystem
8 |   activities:
9 | activities:


--------------------------------------------------------------------------------
/app_configs/contacts.yaml:
--------------------------------------------------------------------------------
1 | app:
2 |   name: Contacts
3 |   package: com.google.android.contacts
4 |   description: Back up your contacts & access them anywhere.
5 |   permissions:
6 |     - GPS
7 |     - FileSystem
8 |   activities:
9 | activities:


--------------------------------------------------------------------------------
/app_configs/gmail.yaml:
--------------------------------------------------------------------------------
1 | app:
2 |   name: Gmail
3 |   package: com.google.android.gm
4 |   description: Connect, create and collaborate with Gmail, part of Google Workspace.
5 |   permissions:
6 |     - GPS
7 |     - FileSystem
8 |   activities:
9 | activities:


--------------------------------------------------------------------------------
/app_configs/camera.yaml:
--------------------------------------------------------------------------------
1 | app:
2 |   name: Camera
3 |   package: com.android.camera2
4 |   description: Native Android camera APP, useful for taking photos and recording videos.
5 |   permissions:
6 |     - GPS
7 |     - FileSystem
8 |   activities:
9 | activities:


--------------------------------------------------------------------------------
/app_configs/settings.yaml:
--------------------------------------------------------------------------------
1 | app:
2 |   name: Settings
3 |   package: com.android.settings
4 |   description: The phone settings. You can check and modify phone settings here.
5 |   permissions:
6 |     - GPS
7 |     - FileSystem
8 |   activities:
9 | activities:


--------------------------------------------------------------------------------
/app_configs/firefox.yaml:
--------------------------------------------------------------------------------
1 | app:
2 |   name: Firefox
3 |   package: org.mozilla.firefox
4 |   description: Firefox web browser. You can search for any news, events and knowledge here.
5 |   permissions:
6 |     - GPS
7 |     - FileSystem
8 |   activities:
9 | activities:


--------------------------------------------------------------------------------
/app_configs/youtube.yaml:
--------------------------------------------------------------------------------
1 | app:
2 |   name: YouTube
3 |   package: com.google.android.youtube
4 |   description: Enjoy your favourite videos and channels with the official YouTube app.
5 |   permissions:
6 |     - GPS
7 |     - FileSystem
8 |   activities:
9 | activities:


--------------------------------------------------------------------------------
/app_configs/googlemaps.yaml:
--------------------------------------------------------------------------------
1 | app:
2 |   name: Google Maps
3 |   package: com.google.android.apps.maps
4 |   description: Real-time GPS navigation & local suggestions for food, events & activities.
5 |   permissions:
6 |     - GPS
7 |     - FileSystem
8 |   activities:
9 | activities:


--------------------------------------------------------------------------------
/app_configs/photos.yaml:
--------------------------------------------------------------------------------
1 | app:
2 |   name: Photos
3 |   package: com.google.android.apps.photos
4 |   description: The home for all your photos and videos, automatically organized and easy to share.
5 |   permissions:
6 |     - GPS
7 |     - FileSystem
8 |   activities:
9 | activities:


--------------------------------------------------------------------------------
/android_env2/constant.py:
--------------------------------------------------------------------------------
1 | ASCII_CHARSET = "".join(chr(x) for x in range(32, 128))
2 | FREQ_UNICODE_CHARSET = "".join(chr(x) for x in range(129, 1000))
3 | UTTERANCE_MAX_LENGTH = 8192
4 | ATTRIBUTE_MAX_LENGTH = 256
5 | TEXT_MAX_LENGTH = 256
6 | TYPING_MAX_LENGTH = 64
7 | URL_MAX_LENGTH = 256
8 | MAX_ANSWER_LENGTH = 512
9 | 


--------------------------------------------------------------------------------
/app_configs/slack.yaml:
--------------------------------------------------------------------------------
1 | app:
2 |   name: Slack
3 |   package: com.Slack
4 |   description: Slack brings team communication and collaboration into one place so you can get more work done, whether you belong to a large enterprise or a small business.
5 |   permissions:
6 |     - GPS
7 |     - FileSystem
8 |   activities:
9 | activities:


--------------------------------------------------------------------------------
/app_configs/google-drive.yaml:
--------------------------------------------------------------------------------
1 | app:
2 |   name: Google Drive
3 |   package: com.google.android.apps.docs
4 |   description: a safe place to back up and access all your files from any device. Easily invite others to view, edit, or leave comments on any of your files or folders.
5 |   permissions:
6 |     - GPS
7 |     - FileSystem
8 |   activities:
9 | activities:


--------------------------------------------------------------------------------
/tasks/firefox.yaml:
--------------------------------------------------------------------------------
 1 | type: firefox
 2 | obs_type: mix
 3 | reward_type: prompt
 4 | max_step: 15
 5 | max_repeat_step: 5
 6 | tasks:
 7 |   - instruction: Search for "large language model" in Firefox.
 8 |   - instruction: Navigate to the webpage "https://www.wikipedia.org/".
 9 |   - instruction: Search the key word "GPT" in website "https://openai.com/blog".
10 |   - instruction: View my bookmarks in Firefox.
11 |   - instruction: View my history in Firefox.


--------------------------------------------------------------------------------
/app_configs/calendar.yaml:
--------------------------------------------------------------------------------
 1 | app:
 2 |   name: Calendar
 3 |   package: com.google.android.calendar
 4 |   description: useful for creating, querying, editing and deleting calendar events. For example, when the query is "please remind me of the meeting on July 18.", you should create an event with content "meeting" and date "July 18".
 5 |   permissions:
 6 |     - GPS
 7 |     - FileSystem
 8 |   activities:
 9 |     - page1
10 |     - page2
11 |     - page3
12 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | adbutils==1.2.12
 2 | aiolimiter==1.1.0
 3 | beautifulsoup4==4.12.3
 4 | cairosvg==2.7.1
 5 | colorama==0.4.6
 6 | gymnasium==0.28.1
 7 | html2text==2020.1.16
 8 | langchain==0.1.4
 9 | langchain_core==0.1.17
10 | lxml==4.9.2
11 | matplotlib==3.7.1
12 | numpy==1.24.3
13 | openai==0.27.6
14 | pandas==2.0.2
15 | pydantic==1.10.9
16 | pygal==3.0.4
17 | python-dotenv==1.0.1
18 | PyYAML==6.0.1
19 | Requests==2.31.0
20 | seaborn==0.13.2
21 | spacy==3.7.2
22 | streamlit==1.27.2
23 | tiktoken==0.5.1
24 | tqdm==4.65.0
25 | trafilatura==1.6.1
26 | transformers==4.29.0
27 | uiautomator2
28 | xmltodict==0.13.0
29 | 


--------------------------------------------------------------------------------
/tasks/photos.yaml:
--------------------------------------------------------------------------------
 1 | type: photos
 2 | obs_type: mix
 3 | reward_type: prompt
 4 | max_step: 15
 5 | max_repeat_step: 5
 6 | tasks:
 7 |   - instruction: Group similar faces together in my photos using the facial recognition feature.
 8 |   - instruction: Create an album named "animals" for the first photo.
 9 |   - instruction: Delete the album named "animals" in Photos.
10 |   - instruction: Create a quick video from first two photos using the animation feature.
11 |   - instruction: Delete the first photo from my Google Photos.
12 |   - instruction: View my Google Photos information, including storage usage and account settings.
13 | 


--------------------------------------------------------------------------------
/android_env2/config.py:
--------------------------------------------------------------------------------
 1 | from functools import lru_cache
 2 | 
 3 | from pydantic import BaseSettings
 4 | 
 5 | 
 6 | class Settings(BaseSettings):
 7 |     android_image: str = ""
 8 |     emulator_path: str = "<YOUR_EMULATOR_PATH>, e.g., XXXX\\android_sdk\\emulator\\emulator.exe"
 9 |     avd_name: str = "<ANDROID_VIRTUAL_DEVICE_NAME>"
10 |     adb_ip: str = "127.0.0.1"
11 |     adb_port: int = 5555
12 |     emulator_name: str = "emulator-5554"
13 | 
14 |     early_stop: bool = True
15 |     max_step: int = 50
16 | 
17 |     logger_path: str = "android_env_log/"
18 | 
19 |     phone_config_path = "app_configs/phone.yaml"
20 | 
21 | 
22 | @lru_cache
23 | def get_settings():
24 |     settings = Settings()
25 |     return settings
26 | 


--------------------------------------------------------------------------------
/tasks/slack.yaml:
--------------------------------------------------------------------------------
 1 | type: slack
 2 | obs_type: mix
 3 | reward_type: prompt
 4 | max_step: 15
 5 | max_repeat_step: 5
 6 | tasks:
 7 |  - instruction: Create a new workspace with the name "myspace" for project "myproject" in Slack and do not invite anyone.
 8 |  - instruction: Create a new public channel with the name "work_channel" in Slack.
 9 |  - instruction: Invite "bob@example.com" to join channel "work_channel" in Slack.
10 |   - instruction: Send a message "hello" to "bob" in Slack.
11 |   - instruction: Mark a "heart" emoji to the most recent message from "bob" in Slack.
12 |   - instruction: Share the most recent file with "bob" in Slack.
13 |   - instruction: Search for a specific conversation containing "hello" in Slack.
14 |   - instruction: Mute the specific channel "work_channel" in Slack.
15 |   - instruction: Change the slack theme to the dark mode.
16 | 


--------------------------------------------------------------------------------
/tasks/camera.yaml:
--------------------------------------------------------------------------------
 1 | type: camera
 2 | obs_type: mix
 3 | reward_type: prompt
 4 | max_step: 15
 5 | max_repeat_step: 5
 6 | tasks:
 7 |   - instruction: Take a picture.
 8 |   - instruction: Start recording a video.
 9 |   - instruction: Set the flash mode to off for the Camera.
10 |   - instruction: Set the flash mode to on for the Camera.
11 |   - instruction: Turn on save location function for the Camera.
12 |   - instruction: Turn off save location function for the Camera.
13 |   - instruction: Turn on the manual exposure compensation for the Camera.
14 |   - instruction: Turn off the manual exposure compensation for the Camera.
15 |   - instruction: Set the migapixels to maximum for the Camera.
16 |   - instruction: Set the migapixels to minimum for the Camera.
17 |   - instruction: Set the resolution to "HD 720p" for the Camera.
18 |   - instruction: Set the resolution to "QVGA" for the Camera.
19 |   - instruction: Change the timer to 10 seconds for the Camera.
20 |   - instruction: Change the timer to 3 seconds for the Camera.
21 | 
22 | 


--------------------------------------------------------------------------------
/tasks/clock.yaml:
--------------------------------------------------------------------------------
 1 | type: clock
 2 | obs_type: mix
 3 | reward_type: prompt
 4 | max_step: 15
 5 | max_repeat_step: 5
 6 | tasks:
 7 |   - instruction: Open Google Clock.
 8 |   - instruction: Set an alarm for 3PM with the label "meeting" using Google Clock.
 9 |   - instruction: Turn off the alarm with the label "meeting" using Google Clock.
10 |   - instruction: Delete the alarm with the label "meeting" using Google Clock.
11 |   - instruction: Set a timer for 5 seconds using Google Clock.
12 |   - instruction: Pause the timer using Google Clock.
13 |   - instruction: Resume the timer using Google Clock.
14 |   - instruction: Cancel the timer using Google Clock.
15 |   - instruction: Start the stopwatch using Google Clock.
16 |   - instruction: Stop the stopwatch using Google Clock.
17 |   - instruction: Reset the stopwatch using Google Clock.
18 |   - instruction: View the current time in London in Google Clock.
19 |   - instruction: Add a new city Beijing to World Clock in Google Clock.
20 |   - instruction: Delete the city Beijing to World Clock in Google Clock.


--------------------------------------------------------------------------------
/trajectory/README.md:
--------------------------------------------------------------------------------
 1 | ## Trajectory
 2 | 
 3 | 
 4 | This folder contains HTML and PDF visualization examples of trajectory.
 5 | 
 6 | 1. Add a new city Beijing to World Clock in Google Clock
 7 | 
 8 | [HTML](./GPT4-Add_a_new_city_Beijing_to_World_Clock_in_Google_Clock.html)  [PDF](./GPT4-Add_a_new_city_Beijing_to_World_Clock_in_Google_Clock.pdf)
 9 | 
10 | 
11 | 2. Change the default web browser to chrome on my Android device
12 | 
13 | [HTML](./GPT4-Change_the_default_web_browser_to_chrome_on_my_Android_device.html)  [PDF](./GPT4-Change_the_default_web_browser_to_chrome_on_my_Android_device.pdf)
14 | 
15 | 
16 | 3. Get directions from my current location to "Microsoft SVC Building".
17 | 
18 | [HTML](./GPT4-Get_directions_from_my_current_location_to_Microsoft_SVC_Building.html)  [PDF](./GPT4-Get_directions_from_my_current_location_to_Microsoft_SVC_Building.pdf)
19 | 
20 | 
21 | 4. Search for "large language model" in Firefox.
22 | 
23 | [HTML](./GPT4-Search_for_large_language_model_in_Firefox.html)  [PDF](./GPT4-Search_for_large_language_model_in_Firefox.pdf)
24 | 


--------------------------------------------------------------------------------
/tasks/youtube.yaml:
--------------------------------------------------------------------------------
 1 | type: youtube
 2 | obs_type: mix
 3 | reward_type: prompt
 4 | max_step: 15
 5 | max_repeat_step: 5
 6 | tasks:
 7 |   - instruction: Search for videos using keywords "ChatGPT".
 8 |   - instruction: Play the first video from the first page of YouTube.
 9 |   - instruction: Like the YouTube video titled "ChatGPT Explained Completely".
10 |   - instruction: Leave a comment "Great!" on the video titled "ChatGPT Explained Completely".
11 |   - instruction: Copy the share link of the YouTube video titled "ChatGPT Explained Completely".
12 |  - instruction: Subscribe to the YouTube channel "Ted-Ed".
13 |  - instruction: Unsubscribe from the YouTube channel "Ted-Ed".
14 |   - instruction: Create a YouTube playlist "work".
15 |   - instruction: Add the online video titled "ChatGPT Explained Completely" to the YouTube playlist "work".
16 |   - instruction: Remove the video titled "ChatGPT Explained Completely" from the YouTube playlist "work".
17 |   - instruction: Rename the YouTube playlist "work" to "test".
18 |   - instruction: Delete the YouTube playlist "test".


--------------------------------------------------------------------------------
/tasks/contacts.yaml:
--------------------------------------------------------------------------------
 1 | type: contacts
 2 | obs_type: mix
 3 | reward_type: prompt
 4 | max_step: 15
 5 | max_repeat_step: 5
 6 | tasks:
 7 |   - instruction: Create a new contact with the first name "John", last name "Smith", email address "john@example.com", and phone number "010-123456" in Google Contacts.
 8 |   - instruction: Add the Field Company as "Microsoft" to the contact "John" in Google Contacts.
 9 |   - instruction: Create a new label called "colleague" in Google Contacts.
10 |   - instruction: Add "John" to the existing label "colleague" in Google Contacts.
11 |   - instruction: Remove "John" from the existing label "colleague" in Google Contacts.
12 |   - instruction: Delete the label "colleague" in Google Contacts.
13 |   - instruction: Export all contacts from Google Contacts to a VCF file.
14 |   - instruction: Search for contacts with the name "John" in Google Contacts.
15 |   - instruction: Merge duplicate contacts in Google Contacts.
16 |   - instruction: Delete "John" from Google Contacts.
17 |   - instruction: Sort contacts by first name in Google Contacts.
18 | 


--------------------------------------------------------------------------------
/tasks/google-maps.yaml:
--------------------------------------------------------------------------------
 1 | type: google-maps
 2 | obs_type: mix
 3 | reward_type: prompt
 4 | max_step: 15
 5 | max_repeat_step: 5
 6 | tasks:
 7 |   - instruction: Get directions from my current location to "Microsoft SVC Building".
 8 |   - instruction: Show me the traffic conditions on my route to "Microsoft SVC Building".
 9 |   - instruction: Switch to satellite view.
10 |   - instruction: Find the nearest gas station.
11 |   - instruction: Show me the Street View of "Microsoft SVC Building".
12 |   - instruction: Find the best route for walking to "Microsoft SVC Building".
13 |   - instruction: Show me the public transportation options to "Microsoft SVC Building".
14 |   - instruction: Save the address "Los Altos Gardens" for quick access.
15 |   - instruction: Find the nearest restaurant available now.
16 |   - instruction: Show me the distance from "Los Altos Gardens" to "Microsoft SVC Building".
17 |   - instruction: Find the nearest ATM.
18 |   - instruction: Find the nearest parking lot.
19 |   - instruction: Find the nearest hospital.
20 |   - instruction: Find the nearest hotel.


--------------------------------------------------------------------------------
/agents/agent_base.py:
--------------------------------------------------------------------------------
 1 | from gymnasium import Env
 2 | from gymnasium.core import ObsType, ActType
 3 | 
 4 | from agents.replay_buffer import Trajectory
 5 | from agents.tasks import Task
 6 | 
 7 | 
 8 | class BaseAgent:
 9 |     def __init__(self, env: Env[ObsType, ActType], args):
10 |         self.env = env
11 |         self.args = args
12 |         self.trajectory = None
13 |         self.terminated = False
14 |         self.cur_step = 1
15 | 
16 |     def _reset_agent(self):
17 |         self.cur_step = 1
18 |         self.trajectory = None
19 |         self.terminated = False
20 | 
21 |     def select_action(self) -> ActType:
22 |         pass
23 | 
24 |     def learn(self):
25 |         pass
26 | 
27 |     def run(self, task: Task):
28 |         self.trajectory = Trajectory(task=task)
29 |         obs, info = self.env.reset()
30 |         self.trajectory.add(state=obs)
31 |         while not self.terminated:
32 |             action = self.select_action()
33 |             self.trajectory.add(action=action)
34 |             next_obs, reward, self.terminated, truncated, info = self.env.step(action)
35 |             self.trajectory.add(state=next_obs, reward=reward)
36 | 


--------------------------------------------------------------------------------
/tasks/google-drive.yaml:
--------------------------------------------------------------------------------
 1 | type: google-drive
 2 | obs_type: mix
 3 | reward_type: prompt
 4 | max_step: 15
 5 | max_repeat_step: 5
 6 | tasks:
 7 |   - instruction: Upload the document "sample.pdf" in the Download Folder to Google Drive.
 8 |   - instruction: Share the file "sample.pdf" with "bob@example.com" on Google Drive.
 9 |   - instruction: Create a new folder "test" on Google Drive.
10 |   - instruction: Move the file "sample.pdf" to folder "test" on Google Drive.
11 |   - instruction: Rename the file "sample.pdf" to "test.pdf" in the "test" folder on Google Drive.
12 |   - instruction: Make the file "test.pdf" in the "test" folder available offline on Google Drive.
13 |   - instruction: Share the file "test.pdf" in the "test" folder with "bob@example.com" and "bob2@example.com" on Google Drive.
14 |   - instruction: Add a comment "read" to the file "test.pdf"  in the "test" folder on Google Drive.
15 |   - instruction: Search for file "test.pdf" on Google Drive.
16 |   - instruction: Delete the "test" folder on Google Drive.
17 |   - instruction: Set the layout of Google Drive to dark theme.
18 |   - instruction: Set the layout of Google Drive to light theme.
19 | 
20 | 


--------------------------------------------------------------------------------
/tasks/weather.yaml:
--------------------------------------------------------------------------------
 1 | type: weather
 2 | obs_type: mix
 3 | reward_type: prompt
 4 | max_step: 15
 5 | max_repeat_step: 5
 6 | tasks:
 7 |   - instruction: Get the current weather for my current location.
 8 |   - instruction: Check the weather forecast for the next 3 days.
 9 |   - instruction: View the wind speed and direction for the current weather.
10 |   - instruction: Check the chance of rain for the current weather.
11 |   - instruction: View the humidity level for the current weather.
12 |   - instruction: Check the UV index for the current weather.
13 |   - instruction: Find out the sunrise and sunset times for my current location.
14 |   - instruction: Switch from Celsius to Fahrenheit units in the Weather APP.
15 |   - instruction: Switch from Fahrenheit to Celsius units in the Weather APP.
16 |   - instruction: Get a detailed hourly forecast for the next 3 hours.
17 |   - instruction: View the weather radar for my current location.
18 |   - instruction: Get the weather for "London" by searching for it.
19 |   - instruction: Turn on significant weather forecast alerts for severe weather conditions.
20 |   - instruction: Turn off significant weather forecast alerts for severe weather conditions.
21 | 
22 | 


--------------------------------------------------------------------------------
/agents/replay_agent.py:
--------------------------------------------------------------------------------
 1 | from colorama import Fore
 2 | 
 3 | from agents.agent_base import BaseAgent
 4 | from agents.replay_buffer import Trajectory, save_trajectory
 5 | 
 6 | 
 7 | class ReplayAgent(BaseAgent):
 8 |     def __init__(self, env, args):
 9 |         super().__init__(env, args)
10 |         pass
11 | 
12 |     @save_trajectory(folder="tj_replay")
13 |     def run(self, task):
14 |         self._reset_agent()
15 |         print(Fore.RED + f"Task: {task.instruction}" + Fore.RESET, end="\n\n")
16 |         self.trajectory = Trajectory(task=task)
17 |         obs, info = self.env.reset()
18 |         print(Fore.YELLOW + f"Obs: {obs['text'] if isinstance(obs, dict) else obs}" + Fore.RESET, end="\n\n")
19 |         self.trajectory.add(state=obs)
20 |         for action in task.action_sequence[:-1]:
21 |             print(Fore.BLUE + f"Action: {action}" + Fore.RESET, end="\n\n")
22 |             self.trajectory.add(action=action)
23 |             obs, reward, terminated, truncated, info = self.env.step(action)
24 |             print(Fore.YELLOW + f"Obs: {obs['text'] if isinstance(obs, dict) else obs}" + Fore.RESET, end="\n\n")
25 |             self.trajectory.add(state=obs, reward=reward)
26 |             self.cur_step += 1
27 | 


--------------------------------------------------------------------------------
/run_replay_agent.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | from agents.tasks import load_tasks_from_files
 4 | from android_env2.actions import AndroidActionWrapper
 5 | from android_env2.config import get_settings
 6 | from android_env2.env import AndroidEnv
 7 | from android_env2.observation import MixObsWrapper
 8 | from android_env2.reward import DummyRewardWrapper
 9 | from agents.replay_agent import ReplayAgent
10 | 
11 | 
12 | def get_args():
13 |     args = argparse.ArgumentParser(description='replay_agent')
14 |     args.add_argument('--test_app', default="calendar", type=str, help='test_apps')
15 |     return args.parse_args()
16 | 
17 | 
18 | def get_env(reward_lm=None):
19 |     settings = get_settings()
20 |     env = AndroidEnv(settings)
21 | 
22 |     env = MixObsWrapper(env)
23 |     env = DummyRewardWrapper(env)
24 |     env = AndroidActionWrapper(env)
25 |     return env
26 | 
27 | 
28 | def run(args=get_args()):
29 |     replay_agent = ReplayAgent(env=get_env(), args=args)
30 |     task_list = load_tasks_from_files(filename=f"tasks/{args.test_app}.yaml")
31 |     for task in task_list:
32 |         if not task.action_sequence:
33 |             continue
34 |         replay_agent.run(task)
35 | 
36 | 
37 | if __name__ == "__main__":
38 |     run()
39 | 


--------------------------------------------------------------------------------
/android_env2/env.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | import gymnasium as gym
 4 | 
 5 | from android_env2.actions import Action, ActionType
 6 | from android_env2.config import Settings
 7 | from android_env2.phone import Phone
 8 | from android_env2.simulator import Simulator
 9 | from android_env2.xml_tool import UIXMLTree
10 | 
11 | 
12 | class AndroidEnv(gym.Env):
13 |     def __init__(self, config: Settings):
14 |         self.config = config
15 |         self.simulator = Simulator(config)
16 |         self.phone = Phone()
17 |         self.phone.load_from_yaml(config.phone_config_path)
18 |         self.cur_ui_xml_tree = UIXMLTree()
19 |         self.trajectory = None
20 | 
21 |     def set_traj(self, traj):
22 |         self.trajectory = traj
23 | 
24 |     def reset(self, **kwargs):
25 |         self.simulator.reset()
26 |         if not self.phone.device_info:
27 |             self.phone.set_device_info(self.simulator.driver.device_info)
28 |         return None, {}
29 | 
30 |     def step(self, action: Action):
31 |         terminated, truncated = False, False
32 |         if action.action_type != ActionType.FINISH:
33 |             self.simulator.execute_action(action)
34 |         else:
35 |             terminated = True
36 |         return None, None, terminated, truncated, {"action": action}
37 | 
38 |     def close(self):
39 |         self.simulator.stop_avd()
40 | 


--------------------------------------------------------------------------------
/tasks/settings.yaml:
--------------------------------------------------------------------------------
 1 | type: settings
 2 | obs_type: mix
 3 | reward_type: prompt
 4 | max_step: 15
 5 | max_repeat_step: 5
 6 | tasks:
 7 |   - instruction: Turn off Bluetooth.
 8 |   - instruction: Turn on Bluetooth.
 9 |   - instruction: Change the wallpaper to the first photo in the 'Life' category.
10 |   - instruction: Enable Wi-Fi.
11 |   - instruction: Disable Wi-Fi.
12 |   - instruction: Check the battery usage.
13 |   - instruction: Add 'Aghem' language of my Android device.
14 |   - instruction: Change the phone ringtone to "Pixel Sounds Copycat" on my Android device.
15 |   - instruction: Disable location services on my Android device.
16 |   - instruction: Enable location services on my Android device.
17 |   - instruction: Check the storage usage of my Android device.
18 |   - instruction: Change the font size on my Android device and make the text bigger.
19 |   - instruction: Enable airplane mode on my Android device.
20 |   - instruction: Disable airplane mode on my Android device.
21 |   - instruction: Change the screen timeout to '1 minute' on my Android device.
22 |   - instruction: Change the screen timeout to '30 seconds' on my Android device.
23 |   - instruction: Check for system updates on my Android device.
24 |   - instruction: Change the default web browser to chrome on my Android device.
25 |   - instruction: Change the default web browser to firefox on my Android device.
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/tasks/messages.yaml:
--------------------------------------------------------------------------------
 1 | type: messages
 2 | obs_type: mix
 3 | reward_type: prompt
 4 | max_step: 15
 5 | max_repeat_step: 5
 6 | tasks:
 7 |   - instruction: Send a text message to "Bob" saying "hello" using Google Messages.
 8 |   - instruction: Send a text message to "Bob" saying "here is a good YouTube video 'ChatGPT Explained Completely'" using Google Messages.
 9 |   - instruction: Schedule a message "hello" to be sent to "Bob" at tomorrow morning using Google Messages.
10 |   - instruction: Search for the messages containing "hello" using Google Messages.
11 |   - instruction: Mark the message containing "hello" as important using Google Messages.
12 |   - instruction: Archive the message containing "hello" using Google Messages.
13 |   - instruction: Delete the message containing "hello" using Google Messages.
14 |   - instruction: Block a specific phone number "010-654321" using Google Messages.
15 |   - instruction: Unblock the specific phone number "010-654321" using Google Messages.
16 |   - instruction: Create a chat group called "work" with the first two people on the contact list using Google Messages.
17 |   - instruction: Pin the specific chat "work" to the top of my chat list using Google Messages.
18 |   - instruction: Delete the chat group called "work" in Google Messages.
19 |   - instruction: Mute notifications for all chats using Google Messages.
20 |   - instruction: Unmute notifications for all chats using Google Messages.
21 | 
22 | 
23 | 


--------------------------------------------------------------------------------
/app_configs/phone.yaml:
--------------------------------------------------------------------------------
 1 | user:
 2 |   name: Alice
 3 |   # generated by ChatGPT
 4 |   self_introduction: I prefer to use apps that cater to my interests and make my life easier. For travel, I like to use Airbnb, as it offer a wide range of accommodation options and allow me to book my stay easily. For transportation, I prefer Uber, as it provides a convenient and reliable way to get around.  For online shopping, I like to use Amazon, as its offer a vast selection of products and has a user-friendly interface. Additionally, it has a secure payment system and offers fast shipping options. For music, I enjoy using apps such as Spotify or Apple Music, as they have a vast collection of songs and playlists to choose from. I also appreciate apps that offer personalized recommendations based on my listening habits. Overall, I prioritize apps that provide a seamless and enjoyable experience for my travel, shopping, and entertainment needs, while also ensuring my security and privacy.
 5 | apps:
 6 |   firefox: firefox.yaml
 7 | #  chrome: chrome.yaml # only for constrain evaluation
 8 |   calendar: calendar.yaml
 9 |   camera: camera.yaml
10 |   gmail: gmail.yaml
11 |   contacts: contacts.yaml
12 |   clock: clock.yaml
13 |   googlemaps: googlemaps.yaml
14 |   google-drive: google-drive.yaml
15 |   message: message.yaml
16 |   weather: weather.yaml
17 |   youtube: youtube.yaml
18 |   settings: settings.yaml
19 |   photos: photos.yaml
20 |   slack: slack.yaml
21 | device_support:
22 |   - Camera
23 |   - FileSystem
24 |   - GPS
25 |   - NFC
26 |   - MIC
27 |   - Bluetooth
28 |   - WIFI
29 | 


--------------------------------------------------------------------------------
/tasks/gmail.yaml:
--------------------------------------------------------------------------------
 1 | type: gmail
 2 | obs_type: mix
 3 | reward_type: prompt
 4 | max_step: 15
 5 | max_repeat_step: 5
 6 | tasks:
 7 |   - instruction: Compose a draft email with the subject "meeting details" and the content "weekly meeting on 13 Oct, Room 101" to be sent to "bob@example.com".
 8 |   - instruction: Schedule an email with the subject "meeting details" and the content "weekly meeting on 13 Oct, Room 101" to be sent to "bob@example.com" at tomorrow morning in Gmail.
 9 |   - instruction: Add the first file in the file system to the saved draft email with the subject "meeting details" in Gmail.
10 |   - instruction: Open the latest draft email send to "bob@example.com" in Gmail.
11 |   - instruction: Star the latest email in sent box in Gmail.
12 |   - instruction: Search for emails containing "meeting" in Gmail.
13 |   - instruction: Mark the latest email sent from Google as unread in Gmail.
14 |   - instruction: Reply to the latest email sent from Google with content "Got it." in Gmail.
15 |   - instruction: Forward the latest email sent from Google to "bob@example.com" in Gmail.
16 |   - instruction: Search for emails sent to "bob@example.com" in Gmail.
17 |   - instruction: Delete the latest draft email to be send to "bob@example.com" in Gmail.
18 |   - instruction: Mark important to the latest email sent from Google in Gmail.
19 |   - instruction: Archive the latest email sent from Google in Gmail.
20 |   - instruction: Search for all emails sent from Google in Gmail.
21 |   - instruction: Open Gmail settings.
22 |   - instruction: Turn off notifications of the current account for Gmail.
23 |   - instruction: Change the conversation list density of emails to "Comfortable" in Gmail.
24 |   - instruction: Change the theme to "Dark" of Gmail.
25 | 


--------------------------------------------------------------------------------
/agents/lm_reward.py:
--------------------------------------------------------------------------------
 1 | import tiktoken
 2 | from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate
 3 | from langchain.schema import SystemMessage
 4 | 
 5 | from agents.prompt import REWARD_SYSTEM, REWARD_PROMPT
 6 | from agents.tasks import Task
 7 | from agents.utils import load_llm_agent, load_tokenizer
 8 | 
 9 | 
10 | class RewardLLM:
11 |     def __init__(self, args):
12 |         self.args = args
13 |         self.chat_model = load_llm_agent(args.model_provider, args.temperature)
14 |         self.instruction = ""
15 |         self.prompt_template = ""
16 |         self.tokenizer = load_tokenizer(args.model_name)
17 | 
18 |     def set_task(self, task: Task):
19 |         self.instruction = task.instruction
20 |         self.prompt_template = task.reward_prompt
21 | 
22 |     def construct_prompt(self, traj):
23 |         prompt = ""
24 |         i = len(traj)
25 |         for d in traj[::-1]:
26 |             state = d['state']["text"] if isinstance(d["state"], dict) else d['state']
27 |             if "action" in d:
28 |                 cur_prompt = f"Step {i - 1}:\n\nPrevious Observation: {state}\nAction: {d['action']}\n\n"
29 |             else:
30 |                 cur_prompt = f"Step {i - 1}:\n\nPrevious Observation: {state}\n\n"
31 |             if len(self.tokenizer.encode(cur_prompt + prompt)) > 3500:
32 |                 return prompt
33 |             prompt = cur_prompt + prompt
34 |             i -= 1
35 |         return prompt
36 | 
37 |     def __call__(self, traj, goal=None):
38 |         chat_prompt = ChatPromptTemplate.from_messages(
39 |             [SystemMessage(content=REWARD_SYSTEM), HumanMessagePromptTemplate(prompt=REWARD_PROMPT)])
40 |         message = chat_prompt.format_prompt(goal=self.instruction if not goal else goal,
41 |                                             traj=self.construct_prompt(traj)).to_messages()
42 |         return self.chat_model(message).content
43 | 


--------------------------------------------------------------------------------
/android_env2/observation.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from gymnasium import spaces
 3 | from gymnasium.core import ObservationWrapper
 4 | 
 5 | from android_env2.constant import UTTERANCE_MAX_LENGTH, ASCII_CHARSET, FREQ_UNICODE_CHARSET
 6 | 
 7 | 
 8 | class ImageObsWrapper(ObservationWrapper):
 9 |     def observation(self, observation):
10 |         img = self.env.simulator.screenshot()
11 |         return img
12 | 
13 |     def observation_space(self):
14 |         display = self.env.phone.device_info["display"]
15 |         image_space = spaces.Box(low=0, high=255, shape=(display["width"], display["height"], 3), dtype=np.uint8)
16 |         return image_space
17 | 
18 | 
19 | class TextObsWrapper(ObservationWrapper):
20 |     def observation(self, observation):
21 |         xml_str = self.simulator.dump_ui_xml()
22 |         app_info = self.env.simulator.current_app()
23 |         package = app_info["package"]
24 |         if "com.google.android.apps.nexuslauncher" == package:
25 |             app_info["app_name"] = "home"
26 |         else:
27 |             app = self.env.phone.get_pkg_by_name(package)
28 |             if not app:
29 |                 app_info["app_name"] = package.split(".")[-1]
30 |             else:
31 |                 app_info["app_name"] = app.name
32 |         xml_json = self.env.cur_ui_xml_tree.process(xml_str, app_info, level=2, str_type="plain_text")
33 |         return xml_json
34 | 
35 |     def observation_space(self):
36 |         text_space = spaces.Text(
37 |             min_length=0,
38 |             max_length=UTTERANCE_MAX_LENGTH,
39 |             charset=ASCII_CHARSET + FREQ_UNICODE_CHARSET,
40 |         )
41 |         return text_space
42 | 
43 | 
44 | class MixObsWrapper(TextObsWrapper):
45 |     def observation(self, observation):
46 |         xml_str = super().observation(observation)
47 |         return {"text": xml_str, "image": self.env.simulator.screenshot()}
48 | 
49 |     def observation_space(self):
50 |         text_space = spaces.Text(
51 |             min_length=0,
52 |             max_length=UTTERANCE_MAX_LENGTH,
53 |             charset=ASCII_CHARSET + FREQ_UNICODE_CHARSET,
54 |         )
55 |         return text_space
56 | 


--------------------------------------------------------------------------------
/agents/action_parser.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | from android_env2.exception import OutputParserException
 4 | 
 5 | 
 6 | class AgentOutputParser:
 7 |     def __init__(self):
 8 |         self.action_splitter = "#"
 9 |         self.arg_splitter = ["\\[", "\\]"]
10 | 
11 |     def parse_arg(self, arg):
12 |         pattern = rf"{self.arg_splitter[0]}(.+?){self.arg_splitter[1]}"
13 |         match = re.findall(pattern, arg)
14 |         if len(match) > 1:
15 |             raise OutputParserException("Invalid agent output. Only one action output is allowed.")
16 |         if match:
17 |             para = match[-1]
18 |             return para
19 |         else:
20 |             raise OutputParserException("Invalid agent output. At least output one action.")
21 | 
22 |     def parse(self, response):
23 |         pattern = rf"{self.action_splitter}(.+?){self.action_splitter}"
24 |         match = re.findall(pattern, response)
25 |         if match:
26 |             action = match[-1]
27 |         else:
28 |             action = response
29 |         action = action.split()
30 |         if "start" in action[0]:
31 |             return {"action": "START_APP", "package": self.parse_arg(" ".join(action[1:]))}
32 |         elif "stop" in action[0]:
33 |             return {"action": "STOP_APP", "package": self.parse_arg(" ".join(action[1:]))}
34 |         elif "long_click" in action[0]:
35 |             return {"action": "LONG_CLICK", "xpath": self.parse_arg(" ".join(action[1:]))}
36 |         elif "click" in action[0]:
37 |             return {"action": "CLICK", "xpath": self.parse_arg(" ".join(action[1:]))}
38 |         elif "set_text" in action[0]:
39 |             return {"action": "SET_TEXT", "xpath": self.parse_arg(action[1]),
40 |                     "text": self.parse_arg(" ".join(action[2:]))}
41 |         elif action[0] in ["swipe_up", "scroll_down", "swipe_down", "swipe_left", "swipe_right", "press_back",
42 |                            "press_recent", "press_enter"]:
43 |             if action[0] == "scroll_down":
44 |                 action[0] = "swipe_up"
45 |             return {"action": action[0].upper()}
46 |         elif "finish" in action[0]:
47 |             response = ""
48 |             if len(action) > 1:
49 |                 response = " ".join(action[1:])
50 |             return {"action": "FINISH", "text": response}
51 |         else:
52 |             raise OutputParserException(f"Invalid action: {action}")
53 | 
54 | 
55 | class RegexParser(AgentOutputParser):
56 |     pass
57 | 
58 | 
59 | class LLMParser(AgentOutputParser):
60 |     pass
61 | 


--------------------------------------------------------------------------------
/agents/utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import tiktoken
 4 | from langchain.chat_models import AzureChatOpenAI, ChatOpenAI
 5 | from transformers import AutoTokenizer
 6 | 
 7 | 
 8 | def load_llm_agent(model_provider, temperature=0.1):
 9 |     if model_provider == "azure_openai":
10 |         chat_model = AzureChatOpenAI(deployment_name=os.environ["AZURE_ENGINE"],
11 |                                      openai_api_key=os.environ["AZURE_OPENAI_KEY"],
12 |                                      openai_api_base=os.environ["AZURE_OPENAI_BASE"],
13 |                                      openai_api_version=os.environ["AZURE_OPENAI_VERSION"],
14 |                                      temperature=temperature,
15 |                                      request_timeout=60,
16 |                                      max_retries=10,
17 |                                      openai_api_type="azure")
18 |     elif model_provider == "openai":
19 |         chat_model = ChatOpenAI(temperature=temperature)
20 |     elif model_provider == "llama":
21 |         chat_model = ChatOpenAI(model=os.environ["LLAMA_ENGINE"],
22 |                                 openai_api_key=os.environ["LLAMA_API_KEY"],
23 |                                 openai_api_base=os.environ["LLAMA_API_BASE"],
24 |                                 temperature=temperature,
25 |                                 request_timeout=60,
26 |                                 max_retries=10)
27 |     else:
28 |         raise NotImplementedError(f"Unsupported LLM provider {model_provider}.")
29 |     return chat_model
30 | 
31 | 
32 | def load_tokenizer(model_name):
33 |     if "llama" in model_name:
34 |         if "llama70b" == model_name:
35 |             return AutoTokenizer.from_pretrained("meta-llama/Llama-2-70b-chat-hf")
36 |         elif "llama13b" == model_name:
37 |             return AutoTokenizer.from_pretrained("meta-llama/Llama-2-13b-chat-hf")
38 |         else:
39 |             raise NotImplementedError(f"Unsupported llama tokenizer for model {model_name}.")
40 |     else:
41 |         return tiktoken.encoding_for_model(model_name)
42 | 
43 | 
44 | def truncate_scratchpad(scratchpad: str, n_tokens: int = 1600, model_name="gpt-3.5-turbo") -> (str, bool):
45 |     tokenizer = load_tokenizer(model_name)
46 |     lines = scratchpad.split('\n\n')
47 |     observations = filter(lambda x: x.startswith('Previous Observation'), lines)
48 |     observations_by_tokens = sorted(observations, key=lambda x: len(tokenizer.encode(x)))
49 |     while len(tokenizer.encode('\n\n'.join(lines))) > n_tokens and len(observations_by_tokens) > 0:
50 |         largest_observation = observations_by_tokens.pop(-1)
51 |         ind = lines.index(largest_observation)
52 |         lines[ind] = '[Truncated Observation]'
53 |     return '\n\n'.join(lines), len(tokenizer.encode('\n\n'.join(lines))) > n_tokens
54 | 


--------------------------------------------------------------------------------
/android_env2/reward.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from typing import SupportsFloat, Any
 3 | 
 4 | import tiktoken
 5 | from colorama import Fore
 6 | from gymnasium import Env
 7 | from gymnasium.core import ObsType, ActType, Wrapper
 8 | 
 9 | from android_env2.actions import ActionType
10 | from android_env2.exception import OutputParserException
11 | 
12 | 
13 | class AndroidRewardWrapper(Wrapper[ObsType, ActType, ObsType, ActType]):
14 |     def __init__(self, env: Env[ObsType, ActType]):
15 |         Wrapper.__init__(self, env)
16 | 
17 |     def step(self, action: ActType) -> tuple[ObsType, SupportsFloat, bool, bool, dict[str, Any]]:
18 |         obs, reward, terminated, truncated, info = self.env.step(action)
19 |         if action.action_type == ActionType.FINISH:
20 |             reward = self.reward(action, obs, reward)
21 |         else:
22 |             reward = 0.
23 |         return obs, reward, terminated, truncated, info
24 | 
25 |     def reward(self, action: ActType, obs: ObsType, reward: SupportsFloat) -> SupportsFloat:
26 |         raise NotImplementedError
27 | 
28 | 
29 | class DummyRewardWrapper(AndroidRewardWrapper):
30 | 
31 |     def reward(self, action, obs, reward):
32 |         # dummy reward, for testing
33 |         return 1.
34 | 
35 | 
36 | class RegexMatchRewardWrapper(AndroidRewardWrapper):
37 | 
38 |     def reward(self, action, obs, reward):
39 |         match_yes = re.search(
40 |             r".*success.*", obs.strip(), re.MULTILINE | re.IGNORECASE | re.DOTALL
41 |         )
42 |         if match_yes:
43 |             return 1.
44 |         match_no = re.search(
45 |             r".*fail.*", obs.strip(), re.MULTILINE | re.IGNORECASE | re.DOTALL
46 |         )
47 |         if match_no:
48 |             return 0.
49 |         return 0.
50 | 
51 | 
52 | class ImageMatchRewardWrapper(AndroidRewardWrapper):
53 |     def __init__(self, env, target_img):
54 |         super().__init__(env)
55 |         self.target_img = target_img
56 | 
57 |     def reward(self, action, obs, reward):
58 |         # check image similarity
59 |         match_score = self.env.simulator.driver.match(self.target_img)["similarity"]
60 |         return match_score
61 | 
62 | 
63 | class PromptRewardWrapper(AndroidRewardWrapper):
64 |     def __init__(self, env, reward_lm):
65 |         super().__init__(env)
66 |         self.reward_lm = reward_lm
67 | 
68 |     def reward(self, action, obs, reward):
69 |         response = self.reward_lm(self.env.trajectory.data)
70 |         print(Fore.MAGENTA + f"LM Reward Function: {response}\n" + Fore.RESET)
71 |         match_yes = re.search(
72 |             r".*Yes.*", response.strip(), re.MULTILINE | re.IGNORECASE | re.DOTALL
73 |         )
74 |         if match_yes:
75 |             return 1.
76 |         match_no = re.search(
77 |             r".*No.*", response.strip(), re.MULTILINE | re.IGNORECASE | re.DOTALL
78 |         )
79 |         if match_no:
80 |             return 0.
81 |         raise OutputParserException("reward parse error.")
82 | 
83 | 
84 | class LogRewardWrapper(AndroidRewardWrapper):
85 |     def reward(self, action, obs, reward):
86 |         # todo redirect logcat output to log file
87 |         pass
88 | 


--------------------------------------------------------------------------------
/tasks/cross-app.yaml:
--------------------------------------------------------------------------------
 1 | type: cross-app
 2 | obs_type: mix
 3 | reward_type: prompt
 4 | max_step: 30
 5 | max_repeat_step: 5
 6 | tasks:
 7 |   - instruction: Find the email titled "OpenAI website" in Gmail, extract the first URL in the email content, and open the URL in Firefox.
 8 |   - instruction: Extract the date and time from the email titled "meeting details" in Gmail, and create a calendar event in Google Calendar.
 9 |   - instruction: Search Gmail for the latest email titled "restaurant reservation", extract the restaurant name, and open Google Maps with the restaurant name for directions.
10 |   - instruction: Scan Gmail for the latest email titled "flight confirmation", extract the airport details, and open Google Maps with the airport for directions.
11 |   - instruction: Create a new contact in Google Contacts from a website "https://profiles.stanford.edu/fei-fei-li" visited in Firefox, and automatically fill in the contact's name, email, and phone number.
12 |   - instruction: Export all contacts from Google Contacts to a VCF file and save it to Google Drive.
13 |   - instruction: Scan Gmail for the email titled "meeting details" containing a meeting invitation, extract the meeting details, and send a message in Google Messages to "John" with the meeting details.
14 |   - instruction: Scan Google Messages for the latest message regarding a meeting, extract the meeting details, and create an event in Google Calendar.
15 |   - instruction: Take a photo with a camera and send it as an email attachment to bob@example.com, and the subject of the email is "landscape photo".
16 |   - instruction: Take a photo using Android Camera and set it as the wallpaper of homescreen through Android Settings.
17 |   - instruction: Take a photo using Android Camera and set it as the lock screen wallpaper through Android Settings.
18 |   - instruction: Find the contact information for "John" in Google Contacts and create an event titled "meeting" in Google Calendar with him.
19 |   - instruction: Find the company address of a Google contact of "John" and create a Google Maps route to their location.
20 |   - instruction: Check the current weather forecast for the location of "John" in Google Contacts.
21 |   - instruction: Search for the nearest gas station in Google Maps and send the address to the "john@example.com" in Google Messages.
22 |   - instruction: Search Google Contacts for John's email address and send it to the Slack channel "work".
23 |   - instruction: Send a message to the "work" channel in Slack with the content of the latest SMS containing "hello" in Google Messages.
24 |   - instruction: Search for the keyword "ChatGPT" in Firefox, extract the title of the first searched result, and send it as a message to the "work" channel in Slack.
25 |   - instruction: Find the nearest coffee shop using Google Maps and send a message to "Bob" on Slack with the shop name.
26 |   - instruction: Use Google Maps to find the nearest restaurant and send a message to "Bob" on Slack with the restaurant name.
27 |   - instruction: Send a message to the "work" channel in Slack with the content of the latest SMS containing "hello" in Google Messages.
28 |   - instruction: Find the email with the subject "meeting details" in my Gmail, extract the content, and send a message to the "work" channel on Slack.


--------------------------------------------------------------------------------
/run_lm_agent.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | 
 4 | from colorama import Fore
 5 | from dotenv import load_dotenv
 6 | 
 7 | from agents.lm_agent import LMAgent
 8 | from agents.lm_reward import RewardLLM
 9 | from agents.replay_agent import ReplayAgent
10 | from agents.tasks import load_tasks_from_files
11 | from android_env2.actions import AndroidActionWrapper
12 | from android_env2.config import get_settings
13 | from android_env2.env import AndroidEnv
14 | from android_env2.observation import MixObsWrapper
15 | from android_env2.reward import PromptRewardWrapper
16 | 
17 | load_dotenv(".env")
18 | 
19 | 
20 | def get_args():
21 |     args = argparse.ArgumentParser(description='lm_agent')
22 |     args.add_argument('--model_provider', default="azure_openai", type=str, help='{openai, azure_openai, llama}')
23 |     args.add_argument('--model_name', default="gpt-35-turbo", type=str, help='{gpt-35-turbo, gpt-4, llama70b}')
24 |     args.add_argument('--agent_type', default="react", type=str, help='{direct, react, react_reflection}')
25 |     args.add_argument('--max_reflection', default=1, type=int, help='max reflection time')
26 |     args.add_argument('--hist_steps', default=5, type=int, help='hist_steps')
27 |     args.add_argument('--mode', default="chat", type=str, help='{chat, completion}')
28 |     args.add_argument('--temperature', default=0.1, type=float, help='temperature')
29 |     args.add_argument('--max_tokens', default=2000, type=int, help='max_tokens')
30 |     args.add_argument('--stop_token', default=None, type=list, help='stop_token')
31 |     args.add_argument('--with_obs', action="store_true", help='with_obs')
32 |     args.add_argument('--scratchpad_length', default=2000, type=int, help='scratchpad_length')
33 |     args.add_argument('--test_app', default="calendar", type=str, help='test_apps')
34 |     args.add_argument('--tj_suffix', default="", type=str, help='tj_suffix')
35 |     return args.parse_args()
36 | 
37 | 
38 | def get_env(reward_lm=None):
39 |     settings = get_settings()
40 |     env = AndroidEnv(settings)
41 | 
42 |     env = MixObsWrapper(env)
43 |     env = PromptRewardWrapper(env, reward_lm)
44 |     env = AndroidActionWrapper(env)
45 |     return env
46 | 
47 | 
48 | def run():
49 |     args = get_args()
50 |     if args.model_provider == "azure_openai":
51 |         os.environ["AZURE_ENGINE"] = args.model_name
52 |     if args.model_provider == "llama":
53 |         llama_engine_dict = {"llama70b": "llama-2-70b-chat", "llama13b": "llama-2-13b-chat"}
54 |         os.environ["LLAMA_ENGINE"] = llama_engine_dict[args.model_name]
55 |     reward_lm = RewardLLM(args)
56 |     lm_agent = LMAgent(env=get_env(reward_lm), args=args)
57 |     replay_agent = ReplayAgent(env=get_env(), args=args)
58 |     task_list = load_tasks_from_files(filename=f"tasks/{args.test_app}.yaml")
59 |     for task in task_list:
60 |         reward_lm.set_task(task)
61 |         lm_agent.run(task)
62 |         success = task.success
63 |         reflection_cnt = 1
64 |         while not success and "react_reflection" == args.agent_type and reflection_cnt <= args.max_reflection:
65 |             lm_agent.run(task)
66 |             success = task.success
67 |             reflection_cnt += 1
68 |         if "react_reflection" == args.agent_type and task.exe_if_failed and not success:
69 |             print(Fore.RED + "LM Agent failed, executing Replay Agent" + Fore.RESET)
70 |             replay_agent.run(task)
71 | 
72 | 
73 | if __name__ == "__main__":
74 |     run()
75 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Understanding the Weakness of Large Language Model Agents within a Complex Android Environment
 2 | 
 3 | <a href="https://arxiv.org/abs/2402.06596">Paper Link</a>
 4 | 
 5 | ## Abstract
 6 | Large language models (LLMs) have empowered intelligent agents to execute intricate tasks within `domain-specific software` such as browsers and games. However, when applied to `general-purpose software systems` like operating systems, LLM agents face three primary challenges. Firstly, the `action space is vast and dynamic`, posing difficulties for LLM agents to maintain an up-to-date understanding and deliver accurate responses. Secondly, real-world tasks often require `inter-application cooperation`, demanding farsighted planning from LLM agents. Thirdly, agents need to identify optimal solutions `aligning with user constraints`, such as security concerns and preferences.
 7 | These challenges motivate AndroidArena, an environment and benchmark designed to evaluate LLM agents on a modern operating system. To address high-cost of manpower, we design a scalable and semi-automated method to construct the benchmark.
 8 | In the task evaluation, AndroidArena incorporates accurate and adaptive metrics to address the issue of non-unique solutions. Our findings reveal that even state-of-the-art LLM agents struggle in cross-APP scenarios and adhering to specific constraints. Additionally, we identify a lack of four key capabilities, i.e., understanding, reasoning, exploration, and reflection, as primary reasons for the failure of LLM agents. Furthermore, we provide empirical analysis on the failure of reflection, and improve the success rate by 27% with our proposed exploration strategy. This work is the first to present valuable insights in understanding fine-grained weakness of LLM agents, and offers a path forward for future research in this area.
 9 | 
10 | ## Demo
11 | Task: `Get directions from my current location to "Microsoft SVC Building".`
12 | 
13 | 
14 | https://github.com/AndroidArenaAgent/AndroidArena/assets/158838805/e7395b3b-4272-45e2-8492-93572ad722ec
15 | 
16 | 
17 | 
18 | ## Dependencies:
19 | 
20 | ### Python
21 | - Python 3.10
22 | - `pip install -r requirements.txt`
23 | 
24 | ### Emulator Installation
25 | Please follow [Android Emulator Installation Guide](./android_env2/README.md) to install the Android Emulator.
26 | 
27 | ### Environment Steup
28 | 1. Please setup up your Google account first.
29 | 2. Run setup scripts:
30 |     - for single-APP evaluation: `python scripts/env_setup.py`
31 |     - for cross-APP evaluation: `python scripts/env_setup_crossapp.py`
32 | 
33 | ## Benchmark
34 | The task instructions are located in the `tasks` folder, where tasks for each APP are organized in YAML files. The `constrain.yaml` and `cross-app.yaml` files contain cross-APP and constrained tasks, respectively. We offer only task instructions at this time, with the exception of `calendar.yaml` provided as an example. Annotated action sequences will be released later.
35 | 
36 | ## Run
37 | 
38 | ### Execute tasks
39 | 
40 | `python run_lm_agent.py --model_provider=<model_provider> --model_name=<model_name> --agent_type=<agent_type> --test_app=<app_name>`
41 | 
42 | For example:
43 | 
44 | `python run_lm_agent.py --model_name=gpt-4 --agent_type=react --test_app=calendar`
45 | 
46 | ### Evaluation
47 | 
48 | The evaluation script is in `run_evaluator.py`.
49 | 
50 | 
51 | 
52 | ## Citation
53 | If you find our environment or benchmark useful, please cite our paper:
54 | 
55 | ```
56 | @article{xing2024understanding,
57 |   title={Understanding the Weakness of Large Language Model Agents within a Complex Android Environment},
58 |   author={Xing, Mingzhe and Zhang, Rongkai and Xue, Hui and Chen, Qi and Yang, Fan and Xiao, Zhen},
59 |   journal={arXiv preprint arXiv:2402.06596},
60 |   year={2024}
61 | }
62 | ```
63 | 


--------------------------------------------------------------------------------
/android_env2/README.md:
--------------------------------------------------------------------------------
 1 | ## Installation
 2 | 
 3 | 
 4 | ### Install Android Emulator
 5 | #### Windows
 6 | 1. Install Java: Download and install Java from [here](https://www.oracle.com/java/technologies/downloads/). Make sure you set the JAVA_HOME environment variable. You can check if Java is installed correctly by running the command java --version in any command prompt window, which should display the installed Java version.
 7 | 2. Install Android SDK Command line tools:
 8 |    - Download the [Command line tools](https://developer.android.com/studio) and extract them.
 9 |    - Move the extracted `cmdline-tools` directory to a new directory of your choice, for example, `android_sdk`. This new directory will be your Android SDK directory.
10 |    - Inside the extracted `cmdline-tools directory`, create a new subdirectory named `latest`.
11 |    - Move the contents of the original cmdline-tools directory (including the `lib` directory, `bin` directory, `NOTICE.txt` file, and `source.properties` file) to the newly created `latest` directory. Now, you can use the command line tools from this location. 
12 | 3. Install platform tools: Run the following command in the command prompt:
13 | ```
14 | android_sdk\cmdline-tools\latest\bin\sdkmanager.bat "platform-tools" "platforms;android-33"
15 | ```
16 | 4. Download the Android image (API-level: 33):
17 | ```
18 | android_sdk\cmdline-tools\latest\bin\sdkmanager.bat "system-images;android-33;google_apis_playstore;x86_64"
19 | ```
20 | 5. Create an Android Virtual Device (AVD):
21 | ```
22 | android_sdk\cmdline-tools\latest\bin\avdmanager.bat create avd -n avd33 -k "system-images;android-33;google_apis_playstore;x86_64"
23 | ```
24 | 6. Launch the AVD:
25 |    - For the Android GUI:
26 |     ```
27 |     android_sdk\emulator\emulator.exe -avd avd33 -memory 512 -partition-size 1024 -no-snapshot-load
28 |     ```
29 |    - For headless mode (no Android GUI):
30 |     ```
31 |     android_sdk\emulator\emulator.exe -avd avd33 -memory 512 -partition-size 1024 -no-snapshot-load
32 |     ```
33 | 7. Test ADB connection:
34 | ```
35 | android_sdk\platform-tools\adb.exe connect 127.0.0.1:5555
36 | android_sdk\platform-tools\adb.exe devices
37 | ```
38 | 8. Run the following command to install the ATX application on the emulator:
39 | ```
40 | python3 -m uiautomator2 init
41 | ```
42 | 
43 | #### Linux
44 | The installation process for Linux is similar to Windows, with some additional steps:
45 | 
46 | 1. Install Java and set the environment variables:
47 | ```
48 | export JAVA_HOME=/home/user_name/java/jdk-xx.x.x.x  # Replace with your actual JDK installation directory
49 | export JRE_HOME=${JAVA_HOME}/jre
50 | export CLASSPATH=.:${JAVA_HOME}/lib:${JRE_HOME}/lib
51 | export PATH=${JAVA_HOME}/bin:$PATH
52 | ```
53 | 2. Follow the same steps as Windows for installing Android SDK Command line tools, platform tools, and creating an AVD.
54 | 3. Launch the AVD:
55 |    - For the Android GUI:
56 |     ```
57 |     android_sdk\emulator\emulator -avd avd33 -memory 512 -partition-size 1024 -no-snapshot-load
58 |     ```
59 |    - For headless mode (no Android GUI):
60 |     ```
61 |     android_sdk\emulator\emulator -avd avd33 -memory 512 -partition-size 1024 -no-snapshot-load
62 |     ```
63 | 4. Test ADB connection:
64 | ```
65 | android_sdk\platform-tools\adb connect 127.0.0.1:5555
66 | android_sdk\platform-tools\adb devices
67 | ```
68 | 5. Run the following command to install the ATX application on the emulator:
69 | ```
70 | python3 -m uiautomator2 init
71 | ```
72 | 
73 | #### Additional setup
74 | 1. Please mannuly setup your Google account
75 | 2. Turn off APP auto-upgrade in Google Play
76 | 
77 | 
78 | 
79 | ### Troubleshoot
80 | 1. If you encounter the error "packaging.version.InvalidVersion: Invalid version: ''", you may need to enable uiautomator2 in the emulator:
81 |    - On the emulator, open the ATX app
82 |    - Click on "Start uiautomator"
83 | 2. Cannot `set_text` in TextView or EditView
84 | Check `Settings` -> `System` -> `Language & Input` -> `Physical Keyboard` -> turn on `Use on-screen keyboard`
85 | 3. Black screen
86 | https://www.cnblogs.com/yongdaimi/p/17464095.html
87 | `android_sdk\emulator\emulator.exe -avd avd33 -memory 512 -partition-size 1024 -no-snapshot-load`


--------------------------------------------------------------------------------
/agents/tasks.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import random
  3 | import re
  4 | from dataclasses import dataclass, field
  5 | from typing import List, Any, Dict, Tuple
  6 | 
  7 | import yaml
  8 | 
  9 | from agents.prompt import SYSTEM_TEMPLATE, EXAMPLE_PROMPT, ACT_TEMPLATE, REWARD_PROMPT, EXAMPLES, REFLECTION_PROMPT
 10 | 
 11 | 
 12 | @dataclass
 13 | class Task:
 14 |     instruction: str = "do not need to do anything."
 15 |     obs_type: str = "text"
 16 |     reward_type: str = "dummy"
 17 |     hist_steps: int = 3
 18 | 
 19 |     examples: List[Tuple[str, str]] = field(default_factory=list)  # few-shot examples
 20 |     reflection_examples: List[Tuple[str, str]] = field(default_factory=list)  # self reflection examples
 21 |     action_sequence: List[Dict] = field(default_factory=list)
 22 |     reflection: List[str] = field(default_factory=list)  # reflection
 23 | 
 24 |     target_img: str | None = None
 25 |     regex: str | None = None
 26 |     success: bool = False
 27 | 
 28 |     system_prompt: str = SYSTEM_TEMPLATE
 29 |     example_prompt: str = EXAMPLE_PROMPT
 30 |     act_prompt: str = ACT_TEMPLATE
 31 |     constrain_prompt: str = ""
 32 |     reflection_prompt: str = REFLECTION_PROMPT
 33 |     reward_prompt: str = REWARD_PROMPT
 34 | 
 35 |     max_step: int = 30
 36 |     max_repeat_step: int = 5
 37 |     exe_if_failed: bool = False
 38 |     meta_data: dict[str, Any] = field(default_factory=dict)
 39 | 
 40 |     def as_dict(self):
 41 |         return {"task": self.instruction, "action_sequence": self.action_sequence, "reflection": self.reflection,
 42 |                 "target_img": self.target_img, "regex": self.regex, "exe_if_failed": self.exe_if_failed}
 43 | 
 44 |     @classmethod
 45 |     def load_from_yaml(cls, path):
 46 |         data = yaml.safe_load(open(path, "r"))
 47 |         task = cls()
 48 |         if "instruction" in data:
 49 |             task.instruction = data["instruction"]
 50 |         if "reward_type" in data:
 51 |             task.reward_type = data["reward_type"]
 52 |         task.examples = EXAMPLES
 53 |         return task
 54 | 
 55 |     def save_to_yaml(self):
 56 |         pass
 57 | 
 58 | 
 59 | def ui2code_to_dict(code_list: List[str]):
 60 |     action_dict_list = []
 61 |     for c in code_list:
 62 |         act_dict = {}
 63 |         if "app_start" in c:
 64 |             package = re.findall(r"'(.+?)'", c)[0]
 65 |             act_dict = {"action": "START_APP", "package": package}
 66 |         if "xpath" in c:
 67 |             xpath = re.findall(r"xpath\('(.+?)'\)", c)[0]
 68 |             act_dict["xpath"] = xpath
 69 |             if "long_click()" in c:
 70 |                 act_dict["action"] = "LONG_CLICK"
 71 |             elif "click()" in c:
 72 |                 act_dict["action"] = "CLICK"
 73 |             elif "set_text(" in c:
 74 |                 act_dict["action"] = "SET_TEXT"
 75 |                 text = re.findall(r"set_text\('(.+?)'\)", c)[0]
 76 |                 act_dict["text"] = text
 77 |         if "swipe_ext" in c:
 78 |             direction = re.findall(r"swipe_ext\('(.+?)'\)", c)[0]
 79 |             act_dict = {"action": f"swipe_{direction}".upper()}
 80 |         if "press" in c:
 81 |             act = re.findall(r"press\('(.+?)'\)", c)[0]
 82 |             act_dict = {"action": f"press_{act}".upper()}
 83 |         action_dict_list.append(act_dict)
 84 |     action_dict_list.append({"action": "FINISH", "text": ""})
 85 |     return action_dict_list
 86 | 
 87 | 
 88 | def load_tasks_from_files(folder=None, filename=None) -> List[Task]:
 89 |     task_list = []
 90 |     file_list = []
 91 |     if filename:
 92 |         file_list = [filename]
 93 |     else:
 94 |         for root, ds, fs in os.walk(folder):
 95 |             for f in fs:
 96 |                 fullname = os.path.join(root, f)
 97 |                 file_list.append(fullname)
 98 |     for fn in file_list:
 99 |         data = yaml.safe_load(open(fn, "r"))
100 |         random_suffix = ""
101 |         if "slack" in fn:
102 |             random_suffix = random.randint(0, 100)
103 |         for ins in data["tasks"]:
104 |             task = Task()
105 |             task.instruction = ins["instruction"]
106 |             task.instruction = task.instruction.replace("myspace", "myspace" + str(random_suffix))
107 |             task.instruction = task.instruction.replace("myproject", "myproject" + str(random_suffix))
108 |             task.instruction = task.instruction.replace("work_channel", "work_channel" + str(random_suffix))
109 |             task.obs_type = ins["obs_type"] if "obs_type" in ins else data["obs_type"]
110 |             task.reward_type = ins["reward_type"] if "reward_type" in ins else data["reward_type"]
111 |             task.max_step = ins["max_step"] if "max_step" in ins else data["max_step"]
112 |             task.max_repeat_step = ins["max_repeat_step"] if "max_repeat_step" in ins else data["max_repeat_step"]
113 |             task.exe_if_failed = ins["exe_if_failed"] if "exe_if_failed" in ins else False
114 |             task.constrain_prompt = ins["constrains"] if "constrains" in ins else ""
115 |             if "action_seq" in ins:
116 |                 task.action_sequence = ui2code_to_dict(ins["action_seq"])
117 |             task.examples = EXAMPLES
118 |             task_list.append(task)
119 |     return task_list
120 | 


--------------------------------------------------------------------------------
/scripts/env_setup.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | import uiautomator2 as u2
 4 | 
 5 | def setup_emulator(d):
 6 |     # set home page for firefox
 7 |     # TODO setup for firefox
 8 |     d.app_start('org.mozilla.firefox', use_monkey=True)
 9 |     d.xpath('//*[@resource-id="org.mozilla.firefox:id/menuButton"]').click()
10 |     d.xpath(
11 |         '//*[@resource-id="org.mozilla.firefox:id/mozac_browser_menu_recyclerView"]/android.widget.LinearLayout[8]').click()
12 |     d.xpath(
13 |         '//*[@resource-id="org.mozilla.firefox:id/recycler_view"]/android.widget.LinearLayout[3]/android.widget.RelativeLayout[1]').click()
14 |     d.swipe_ext("up")
15 |     d.xpath('//*[@resource-id="org.mozilla.firefox:id/recycler_view"]/android.view.ViewGroup[3]').click()
16 | 
17 |     # set auto-sync for Photo
18 |     d.app_start('com.google.android.apps.photos', use_monkey=True)
19 |     d.xpath('//*[@resource-id="com.google.android.apps.photos:id/og_apd_internal_image_view"]').click()
20 |     d.xpath('//*[@resource-id="com.google.android.apps.photos:id/photos_autobackup_particle_generic_button"]').click()
21 |     d.xpath('//*[@resource-id="com.google.android.apps.photos:id/done_button"]').click()
22 |     
23 | def push_files(d):
24 |     d.push("scripts/prepare_files/sample.pdf", "/sdcard/Download/", show_progress=True)
25 |     d.push("scripts/prepare_files/sample1.pdf", "/sdcard/Download/", show_progress=True)
26 |     d.push("scripts/prepare_files/image.jpg", "/sdcard/Download/", show_progress=True)
27 |     d.push("scripts/prepare_files/image1.jpeg", "/sdcard/Download/", show_progress=True)
28 | 
29 | 
30 | def install_apps(d):
31 |     # install IBM Weather, Slack and Firefox
32 |     d.open_url("https://play.google.com/store/apps/details?id=com.weather.Weather")
33 |     d.xpath('//*[@content-desc="Install"]').click()
34 |     time.sleep(60)
35 |     d.open_url("https://play.google.com/store/apps/details?id=org.mozilla.firefox")
36 |     d.xpath('//*[@content-desc="Install"]').click()
37 |     time.sleep(60)
38 |     d.open_url("https://play.google.com/store/apps/details?id=com.Slack")
39 |     d.xpath('//*[@content-desc="Install"]').click()
40 |     time.sleep(60)
41 | 
42 | 
43 | def create_contacts(d):
44 |     d.app_start('com.google.android.contacts', use_monkey=True)
45 |     d.xpath('//*[@resource-id="com.google.android.contacts:id/floating_action_button"]').click()
46 |     d.xpath(
47 |         '//*[@resource-id="com.google.android.contacts:id/kind_section_views"]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]').set_text(
48 |         'John')
49 |     d.xpath(
50 |         '//*[@resource-id="com.google.android.contacts:id/kind_section_views"]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.widget.LinearLayout[2]/android.widget.FrameLayout[1]').set_text(
51 |         'Smith')
52 |     d.swipe_ext('up')
53 |     d.xpath(
54 |         '//*[@resource-id="com.google.android.contacts:id/kind_section_views"]/android.widget.LinearLayout[3]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]').set_text(
55 |         '010-123456')
56 |     d.swipe_ext('up')
57 |     d.xpath('//*[@resource-id="com.google.android.contacts:id/more_fields"]').click()
58 |     d.swipe_ext('up')
59 |     d.swipe_ext('up')
60 |     d.xpath(
61 |         '//*[@resource-id="com.google.android.contacts:id/kind_section_views"]/android.widget.LinearLayout[3]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]').set_text(
62 |         'Mountain View, CA 94045')
63 |     d.app_stop('com.google.android.contacts')
64 | 
65 |     d.app_start('com.google.android.contacts', use_monkey=True)
66 |     d.xpath('//*[@resource-id="com.google.android.contacts:id/floating_action_button"]').click()
67 |     d.xpath(
68 |         '//*[@resource-id="com.google.android.contacts:id/kind_section_views"]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]').set_text(
69 |         'Bob')
70 |     d.xpath(
71 |         '//*[@resource-id="com.google.android.contacts:id/kind_section_views"]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.widget.LinearLayout[2]/android.widget.FrameLayout[1]').set_text(
72 |         'Steve')
73 |     d.swipe_ext('up')
74 |     d.xpath(
75 |         '//*[@resource-id="com.google.android.contacts:id/kind_section_views"]/android.widget.LinearLayout[3]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]').set_text(
76 |         '010-321456')
77 |     d.xpath('//*[@resource-id="com.google.android.contacts:id/menu_save"]').click()
78 | 
79 | 
80 | if __name__ == "__main__":
81 |     device = u2.connect("emulator-5554")
82 |     install_apps(device)
83 |     push_files(device)
84 |     create_contacts(device)
85 | 


--------------------------------------------------------------------------------
/scripts/env_setup_crossapp.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | import uiautomator2 as u2
 4 | 
 5 | 
 6 | def create_contacts(d):
 7 |     d.app_start('com.google.android.contacts', use_monkey=True)
 8 |     d.xpath('//*[@resource-id="com.google.android.contacts:id/floating_action_button"]').click()
 9 |     d.xpath(
10 |         '//*[@resource-id="com.google.android.contacts:id/kind_section_views"]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]').set_text(
11 |         'John')
12 |     d.xpath(
13 |         '//*[@resource-id="com.google.android.contacts:id/kind_section_views"]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.widget.LinearLayout[2]/android.widget.FrameLayout[1]').set_text(
14 |         'Smith')
15 |     d.swipe_ext('up')
16 |     d.xpath(
17 |         '//*[@resource-id="com.google.android.contacts:id/kind_section_views"]/android.widget.LinearLayout[3]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]').set_text(
18 |         '010-123456')
19 |     d.swipe_ext('up')
20 |     d.xpath('//*[@resource-id="com.google.android.contacts:id/more_fields"]').click()
21 |     d.swipe_ext('up')
22 |     d.swipe_ext('up')
23 |     d.xpath(
24 |         '//*[@resource-id="com.google.android.contacts:id/kind_section_views"]/android.widget.LinearLayout[3]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]').set_text(
25 |         'Mountain View, CA 94045')
26 |     d.xpath('//*[@resource-id="com.google.android.contacts:id/menu_save"]').click()
27 | 
28 | 
29 | def setup_emulator_crossapp(d):
30 |     # gmail draft
31 |     d.app_start('com.google.android.gm', use_monkey=True)
32 |     d.xpath('//*[@resource-id="com.google.android.gm:id/compose_button"]').click()
33 |     d.xpath('//*[@resource-id="com.google.android.gm:id/subject"]').set_text('OpenAI website')
34 |     d.xpath('//*[@text="Compose email"]').set_text('https://openai.com/')
35 |     d.press('enter')
36 |     d.xpath('//*[@content-desc="Navigate up"]').click()
37 |     time.sleep(3)
38 | 
39 |     d.xpath('//*[@resource-id="com.google.android.gm:id/compose_button"]').click()
40 |     d.xpath('//*[@resource-id="com.google.android.gm:id/subject"]').set_text('meeting details')
41 |     d.xpath('//*[@text="Compose email"]').set_text('meeting at 13:00')
42 |     d.press('enter')
43 |     d.xpath('//*[@content-desc="Navigate up"]').click()
44 |     time.sleep(3)
45 |     # d.xpath('//*[@resource-id="com.google.android.gm:id/conversation_container"]/android.webkit.WebView[1]/android.webkit.WebView[1]').set_text('weekly meeting on 13 next month, Microsoft SVC Building')
46 | 
47 |     d.xpath('//*[@resource-id="com.google.android.gm:id/compose_button"]').click()
48 |     d.xpath('//*[@resource-id="com.google.android.gm:id/subject"]').set_text('restaurant reservation')
49 |     d.xpath('//*[@text="Compose email"]').set_text('3 kingdoms hotpot')
50 |     d.press('enter')
51 |     d.xpath('//*[@content-desc="Navigate up"]').click()
52 |     time.sleep(3)
53 | 
54 |     d.xpath('//*[@resource-id="com.google.android.gm:id/compose_button"]').click()
55 |     d.xpath('//*[@resource-id="com.google.android.gm:id/subject"]').set_text('flight confirmation')
56 |     d.xpath('//*[@text="Compose email"]').set_text('Columbia Metropolitan Airport')
57 |     d.xpath('//*[@content-desc="Navigate up"]').click()
58 |     time.sleep(3)
59 | 
60 |     d.xpath('//*[@resource-id="com.google.android.gm:id/compose_button"]').click()
61 |     d.xpath('//*[@resource-id="com.google.android.gm:id/subject"]').set_text('YouTube video recommendation')
62 |     d.xpath('//*[@text="Compose email"]').set_text('ChatGPT Explained Completely')
63 |     d.xpath('//*[@content-desc="Navigate up"]').click()
64 |     time.sleep(3)
65 | 
66 |     d.xpath('//*[@resource-id="com.google.android.gm:id/compose_button"]').click()
67 |     d.xpath('//*[@resource-id="com.google.android.gm:id/subject"]').set_text('YouTube channel subscription')
68 |     d.xpath('//*[@text="Compose email"]').set_text('trailer of movie The Godfather')
69 |     d.xpath('//*[@content-desc="Navigate up"]').click()
70 |     time.sleep(3)
71 | 
72 |     d.xpath('//*[@resource-id="com.google.android.gm:id/compose_button"]').click()
73 |     d.xpath('//*[@resource-id="com.google.android.gm:id/subject"]').set_text('YouTube channel subscription')
74 |     d.xpath('//*[@text="Compose email"]').set_text('trailer of movie The Godfather')
75 |     d.xpath('//*[@content-desc="Navigate up"]').click()
76 |     time.sleep(3)
77 | 
78 |     # Message
79 |     d.app_start('com.google.android.apps.messaging', use_monkey=True)
80 |     d.xpath('//*[@resource-id="com.google.android.apps.messaging:id/start_chat_fab"]').click()
81 |     d.xpath('//android.widget.ScrollView').set_text('123')
82 |     d.press('enter')
83 |     d.xpath('//*[@resource-id="com.google.android.apps.messaging:id/compose_message_text"]').set_text(
84 |         'weekly meeting on 13 Oct, Room 101')
85 |     d.xpath('//*[@resource-id="com.google.android.apps.messaging:id/send_message_button_container"]').click()
86 |     d.xpath('//*[@content-desc="Navigate up"]')
87 | 
88 | 
89 | if __name__ == "__main__":
90 |     device = u2.connect("emulator-5554")
91 |     create_contacts(device)
92 |     setup_emulator_crossapp(device)
93 | 


--------------------------------------------------------------------------------
/tasks/calendar.yaml:
--------------------------------------------------------------------------------
 1 | type: calendar
 2 | obs_type: mix
 3 | reward_type: prompt
 4 | max_step: 15
 5 | max_repeat_step: 5
 6 | tasks:
 7 |   - instruction: Open the calendar.
 8 |     action_seq:
 9 |       - d.app_start('com.google.android.calendar')
10 |   - instruction: Create a new event on Google Calendar with the title "meeting", start time the 13th of next month, and location "Microsoft SVC Building".
11 |     action_seq:
12 |       - d.app_start('com.google.android.calendar')
13 |       - d.xpath('//*[@resource-id="com.google.android.calendar:id/floating_action_button"]').click()
14 |       - d.xpath('//*[@resource-id="com.google.android.calendar:id/floating_action_button"]').click()
15 |       - d.xpath('//*[@resource-id="com.google.android.calendar:id/title"]').set_text('meeting')
16 |       - d.xpath('//*[@resource-id="com.google.android.calendar:id/expanded_recycler"]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]').click()
17 |       - d.xpath('//*[@resource-id="android:id/next"]').click()
18 |       - d.xpath('//*[@text="13"]').click()
19 |       - d.xpath('//*[@resource-id="android:id/button1"]').click()
20 |       - d.swipe_ext('up')
21 |       - d.xpath('//*[@resource-id="com.google.android.calendar:id/expanded_recycler"]/android.widget.Button[2]').click()
22 |       - d.xpath('//*[@resource-id="com.google.android.calendar:id/search_text"]').set_text('Microsoft SVC Building')
23 |       - d.xpath('//*[@resource-id="com.google.android.calendar:id/suggestions"]/android.widget.Button[1]').click()
24 |       - d.xpath('//*[@resource-id="com.google.android.calendar:id/save"]').click()
25 |   - instruction: Change the start date of an existing event on Google Calendar with the title "meeting" to the 12th of next month.
26 |     action_seq:
27 |       - d.app_start('com.google.android.calendar')
28 |       - d.xpath('//*[@resource-id="com.google.android.calendar:id/action_search"]').click()
29 |       - d.xpath('//*[@resource-id="com.google.android.calendar:id/search_text"]').set_text('meeting')
30 |       - d.press('enter')
31 |       - d.xpath('//*[@resource-id="com.google.android.calendar:id/search_list"]/android.support.v7.widget.RecyclerView[1]/android.view.View[6]').click()
32 |       - d.xpath('//*[@resource-id="com.google.android.calendar:id/edit_image"]').click()
33 |       - d.xpath('//*[@resource-id="com.google.android.calendar:id/expanded_recycler"]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.widget.TextView[1]').click()
34 |       - d.xpath('//*[@resource-id="android:id/month_view"]/android.view.View[12]').click()
35 |       - d.xpath('//*[@resource-id="android:id/button1"]').click()
36 |       - d.xpath('//*[@resource-id="com.google.android.calendar:id/save"]').click()
37 |   - instruction: Invite "bob@example.com" to the existing event with the title "meeting" on Google Calendar.
38 |     action_seq:
39 |       - d.app_start('com.google.android.calendar')
40 |       - d.xpath('//*[@resource-id="com.google.android.calendar:id/action_search"]').click()
41 |       - d.xpath('//*[@resource-id="com.google.android.calendar:id/search_text"]').set_text('meeting')
42 |       - d.press('enter')
43 |       - d.xpath('//*[@resource-id="com.google.android.calendar:id/search_list"]/android.support.v7.widget.RecyclerView[1]/android.view.View[6]').click()
44 |       - d.xpath('//*[@resource-id="com.google.android.calendar:id/edit_image"]').click()
45 |       - d.xpath('//*[@resource-id="com.google.android.calendar:id/expanded_recycler"]/android.widget.LinearLayout[2]').click()
46 |       - d.xpath('//*[@resource-id="com.google.android.calendar:id/search_text"]').set_text('bob@example.com')
47 |       - d.xpath('//*[@resource-id="com.google.android.calendar:id/recycler"]/android.view.ViewGroup[1]').click()
48 |       - d.xpath('//*[@resource-id="com.google.android.calendar:id/right_button"]').click()
49 |       - d.xpath('//*[@resource-id="com.google.android.calendar:id/save"]').click()
50 |       - d.xpath('//*[@resource-id="android:id/button1"]').click()
51 |   - instruction: Present all events on Google Calendar for this week.
52 |     action_seq:
53 |       - d.app_start('com.google.android.calendar')
54 |       - d.xpath('//*[@resource-id="com.google.android.calendar:id/toolbar"]/android.widget.ImageButton[1]').click()
55 |       - d.xpath('//*[@resource-id="com.google.android.calendar:id/drawer_list"]/android.widget.LinearLayout[5]').click()
56 |   - instruction: Present all events on Google Calendar for this month.
57 |     action_seq:
58 |       - d.app_start('com.google.android.calendar')
59 |       - d.xpath('//*[@resource-id="com.google.android.calendar:id/toolbar"]/android.widget.ImageButton[1]').click()
60 |       - d.xpath('//*[@resource-id="com.google.android.calendar:id/drawer_list"]/android.widget.LinearLayout[6]').click()
61 |   - instruction: Delete the event on Google Calendar with the title "meeting".
62 |     action_seq:
63 |       - d.app_start('com.google.android.calendar')
64 |       - d.xpath('//*[@resource-id="com.google.android.calendar:id/action_search"]').click()
65 |       - d.xpath('//*[@resource-id="com.google.android.calendar:id/search_text"]').set_text('meeting')
66 |       - d.press('enter')
67 |       - d.xpath('//*[@resource-id="com.google.android.calendar:id/search_list"]/android.support.v7.widget.RecyclerView[1]/android.view.View[6]').click()
68 |       - d.xpath('//*[@resource-id="com.google.android.calendar:id/info_action_overflow"]/android.support.v7.widget.LinearLayoutCompat[1]/android.widget.ImageView[1]').click()
69 |       - d.xpath('//android.widget.ListView/android.widget.LinearLayout[1]').click()
70 |       - d.xpath('//*[@resource-id="android:id/button1"]').click()
71 | 


--------------------------------------------------------------------------------
/tasks/constrain.yaml:
--------------------------------------------------------------------------------
 1 | type: constrained
 2 | obs_type: mix
 3 | reward_type: prompt
 4 | max_step: 15
 5 | max_repeat_step: 5
 6 | system_prompt: Here are some constrains specified by the phone user due to privacy or preference issues. Please complete the task instruction under the following constrains.
 7 | tasks:
 8 |   - instruction: Find the current weather forecast.
 9 |     constrains: Do NOT use the Weather APP.
10 |   - instruction: Check the weather forecast for the next 3 days.
11 |     constrains: Do NOT use the Weather APP.
12 |   - instruction: Get the weather in "London"
13 |     constrains: Do NOT use the Weather APP.
14 |   - instruction: Find the location of "Microsoft SVC Building".
15 |     constrains: Do NOT use the Google Maps APP.
16 |   - instruction: Search for the video "ChatGPT Explained Completely".
17 |     constrains: Do NOT use the YouTube APP.
18 |   - instruction: View the current time in London in Google Clock.
19 |     constrains: Do NOT use the Clock APP.
20 |   - instruction: Open the latest email received from google in Gmail.
21 |     constrains: Do NOT enter the label list page in my Gmail.
22 |   - instruction: Star the latest draft email send to "bob@example.com" in Gmail.
23 |     constrains: Do NOT enter the label list page in my Gmail.
24 |   - instruction: Add the first file in the file system to the saved draft email with the subject "meeting details" in Gmail.
25 |     constrains: Do NOT enter the label list page in my Gmail.
26 |   - instruction: Mark the latest email sent from Google as unread in Gmail.
27 |     constrains: Do NOT enter the label list page in my Gmail.
28 |   - instruction: View the reminders for the next 3 days.
29 |     constrains: Do NOT enter the label navigation list page in my Calendar.
30 |   - instruction: Upload the latest photo from my photos to Google Drive.
31 |     constrains: Do NOT use the sharing functionality from the Photos app.
32 |   - instruction: Watch the latest video in my subscribed Ted-edu channel.
33 |     constrains: Do NOT enter the subscriptions page in my YouTube.
34 |   - instruction: Open YouTube and share the first video with bob@example.com through Gmail.
35 |     constrains: Do NOT use the sharing functionality in the YouTube.
36 |   - instruction: Find the introduction of ChatGPT.
37 |     constrains: Do NOT enter to the openai's website.
38 |   - instruction: Create a new label called "work" in Google Contacts.
39 |     constrains: Do not click the "Confirm"-like button, and the filled information must be reviewed by me before confirming.
40 |   - instruction: Delete the contact "John" in Google Contacts.
41 |     constrains: Do not click the "Confirm"-like button, and the filled information must be reviewed by me before confirming.
42 |   - instruction: Search for "Artificial Intelligence" in wikipedia using Firefox.
43 |     constrains: Do NOT directly type search keywords in the firefox's header search bar.
44 |   - instruction: Search for "large language model".
45 |     constrains: Do NOT use Chrome.
46 |   - instruction: Navigate to the webpage "https://www.wikipedia.org/".
47 |     constrains: Do NOT use Chrome.
48 |   - instruction: Send a text message to "Bob" saying "hello".
49 |     constrains: Do NOT use Slack.
50 |   - instruction: Search for the messages containing "hello".
51 |     constrains: Do NOT use Slack.
52 |   - instruction: Search for the messages containing "hello" using Google Messages.
53 |     constrains: Use the search bar instead of scrolling down to find the target item.
54 |   - instruction: Compose an email with the subject "meeting details" and the content "weekly meeting on 13 Oct, Room 101" to be sent to "bob@example.com".
55 |     constrains: Do not click the "Send"-like button, and all the messages must be reviewed by me before sending them.
56 |   - instruction: Open the latest draft email send to "bob@example.com" in Gmail.
57 |     constrains: Do NOT enter the label list page in my Gmail.
58 |   - instruction: Search for emails containing "meeting" in Gmail.
59 |     constrains: Use the search bar instead of scrolling down to find the target item.
60 |   - instruction: Open the latest draft email send to "bob@example.com" in Gmail.
61 |     constrains: Use the search bar instead of scrolling down to find the target item.
62 |   - instruction: Invite "bob@example.com" to the existing event with the title "meeting" on Google Calendar.
63 |     constrains: Use the search bar instead of scrolling down to find the target item.
64 |   - instruction: Find "Bob" in Google Contacts.
65 |     constrains: Use the search bar instead of scrolling down to find the target item.
66 |   - instruction: Send a text message to "Bob" saying "hello" using Google Messages.
67 |     constrains: Do not click the "Send"-like button, and all the messages must be reviewed by me before sending them.
68 |   - instruction: Schedule a message "hello" to be sent to "Bob" at tomorrow morning using Google Messages.
69 |     constrains: Do not click the "Send"-like button, and all the messages must be reviewed by me before sending them.
70 |   - instruction: Reply to the latest email sent from Google with content "Got it." in Gmail.
71 |     constrains: Do not click the "Send"-like button, and all the messages must be reviewed by me before sending them.
72 |   - instruction: Forward the latest email sent from Google to "bob@example.com" in Gmail.
73 |     constrains: Do not click the "Send"-like button, and all the messages must be reviewed by me before sending them.
74 |   - instruction: Send a message "hello" to "bob" in Slack.
75 |     constrains: Do not click the "Send"-like button, and all the messages must be reviewed by me before sending them.
76 |   - instruction: Share the most recent file with "bob" in Slack.
77 |     constrains: Do not click the "Send"-like button, and all the messages must be reviewed by me before sending them.


--------------------------------------------------------------------------------
/android_env2/phone.py:
--------------------------------------------------------------------------------
  1 | import os.path
  2 | import re
  3 | from typing import List
  4 | 
  5 | import yaml
  6 | 
  7 | 
  8 | class Component:
  9 |     def __init__(self):
 10 |         self.name = None
 11 |         self.xpath = None
 12 |         self.nearby_xpath = set()
 13 |         self.description = None
 14 | 
 15 | 
 16 | class Activity:
 17 |     def __init__(
 18 |             self,
 19 |             name: str = None,
 20 |             description: str = None):
 21 |         self.name = name
 22 |         self.description = description
 23 |         self.components = []
 24 | 
 25 |     def load_from_dict(self, act):
 26 |         """
 27 |         load activities and define Component
 28 |         :param act:
 29 |         :return:
 30 |         """
 31 |         return
 32 | 
 33 |     @property
 34 |     def num_components(self):
 35 |         return len(self.components)
 36 | 
 37 | 
 38 | class APP:
 39 |     def __init__(
 40 |             self,
 41 |             download_url: str = None,
 42 |             package: str = None,
 43 |             name: str = None,
 44 |             description: str = None,
 45 |             activities: List[Activity] = None):
 46 |         self.download_url = download_url
 47 |         self.package = package
 48 |         self.name = name
 49 |         self.description = description
 50 |         self.activities = activities
 51 | 
 52 |     def retrieve_app_desc(self):
 53 |         google_play_url = "https://play.google.com/store/apps/details?id={}&hl=en_US"
 54 |         try:
 55 |             import trafilatura
 56 |             web = trafilatura.fetch_url(google_play_url.format(self.package))
 57 |             desc = trafilatura.extract(web)
 58 |         except ImportError:
 59 |             desc = ""
 60 |         return desc
 61 | 
 62 |     def load_from_yaml(self, yaml_path):
 63 |         """
 64 |         load the app description, activities and components in YAML file
 65 |         :param yaml_path:
 66 |         :return:
 67 |         """
 68 |         app_config = yaml.safe_load(open(yaml_path, "r"))
 69 |         self.package = app_config["app"]["package"]
 70 |         self.name = app_config["app"]["name"]
 71 |         if "description" in app_config["app"]:
 72 |             self.description = app_config["app"]["description"]
 73 |         else:
 74 |             self.description = self.retrieve_app_desc()
 75 |         self.activities = []
 76 |         activities = app_config["app"]["activities"]
 77 |         if activities:
 78 |             for act in activities:
 79 |                 self.activities.append(Activity().load_from_dict(act))
 80 | 
 81 |     def dump_to_yaml(self, yaml_path):
 82 |         pass
 83 | 
 84 |     def update_property(self, k, v):
 85 |         pass
 86 | 
 87 |     def update_activity(self):
 88 |         pass
 89 | 
 90 |     @property
 91 |     def num_activities(self):
 92 |         return len(self.activities)
 93 | 
 94 |     @property
 95 |     def num_components(self):
 96 |         return sum([act.num_components for act in self.activities])
 97 | 
 98 | 
 99 | class User:
100 |     def __init__(
101 |             self,
102 |             name: str = None,
103 |             description: str = None):
104 |         self.name = name
105 |         self.description = description
106 |         self.personality = None
107 |         self.preference = None
108 | 
109 |     def update_from_history(self):
110 |         """
111 |         using LM to update user's personality and preference from behavior history
112 |         :return:
113 |         """
114 |         pass
115 | 
116 | 
117 | class UserTrace:
118 |     def __init__(self):
119 |         pass
120 | 
121 | 
122 | class Phone:
123 |     def __init__(self):
124 |         self.user = User()
125 |         self.apps = {}
126 |         self.device_support = []
127 |         self.device_info = {}
128 | 
129 |     def set_device_info(self, info):
130 |         """
131 |         {'udid': 'EMULATOR32X1X14X0-02:15:b2:00:00:00-sdk_gphone64_x86_64',
132 |         'version': '13',
133 |         'serial': 'EMULATOR32X1X14X0',
134 |         'brand': 'google',
135 |         'model': 'sdk_gphone64_x86_64',
136 |         'hwaddr': '02:15:b2:00:00:00',
137 |         'sdk': 33,
138 |         'agentVersion': '0.10.0',
139 |         'display': {'width': 320, 'height': 640},
140 |         'battery': {'acPowered': False, 'usbPowered': False, 'wirelessPowered': False, 'status': 4, 'health': 2, 'present': True, 'level': 100, 'scale': 100, 'voltage': 5000, 'temperature': 250, 'technology': 'Li-ion'},
141 |         'memory': {'total': 2013524, 'around': '2 GB'},
142 |         'arch': '',
143 |         'owner': None,
144 |         'presenceChangedAt': '0001-01-01T00:00:00Z',
145 |         'usingBeganAt': '0001-01-01T00:00:00Z',
146 |         'product': None,
147 |         'provider': None}
148 |         """
149 |         self.device_info = info
150 | 
151 |     def add_app(self, app: APP):
152 |         self.apps[app.name] = app
153 | 
154 |     def remove_app(self, app: APP):
155 |         self.apps.pop(app.name)
156 | 
157 |     def load_from_yaml(self, yaml_path):
158 |         phone_config = yaml.safe_load(open(yaml_path, "r"))
159 |         self.user.name = phone_config["user"]["name"]
160 |         self.user.description = phone_config["user"]["self_introduction"]
161 |         for app_name, app_path in phone_config["apps"].items():
162 |             app_obj = APP()
163 |             app_obj.load_from_yaml(os.path.join(os.path.dirname(yaml_path), app_path))
164 |             self.add_app(app_obj)
165 | 
166 |     @property
167 |     def num_apps(self):
168 |         return len(self.apps)
169 | 
170 |     @property
171 |     def num_activities(self):
172 |         return sum([app.num_activities for app in self.apps])
173 | 
174 |     @property
175 |     def num_components(self):
176 |         return sum([act.num_components for app in self.apps for act in app.activities])
177 | 
178 |     def get_pkg_by_name(self, name) -> APP | None:
179 |         if name in self.apps.keys():
180 |             return self.apps[name]
181 |         for app in self.apps.values():
182 |             if app.package == name:
183 |                 return app
184 |         return
185 | 


--------------------------------------------------------------------------------
/android_env2/simulator.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import subprocess
  3 | import time
  4 | 
  5 | import adbutils
  6 | import uiautomator2 as u2
  7 | 
  8 | from android_env2.actions import Action, ActionType
  9 | from android_env2.config import Settings
 10 | 
 11 | 
 12 | class Simulator:
 13 |     def __init__(
 14 |             self,
 15 |             config: Settings,
 16 |             **kwargs):
 17 |         self.emulator_path = config.emulator_path
 18 |         self.avd_name = config.avd_name
 19 |         self.adb_ip = config.adb_ip if config.adb_ip else "127.0.0.1"
 20 |         self.adb_port = config.adb_port
 21 |         self.emulator_name = config.emulator_name
 22 |         self.driver = None
 23 | 
 24 |         self.excluded_app = kwargs.get("excluded_app", [])
 25 | 
 26 |         self._prepare_device()
 27 | 
 28 |     def _prepare_device(self):
 29 |         devices = adbutils.adb.list()
 30 |         if len(devices) == 0:
 31 |             return
 32 |         if not self.driver:
 33 |             self.driver = u2.connect(self.emulator_name)
 34 |             print(f"uiautomator is connected to {self.emulator_name}...")
 35 | 
 36 |     def stop_avd(self):
 37 |         pass
 38 | 
 39 |     def reset(self):
 40 |         # self.driver.healthcheck()
 41 |         exclude_apps = ['com.github.uiautomator', 'com.github.uiautomator.test',
 42 |                         'com.google.android.apps.nexuslauncher', 'com.google.android.providers.media.module',
 43 |                         'com.android.remoteprovisioner', 'com.google.android.ext.services',
 44 |                         'com.google.android.permissioncontroller', 'com.android.bluetooth',
 45 |                         'com.google.android.apps.wellbeing', 'com.android.emulator.multidisplay',
 46 |                         'com.google.android.ims', 'com.google.android.adservices.api', 'com.android.vending',
 47 |                         'com.android.systemui', 'com.android.se']
 48 |         self.driver.app_stop_all(excludes=exclude_apps)
 49 |         self.driver.press("home")
 50 |         self.driver.set_fastinput_ime(True)
 51 | 
 52 |     def execute_action(self, action: Action):
 53 | 
 54 |         if action.action_type == ActionType.NONE or action.action_type == ActionType.FINISH:
 55 |             return
 56 | 
 57 |         elif action.action_type == ActionType.INSTALL_APP:
 58 |             self.driver.app_install(action.app.download_url)
 59 | 
 60 |         elif action.action_type == ActionType.START_APP:
 61 |             retry_time = 0
 62 |             while self.driver.app_current()["package"] != action.app.package and retry_time < 3:
 63 |                 self.driver.app_start(action.app.package, use_monkey=True, wait=True)
 64 |                 retry_time = retry_time + 1
 65 |                 time.sleep(3)
 66 | 
 67 |         elif action.action_type == ActionType.STOP_APP:
 68 |             self.driver.app_stop(action.app.package)
 69 | 
 70 |         elif action.action_type == ActionType.CLICK:
 71 |             self.driver.xpath(action.component.xpath).click()
 72 | 
 73 |         elif action.action_type == ActionType.LONG_CLICK:
 74 |             self.driver.xpath(action.component.xpath).long_click()
 75 | 
 76 |         elif action.action_type == ActionType.DOUBLE_CLICK:
 77 |             self.driver.xpath(action.component.xpath).double_click()
 78 | 
 79 |         elif action.action_type == ActionType.SET_TEXT:
 80 |             self.driver.xpath(action.component.xpath).set_text(action.action_para["text"])
 81 | 
 82 |         elif action.action_type == ActionType.PRESS_BACK:
 83 |             self.driver.press("back")
 84 | 
 85 |         elif action.action_type == ActionType.PRESS_HOME:
 86 |             self.driver.press("home")
 87 | 
 88 |         elif action.action_type == ActionType.PRESS_ENTER:
 89 |             self.driver.press("enter")
 90 | 
 91 |         elif action.action_type == ActionType.SCREEN_ON:
 92 |             self.driver.screen_on()
 93 | 
 94 |         elif action.action_type == ActionType.SCREEN_OFF:
 95 |             self.driver.screen_off()
 96 | 
 97 |         elif action.action_type == ActionType.VOLUME_UP:
 98 |             self.driver.press("volume_up")
 99 | 
100 |         elif action.action_type == ActionType.VOLUME_DOWN:
101 |             self.driver.press("volume_down")
102 | 
103 |         elif action.action_type == ActionType.VOLUME_MUTE:
104 |             self.driver.press("volume_mute")
105 | 
106 |         elif action.action_type == ActionType.SET_ORIENTATION:
107 |             self.driver.set_orientation(action.action_para["orientation"])
108 | 
109 |         elif action.action_type == ActionType.FREEZE_ROTATION:
110 |             self.driver.freeze_rotation()
111 | 
112 |         elif action.action_type == ActionType.UNFREEZE_ROTATION:
113 |             self.driver.freeze_rotation(False)
114 | 
115 |         elif action.action_type == ActionType.SCREENSHOT:
116 |             im = self.driver.screenshot()
117 |             im.save(action.action_para["img_path"])
118 | 
119 |         elif action.action_type == ActionType.SWIPE_UP:
120 |             self.driver.swipe_ext("up")
121 | 
122 |         elif action.action_type == ActionType.SWIPE_DOWN:
123 |             self.driver.swipe_ext("down")
124 | 
125 |         elif action.action_type == ActionType.SWIPE_LEFT:
126 |             self.driver.swipe_ext("left")
127 | 
128 |         elif action.action_type == ActionType.SWIPE_RIGHT:
129 |             self.driver.swipe_ext("right")
130 | 
131 |         elif action.action_type == ActionType.SWIPE:
132 |             self.driver.swipe(action.action_para['sx'],
133 |                               action.action_para['sy'],
134 |                               action.action_para['ex'],
135 |                               action.action_para['ey'])
136 | 
137 |         elif action.action_type == ActionType.RECENT:
138 |             self.driver.press("recent")
139 | 
140 |         elif action.action_type == ActionType.DRAG:
141 |             self.driver.drag(action.action_para['sx'],
142 |                              action.action_para['sy'],
143 |                              action.action_para['ex'],
144 |                              action.action_para['ey'])
145 | 
146 |         time.sleep(3)
147 | 
148 |     def current_app(self):
149 |         return self.driver.app_current()
150 | 
151 |     def dump_ui_xml(self):
152 |         xml = self.driver.dump_hierarchy()
153 |         return xml
154 | 
155 |     def screenshot(self):
156 |         im = self.driver.screenshot()
157 |         return im
158 | 
159 |     def adb_shell(self, shell_cmd):
160 |         output, exit_code = self.driver.shell(shell_cmd)
161 | 
162 |     def avd_log(self, log_path):
163 |         pass
164 | 


--------------------------------------------------------------------------------
/agents/replay_buffer.py:
--------------------------------------------------------------------------------
  1 | import base64
  2 | import os.path
  3 | import pickle
  4 | import re
  5 | import time
  6 | import traceback
  7 | from collections import defaultdict
  8 | from io import BytesIO
  9 | 
 10 | from agents.tasks import Task
 11 | 
 12 | HTML_TEMPLATE = """
 13 | <!DOCTYPE html>
 14 | <html>
 15 | <head>
 16 |     <title>{title}</title>
 17 |     <style>
 18 |         body {{
 19 |             font-family: Arial, sans-serif;
 20 |             margin: 20px;
 21 |         }}
 22 | 
 23 |         pre {{
 24 |             background-color: #f4f4f4;
 25 |             padding: 10px;
 26 |         }}
 27 | 
 28 |         .expandable {{
 29 |             cursor: pointer;
 30 |             font-weight: bold;
 31 |         }}
 32 |         .json-line {{
 33 |             border-left: 2px solid #ccc;
 34 |             margin-left: 20px;
 35 |             padding-left: 10px;
 36 |         }}
 37 | 
 38 |          .container {{
 39 |             width: 100%;
 40 |         }}
 41 | 
 42 |         .row {{
 43 |             display: flex;
 44 |             align-items: center;
 45 |             margin-bottom: 10px;
 46 |         }}
 47 | 
 48 | 
 49 |         .image {{
 50 |             display: flex;
 51 |             justify-content: center;
 52 |             align-items: center;
 53 |             margin-left: 30px;
 54 |             height: 500px;
 55 |             width: auto;
 56 |         }}
 57 | 
 58 |         .green-font {{
 59 |             color: green;
 60 |         }}
 61 |         .yellow-font{{
 62 |             color: rgb(255, 183, 0);
 63 |         }}
 64 |          .red-font{{
 65 |             color: rgb(220, 20, 60);
 66 |         }}
 67 |         .blue-font{{
 68 |             color:rgb(0, 47, 255);
 69 |         }}
 70 |     </style>
 71 | </head>
 72 | <body>
 73 |     <h2>Task</h2>
 74 |         <pre>{task}</pre>
 75 |         <pre>{constrain}</pre>
 76 |     <h2>System Prompt</h2>
 77 |         <pre>{system}</pre>
 78 |     <h2>Few-shot Examples</h2>
 79 |         <pre>{examples}</pre>
 80 |     <h2>Previous Reflection</h2>
 81 |         <pre>{prev_reflection}</pre>
 82 |     <div class="container">
 83 |         {body}
 84 |     </div>
 85 | 
 86 | </body>
 87 | </html>
 88 | 
 89 | """
 90 | 
 91 | 
 92 | def save_trajectory(folder="trajectory"):
 93 |     def decorator(run_func):
 94 |         def wrapper(self, *args, **kwargs):
 95 |             try:
 96 |                 run_func(self, *args, **kwargs)
 97 |             except Exception as e:
 98 |                 self.trajectory.exception_str = traceback.format_exc()
 99 |                 raise e
100 |             finally:
101 |                 obs = f"obs_{self.args.hist_steps}_" if self.args.with_obs else ""
102 |                 suffix = "_" + self.args.tj_suffix if self.args.tj_suffix else ""
103 |                 para_folder = folder + "_" + self.args.model_name + "_" + self.args.agent_type + "_" + obs + self.args.test_app + suffix
104 |                 self.trajectory.save_to_html(para_folder)
105 |                 self.trajectory.save_to_pkl(para_folder)
106 | 
107 |         return wrapper
108 | 
109 |     return decorator
110 | 
111 | 
112 | class Trajectory:
113 |     def __init__(self, task: Task):
114 |         self.task = task
115 |         self.data = []
116 |         self.prev_reflection = ""
117 |         self.reflection = None
118 |         self.system_str = ""
119 |         self.example_str = ""
120 |         self.exception_str = ""
121 | 
122 |     def add(self, state=None, thought=None, action=None, reward=None, reflection=None):
123 |         if thought:
124 |             self.data[-1]["thought"] = thought
125 |         if action:
126 |             self.data[-1]["action"] = action
127 |         if reward is not None:
128 |             self.data[-1]["reward"] = reward
129 |         if reflection:
130 |             # add reflection when episode failed
131 |             self.task.reflection.append(reflection)
132 |             self.reflection = reflection
133 |         if state:
134 |             self.data.append({"state": state})
135 | 
136 |     def get_last_k(self, k=5):
137 |         return self.data[-k:]
138 | 
139 |     def save_to_html(self, folder):
140 |         inst = self.task.instruction.replace(" ", "_")
141 |         inst = re.sub(r"[\/,\.@\\\:\*\?\"\<\>\|]", "", inst)
142 |         inst = inst.replace("__", "_")
143 |         inst = inst.replace("__", "_")
144 |         body_str = ""
145 |         for i, data in enumerate(self.data):
146 |             head = f"<h2>Step {i}</h2>"
147 |             obs_str = ""
148 |             if 'state' in data:
149 |                 if isinstance(data['state'], str):
150 |                     obs_str += f"<pre>{data['state']}</pre>"
151 |                 elif isinstance(data['state'], dict):
152 |                     image = data['state']['image']
153 |                     image_bytes_io = BytesIO()
154 |                     image.save(image_bytes_io, format="JPEG")
155 |                     base64_image = base64.b64encode(image_bytes_io.getvalue()).decode('ascii')
156 |                     obs_str += f"<img class=\"image\" src=\"data:image/png;base64,{base64_image}\">"
157 |                     obs_str += f"<pre>{data['state']['text']}</pre>"
158 |             thought_str = ""
159 |             if 'thought' in data:
160 |                 thought_str = f"<br><div class=\"green-font\">Thought: {data['thought']}</div>"
161 |             action_str = ""
162 |             if 'action' in data:
163 |                 action_str = f"<br><div class=\"blue-font\">Action: {data['action']}</div>"
164 |             reward_str = ""
165 |             if 'reward' in data:
166 |                 reward_str = f"<br><div class=\"red-font\">Reward: {data['reward']}</div>"
167 |             body_str += head + "<div class='row'>" + obs_str + "</div>" + thought_str + action_str + reward_str
168 |         if self.exception_str:
169 |             body_str += f"<h2>Exception</h2><pre>{self.exception_str}</pre>"
170 |         if self.reflection:
171 |             body_str += f"<h2>Reflection</h2><pre>{self.reflection}</pre>"
172 |         if not os.path.exists(folder):
173 |             os.mkdir(folder)
174 |         inst = inst[:100]
175 |         with open(f"{folder}/{inst}_{time.strftime('%m-%d_%H-%M-%S', time.localtime(int(time.time())))}.html",
176 |                   "w", encoding='utf-8') as f:
177 |             f.write(
178 |                 HTML_TEMPLATE.format(title=self.task.instruction, task=self.task.instruction,
179 |                                      constrain=self.task.constrain_prompt, system=self.system_str,
180 |                                      examples=self.example_str, prev_reflection=self.prev_reflection, body=body_str))
181 | 
182 |     def save_to_pkl(self, folder):
183 |         inst = self.task.instruction.replace(" ", "_")
184 |         inst = re.sub(r"[\/,\.@\\\:\*\?\"\<\>\|]", "", inst)
185 |         inst = inst.replace("__", "_")
186 |         inst = inst.replace("__", "_")
187 |         if not os.path.exists(folder):
188 |             os.mkdir(folder)
189 |         inst = inst[:100]
190 |         pickle.dump(
191 |             {"task": self.task.as_dict(), "data": self.data, "reflection": self.reflection,
192 |              "exception": self.exception_str},
193 |             open(f"{folder}/{inst}_{time.strftime('%m-%d_%H-%M-%S', time.localtime(int(time.time())))}.pkl", "wb")
194 |         )
195 | 
196 | 
197 | class ReplayBuffer:
198 |     def __init__(self):
199 |         self.exp = defaultdict(list)
200 | 
201 |     def add_exp(self, instruction, action_sequence, final_state):
202 |         self.exp[instruction].append({"action_sequence": action_sequence,
203 |                                       "final_state": final_state})
204 | 
205 |     def retrieve_topk(self, instruction, top_k):
206 |         """
207 |         find the top_k most similar experiences
208 |         """
209 |         return
210 | 
211 |     def save_to_vector_db(self):
212 |         pass
213 | 
214 |     def save_to_db(self):
215 |         pass
216 | 
217 |     def save_to_json(self):
218 |         pass
219 | 


--------------------------------------------------------------------------------
/android_env2/actions.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from enum import Enum
  3 | from typing import Dict
  4 | 
  5 | import numpy as np
  6 | from gymnasium import spaces, ActionWrapper
  7 | from gymnasium.core import WrapperActType, ActType
  8 | 
  9 | from android_env2.exception import AndroidActionException
 10 | from android_env2.phone import APP, Activity, Component
 11 | from android_env2.constant import TEXT_MAX_LENGTH
 12 | 
 13 | 
 14 | class ActionType(Enum):
 15 |     NONE = 0
 16 |     # app level
 17 |     INSTALL_APP = 1
 18 |     START_APP = 2
 19 |     STOP_APP = 3
 20 |     STOP_ALL_APP = 4
 21 | 
 22 |     # component level
 23 |     CLICK = 5
 24 |     DOUBLE_CLICK = 6
 25 |     LONG_CLICK = 7
 26 |     SET_TEXT = 8
 27 | 
 28 |     # system level
 29 |     PRESS_BACK = 9
 30 |     PRESS_HOME = 10
 31 |     SCREEN_ON = 11
 32 |     SCREEN_OFF = 12
 33 |     VOLUME_UP = 13
 34 |     VOLUME_DOWN = 14
 35 |     VOLUME_MUTE = 15
 36 |     SET_ORIENTATION = 16
 37 |     FREEZE_ROTATION = 17
 38 |     UNFREEZE_ROTATION = 18
 39 |     SCREENSHOT = 19
 40 |     SWIPE_UP = 20
 41 |     SWIPE_DOWN = 21
 42 |     SWIPE_LEFT = 22
 43 |     SWIPE_RIGHT = 23
 44 |     SWIPE = 24
 45 |     RECENT = 25
 46 |     DRAG = 27
 47 |     LIST_ALL_APP = 28
 48 |     PRESS_ENTER = 29
 49 | 
 50 |     FINISH = 30
 51 |     INVALID = 31
 52 | 
 53 |     def __str__(self) -> str:
 54 |         return f"ActionType.{self.name}"
 55 | 
 56 | 
 57 | class Action:
 58 |     def __init__(self):
 59 |         self.action_type: ActionType = ActionType.NONE
 60 |         self.action_para: Dict[str, str] = dict()
 61 |         self.app: APP = APP()
 62 |         self.activity: Activity = Activity()
 63 |         self.component: Component = Component()
 64 | 
 65 |     def __str__(self):
 66 |         match self.action_type:
 67 |             case ActionType.INSTALL_APP:
 68 |                 return f"install {self.app.name} APP"
 69 |             case ActionType.START_APP:
 70 |                 return f"launch {self.app.name} APP"
 71 |             case ActionType.CLICK:
 72 |                 return f"click {self.component.name} on {self.app.name} APP"
 73 |             case ActionType.SET_TEXT:
 74 |                 return f"type {self.action_para['text']} in {self.component.name} of {self.app.name} APP"
 75 |             case ActionType.STOP_APP:
 76 |                 return f"stop {self.app.name} APP"
 77 |             case ActionType.STOP_ALL_APP:
 78 |                 return "stop all APPs"
 79 |             case ActionType.DOUBLE_CLICK:
 80 |                 return f"double click {self.component.name} on {self.app.name} APP"
 81 |             case ActionType.LONG_CLICK:
 82 |                 return f"long click {self.component.name} on {self.app.name} APP"
 83 |             case ActionType.PRESS_BACK:
 84 |                 return "press the back key"
 85 |             case ActionType.PRESS_HOME:
 86 |                 return "press the home key"
 87 |             case ActionType.PRESS_ENTER:
 88 |                 return "press the enter key"
 89 |             case ActionType.SCREEN_ON:
 90 |                 return "turn on the screen"
 91 |             case ActionType.SCREEN_OFF:
 92 |                 return "turn off the screen"
 93 |             case ActionType.VOLUME_UP:
 94 |                 return "turn the volume up"
 95 |             case ActionType.VOLUME_DOWN:
 96 |                 return "turn the volume down"
 97 |             case ActionType.VOLUME_MUTE:
 98 |                 return "mute the volume"
 99 |             case ActionType.SET_ORIENTATION:
100 |                 return f"rotate screen to {self.action_para['orientation']}"
101 |             case ActionType.FREEZE_ROTATION:
102 |                 return "freeze screen rotation"
103 |             case ActionType.UNFREEZE_ROTATION:
104 |                 return "un-freeze screen rotation"
105 |             case ActionType.SCREENSHOT:
106 |                 return "take a screenshot"
107 |             case ActionType.SWIPE_UP:
108 |                 return f"swip up on {self.app.name} APP"
109 |             case ActionType.SWIPE_DOWN:
110 |                 return f"swip down on {self.app.name} APP"
111 |             case ActionType.SWIPE_LEFT:
112 |                 return f"swip left on {self.app.name} APP"
113 |             case ActionType.SWIPE_RIGHT:
114 |                 return f"swip right on {self.app.name} APP"
115 |             case ActionType.SWIPE:
116 |                 return (f"swipe from [{self.action_para['sx']}, {self.action_para['sy']}]  "
117 |                         f"to [{self.action_para['ex']}, {self.action_para['ey']}] on {self.app.name} APP")
118 |             case ActionType.RECENT:
119 |                 return "show recent Apps"
120 |             case ActionType.DRAG:
121 |                 return (f"drag from [{self.action_para['sx']}, {self.action_para['sy']}]  "
122 |                         f"to [{self.action_para['ex']}, {self.action_para['ey']}] on {self.app.name} APP")
123 |             case ActionType.LIST_ALL_APP:
124 |                 return "list all Apps"
125 |             case ActionType.FINISH:
126 |                 return "task finished"
127 |             case ActionType.INVALID:
128 |                 return "invalid action"
129 | 
130 | 
131 | class AndroidActionWrapper(ActionWrapper):
132 | 
133 |     def action(self, action: WrapperActType) -> ActType:
134 |         """
135 |         transform input `action dict` inferred by the agent to `Action` object
136 |         :param action: action dict
137 |         :return: Action object
138 |         """
139 |         action_obj = Action()
140 |         action_type = ActionType.__getitem__(action["action"].upper())
141 |         action_obj.action_type = action_type
142 |         # app_level actions
143 |         if action_type in [ActionType.START_APP, ActionType.STOP_APP]:
144 |             pkg = self.env.phone.get_pkg_by_name(action["package"])
145 |             if not pkg:
146 |                 raise AndroidActionException(
147 |                     f"Cannot find APP {action['package']}. The APP name might be incorrect.")
148 |             action_obj.app.name = pkg.name
149 |             action_obj.app.package = pkg.package
150 |         # component-leval actions
151 |         elif action_type in [ActionType.CLICK, ActionType.LONG_CLICK, ActionType.DOUBLE_CLICK, ActionType.SET_TEXT]:
152 |             if action["xpath"].startswith("//"):
153 |                 for node, xpath in self.env.cur_ui_xml_tree.node_to_xpath.items():
154 |                     if action["xpath"] == xpath[0]:
155 |                         action["xpath"] = xpath[1]
156 |                         break
157 |                 action_obj.component.xpath = action["xpath"]
158 |             else:
159 |                 if action["xpath"] not in self.env.cur_ui_xml_tree.node_to_xpath:
160 |                     raise AndroidActionException(
161 |                         f"Invalid node id {action['xpath']}. The node id might be incorrect.")
162 |                 else:
163 |                     action_obj.component.xpath = self.env.cur_ui_xml_tree.node_to_xpath[action["xpath"]][1]
164 |                 action_obj.component.nearby_xpath = set(self.env.cur_ui_xml_tree.node_to_xpath[action["xpath"]][:2] + \
165 |                                                         self.env.cur_ui_xml_tree.node_to_xpath[action["xpath"]][2])
166 |                 action_obj.component.name = self.env.cur_ui_xml_tree.node_to_name[action["xpath"]]
167 |                 app = self.env.phone.get_pkg_by_name(self.env.cur_ui_xml_tree.app_name)
168 |                 if not app:
169 |                     app = APP(name=self.env.cur_ui_xml_tree.app_name)
170 |                 action_obj.app = app
171 |             if action_type == ActionType.SET_TEXT:
172 |                 action_obj.action_para["text"] = action["text"]
173 |         elif action_type == ActionType.FINISH:
174 |             action_obj.action_para["text"] = action["text"]
175 |         # do not need to parse args for none, finish, or system-level actions
176 |         else:
177 |             app = self.env.phone.get_pkg_by_name(self.env.cur_ui_xml_tree.app_name)
178 |             if not app:
179 |                 app = APP(name=self.env.cur_ui_xml_tree.app_name)
180 |             action_obj.app = app
181 |         return action_obj
182 | 
183 |     def action_space(
184 |             self,
185 |     ) -> spaces.Space[ActType] | spaces.Space[WrapperActType]:
186 |         space = spaces.Dict(
187 |             {
188 |                 "action_type": spaces.Discrete(len(ActionType)),
189 |                 "action_para": spaces.Text(TEXT_MAX_LENGTH),
190 |                 "coords": spaces.Box(
191 |                     np.array([0.0, 0.0], dtype=np.float32),
192 |                     np.array([1.0, 1.0], dtype=np.float32),
193 |                 ),
194 |                 "app": spaces.Discrete(self.env.phone.num_apps),
195 |                 "activity": spaces.Discrete(self.env.phone.num_activities),
196 |                 "component": spaces.Discrete(self.env.phone.num_components),
197 |             }
198 |         )
199 |         return space
200 | 


--------------------------------------------------------------------------------
/agents/prompt.py:
--------------------------------------------------------------------------------
  1 | from langchain.prompts import ChatPromptTemplate, PromptTemplate
  2 | 
  3 | SYSTEM_TEMPLATE = """You are an autonomous intelligent agent tasked with operating a mobile phone. 
  4 | You are able to assist with a wide range of tasks, from answering simple questions to planning and executing a complicated instruction with specific actions you can issue. 
  5 | 
  6 | Here's the information you'll have:
  7 | The user's objective: This is the task you're trying to complete.
  8 | The installed APPs: These are the APPS you can operate on.
  9 | The current phone's Observation: This is a simplified and structured representation of the phone view, providing key information.
 10 | The previous action and Observation : There are the action you just performed and the resulted phone observation. It may be helpful to track your progress.
 11 | 
 12 | Solve the user's task with interleaving Observation, Thought, Action steps. 
 13 | Thought can reason about the current situation.
 14 | At the end of thinking process, you MUST response the next Action in the following formats:
 15 | 1. APP level Actions:
 16 | #start [app_name]#: This action start an APP specified by app name.
 17 | You can ONLY issue the start operation on the following APPs:
 18 | {app_string}
 19 | 
 20 | 2. UI Element level Actions:
 21 | #click [id]#: This action clicks on an element with a specific id on the APP page.
 22 | #long_click [id]#: This action long clicks on an element with a specific id on the APP page.
 23 | #set_text [id] [text]# This action set text in a text view element with a specific id on the APP page.
 24 | Note that the UI elements with 'clickable' or 'long-clickable' properties can be issued with #click#, while the elements with 'EditText' can be issued with #set_text# action.
 25 | 
 26 | 3. Phone system level Actions:
 27 | #swipe_up#: Scroll up the screen.
 28 | #swipe_down#: Scroll down the screen.
 29 | #swipe_left#: Swipe left the screen.
 30 | #swipe_right#: Swipe right the screen.
 31 | #press_back#: Navigate to the previously viewed page.
 32 | #press_enter#: Press enter.
 33 | 
 34 | 4. Task completion Action:
 35 | #finish [answer]#: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as "N/A" in the bracket.
 36 | 
 37 | ------
 38 | 
 39 | Observation is the simplified and structured text representation of APP view.
 40 | 
 41 | To be successful, it is very important to follow the following rules:
 42 | 1. You MUST only issue ONE next action in each thinking process.
 43 | 2. Generate the action in the correct format. Always put the action inside a pair of #. For example, #click [node3]#.
 44 | 3. Issue finish action when you think you have achieved the objective.
 45 | 4. Today is {date}, which might be useful for you to complete the task.
 46 | """
 47 | 
 48 | SYSTEM_PROMPT = PromptTemplate(template=SYSTEM_TEMPLATE, input_variables=["app_string", "date"])
 49 | 
 50 | EXAMPLES = [
 51 |     {"input":
 52 |          """User's objective: open the email from Kaggle in Gmail.
 53 |          Previous Action: None
 54 |          Observation: [n21c9] ScrollView workspace ;scroll ; ; :
 55 |          [n5438] ViewPager smartspace_card_pager ;long-click focusable ; ; :
 56 |              [n06cb] ViewGroup ;click focusable ; ; :
 57 |                  [n5315] TextView date ;click focusable ; ; Thu, Aug 31 Thu, Aug 31 :
 58 |      [nd577] View ;; ; Home :
 59 |      [nd90b] TextView ;click long-click focusable ; ; Phone Phone :
 60 |      [n95b5] TextView ;click long-click focusable ; ; Messages Messages :
 61 |      [n3a72] TextView ;click long-click focusable ; ; Camera Camera :
 62 |      [n3a95] FrameLayout search_container_hotseat ;click long-click focusable ; ; Search :
 63 |          [nc948] ImageView g_icon ;click focusable ; ; :
 64 |          [n71c5] ImageButton lens_icon ;click focusable ; ; Google Lens :
 65 |          Think: """,
 66 |      "output": "Let's think step-by-step. The current phone view is home page, so I need to open the Gmail app and search for the email from Kaggle. In summary, the next action I will perform is #start [Gmail]#"
 67 |      },
 68 |     {"input":
 69 |          """User's objective: open the email from Kaggle in Gmail.
 70 |          Previous Action: launch Gmail APP
 71 |          Observation: [n5cf0] EditText open_search ;click focusable ; ; Search in mail :
 72 |          [n64d9] ImageButton ;click focusable ; ; Open navigation drawer :
 73 |          [n5bfe] FrameLayout selected_account_disc_gmail ;click focusable ; ; Account and settings. :
 74 |      [n5d11] FrameLayout conversation_list_parent_frame ;click focusable ; ; :
 75 |          [nfc59] RecyclerView thread_list_view ;scroll focusable ; ;  Primary Tap a sender image to select that conversation:
 76 |              [n18c9] Button dismiss_icon ;click focusable ; ; Dismiss Dismiss tip :
 77 |              [n4979] ViewGroup viewified_conversation_item_view ;click long-click focusable ; ; , , , Google Research:
 78 |                  [n2523] ImageView contact_image ;click long-click focusable ; ; :
 79 |                  [n7cc1] ImageView star ;click focusable ; ; :
 80 |              [n667d] ViewGroup viewified_conversation_item_view ;click long-click focusable ; ; , , , Kaggle, , [Invitation Letter]:
 81 |                  [nab90] ImageView contact_image ;click long-click focusable ; ; :
 82 |      [nb50d] Button compose_button ;click focusable ; ; Compose :
 83 |      [nf267] FrameLayout ;focusable ; selected ; Mail, 2 new notifications :
 84 |      [n90ef] FrameLayout ;click focusable ; ; Meet :
 85 |      Think: """,
 86 |      "output": "Let's think step-by-step. The previous action and the current state indicate we have opened Gmail, so the next step is to search for the email from Kaggle using the search bar. This page has a search box whose ID is [n5cf0], and I can search for the email by \"Kaggle\" and then submit my typing by pressing the Search button afterwards. In summary, the next action I will perform is #[set_text] [n5cf0] [Kaggle]#"
 87 |      }
 88 | ]
 89 | 
 90 | EXAMPLE_PROMPT_PREFIX = """
 91 | Here are some examples:
 92 | (BEGIN OF EXAMPLES)
 93 | """
 94 | 
 95 | EXAMPLE_PROMPT_SUFFIX = """(END OF EXAMPLES)"""
 96 | 
 97 | EXAMPLE_PROMPT = ChatPromptTemplate.from_messages(
 98 |     [
 99 |         ("human", "EXAMPLE INPUT: {input}"),
100 |         ("ai", "EXAMPLE OUTPUT: {output}"),
101 |     ]
102 | )
103 | 
104 | ACT_TEMPLATE = """REMEMBER to think step by step, and generate ONE next action in the correct format. 
105 | Always put the action inside a pair of #. For example, #start [Gmail]# or #click [node3]#.
106 | If you think the current state indicates the task is completed, issue the #finish [answer]# action.
107 | 
108 | {reflection}
109 | 
110 | {constrain}
111 | 
112 | Now, begin!
113 | User's objective: {instruction}
114 | 
115 | {scratchpad}
116 | """
117 | 
118 | ACT_PROMPT = PromptTemplate(template=ACT_TEMPLATE, input_variables=["reflection", "constrain", "instruction", "scratchpad"])
119 | 
120 | REFLECTION_HEADER = """You have attempted to completed following task before and failed. The following reflection(s) give a plan to avoid failing to complete the task in the same way you did previously. Use them to improve your strategy of completing the given task.\n"""
121 | 
122 | REFLECTION_TEMPLATE = """User's objective: {instruction}
123 | 
124 | {previous_reflection}
125 | 
126 | Previous trial:
127 | {scratchpad}
128 | """
129 | 
130 | REFLECTION_PROMPT = PromptTemplate(template=REFLECTION_TEMPLATE, input_variables=["instruction", "scratchpad", "previous_reflection"])
131 | 
132 | REFLECTION_PROMPT_SYSTEM = """You are an advanced reasoning agent that can improve based on self reflection. You will be given a previous reasoning trial in which you were given access to operate an Android phone environment with human-like actions including click and type text on the phone screen, and a task instruction to complete. You were unsuccessful in completing the task either because you made the wrong action decisions, or you used up your set number of reasoning steps. In a few sentences, Diagnose a possible reason for failure and devise a new, concise, high level plan that aims to mitigate the same failure. Use complete sentences.  """
133 | 
134 | REWARD_SYSTEM = """You can access to the actions and phone states at some steps during executing a specific task on a phone. Check if the given phone states and actions indicate the achievement of a goal. The phone state is represented as structured texts, with each entry denoting a UI component along with its content and function description. 
135 | """
136 | 
137 | REWARD_TEMPLATE = """The goal is 
138 | {goal}, 
139 | 
140 | the actions and states at some steps are:
141 | {traj}
142 | 
143 | Please check if the above trajectory indicate the achievement of the goal: {goal}.
144 | Only output 'Yes' or 'No', no other words."""
145 | 
146 | REWARD_PROMPT = PromptTemplate(template=REWARD_TEMPLATE, input_variables=["goal", "traj"])
147 | 
148 | CONSTRAIN_SYSTEM_HEADER = "Here are some constrains specified by the phone user due to privacy or preference issues. Please complete the task instruction under the following constrains."
149 | 


--------------------------------------------------------------------------------
/agents/lm_agent.py:
--------------------------------------------------------------------------------
  1 | import traceback
  2 | 
  3 | import time
  4 | from typing import List, Tuple, Dict, Any
  5 | 
  6 | from colorama import Fore
  7 | from gymnasium import Env
  8 | from gymnasium.core import ObsType, ActType
  9 | from langchain.prompts import ChatPromptTemplate, FewShotChatMessagePromptTemplate, SystemMessagePromptTemplate, \
 10 |     HumanMessagePromptTemplate
 11 | from langchain.schema import SystemMessage
 12 | from uiautomator2 import UiObjectNotFoundError
 13 | from uiautomator2.exceptions import XPathElementNotFoundError
 14 | 
 15 | from agents.action_parser import AgentOutputParser
 16 | from agents.agent_base import BaseAgent
 17 | from agents.prompt import (REFLECTION_HEADER,
 18 |                            EXAMPLE_PROMPT,
 19 |                            SYSTEM_PROMPT,
 20 |                            ACT_PROMPT,
 21 |                            REFLECTION_PROMPT_SYSTEM,
 22 |                            REFLECTION_PROMPT, CONSTRAIN_SYSTEM_HEADER)
 23 | from agents.replay_agent import ReplayAgent
 24 | from agents.replay_buffer import Trajectory, save_trajectory
 25 | from agents.tasks import Task
 26 | from agents.utils import load_llm_agent, truncate_scratchpad, load_tokenizer
 27 | from android_env2.actions import Action, ActionType
 28 | from android_env2.exception import AndroidActionException, OutputParserException
 29 | 
 30 | 
 31 | class LMAgent(BaseAgent):
 32 |     def __init__(self, env: Env[ObsType, ActType], args):
 33 |         super().__init__(env, args)
 34 |         assert args.agent_type in ["direct", "react", "react_reflection"]
 35 |         self.chat_model = load_llm_agent(args.model_provider, args.temperature)
 36 |         self.tokenizer = load_tokenizer(model_name=args.model_name)
 37 |         self.action_parser = AgentOutputParser()
 38 |         self.action_repeat_cnt = 0
 39 |         self.cur_step = 1
 40 |         self.replay_agent = ReplayAgent(env, args)
 41 | 
 42 |     def create_agent_prompt(self, stage: str):
 43 |         task = self.trajectory.task
 44 |         app_string = "\n".join(
 45 |             [f"> {app.name}: {app.description}" for package, app in self.env.phone.apps.items()]
 46 |         )
 47 |         date = time.strftime('%b %d %Y %A', time.localtime(int(time.time())))
 48 | 
 49 |         reflection = ""
 50 |         if "react_reflection" == self.args.agent_type and task.reflection:
 51 |             reflection = REFLECTION_HEADER + 'Reflections:\n- ' + '\n- '.join([r for r in task.reflection])
 52 |             if not self.trajectory.prev_reflection:
 53 |                 self.trajectory.prev_reflection = reflection
 54 | 
 55 |         constrain = ""
 56 |         if "constrain" == self.args.test_app:
 57 |             constrain = CONSTRAIN_SYSTEM_HEADER + f'\nConstrain: {task.constrain_prompt}'
 58 | 
 59 |         instruction = task.instruction
 60 |         scratchpad, still_exceed = self._construct_react_scratchpad(self.trajectory.get_last_k(self.args.hist_steps),
 61 |                                                                     stage)
 62 | 
 63 |         if still_exceed:  # remove few-shot examples
 64 |             chat_prompt_template = ChatPromptTemplate.from_messages(
 65 |                 [SystemMessagePromptTemplate(prompt=SYSTEM_PROMPT),
 66 |                  HumanMessagePromptTemplate(prompt=ACT_PROMPT)]
 67 |             )
 68 |         else:
 69 |             chat_prompt_template = ChatPromptTemplate.from_messages(
 70 |                 [SystemMessagePromptTemplate(prompt=SYSTEM_PROMPT),
 71 |                  FewShotChatMessagePromptTemplate(example_prompt=EXAMPLE_PROMPT, examples=task.examples),
 72 |                  HumanMessagePromptTemplate(prompt=ACT_PROMPT)]
 73 |             )
 74 | 
 75 |         message = chat_prompt_template.format_prompt(app_string=app_string, date=date, reflection=reflection,
 76 |                                                      constrain=constrain, instruction=instruction,
 77 |                                                      scratchpad=scratchpad).to_messages()
 78 | 
 79 |         if not self.trajectory.system_str:
 80 |             self.trajectory.system_str = message[0].content
 81 | 
 82 |         if not self.trajectory.example_str:
 83 |             self.trajectory.example_str = "\n".join([m.content for m in message[1:-1]])
 84 | 
 85 |         print(Fore.CYAN + f"Prompt: {message[-1].content}" + Fore.RESET, end="\n\n")
 86 |         return message
 87 | 
 88 |     def create_reflection_prompt(self):
 89 |         task = self.trajectory.task
 90 |         scratchpad, still_exceed = self._construct_react_scratchpad(
 91 |             self.trajectory.get_last_k(len(self.trajectory.data)), stage="Reflection")
 92 |         if still_exceed:
 93 |             scratchpad = self.tokenizer.decode(self.tokenizer.encode(scratchpad)[-self.args.scratchpad_length:])
 94 |         reflection_prompt_template = ChatPromptTemplate.from_messages([
 95 |             SystemMessage(content=REFLECTION_PROMPT_SYSTEM),
 96 |             HumanMessagePromptTemplate(prompt=REFLECTION_PROMPT)]
 97 |         )
 98 | 
 99 |         previous_reflection = ""
100 |         if task.reflection:
101 |             previous_reflection = 'Previous Reflections:\n- ' + '\n- '.join([r for r in task.reflection])
102 | 
103 |         message = reflection_prompt_template.format_prompt(instruction=task.instruction,
104 |                                                            scratchpad=scratchpad,
105 |                                                            previous_reflection=previous_reflection).to_messages()
106 |         return message
107 | 
108 |     def _construct_react_scratchpad(
109 |             self, intermediate_steps: List[Dict[str, Any]], stage: str
110 |     ) -> (str, bool):
111 |         """Construct the scratchpad that lets the agent continue its think, action and reflection process."""
112 |         scratchpad = ""
113 |         for t, step in enumerate(intermediate_steps[:-1]):
114 |             scratchpad += f"Step {self.cur_step - len(intermediate_steps) + 1 + t}:\n"
115 |             if self.args.with_obs:
116 |                 scratchpad += f"\nPrevious Observation: {step['state']['text'] if isinstance(step['state'], dict) else step['state']}\n"
117 |             scratchpad += f"\nPrevious Action: {step['action']}\n\n"
118 |         scratchpad += f"Step {self.cur_step}:\n"
119 |         last_step = intermediate_steps[-1]
120 |         state = last_step['state']['text'] if isinstance(last_step['state'], dict) else last_step['state']
121 |         if stage in ["Think", "Reflection"]:
122 |             scratchpad += f"\nObservation: '{state}\n{stage}: "
123 |         elif stage == "Action":
124 |             assert len(last_step) == 2
125 |             scratchpad += f"\nObservation: {state}\nThought: {last_step['thought']}\n{stage}: "
126 | 
127 |         scratchpad, still_exceed = truncate_scratchpad(scratchpad, n_tokens=self.args.scratchpad_length)
128 |         return scratchpad, still_exceed
129 | 
130 |     def _construct_direct_scratchpad(
131 |             self, intermediate_steps: List[Tuple[Action, str]], stage: str
132 |     ) -> str:
133 |         """Construct the scratchpad that lets the agent continue its think, action and reflection process."""
134 |         scratchpad = ""
135 |         for state, thought, action in intermediate_steps[:-1]:
136 |             scratchpad += f"\nPrevious Observation: {state}\nPrevious Action: {action}\n\n"
137 |         last_step = intermediate_steps[-1]
138 |         scratchpad_suffix = f"\nPrevious Observation: {last_step[0]}\n{stage}: "
139 |         scratchpad += scratchpad_suffix
140 |         max_hist_length = getattr(self.args, "max_hist_length", None)
141 |         if max_hist_length:
142 |             scratchpad = self.tokenizer.decode(self.tokenizer.encode(scratchpad)[-max_hist_length:])
143 |         return scratchpad
144 | 
145 |     def reflection(self):
146 |         reflection = self.chat_model(self.create_reflection_prompt()).content
147 |         print(Fore.LIGHTGREEN_EX + f"Reflection: {reflection}\n\n" + Fore.RESET)
148 |         self.trajectory.add(reflection=reflection)
149 | 
150 |     def check_repeat_action(self, action):
151 |         last_action = self.trajectory.data[-2]["action"] if len(self.trajectory.data) > 1 else None
152 |         if last_action and action == last_action:
153 |             self.action_repeat_cnt += 1
154 |             if self.action_repeat_cnt > self.trajectory.task.max_repeat_step:
155 |                 raise ValueError(f"Exceed max {self.action_repeat_cnt} repeat action {action}")
156 |             return True
157 |         else:
158 |             self.action_repeat_cnt = 0
159 |             return False
160 | 
161 |     def select_action(self):
162 |         # think
163 |         think_response = self.chat_model(self.create_agent_prompt(stage="Think")).content
164 |         print(Fore.GREEN + f"Think: {think_response}" + Fore.RESET, end="\n\n")
165 |         self.trajectory.add(thought=think_response)
166 | 
167 |         action_response = self.action_parser.parse(think_response)
168 |         return action_response
169 | 
170 |     @save_trajectory(folder=f"traj")
171 |     def run(self, task: Task):
172 |         self._reset_agent()
173 |         print(Fore.RED + f"Task: {task.instruction}" + Fore.RESET, end="\n\n")
174 |         self.trajectory = Trajectory(task=task)
175 |         self.env.set_traj(self.trajectory)
176 |         obs, info = self.env.reset()
177 |         print(Fore.YELLOW + f"Obs: {obs['text'] if isinstance(obs, dict) else obs}" + Fore.RESET, end="\n\n")
178 |         self.trajectory.add(state=obs)
179 |         try:
180 |             while not self.terminated:
181 |                 try:
182 |                     action = self.select_action()
183 |                     print(Fore.BLUE + f"Action: {action}" + Fore.RESET, end="\n\n")
184 |                     obs, reward, self.terminated, truncated, info = self.env.step(action)
185 |                     self.trajectory.add(action=info["action"])
186 |                 except (AndroidActionException,
187 |                         UiObjectNotFoundError, XPathElementNotFoundError,
188 |                         OutputParserException) as e:
189 |                     if isinstance(e, UiObjectNotFoundError) or isinstance(e, XPathElementNotFoundError):
190 |                         e = f"Invalid node id."
191 |                     if isinstance(obs, dict):
192 |                         obs = {"text": str(e), "image": obs["image"]}
193 |                     else:
194 |                         obs = str(e)
195 |                     reward = 0.
196 |                     invalid_action = Action()
197 |                     invalid_action.action_type = ActionType.INVALID
198 |                     self.trajectory.add(action=invalid_action)
199 |                 print(Fore.YELLOW + f"Obs: {obs['text'] if isinstance(obs, dict) else obs}" + Fore.RESET, end="\n\n")
200 |                 self.trajectory.add(state=obs, reward=reward)
201 |                 self.cur_step += 1
202 |                 if self.cur_step >= task.max_step:
203 |                     raise ValueError(f"Exceed max step ({task.max_step}) limit, exit.")
204 |         except Exception:
205 |             print("Other exception: ", traceback.format_exc())
206 |         finally:
207 |             if self.terminated and (
208 |                     self.trajectory.data[-2]["reward"] == 1. or
209 |                     "reward" in self.trajectory.data[-1] and self.trajectory.data[-1]["reward"] == 1.):
210 |                 task.success = True
211 |             else:
212 |                 if "react_reflection" == self.args.agent_type:
213 |                     self.reflection()
214 |                 elif task.exe_if_failed:
215 |                     print(Fore.RED + "LM Agent failed, executing Replay Agent" + Fore.RESET)
216 |                     self.replay_agent.run(task)
217 | 


--------------------------------------------------------------------------------
/tasks/MTG.py:
--------------------------------------------------------------------------------
  1 | import itertools
  2 | import os
  3 | import traceback
  4 | import logging
  5 | 
  6 | import numpy as np
  7 | from dotenv import load_dotenv
  8 | from langchain import GoogleSerperAPIWrapper
  9 | from langchain.chat_models import ChatOpenAI, AzureChatOpenAI
 10 | from langchain.embeddings import HuggingFaceEmbeddings
 11 | from langchain.output_parsers import CommaSeparatedListOutputParser, ListOutputParser
 12 | 
 13 | import re
 14 | 
 15 | from typing import Any, Dict, List, Optional
 16 | 
 17 | from pydantic import Extra
 18 | 
 19 | from langchain.chains import LLMChain, RetrievalQA
 20 | from langchain.document_loaders import AsyncHtmlLoader, WikipediaLoader
 21 | from langchain.document_transformers import Html2TextTransformer
 22 | from langchain.output_parsers.pydantic import PydanticOutputParser
 23 | from langchain.prompts import PromptTemplate
 24 | from langchain.pydantic_v1 import BaseModel, Field
 25 | from langchain.schema import BaseRetriever, Document, BaseDocumentTransformer
 26 | from langchain.text_splitter import RecursiveCharacterTextSplitter
 27 | from langchain.utilities import GoogleSearchAPIWrapper
 28 | from langchain.vectorstores import Chroma
 29 | from langchain.vectorstores.base import VectorStore
 30 | 
 31 | load_dotenv(".env")
 32 | 
 33 | 
 34 | class SearchQueries(BaseModel):
 35 |     """Search queries to research for the user's goal."""
 36 | 
 37 |     queries: List[str] = Field(
 38 |         ..., description="List of search queries to look up on Google"
 39 |     )
 40 | 
 41 | 
 42 | DEFAULT_SEARCH_PROMPT = PromptTemplate(
 43 |     input_variables=["question"],
 44 |     template="""You are an assistant tasked with generating Google Search queries where their searched results can cover all functions and basic usage instructions of the given APPs. 
 45 |     For example, for the give <APP>, the generated queries should be like 'how to use <APP>', '<APP> guidance', '<APP> usage instruction' etc.
 46 | Generate the Google search queries as many and diverse as possible. The output should be a numbered list of questions: {question}""",
 47 | )
 48 | 
 49 | TASK_SEED_TEMPLATE_PROMPT = PromptTemplate(
 50 |     input_variables=["feature"],
 51 |     template="""You are a smart task creator, where instructions can be generated based on these templates. For example, we can generate "create an event titled 'team meeting' for 3PM" and "create an event titled 'go to the hospital for 11AM" based on the template "create an event titled <event title> for <event time>". Your goal is to generate tasks templates for automatic features from the feature description of an APP:
 52 | 
 53 | {feature}
 54 | 
 55 | Please generate as many of these task templates as possible for the app. Your response should be a numbered list of task templates.""",
 56 | )
 57 | 
 58 | TASK_SEED_PROMPT = PromptTemplate(
 59 |     input_variables=["feature", "app"],
 60 |     template="""You are a smart task creator for a smartphone intelligent assistant. Given the features description of the {app} APP, your goal is to generate clear and practical tasks that the assistant can assist people with while they use {app} on their phone in their daily lives. These tasks should encompass a wide range of possible instructions and questions that may arise when using {app} APP.
 61 | 
 62 | For example, for the Gmail APP, potential task instructions could include:
 63 | Compose an email with the subject <email subject> and the message content <email content> to be sent to <email address> using Gmail., 
 64 | Send the first draft email., 
 65 | Open the latest email from <email address> in Gmail., 
 66 | Open Gmail settings., 
 67 | Turn off notifications for Gmail., 
 68 | Star the latest email from <email address> in Gmail., 
 69 | Delete the latest email from <email address> in Gmail., 
 70 | etc., where the placeholders surrounded with angle brackets '<' and '>' should be automated generated and not be filled with specific content.
 71 | 
 72 | The {app} APP's feature description is: 
 73 | {feature}
 74 | 
 75 | Your task now is to generate as many of these tasks as possible for the {app} app. Ensure that these instructions are clear and will not lead to any misunderstanding so that the assitant can successfully execute them.
 76 | Your response should be a list of comma separated task instructions, where each instruction should be presented in one sentence.""",
 77 | )
 78 | 
 79 | CROSS_TASK_SEED_PROMPT = PromptTemplate(
 80 |     input_variables=["feature", "app"],
 81 |     template="""You are a proficient task creator for a smartphone's intelligent assistant. Your objective is to craft explicit and practical cross-APP tasks that can be cooperatively accomplished by the {app} APPs, leveraging the feature descriptions of these apps. These tasks should encompass a wide array of potential instructions and questions that might arise in users' daily lives when utilizing {app} on their smartphones.
 82 | 
 83 | For example, for the Gmail and Google Calendar APPs, potential cross-APP task instructions could include:
 84 | Find the email with the subject <subject> in your Gmail, extract the meeting details, and create an event in Google Calendar., 
 85 | Search Gmail for the latest email related to upcoming flights, extract the flight details, and create a calendar event for the flight in Google Calendar., 
 86 | Scan Gmail for the latest event invitation and RSVP confirmations, and automatically update Google Calendar with the RSVP status for the event., 
 87 | etc., where the placeholders surrounded with angle brackets '<' and '>' should be automated generated and not be filled with specific content.
 88 | 
 89 | The {app} APPs‘ features and functions description are: 
 90 | {feature}
 91 | 
 92 | Your task now is to generate as many of these cross-APP tasks as possible for the {app} APPs. 
 93 | Ensure that the generated cross-APP tasks must be cooperatively completed by the {app} APPs, and these instructions should be clear, comprehensive, and free from ambiguity to enable the assistant to execute them successfully. 
 94 | Your response should be a list of comma separated task instructions, where each instruction MUST be presented in one line of sentence.""",
 95 | )
 96 | 
 97 | POST_PROMPT = """Please note that the #Given Instruction# might be a template with placeholders surrounded with angle brackets '<' and '>', e.g., 'Compose an email with the subject <email subject> and the message content <email content> to be sent to <email address> using Gmail.'. You should fill the placeholders with specific content and generate a pratical instruction, e.g., ’Compose an email with the subject "Hello" and the message content "Hello, world!" to be sent to abc@example.com using Gmail.‘.
 98 | Ensure that the #New Instruction# remains a practical and realistic {app} APP task instruction for a mobile phone user, but do not incorporate personal information.
 99 | Concisely and accurately output the generated instruction in one line.
100 | #Given Instruction#:
101 | {instruction}
102 | 
103 | #APP's functionality#:
104 | {feature}
105 | 
106 | The #New Instruction# is:
107 | """
108 | 
109 | ADD_CONSTRAINTS_PROMPT = PromptTemplate(
110 |     input_variables=["app", "instruction", "feature"],
111 |     template="""You are a smart task instruction rewriter for mobile phone tasks. I will provide you with a task instruction for completion and the functionality of an APP on a mobile phone. 
112 | Please add a few more constraints or requirements to #Given Instruction#, and create #New Instruction#.
113 | """ + POST_PROMPT
114 | )
115 | 
116 | COMPLICATE_PROMPT = PromptTemplate(
117 |     input_variables=["app", "instruction", "feature"],
118 |     template="""You are a smart task instruction rewriter for mobile phone tasks. I will provide you with a task instruction for completion and the functionality of an APP on a mobile phone. 
119 |     Please rewrite #Given Instruction# to make it slightly more complicated, and create #New Instruction#.
120 | """ + POST_PROMPT
121 | )
122 | 
123 | DEEPEN_PROMPT = PromptTemplate(
124 |     input_variables=["app", "instruction", "feature"],
125 |     template="""You are a smart task instruction rewriter for mobile phone tasks. I will provide you with a task instruction for completion and the functionality of an APP on a mobile phone. 
126 |     Slightly increase the depth and breadth of #Given Instruction#, and create #New Instruction#.
127 | """ + POST_PROMPT
128 | )
129 | 
130 | CONCRETIZE_PROMPT = PromptTemplate(
131 |     input_variables=["app", "instruction", "feature"],
132 |     template="""You are a smart task instruction rewriter for mobile phone tasks. I will provide you with a task instruction for completion and the functionality of an APP on a mobile phone.  
133 |     Make #Given Instruction# slightly more concrete, and create #New Instruction#.
134 | """ + POST_PROMPT
135 | )
136 | 
137 | INCREASE_REASONING_PROMPT = PromptTemplate(
138 |     input_variables=["app", "instruction", "feature"],
139 |     template="""You are a smart task instruction rewriter for mobile phone tasks. I will provide you with a task instruction for completion and the functionality of an APP on a mobile phone. 
140 |     If #Given Instruction# can be solved with just a few simple thinking processes, rewrite it to explicitly request multi-step reasoning, and create #New Instruction#.
141 | """ + POST_PROMPT
142 | )
143 | 
144 | SWITCH_TOPIC_PROMPT = PromptTemplate(
145 |     input_variables=["app", "instruction", "feature"],
146 |     template="""You are a smart task instruction rewriter for mobile phone tasks. I will provide you with a task instruction for completion and the functionality of an APP on a mobile phone. 
147 |     Rewrite #Given Instruction# by switching the topic for the same APP, keeping the domain and difficulty level similar, and create #New Instruction#.
148 | """ + POST_PROMPT
149 | )
150 | 
151 | 
152 | class LineList(BaseModel):
153 |     """List of questions."""
154 | 
155 |     lines: List[str] = Field(description="Questions")
156 | 
157 | 
158 | class QuestionListOutputParser(PydanticOutputParser):
159 |     """Output parser for a list of numbered questions."""
160 | 
161 |     def __init__(self) -> None:
162 |         super().__init__(pydantic_object=LineList)
163 | 
164 |     def parse(self, text: str) -> LineList:
165 |         lines = re.findall(r"\d+\..*?\n", text)
166 |         return LineList(lines=lines)
167 | 
168 | 
169 | class WizardLMAgent(BaseModel):
170 |     vectorstore: VectorStore = Field(
171 |         ..., description="Vector store for storing web pages"
172 |     )
173 |     num_search_results: int = Field(3, description="Number of pages per Google search")
174 |     text_transformer: BaseDocumentTransformer = Field(Html2TextTransformer(), description="text transformer")
175 |     text_splitter: RecursiveCharacterTextSplitter = Field(
176 |         RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=150),
177 |         description="Text splitter for splitting web pages into chunks",
178 |     )
179 |     url_database: List[str] = Field(
180 |         default_factory=list, description="List of processed URLs"
181 |     )
182 |     app_name: List[str] = Field(
183 |         default_factory=list, description="APP name"
184 |     )
185 |     tasks: List[str] = Field(default_factory=list, description="List of APP tasks.")
186 |     llm: ChatOpenAI = Field(..., description="LLM model")
187 | 
188 |     class Config:
189 |         """Configuration for this pydantic object."""
190 | 
191 |         extra = Extra.forbid
192 |         arbitrary_types_allowed = True
193 | 
194 |     def web_search(self, questions):
195 |         search = GoogleSerperAPIWrapper(k=self.num_search_results)
196 | 
197 |         def clean_search_query(query: str) -> str:
198 |             # Some search tools (e.g., Google) will
199 |             # fail to return results if query has a
200 |             # leading digit: 1. "LangCh..."
201 |             # Check if the first character is a digit
202 |             if query[0].isdigit():
203 |                 # Find the position of the first quote
204 |                 first_quote_pos = query.find('"')
205 |                 if first_quote_pos != -1:
206 |                     # Extract the part of the string after the quote
207 |                     query = query[first_quote_pos + 1:]
208 |                     # Remove the trailing quote if present
209 |                     if query.endswith('"'):
210 |                         query = query[:-1]
211 |             return query.strip()
212 | 
213 |         def search_tool(query: str, num_search_results: int = 1) -> List[dict]:
214 |             query_clean = query
215 |             result = search.results(query_clean)
216 |             return result["organic"]
217 | 
218 |         # print(f"Questions for Google Search: {questions}")
219 |         # Get urls
220 |         # print("Searching for relevant urls...")
221 |         urls_to_look = []
222 |         for q in questions:
223 |             # Google search
224 |             search_results = search_tool(q, self.num_search_results)
225 |             # print("Searching for relevant urls...")
226 |             # print(f"Search results: {search_results}")
227 |             for res in search_results:
228 |                 if res.get("link", None):
229 |                     if ".pdf" in res["link"] or "youtube" in res["link"] or "androidpolice" in res[
230 |                         "link"] or "xda-developers" in res["link"] or "www.makeuseof.com" in res[
231 |                         "link"] or "support.google.com" in res["link"] or "www.howtogeek.com" in res[
232 |                         "link"] or "davinp1.webs.com" in res["link"] or "www.onboard.upenn.edu" in res[
233 |                         "link"] or "medium.com" in res["link"] or "www.pocket-lint.com" in res[
234 |                         "link"] or "www.pulmonaryfibrosis.org" in res["link"]:
235 |                         continue
236 |                     urls_to_look.append(res["link"])
237 |         # Relevant urls
238 |         urls = set(urls_to_look)
239 |         # Check for any new urls that we have not processed
240 |         new_urls = list(urls.difference(self.url_database))
241 |         # print(f"New URLs to load: {new_urls}")
242 |         if new_urls:
243 |             loader = AsyncHtmlLoader(new_urls)
244 |             # print("Indexing new urls...")
245 |             docs = loader.load()
246 |             docs = list(self.text_transformer.transform_documents(docs))
247 |             docs = self.text_splitter.split_documents(docs)
248 |             self.vectorstore.add_documents(docs)
249 | 
250 |     def create_seed_tasks(self, web_search=True):
251 |         if not os.path.exists(f"tasks/{'_'.join(self.app_name)}_seed.txt"):
252 |             prompt = TASK_SEED_PROMPT if len(self.app_name) == 1 else CROSS_TASK_SEED_PROMPT
253 |             seed_task_chain = LLMChain(
254 |                 llm=llm,
255 |                 prompt=prompt,
256 |                 output_parser=CommaSeparatedListOutputParser(),
257 |                 output_key="template"
258 |             )
259 |             for app in self.app_name:
260 |                 try:
261 |                     docs = WikipediaLoader(query=app, load_max_docs=1).load()
262 |                     docs = list(self.text_transformer.transform_documents(docs))
263 |                     docs = self.text_splitter.split_documents(docs)
264 | 
265 |                     self.vectorstore.add_documents(docs)
266 |                 except Exception as e:
267 |                     print(f"cannot find {app}")
268 |                     print(str(e))
269 | 
270 |             if web_search:
271 |                 if len(self.app_name) == 1:
272 |                     questions = [f"how to use {self.app_name}", f"{self.app_name} usage instructions",
273 |                                  f"{self.app_name} quick start guides", f"{self.app_name} cheat sheets",
274 |                                  f"{self.app_name} productivity guides", f"use {self.app_name} step-by-step",
275 |                                  f"tips and tricks for {self.app_name}", f"{self.app_name} for beginners",
276 |                                  f"{self.app_name} tutorial", f"getting started with {self.app_name}",
277 |                                  f"introduction to {self.app_name}"]
278 |                 else:
279 |                     app_name = ["\"" + a + "\"" for a in self.app_name]
280 |                     comb = " and ".join(app_name)
281 |                     questions = [f"{comb} collaboration features", f"How to use {comb} together for tasks",
282 |                                  f"Integration between {comb} for productivity",
283 |                                  f"Collaborative task management with {comb}",
284 |                                  f"{comb} integration for work and productivity", f"Productivity tips with {comb}"]
285 |                 self.web_search(questions)
286 | 
287 |             qa_chain = RetrievalQA.from_chain_type(llm, retriever=self.vectorstore.as_retriever(), verbose=True,
288 |                                                    chain_type="stuff", output_key="feature")
289 |             if len(self.app_name) == 1:
290 |                 query = f"what are the features and functions of {self.app_name}?"
291 |             else:
292 |                 query = f"what users' tasks can {' and '.join(self.app_name)} complete?"
293 |             features = qa_chain.run(query=query)
294 |             print(features)
295 |             response = seed_task_chain.run(feature=features, app=' and '.join(self.app_name))
296 |             print(response)
297 |             with open(f"tasks/{'_'.join(self.app_name)}_seed.txt", "w") as f:
298 |                 f.write(", ".join(response))
299 |         with open(f"tasks/{'_'.join(self.app_name)}_seed.txt", "r") as f:
300 |             self.tasks = [r.strip() for r in f.readlines()]
301 | 
302 |     def mutate(self, iter_num=10):
303 |         qa_chain = RetrievalQA.from_chain_type(self.llm, retriever=self.vectorstore.as_retriever(), verbose=True,
304 |                                                chain_type="stuff", output_key="feature")
305 | 
306 |         feature = qa_chain.run(query=f"what is the features and functions of {' and '.join(self.app_name)} APP?")
307 |         print(feature)
308 | 
309 |         for i in range(iter_num):
310 |             print(f"iter {i}...")
311 |             evolve_prompt = np.random.choice(
312 |                 [ADD_CONSTRAINTS_PROMPT, COMPLICATE_PROMPT, DEEPEN_PROMPT, SWITCH_TOPIC_PROMPT,
313 |                  INCREASE_REASONING_PROMPT, CONCRETIZE_PROMPT])
314 | 
315 |             llm_chain = LLMChain(
316 |                 llm=self.llm,
317 |                 prompt=evolve_prompt
318 |             )
319 | 
320 |             selected_tasks = np.random.choice(self.tasks, 16)
321 |             response = llm_chain.apply(
322 |                 [{"feature": feature, "app": ' and '.join(self.app_name), "instruction": task} for task
323 |                  in selected_tasks])
324 |             new_tasks = []
325 |             for before, after in zip(selected_tasks, response):
326 |                 after = after["text"].lower()
327 |                 # Elimination Evolving
328 |                 if (before == after
329 |                         or after in self.tasks
330 |                         or "n/a" in after
331 |                         or "how can i assist" in after
332 |                         or "as an ai" in after
333 |                         or "ai assistant" in after
334 |                         or "sorry" in after
335 |                         or "new instruction" in after
336 |                         or re.match(r".*<.+>.*", after)):
337 |                     continue
338 |                 new_tasks.append(after.strip())
339 |             self.tasks.extend(list(set(new_tasks)))
340 |             with open(f"tasks/{'_'.join(self.app_name)}/iter_{i + 1}.txt", "w") as f:
341 |                 f.write("\n".join(self.tasks))
342 | 
343 |     def self_evolve(self, iter_num=5):
344 |         self.create_seed_tasks()
345 |         self.mutate(iter_num=iter_num)
346 | 
347 | 
348 | if __name__ == "__main__":
349 |     logging.basicConfig()
350 |     logging.getLogger("langchain.retrievers.web_research").setLevel(logging.INFO)
351 |     all_apps = ["Google Messages", "Google Contacts", "Google Drive", "Slack", "Gmail", "Google Weather", "Google Maps",
352 |                 "Chrome", "Android Camera", "Google Clock", "Google Calendar", "YouTube", "Android Setting",
353 |                 "Google Photos"]
354 |     llm = AzureChatOpenAI(deployment_name=os.environ["AZURE_ENGINE"],
355 |                           openai_api_key=os.environ["AZURE_OPENAI_KEY"],
356 |                           openai_api_base=os.environ["AZURE_OPENAI_BASE"],
357 |                           openai_api_version=os.environ["AZURE_OPENAI_VERSION"],
358 |                           temperature=0.)
359 |     text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=150)
360 |     for app_n in itertools.combinations(all_apps, 1):
361 |         print(app_n)
362 | 
363 |         vectorstore = Chroma(collection_name='_'.join(app_n),
364 |                              embedding_function=HuggingFaceEmbeddings(),
365 |                              persist_directory=f"./chroma_db_apps")
366 |         agent = WizardLMAgent(app_name=app_n, vectorstore=vectorstore, llm=llm, num_search_results=5,
367 |                               text_splitter=text_splitter,
368 |                               url_database=[])
369 |         try:
370 |             agent.create_seed_tasks()
371 |         except Exception as e:
372 |             print(str(e))
373 |         print(app_n)
374 | 
375 | 


--------------------------------------------------------------------------------
/android_env2/xml_tool.py:
--------------------------------------------------------------------------------
  1 | from typing import Dict
  2 | 
  3 | from lxml import etree
  4 | import xmltodict
  5 | import json
  6 | import uuid
  7 | import copy
  8 | import re
  9 | 
 10 | 
 11 | class UIXMLTree:
 12 |     def __init__(self):
 13 |         self.root = None
 14 |         self.cnt = None
 15 |         self.node_to_xpath: Dict[str, list[str]] = {}
 16 |         self.node_to_name = None
 17 |         self.remove_system_bar = None
 18 |         self.processors = None
 19 |         self.app_name = None
 20 |         self.myTree = None
 21 |         self.xml_dict = None  # dictionary: processed xml
 22 |         self.processors = [self.xml_sparse, self.merge_none_act]
 23 |         self.lastTree = None
 24 |         self.mapCount = {}
 25 |         self.use_bounds = False
 26 |         self.merge_switch = False
 27 | 
 28 |     def process(self, xml_string, app_info, level=1, str_type="json", remove_system_bar=True, use_bounds=False,
 29 |                 merge_switch=False):
 30 |         self.root = etree.fromstring(xml_string.encode('utf-8'))
 31 |         self.cnt = 0
 32 |         self.node_to_xpath: Dict[str, list[str]] = {}
 33 |         self.node_to_name = {}
 34 |         self.remove_system_bar = remove_system_bar
 35 | 
 36 |         self.app_name = app_info['app_name']
 37 |         self.lastTree = self.myTree
 38 |         self.myTree = None
 39 |         self.use_bounds = use_bounds
 40 |         self.merge_switch = merge_switch
 41 | 
 42 |         # from fine-grained to coarse-grained observation
 43 |         for processor in self.processors[:level]:
 44 |             processor()
 45 |         self.reindex()
 46 | 
 47 |         self.xml_dict = xmltodict.parse(etree.tostring(self.root, encoding='utf-8'), attr_prefix="")
 48 |         self.traverse_dict(self.xml_dict)
 49 |         if "json" == str_type:
 50 |             return json.dumps(self.xml_dict, indent=4, ensure_ascii=False).replace(": {},", "").replace(": {}", "")
 51 |         elif "plain_text" == str_type:
 52 |             return self.dict_to_plain_text(self.xml_dict)
 53 |         else:
 54 |             raise NotImplementedError
 55 | 
 56 |     def dict_to_plain_text(self, xml_dict, indent=0):
 57 |         result = ""
 58 |         for key, value in xml_dict.items():
 59 |             result += " " * indent + str(key) + ": "
 60 |             if isinstance(value, dict):
 61 |                 result += "\n" + self.dict_to_plain_text(value, indent + 4)
 62 |             else:
 63 |                 result += str(value) + "\n"
 64 |         return result
 65 | 
 66 |     def should_remove_node(self, node):
 67 |         # remove system ui elements, e.g, battery, wifi and notifications
 68 |         if self.remove_system_bar and node.attrib['package'] == "com.android.systemui":
 69 |             return True
 70 |         # remove non-visible element
 71 |         for p in ['text', "content-desc"]:
 72 |             if node.attrib[p] != "":
 73 |                 return False
 74 |         # remove non-functional element
 75 |         for p in ["checkable", "checked", "clickable", "focusable", "scrollable", "long-clickable", "password",
 76 |                   "selected"]:
 77 |             if node.attrib[p] == "true":
 78 |                 return False
 79 |         return True
 80 | 
 81 |     def child_index(self, parent, node):
 82 |         # find the index of a given node in its sibling nodes
 83 |         for i, v in enumerate(list(parent)):
 84 |             if v == node:
 85 |                 return i
 86 |         return -1
 87 | 
 88 |     def merge_attribute_in_one_line(self, node):
 89 |         node.attrib['description'] = ""
 90 |         # text description
 91 | 
 92 |         # function description in resource-id and class
 93 |         if node.attrib['class'] != "":
 94 |             node.attrib['description'] += node.attrib['class'] + " "
 95 |         if node.attrib['resource-id'] != "":
 96 |             node.attrib['description'] += node.attrib['resource-id'] + " "
 97 |         # action
 98 |         node.attrib['description'] += ';' + node.attrib['action'] + '; '
 99 | 
100 |         # status
101 |         for attrib in ['checked', 'password', 'selected']:
102 |             if node.attrib[attrib] == "true":
103 |                 node.attrib['description'] += attrib + ' '
104 |         if node.attrib['checkable'] == "true" and node.attrib['checked'] == "false":
105 |             node.attrib['description'] += 'unchecked '
106 | 
107 |         # extend status
108 |         extend_status = ";"
109 | 
110 |         if node.attrib['password'] == "true":
111 |             extend_status += ' you can input password, '
112 |         if node.attrib['selected'] == "true":
113 |             extend_status += ' selected, '
114 |         node.attrib['description'] += extend_status
115 | 
116 |         # func-desc
117 |         node.attrib['description'] += ";" + node.attrib['func-desc']
118 |         node.attrib['description'] = node.attrib['description'].replace("\n", "")
119 |         # map functional attributes to support actions
120 | 
121 |         # clean attribute
122 |         for attrib in ['index', 'text', 'resource-id', 'package', 'content-desc', 'enabled', 'focused',
123 |                        'visible-to-user', 'bounds', 'class', 'checkable', 'checked', 'clickable', 'focusable',
124 |                        'scrollable', 'long-clickable', 'password',
125 |                        'selected', 'func-desc', 'action']:
126 |             del node.attrib[attrib]
127 |         if 'NAF' in node.attrib:
128 |             del node.attrib['NAF']
129 | 
130 |     def get_xpath(self, node):
131 |         if node.tag == 'hierarchy':
132 |             return '/'
133 |         else:
134 |             if node.attrib['resource-id'] != "":
135 |                 my_path = f'//*[@resource-id="{node.attrib["resource-id"]}"]'
136 |                 candi_nodes = self.root.xpath(my_path)
137 |                 if len(candi_nodes) == 1:
138 |                     return my_path
139 | 
140 |             parent = node.getparent()
141 |             children = parent.xpath(f'./*[@class="{node.attrib["class"]}"]')
142 |             index = children.index(node) + 1
143 |             return parent.attrib['xpath2'] + '/' + node.attrib['class'] + f'[{index}]'
144 | 
145 | 
146 |     def get_attr_count(self, collection_key, key):
147 |         if collection_key not in self.mapCount:
148 |             return 0
149 |         if key not in self.mapCount[collection_key]:
150 |             return 0
151 |         return self.mapCount[collection_key][key]
152 | 
153 |     def inc_attr_count(self, collection_key, key):
154 | 
155 |         if collection_key not in self.mapCount:
156 |             key_map = {}
157 |             key_map[key] = 1
158 |             self.mapCount[collection_key] = key_map
159 |         elif key not in self.mapCount[collection_key]:
160 |             self.mapCount[collection_key][key] = 1
161 |         else:
162 |             self.mapCount[collection_key][key] += 1
163 | 
164 |     def get_xpath_new(self, node):
165 | 
166 |         array = []
167 |         while node is not None:
168 |             if node.tag != "node":
169 |                 break
170 | 
171 |             parent = node.getparent()
172 |             if self.get_attr_count("tag", node.tag) == 1:
173 |                 array.append(f'*[@label="{node.tag}"]')
174 |                 break
175 |             elif self.get_attr_count("resource-id", node.attrib["resource-id"]) == 1:
176 |                 array.append(f'*[@resource-id="{node.attrib["resource-id"]}"]')
177 |                 break
178 |             elif self.get_attr_count("text", node.attrib["text"]) == 1:
179 |                 array.append(f'*[@text="{node.attrib["text"]}"]')
180 |                 break
181 |             elif self.get_attr_count("content-desc", node.attrib["content-desc"]) == 1:
182 |                 array.append(f'*[@content-desc="{node.attrib["content-desc"]}"]')
183 |                 break
184 |             elif self.get_attr_count("class", node.attrib["class"]) == 1:
185 |                 array.append(f'{node.attrib["class"]}')
186 |                 break
187 |             elif parent is None:
188 |                 array.append(f'{node.tag}')
189 |             else:
190 |                 index = 0
191 |                 children = list(parent)
192 |                 node_id = children.index(node)
193 |                 for _id, child in enumerate(children):
194 |                     if child.attrib["class"] == node.attrib["class"]:
195 |                         index += 1
196 |                     if node_id == _id:
197 |                         break
198 |                 array.append(f'{node.attrib["class"]}[{index}]')
199 |             node = parent
200 | 
201 |         array.reverse() 
202 |         xpath = "//" + "/".join(array) 
203 |         return xpath
204 | 
205 | 
206 |     def get_xpath_all_new(self, node):
207 |         node.attrib['xpath1'] = self.get_xpath_new(node)
208 |         node.attrib['xpath2'] = self.get_xpath(node)
209 |         for child in list(node):
210 |             self.get_xpath_all_new(child)
211 | 
212 |     def get_first_five_words(self, text):
213 |         words = text.split()
214 |         if len(words) > 5:
215 |             return ' '.join(words[:5])
216 |         else:
217 |             return ' '.join(words)
218 | 
219 |     def mid_order_remove(self, node):
220 |         children = list(node)
221 |         node.attrib['name'] = ""
222 |         if node.tag == 'node':
223 |             if self.should_remove_node(node):
224 |                 # remove node
225 |                 parent = node.getparent()
226 |                 # insert child nodes into node's parent
227 |                 index = self.child_index(parent, node)
228 |                 for i, v in enumerate(children):
229 |                     parent.insert(index + i, v)
230 |                 parent.remove(node)
231 |             else:
232 |                 # pre-process attribute
233 |                 # content-desc text
234 |                 node.attrib['func-desc'] = ""
235 |                 node.attrib['action'] = ""
236 |                 # pre desc
237 |                 if node.attrib['text'] != "":
238 |                     node.attrib['func-desc'] = node.attrib['text'] + ' '
239 |                 if node.attrib['content-desc'] != "":
240 |                     node.attrib['func-desc'] += node.attrib['content-desc'] + ' '
241 | 
242 |                 # pre name
243 |                 if node.attrib['class'] != "":
244 |                     if node.attrib['text'] != "":
245 |                         node.attrib['name'] = self.get_first_five_words(node.attrib['text']) + " " + \
246 |                                               node.attrib['class'].split('.')[-1]
247 |                     elif node.attrib['content-desc'] != "":
248 |                         node.attrib['name'] = self.get_first_five_words(node.attrib['content-desc']) + " " + \
249 |                                               node.attrib['class'].split('.')[-1]
250 |                     else:
251 |                         node.attrib['name'] = node.attrib['class'].split('.')[-1]
252 | 
253 |                 # pre class
254 |                 if node.attrib['class'] != "":
255 |                     if node.attrib['class'].split('.')[-1] in ["View", "FrameLayout", "LinearLayout", "RelativeLayout"]:
256 |                         node.attrib['class'] = ""
257 |                     else:
258 |                         node.attrib['class'] = node.attrib['class'].split('.')[-1]
259 | 
260 |                 # pre resource-id
261 |                 if node.attrib['resource-id'] != "":
262 |                     if ":id/" in node.attrib['resource-id']:
263 |                         resrc = node.attrib['resource-id']
264 |                         substring = resrc[resrc.index(":id/") + 4:]
265 |                         node.attrib['resource-id'] = substring
266 |                     else:
267 |                         node.attrib['resource-id'] = ""
268 |                 # pre action
269 |                 for k, v in {'clickable': 'click', 'scrollable': 'scroll', 'long-clickable': 'long-click',
270 |                              'checkable': 'check'}.items():
271 |                     if node.attrib[k] == "true":
272 |                         node.attrib['action'] += v + ' '
273 |                 if node.attrib['action'] == "" and node.attrib['focusable'] == "true":
274 |                     node.attrib['action'] += "focusable "
275 | 
276 |                 # for material_clock_face
277 |                 parent = node.getparent()
278 |                 if parent.tag == 'node' and "material_clock_face" in parent.attrib['resource-id']:
279 |                     node.attrib['action'] += 'click'
280 | 
281 |         for child in children:
282 |             self.mid_order_remove(child)
283 | 
284 |     def dump_tree(self):
285 |         xml_str = etree.tostring(self.root, encoding='unicode')
286 |         print(xml_str)
287 | 
288 |     def mid_order_reindex(self, node):
289 |         if node.tag == 'node':
290 |             self.merge_attribute_in_one_line(node)
291 | 
292 |         node.tag = 'n' + str(uuid.uuid4().hex[:4])
293 | 
294 |         if node.tag in self.node_to_xpath:
295 |             self.node_to_xpath[node.tag].append(node.attrib['xpath1'])
296 |             self.node_to_xpath[node.tag].append(node.attrib['xpath2'])
297 |         else:
298 |             self.node_to_xpath[node.tag] = [node.attrib['xpath1'], node.attrib['xpath2']]
299 |         self.node_to_xpath[node.tag].append([])
300 |         if node.getparent() is not None:
301 |             parent = node.getparent()
302 |             # check if has xpath
303 |             if parent.tag in self.node_to_xpath:
304 |                 self.node_to_xpath[parent.tag][2].append(node.attrib['xpath1'])
305 |                 self.node_to_xpath[parent.tag][2].append(node.attrib['xpath2'])
306 |             # add parent xpath to node
307 |             if 'xpath1' in parent.attrib and 'xpath2' in parent.attrib:
308 |                 if parent.attrib['xpath1'] != "//" and parent.attrib['xpath2'] != "//":
309 |                     if node.tag in self.node_to_xpath:
310 |                         self.node_to_xpath[node.tag][2].append(parent.attrib['xpath1'])
311 |                         self.node_to_xpath[node.tag][2].append(parent.attrib['xpath2'])
312 |                     else:
313 |                         self.node_to_xpath[node.tag][2] = [parent.attrib['xpath1'], parent.attrib['xpath2']]
314 |             # add sibling node
315 |             children = list(parent)
316 |             for _id, child in enumerate(children):
317 |                 if 'xpath1' in child.attrib and 'xpath2' in child.attrib:
318 |                     if node.tag in self.node_to_xpath:
319 |                         self.node_to_xpath[node.tag][2].append(child.attrib['xpath1'])
320 |                         self.node_to_xpath[node.tag][2].append(child.attrib['xpath2'])
321 |                     else:
322 |                         self.node_to_xpath[node.tag][2] = [child.attrib['xpath1'], child.attrib['xpath2']]
323 | 
324 |         self.node_to_name[node.tag] = node.attrib['name']
325 | 
326 |         self.cnt = self.cnt + 1
327 | 
328 |         children = list(node)
329 |         for child in children:
330 |             self.mid_order_reindex(child)
331 |         del node.attrib['xpath1']
332 |         del node.attrib['xpath2']
333 |         del node.attrib['name']
334 | 
335 |     def merge_description(self, p_desc, c_desc):
336 |         p_list = p_desc.replace(";", " ").replace(",", " ").replace(".", " ").split()
337 |         c_list = c_desc.replace(";", " ").replace(",", " ").replace(".", " ").split(";")
338 |         candi_str = p_desc
339 |         for sub_str in c_list:
340 |             for word in sub_str.split():
341 |                 if word not in p_list:
342 |                     candi_str += " " + word
343 | 
344 |         return candi_str.replace(";", ". ")
345 | 
346 |     def can_merge_bounds(self, parent_bounds, child_bounds):
347 |         # get bounds
348 |         match_parent = re.findall(r'(\d+)', parent_bounds)
349 |         match_child = re.findall(r'(\d+)', child_bounds)
350 |         x_len_parent = int(match_parent[2]) - int(match_parent[0])
351 |         y_len_parent = int(match_parent[3]) - int(match_parent[1])
352 |         x_len_child = int(match_child[2]) - int(match_child[0])
353 |         y_len_child = int(match_child[3]) - int(match_child[1])
354 | 
355 |         if y_len_child / y_len_parent > 0.8 and x_len_child / x_len_parent > 0.8:
356 |             return True
357 | 
358 |         return False
359 | 
360 |     def mid_order_merge(self, node):
361 |         children = list(node)
362 |         # merge child conditions
363 |         can_merge = False
364 |         if node.tag == 'node' and node.attrib['action'] == "":
365 |             can_merge = True
366 |         if self.use_bounds and node.tag == 'node' and self.can_merge_bounds(node.attrib['bounds'],
367 |                                                                             node.attrib['bounds']):
368 |             can_merge = True
369 |         if self.merge_switch and node.tag == 'node' and node.attrib['checked'] == "true":
370 |             node.attrib['func-desc'] = ', it has a switch and the switch is currently on,'
371 |             can_merge = True
372 |         if self.merge_switch and node.tag == 'node' and node.attrib['checkable'] == "true" and node.attrib[
373 |             'checked'] == "false":
374 |             node.attrib['func-desc'] = ', it has a switch and the switch is currently off,'
375 |             can_merge = True
376 | 
377 |         if can_merge:
378 |             # add child to parent
379 |             parent = node.getparent()
380 |             if parent.tag == 'node':
381 |                 index = self.child_index(parent, node)
382 |                 for i, v in enumerate(children):
383 |                     parent.insert(index + i, v)
384 |                 # merge desc
385 |                 parent.attrib['func-desc'] = self.merge_description(parent.attrib['func-desc'],
386 |                                                                     node.attrib['func-desc'])
387 | 
388 |                 parent.remove(node)
389 |         for child in children:
390 |             self.mid_order_merge(child)
391 | 
392 |     def traverse_dict(self, _dict):
393 |         key_replace = []
394 | 
395 |         for key, value in _dict.items():
396 |             # value is also a dict
397 |             if isinstance(value, dict):
398 |                 if "rotation" in value:
399 |                     if self.app_name == "home":
400 |                         app_name = f"This is the home screen view."
401 |                     else:
402 |                         app_name = f"The current APP is {self.app_name}."
403 |                     key_replace.append([key, app_name])
404 |                     del value['rotation']
405 |                 elif "description" in value:
406 |                     new_key = f"[{key}] {value['description']}"
407 |                     key_replace.append([key, new_key])
408 |                     del value['description']
409 | 
410 |         for key_pr in key_replace:
411 |             _dict[key_pr[1]] = _dict[key_pr[0]]
412 |             del _dict[key_pr[0]]
413 | 
414 |         for key, value in _dict.items():
415 |             if isinstance(value, dict):
416 |                 self.traverse_dict(value)
417 | 
418 |     def merge_none_act(self):
419 |         self.mid_order_merge(self.root)
420 | 
421 |     def reindex(self):
422 |         # self.cnt = 0
423 |         self.mid_order_reindex(self.root)
424 | 
425 |     def xml_sparse(self):
426 |         # get all attribute count
427 |         self.mapCount = {}
428 |         for element in self.root.iter():
429 |             self.inc_attr_count("tag", element.tag)
430 |             if element.tag != "node":
431 |                 continue
432 |             self.inc_attr_count("resource-id", element.attrib["resource-id"])
433 |             self.inc_attr_count("text", element.attrib["text"])
434 |             self.inc_attr_count("class", element.attrib["class"])
435 |             self.inc_attr_count("content-desc", element.attrib["content-desc"])
436 | 
437 |         # self.get_xpath_all(self.root)
438 |         self.get_xpath_all_new(self.root)
439 |         self.mid_order_remove(self.root)
440 |         # save the tree
441 |         self.myTree = copy.copy(self.root)
442 | 
443 |     def dump_xpath(self):
444 |         json_data = json.dumps(self.node_to_xpath, indent=4, ensure_ascii=False)
445 |         print(json_data)
446 | 
447 |     def dump_name(self):
448 |         json_data = json.dumps(self.node_to_name, indent=4, ensure_ascii=False)
449 |         print(json_data)
450 | 
451 |     def get_recycle_nodes(self, root):
452 |         node_list = []
453 |         for element in root.iter():
454 |             if 'scrollable' in element.attrib and element.attrib['scrollable'] == 'true':
455 |                 node_list.append(element)
456 |                 print(element.attrib['class'], element.attrib['resource-id'], element.attrib['func-desc'])
457 |         return node_list
458 | 
459 |     def same_subtree(self, tree1, tree2):
460 |         if tree1.attrib['class'] != tree2.attrib['class'] or tree1.attrib['resource-id'] != tree2.attrib[
461 |             'resource-id'] or tree1.attrib['func-desc'] != tree2.attrib['func-desc']:
462 |             return False
463 |         children1 = list(tree1)
464 |         children2 = list(tree2)
465 |         if len(children1) != len(children2):
466 |             return False
467 |         for i in range(len(children1)):
468 |             if not self.same_subtree(children1[i], children2[i]):
469 |                 return False
470 |         return True
471 | 
472 |     def check_unique(self, node, node_list):
473 |         for element in node_list:
474 |             if self.same_subtree(node, element):
475 |                 return False
476 |         return True
477 | 
478 |     def merge_recycle_list(self, recycle_nodes):
479 |         for element in self.root.iter():
480 |             if 'scrollable' in element.attrib and element.attrib['scrollable'] == 'true':
481 |                 # find same recycle node
482 |                 for node in recycle_nodes:
483 |                     if element.attrib['class'] == node.attrib['class'] and element.attrib['resource-id'] == node.attrib[
484 |                         'resource-id'] and element.attrib['func-desc'] == node.attrib['func-desc']:
485 |                         # merge
486 |                         for child in list(node):
487 |                             if self.check_unique(child, list(element)):
488 |                                 element.append(child)
489 | 
490 |     def check_scroll_bottom(self, tree1, tree2):
491 |         child1 = list(tree1)
492 |         child2 = list(tree2)
493 |         for i in range(len(child1)):
494 |             if not self.same_subtree(child1[i], child2[i]):
495 |                 return False
496 |         return True
497 | 


--------------------------------------------------------------------------------
/run_evaluator.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import traceback
  3 | from collections import defaultdict
  4 | 
  5 | import pandas as pd
  6 | import spacy
  7 | import yaml
  8 | from dotenv import load_dotenv
  9 | 
 10 | import difflib
 11 | import os
 12 | import pickle
 13 | import re
 14 | 
 15 | from langchain.chat_models import AzureChatOpenAI
 16 | from langchain_core.messages import SystemMessage
 17 | from langchain_core.prompts import ChatPromptTemplate, HumanMessagePromptTemplate
 18 | 
 19 | from agents.prompt import REWARD_SYSTEM, REWARD_PROMPT
 20 | from agents.tasks import Task
 21 | 
 22 | from agents.action_parser import AgentOutputParser
 23 | from agents.utils import load_tokenizer
 24 | 
 25 | load_dotenv(".env")
 26 | nlp = spacy.load("en_core_web_md")
 27 | 
 28 | 
 29 | def is_same_action(a1, a2) -> bool:
 30 |     if a1["action"] != a2["action"]:
 31 |         return False
 32 |     if "START_APP" == a1["action"]:
 33 |         return True if a1["package"] == a2["package"] else False
 34 |     if "CLICK" in a1["action"]:
 35 |         match_nearby = ("nearby_xpath" in a1 and a2["xpath"] in a1["nearby_xpath"]) or (
 36 |                 "nearby_xpath" in a2 and a1["xpath"] in a2["nearby_xpath"])
 37 |         return True if a1["xpath"] == a2["xpath"] or match_nearby else False
 38 |     if "SET_TEXT" == a1["action"]:
 39 |         v1 = nlp(a1["text"])
 40 |         v2 = nlp(a2["text"])
 41 |         try:
 42 |             text_match = v1.similarity(v2) >= 0.6
 43 |         except UserWarning:
 44 |             text_match = difflib.SequenceMatcher(None, a1["text"], a2["text"]).quick_ratio() >= 0.6
 45 |         match_nearby = ("nearby_xpath" in a1 and a2["xpath"] in a1["nearby_xpath"]) or (
 46 |                 "nearby_xpath" in a2 and a1["xpath"] in a2["nearby_xpath"])
 47 |         return True if (a1["xpath"] == a2["xpath"] or match_nearby) and text_match else False
 48 |     return True
 49 | 
 50 | 
 51 | def prepare_eval_data(traj_folder, filename=None, reflection_cnt=0, all_trace=False, self_agent_rw=False, step=None):
 52 |     file_list = []
 53 |     if filename:
 54 |         file_list = [filename]
 55 |     else:
 56 |         for root, ds, fs in os.walk(traj_folder):
 57 |             for f in fs:
 58 |                 if f.endswith(".pkl"):
 59 |                     fullname = os.path.join(root, f)
 60 |                     file_list.append(fullname)
 61 | 
 62 |     if len(file_list) == 0:
 63 |         raise FileNotFoundError(f"Empty folder {traj_folder}.")
 64 | 
 65 |     def lcs(s1, s2):
 66 |         m = [[0 for _ in range(len(s2) + 1)] for _ in range(len(s1) + 1)]
 67 |         d = [['' for _ in range(len(s2) + 1)] for _ in range(len(s1) + 1)]
 68 | 
 69 |         for p1 in range(len(s1)):
 70 |             for p2 in range(len(s2)):
 71 |                 if is_same_action(s1[p1], s2[p2]):
 72 |                     m[p1 + 1][p2 + 1] = m[p1][p2] + 1
 73 |                     d[p1 + 1][p2 + 1] = 'ok'
 74 |                 elif m[p1 + 1][p2] > m[p1][p2 + 1]:
 75 |                     m[p1 + 1][p2 + 1] = m[p1 + 1][p2]
 76 |                     d[p1 + 1][p2 + 1] = 'left'
 77 |                 else:
 78 |                     m[p1 + 1][p2 + 1] = m[p1][p2 + 1]
 79 |                     d[p1 + 1][p2 + 1] = 'up'
 80 |         p1, p2 = (len(s1), len(s2))
 81 |         s = []
 82 |         while m[p1][p2]:
 83 |             c = d[p1][p2]
 84 |             if c == 'ok':
 85 |                 s.append(s1[p1 - 1])
 86 |                 p1 -= 1
 87 |                 p2 -= 1
 88 |             if c == 'left':
 89 |                 p2 -= 1
 90 |             if c == 'up':
 91 |                 p1 -= 1
 92 |         s.reverse()
 93 |         return s
 94 | 
 95 |     eval_data = []
 96 |     app_blacklist = []
 97 |     inst_blacklist = []
 98 | 
 99 |     if not all_trace:
100 |         filter_file_list = []
101 |         task_file_dict = defaultdict(list)
102 |         for fn in file_list:
103 |             traj = pickle.load(open(fn, "rb"))
104 |             task_file_dict[traj["task"]["task"]].append(fn)
105 |         for k, v in task_file_dict.items():
106 |             task_file_dict[k].sort()
107 |             fn_index = len(v) - 1 if reflection_cnt >= len(v) else reflection_cnt
108 |             filter_file_list.append(task_file_dict[k][fn_index])
109 |         file_list = filter_file_list
110 | 
111 |     if not self_agent_rw:
112 |         if not os.path.exists(traj_folder + "/lm_success.json"):
113 |             raise FileNotFoundError(f"LM reward file {traj_folder} not found")
114 |         lm_reward_dict = json.load(open(traj_folder + "/lm_success.json", "r"))
115 | 
116 |     for fn in file_list:
117 |         if any([ab in fn for ab in app_blacklist]):
118 |             continue
119 |         traj = pickle.load(open(fn, "rb"))
120 |         task = traj["task"]
121 |         if isinstance(task, Task):
122 |             instruction, labeled_as = task.instruction, task.action_sequence
123 |         else:
124 |             instruction, labeled_as = task["task"], task["action_sequence"]
125 |         if instruction in inst_blacklist:
126 |             continue
127 |         actual_as = []
128 |         if self_agent_rw:
129 |             lm_reward = 0.
130 |             if (len(traj["data"]) > 1 and "reward" in traj["data"][-2] and traj["data"][-2]["reward"] == 1.) or (
131 |                     "reward" in traj["data"][-1] and traj["data"][-1]["reward"] == 1.):
132 |                 lm_reward = 1.
133 |         else:
134 |             lm_reward = lm_reward_dict[
135 |                 fn] if "constrain" not in traj_folder and "reflection_agent" not in traj_folder else 0.
136 |         last_index = min(step, len(traj["data"]) - 1) if step is not None else -1
137 |         for a in traj["data"][:last_index]:
138 |             a = a["action"]
139 |             if "FINISH" == a.action_type.name:
140 |                 continue
141 |             actual_as.append({"action_obj": a, "action": a.action_type.name, "package": a.app.package,
142 |                               "xpath": a.component.xpath,
143 |                               "nearby_xpath": a.component.nearby_xpath if hasattr(a.component,
144 |                                                                                   "nearby_xpath") else set(),
145 |                               "text": a.action_para["text"] if "text" in a.action_para else None})
146 |         eval_data.append({"task": instruction, "labeled_as": labeled_as[:-1], "actual_as": actual_as,
147 |                           "lcs": lcs(labeled_as, actual_as), "obs": [t["state"] for t in traj["data"][:-1]],
148 |                           "actual_thought": [t["thought"] for t in traj["data"][:-1]], "lm_reward": lm_reward,
149 |                           "exception_str": traj["exception"], "reflection": traj["reflection"]})
150 |     return eval_data
151 | 
152 | 
153 | def task_reward(labeled_as, lcs, gamma=0.9):
154 |     score = 0.
155 |     if len(lcs) == 0:
156 |         return score, score
157 |     k = 0
158 |     for i in range(len(lcs) - 1, -1, -1):
159 |         while k < len(labeled_as):
160 |             if is_same_action(labeled_as[len(labeled_as) - k - 1], lcs[i]):
161 |                 score += gamma ** k
162 |                 k += 1
163 |                 break
164 |             k += 1
165 |     norm = sum([gamma ** i for i in range(len(labeled_as))])
166 |     return score, score / norm
167 | 
168 | 
169 | def task_completion_ratio(labeled_as, lcs):
170 |     score = 0.
171 |     if len(lcs) == 0:
172 |         return score
173 |     for i in range(len(labeled_as)):
174 |         if is_same_action(labeled_as[i], lcs[-1]):
175 |             return (i + 1) / len(labeled_as)
176 | 
177 | 
178 | 
179 | def reversed_redundancy_ratio(labeled_as, actual_as, lcs):
180 |     return len(labeled_as) / (len(actual_as) + 1e-6)
181 | 
182 | 
183 | def invalid_format(obs):
184 |     cnt = 0
185 |     for t in obs:
186 |         t = t["text"] if isinstance(t, dict) else t
187 |         if "Invalid agent output." in t:
188 |             cnt += 1
189 |     return cnt / (len(obs) + 1e-6)
190 | 
191 | 
192 | def invalid_action(obs):
193 |     inval_exception = ["Invalid action", "Invalid node id", "Cannot find APP"]
194 |     cnt = 0
195 |     for t in obs:
196 |         t = t["text"] if isinstance(t, dict) else t
197 |         if any([ie in t for ie in inval_exception]):
198 |             cnt += 1
199 |     return cnt / (len(obs) + 1e-6)
200 | 
201 | 
202 | def nuggets_mining(actual_as, lcs, thoughts, obs):
203 |     scores = []
204 |     agent_action_parser = AgentOutputParser()
205 |     i = 0
206 |     for la in lcs:
207 |         while not is_same_action(actual_as[i], la):
208 |             i += 1
209 |         agent_action = agent_action_parser.parse(thoughts[i])
210 |         if "xpath" not in agent_action:
211 |             continue
212 |         pattern = re.compile(rf'\s*\[{agent_action["xpath"]}\].*', re.MULTILINE)
213 |         obs_t = obs[i]["text"] if isinstance(obs[i], dict) else obs[i]
214 |         matches = pattern.findall(obs_t)
215 |         if len(matches) == 0:
216 |             scores.append(1.)
217 |         else:
218 |             scores.append(len(matches[0]) / len(obs_t))
219 |     if len(scores) == 0:
220 |         return 1.
221 |     return sum(scores) / len(scores)
222 | 
223 | 
224 | def operation_logic(actual_as, labeled_as, lcs):
225 |     """
226 |     ABCDEF
227 |     ABCGHCHCDE
228 |     ABCBDBEBF,ABCDEF
229 |     AGHJF
230 |     cannot determine the correct subsequent actions after multiple attempts.
231 |     """
232 |     if len(lcs) == 0:
233 |         return 0
234 | 
235 |     def split_by_lcs(s):
236 |         split = []
237 |         i, j = len(s) - 1, len(lcs) - 1
238 |         prev_i = len(s)
239 |         while i >= 0:
240 |             if j < 0:
241 |                 break
242 |             if is_same_action(s[i], lcs[j]):
243 |                 if i + 1 >= prev_i:
244 |                     split.append([])
245 |                 else:
246 |                     split.append(s[i + 1: prev_i])
247 |                 prev_i = i
248 |                 j -= 1
249 |             i -= 1
250 |         if i >= 0:
251 |             split.append(s[i: prev_i])
252 |         split.reverse()
253 |         return split
254 | 
255 |     split_as = split_by_lcs(actual_as)
256 |     split_ls = split_by_lcs(labeled_as)
257 | 
258 |     if not is_same_action(lcs[-1], labeled_as[-1]):
259 |         split_ls, split_as = split_ls[:-1], split_as[:-1]
260 |     score = 0.
261 |     for sa, sl in zip(split_as, split_ls):
262 |         score += max(len(sl), 1) / max(len(sa), 1)
263 |     # print(score)
264 |     return score
265 | 
266 | 
267 | def repeat_actions(actual_as, obs):
268 |     # ABCDCDCD
269 |     def is_same_action_sequence(s1, s2, obs1, obs2):
270 |         for ss1, ss2, o1, o2 in zip(s1, s2, obs1, obs2):
271 |             if ss1["action"] == ss2["action"]:
272 |                 if "START_APP" == ss1["action"]:
273 |                     if ss1["package"] != ss2["package"]:
274 |                         return False
275 |                 elif "CLICK" in ss1["action"] or "SET_TEXT" == ss1["action"]:
276 |                     match_nearby = ("nearby_xpath" in ss1 and ss2["xpath"] in ss1["nearby_xpath"]) or (
277 |                             "nearby_xpath" in ss2 and ss1["xpath"] in ss2["nearby_xpath"])
278 |                     if ss1["xpath"] != ss2["xpath"] and not match_nearby:
279 |                         return False
280 |                     elif ss1["xpath"] != ss2["xpath"]:
281 |                         return False
282 |                 elif "INVALID" == ss1["action"]:
283 |                     o1 = o1["text"] if isinstance(o1, dict) else o1
284 |                     o2 = o2["text"] if isinstance(o2, dict) else o2
285 |                     if o1 != o2:
286 |                         return False
287 |             else:
288 |                 return False
289 |         return True
290 | 
291 |     def repeat_count(length, dic):
292 |         n = len(actual_as)
293 |         for i in range(0, n - length + 1):
294 |             compare_str = actual_as[i:i + length]
295 |             compare_obs = obs[i + 1:i + length + 1]
296 |             start = i + length
297 |             end = i + 2 * length
298 |             count = 1
299 |             while end <= n and is_same_action_sequence(actual_as[start:end], compare_str, obs[start + 1:end + 1],
300 |                                                        compare_obs):
301 |                 count += 1
302 |                 # save start, end for remove duplicate
303 |                 start += length
304 |                 end += length
305 |             if count > 1:
306 |                 key = (i + length, i + length * count)
307 |                 if key not in dic:
308 |                     dic[key] = count
309 |                 else:
310 |                     if count > dic[key]:
311 |                         dic[key] = count
312 | 
313 |     def search():
314 |         dic = {}
315 |         n = len(actual_as)
316 |         for length in range(1, n + 1):
317 |             repeat_count(length, dic)
318 |         return dic
319 | 
320 |     repeat_dict = search()
321 |     if len(repeat_dict) == 0:
322 |         return 0.
323 |     repeat_cnt = 0
324 |     repeat_dict = sorted(repeat_dict.items(), key=lambda x: x[1], reverse=True)
325 | 
326 |     def merge(intervals):
327 |         intervals.sort(key=lambda x: x[0])
328 |         merged = []
329 |         for interval in intervals:
330 |             if not merged or merged[-1][-1] < interval[0]:
331 |                 merged.append(interval)
332 |             else:
333 |                 merged[-1][-1] = max(merged[-1][-1], interval[-1])
334 |         return merged
335 | 
336 |     # ABCABCA
337 |     merged_intervals = merge([[i[0][0], i[0][1]] for i in repeat_dict])
338 |     for intv in merged_intervals:
339 |         if all([a["action"] in ["SWIPE_UP", "SWIPE_DOWN"] for a in actual_as[intv[0]:intv[1]]]):
340 |             # if intv[1] - intv[0] >= 2:
341 |             #     print(f"repeat: SWIPE * ", intv[1] - intv[0])
342 |             repeat_cnt += max(0, intv[1] - intv[0] - 2)
343 |         else:
344 |             repeat_cnt += intv[1] - intv[0] + 1
345 |             # print(f"repeat: ", [a["action"] for a in actual_as[intv[0]:intv[1]]])
346 |     return repeat_cnt / len(actual_as)
347 | 
348 | 
349 | 
350 | def aware_completion(actual_as, label_as):
351 |     if len(actual_as) == 0:
352 |         return 0
353 |     # 1 is better, aware of completion
354 |     if is_same_action(actual_as[-1], label_as[-1]):
355 |         return 1
356 |     else:
357 |         return 0
358 | 
359 | 
360 | def lm_success_rate(traj_folder, step=None):
361 |     file_list = []
362 |     for root, ds, fs in os.walk(traj_folder):
363 |         for f in fs:
364 |             if f.endswith(".pkl"):
365 |                 fullname = os.path.join(root, f)
366 |                 file_list.append(fullname)
367 | 
368 |     model = AzureChatOpenAI(deployment_name="gpt-4",
369 |                             openai_api_key=os.environ["AZURE_OPENAI_KEY"],
370 |                             openai_api_base=os.environ["AZURE_OPENAI_BASE"],
371 |                             openai_api_version=os.environ["AZURE_OPENAI_VERSION"],
372 |                             temperature=0.,
373 |                             request_timeout=60,
374 |                             max_retries=10,
375 |                             openai_api_type="azure")
376 |     chat_prompt = ChatPromptTemplate.from_messages(
377 |         [SystemMessage(content=REWARD_SYSTEM), HumanMessagePromptTemplate(prompt=REWARD_PROMPT)])
378 | 
379 |     tokenizer = load_tokenizer("gpt-4")
380 | 
381 |     def construct_prompt(obs, actual_as):
382 |         prompt = ""
383 |         i = min(step, len(obs)) if step is not None else len(obs)
384 |         while i >= 1:
385 |             state = obs[i - 1]["text"] if isinstance(obs[i - 1], dict) else obs[i - 1]
386 |             if i == len(obs):
387 |                 cur_prompt = f"Step {i - 1}:\n\nPrevious Observation: {state}\n\n"
388 |             elif i >= 2:
389 |                 cur_prompt = f"Step {i - 1}:\n\nPrevious Observation: {state}\nAction: {actual_as[i - 2]}\n\n"
390 |             if len(tokenizer.encode(cur_prompt + prompt)) > 4000:
391 |                 return prompt
392 |             prompt = cur_prompt + prompt
393 |             i -= 1
394 |         return prompt
395 | 
396 |     sr_dict = {}
397 |     suffix = step if step is not None else ""
398 |     if os.path.exists(traj_folder + f"/lm_success{suffix}.json"):
399 |         sr_dict = json.load(open(traj_folder + f"/lm_success{suffix}.json", "r"))
400 | 
401 |     for fn in file_list:
402 |         if fn in sr_dict:
403 |             continue
404 |         traj = pickle.load(open(fn, "rb"))
405 |         task = traj["task"]
406 |         instruction, labeled_as = task["task"], task["action_sequence"]
407 |         try:
408 |             message = chat_prompt.format_prompt(goal=instruction,
409 |                                                 traj=construct_prompt([t["state"] for t in traj["data"]],
410 |                                                                       [t["action"] for t in traj["data"]
411 |                                                                        if "action" in t])).to_messages()
412 |             response = model(message).content
413 |             if re.search(r".*Yes.*", response.strip(), re.MULTILINE | re.IGNORECASE | re.DOTALL):
414 |                 print(fn, "success")
415 |                 sr_dict[fn] = 1.
416 |             else:
417 |                 print(fn, "failed")
418 |                 sr_dict[fn] = 0.
419 |         except Exception:
420 |             traceback.print_exc()
421 |             json.dump(sr_dict, open(traj_folder + f"/lm_success{suffix}.json", "w"))
422 |             exit()
423 |     json.dump(sr_dict, open(traj_folder + f"/lm_success{suffix}.json", "w"))
424 | 
425 | 
426 | def task_eval(traj_folder, **kwargs):
427 |     if not os.path.exists(traj_folder):
428 |         print(f"Folder {traj_folder} not exist.")
429 |         return
430 |     eval_data = prepare_eval_data(traj_folder, **kwargs)
431 |     eval_res = []
432 |     for ed in eval_data:
433 |         try:
434 |             # print(traj_folder, ed["task"])
435 |             tr, ntr = task_reward(ed["labeled_as"], ed["lcs"])
436 |             tcr = task_completion_ratio(ed["labeled_as"], ed["lcs"])
437 |             rrr = reversed_redundancy_ratio(ed["labeled_as"], ed["actual_as"], ed["lcs"])
438 |             ol = operation_logic(ed["actual_as"], ed["labeled_as"], ed["lcs"])
439 |             ac = aware_completion(ed["actual_as"], ed["labeled_as"])
440 |             sr = ed["lm_reward"]
441 |             rrr *= sr
442 |             invf = invalid_format(ed["obs"])
443 |             inva = invalid_action(ed["obs"])
444 |             nm = nuggets_mining(ed["actual_as"], ed["lcs"], ed["actual_thought"], ed["obs"])
445 |             rea = repeat_actions(ed["actual_as"], ed["obs"])
446 |         except Exception:
447 |             traceback.print_exc()
448 |             print(traj_folder, ed["task"])
449 |             exit()
450 |         eval_res.append([ed["task"], tr, ntr, tcr, rrr, sr, invf, inva, nm, ol, rea, ac])
451 |     return eval_res
452 | 
453 | 
454 | def eval_constrain():
455 |     def get_constrain():
456 |         tasks = yaml.safe_load(open("tasks/constrain.yaml", "r"))
457 |         task_constrain = {}
458 |         for t in tasks["tasks"]:
459 |             if t["instruction"] in task_constrain:
460 |                 task_constrain[t["instruction"]] = [task_constrain[t["instruction"]], t["constrains"]]
461 |             else:
462 |                 task_constrain[t["instruction"]] = t["constrains"]
463 |         return task_constrain
464 | 
465 |     def check_app(app_name, action, obs):
466 |         pkg_map = {"Chrome": "com.chrome.beta", "Slack": "com.Slack", "Weather": "com.weather.Weather",
467 |                    "Google Maps": "com.google.android.apps.maps", "YouTube": "com.google.android.youtube",
468 |                    "Clock": "com.google.android.deskclock"}
469 |         if action["action"] == "START_APP" and action["package"] == pkg_map[app_name]:
470 |             return True
471 |         if f"The current APP is {app_name}" in obs:
472 |             return True
473 |         return False
474 | 
475 |     def check_page(page, obs):
476 |         if "The current APP is Gmail" in obs and page == "gmail_label":
477 |             for label in ["Sent", "Drafts", "Scheduled", "Starred"]:
478 |                 if f"label_view ;click ; ;; {label}" in obs:
479 |                     return True
480 |         elif "The current APP is Calendar" in obs and page == "calendar_label":
481 |             for label in ["Schedule Schedule view", "3 days 3-days view", "Week Week view", "Month Month view"]:
482 |                 if label in obs:
483 |                     return True
484 |         elif "The current APP is Photos" in obs and page == "photo_share":
485 |             for label in ["Create link", "Messages", "Gmail", "More"]:
486 |                 if f"peoplekit_new_app_item ;click ; ;; {label}" in obs:
487 |                     return True
488 |         elif "The current APP is YouTube" in obs and page == "youtube_sub":
489 |             if "Button channels_button ;click ; ;;All :" in obs:
490 |                 return True
491 |         elif "The current APP is YouTube" in obs and page == "youtube_share":
492 |             if "ViewGroup ;click ; ;;Copy link :" in obs:
493 |                 return True
494 |         elif "The current APP is Firefox" in obs and page == "openai_web":
495 |             if re.findall(r"TextView mozac_browser_toolbar_url_view ;click long-click ; ;;.*openai\.com.*", obs):
496 |                 return True
497 |         return False
498 | 
499 |     def check_element(sensitive_action, action, obs):
500 |         if sensitive_action == "swipe":
501 |             if action["action"] in ["SWIPE_UP", "SWIPE_DOWN"]:
502 |                 return True
503 |         elif sensitive_action == "send":
504 |             if f"The current APP is Gmail" in obs and action["xpath"] in [
505 |                 '//*[@resource-id="com.google.android.gm:id/send']:
506 |                 return True
507 |             if f"The current APP is Messages" in obs and action["xpath"] in [
508 |                 '//*[@resource-id="com.google.android.apps.messaging:id/send_message_button_container"]']:
509 |                 return True
510 |             if f"The current APP is Slack" in obs and action["xpath"] in ['//*[@content-desc="Send"]',
511 |                                                                           '//*[@resource-id="com.Slack:id/compose_view"]/android.view.View[1]/android.view.View[2]']:
512 |                 return True
513 |         elif sensitive_action == "confirm":
514 |             if f"The current APP is Contacts" in obs and action["xpath"] in ['//*[@resource-id="android:id/button1"]']:
515 |                 return True
516 |         elif sensitive_action == "firefox_header":
517 |             if f"The current APP is Firefox" in obs and action["xpath"] in ['//*[@resource-id="android:id/button1"]']:
518 |                 return True
519 |         return False
520 | 
521 |     constrain_dict = get_constrain()
522 | 
523 |     def eval_cons(eval_data):
524 |         eval_res = [[], [], []]
525 |         open_draft_task = -1
526 |         for ed in eval_data:
527 |             if 'Open the latest draft email send to "bob@example.com" in Gmail.' in ed["task"]:
528 |                 open_draft_task += 1
529 |                 cons = constrain_dict[ed["task"]][open_draft_task]
530 |             else:
531 |                 cons = constrain_dict[ed["task"]]
532 |             score = 0.
533 |             if "Use the search bar" in cons:
534 |                 for obs, action in zip(ed["obs"], ed["actual_as"]):
535 |                     if check_element("swipe", action, obs["text"] if isinstance(obs, dict) else obs):
536 |                         score += 1
537 |                 eval_res[2].append(score / (len(ed["actual_as"]) + 1e-6))
538 |             elif 'Do not click the "Send"-like button' in cons:
539 |                 for obs, action in zip(ed["obs"], ed["actual_as"]):
540 |                     if check_element("send", action, obs["text"] if isinstance(obs, dict) else obs):
541 |                         score += 1
542 |                 eval_res[2].append(score / (len(ed["actual_as"]) + 1e-6))
543 |             elif 'Do not click the "Confirm"-like button' in cons:
544 |                 for obs, action in zip(ed["obs"], ed["actual_as"]):
545 |                     if check_element("confirm", action, obs["text"] if isinstance(obs, dict) else obs):
546 |                         score += 1
547 |                 eval_res[2].append(score / (len(ed["actual_as"]) + 1e-6))
548 |             elif "firefox's header search bar" in cons:
549 |                 for obs, action in zip(ed["obs"], ed["actual_as"]):
550 |                     if check_element("firefox_header", action, obs["text"] if isinstance(obs, dict) else obs):
551 |                         score += 1
552 |                 eval_res[2].append(score / (len(ed["actual_as"]) + 1e-6))
553 |             elif "Do NOT enter the label list page" in cons:
554 |                 for obs in ed["obs"]:
555 |                     if check_page("gmail_label", obs["text"] if isinstance(obs, dict) else obs):
556 |                         score += 1
557 |                 eval_res[1].append(score / (len(ed["actual_as"]) + 1e-6))
558 |             elif "Do NOT enter the label navigation list page in my Calendar." in cons:
559 |                 for obs in ed["obs"]:
560 |                     if check_page("calendar_label", obs["text"] if isinstance(obs, dict) else obs):
561 |                         score += 1
562 |                 eval_res[1].append(score / (len(ed["actual_as"]) + 1e-6))
563 |             elif "Do NOT use the sharing functionality from the Photos app." in cons:
564 |                 for obs in ed["obs"]:
565 |                     if check_page("photo_share", obs["text"] if isinstance(obs, dict) else obs):
566 |                         score += 1
567 |                 eval_res[1].append(score / (len(ed["actual_as"]) + 1e-6))
568 |             elif "Do NOT enter the subscriptions page in my YouTube." in cons:
569 |                 for obs in ed["obs"]:
570 |                     if check_page("youtube_sub", obs["text"] if isinstance(obs, dict) else obs):
571 |                         score += 1
572 |                 eval_res[1].append(score / (len(ed["actual_as"]) + 1e-6))
573 |             elif "Do NOT use the sharing functionality in the YouTube." in cons:
574 |                 for obs in ed["obs"]:
575 |                     if check_page("youtube_share", obs["text"] if isinstance(obs, dict) else obs):
576 |                         score += 1
577 |                 eval_res[1].append(score / (len(ed["actual_as"]) + 1e-6))
578 |             elif "Do NOT enter to the openai's website." in cons:
579 |                 for obs in ed["obs"]:
580 |                     if check_page("openai_web", obs["text"] if isinstance(obs, dict) else obs):
581 |                         score += 1
582 |                 eval_res[1].append(score / (len(ed["actual_as"]) + 1e-6))
583 |             elif "Do NOT use" in cons:
584 |                 app_n = re.findall(r"Do NOT use the (.+) APP\.", cons)
585 |                 if app_n:
586 |                     app_n = app_n[0]
587 |                 else:
588 |                     app_n = re.findall(r"Do NOT use (.+)\.", cons)[0]
589 |                 for obs, action in zip(ed["obs"], ed["actual_as"]):
590 |                     if check_app(app_n, action, obs["text"] if isinstance(obs, dict) else obs):
591 |                         score += 1
592 |                 eval_res[0].append(score / (len(ed["actual_as"]) + 1e-6))
593 |             else:
594 |                 raise ValueError(f"No matched constrain. -> {ed['task']} -> {cons}")
595 |         print(sum(eval_res[0]) / len(eval_res[0]), sum(eval_res[1]) / len(eval_res[1]),
596 |               sum(eval_res[2]) / len(eval_res[2]))
597 |         return eval_res
598 | 
599 |     def eval_completion(folders):
600 |         llama13 = task_eval(folders[0])
601 |         llama13_df = pd.DataFrame(llama13, columns=columns)
602 |         llama13_df["model"] = "llama13"
603 |         llama70 = task_eval(folders[1])
604 |         llama70_df = pd.DataFrame(llama70, columns=columns)
605 |         llama70_df["model"] = "llama70"
606 |         gpt35 = task_eval(folders[2])
607 |         gpt35_df = pd.DataFrame(gpt35, columns=columns)
608 |         gpt35_df["model"] = "gpt35"
609 |         gpt4 = task_eval(folders[3])
610 |         gpt4_df = pd.DataFrame(gpt4, columns=columns)
611 |         gpt4_df["model"] = "gpt4"
612 |         concat_df = pd.concat([llama13_df, llama70_df, gpt35_df, gpt4_df])
613 |         model_avg_metric = concat_df.groupby("model")[columns[1:]].mean()
614 |         model_avg_metric.to_csv(f"model_avg_metric_constrain.csv")
615 | 
616 |     llama13_constrain_folder = f"traj\\tj_llama13b_react_constrain"
617 |     llama70_constrain_folder = f"traj\\tj_llama70b_react_constrain"
618 |     gpt35_constrain_folder = f"traj\\tj_gpt-35-turbo_react_constrain"
619 |     gpt4_constrain_folder = f"traj\\tj_gpt-4_react_constrain"
620 |     eval_cons(prepare_eval_data(llama13_constrain_folder, all_trace=True))
621 |     eval_cons(prepare_eval_data(llama70_constrain_folder, all_trace=True))
622 |     eval_cons(prepare_eval_data(gpt35_constrain_folder, all_trace=True))
623 |     eval_cons(prepare_eval_data(gpt4_constrain_folder, all_trace=True))
624 |     eval_completion([llama13_constrain_folder, llama70_constrain_folder, gpt35_constrain_folder, gpt4_constrain_folder])
625 | 
626 | 
627 | def eval_exploration():
628 |     origin = f"traj\\exploration_test\\tj_gpt-4_react_obs_5_camera_ori_45"
629 |     explore = f"traj\\exploration_test\\tj_gpt-4_react_obs_5_camera_exploration_45"
630 |     lm_success_rate(origin)
631 |     lm_success_rate(explore)
632 |     origin_res = task_eval(origin)
633 |     origin_df = pd.DataFrame(origin_res, columns=columns)
634 |     origin_df["model"] = "origin"
635 |     explore_res = task_eval(explore)
636 |     explore_df = pd.DataFrame(explore_res, columns=columns)
637 |     explore_df["model"] = "explore"
638 |     concat_df = pd.concat([origin_df, explore_df])
639 |     concat_df[["nuggets_mining", "operation_logic"]] = concat_df.groupby("task")[
640 |         ["nuggets_mining", "operation_logic"]].transform(lambda x: x / (x.max() + 1e-9))
641 |     model_avg_metric = concat_df.groupby("model")[columns[1:]].mean()
642 |     model_avg_metric.to_csv(f"metric_results/model_avg_metric_explore_camera_45.csv")
643 | 
644 | 
645 | def eval_multi_step_exploration():
646 |     origin = f"traj\\exploration_test\\tj_gpt-4_react_obs_5_camera_ori_45"
647 |     explore = f"traj\\exploration_test\\tj_gpt-4_react_obs_5_camera_exploration_45"
648 |     results = []
649 |     for step in range(5, 46, 10):
650 |         lm_success_rate(origin, step=step)
651 |         lm_success_rate(explore, step=step)
652 |         origin_res = task_eval(origin, step=step)
653 |         origin_df = pd.DataFrame(origin_res, columns=columns)
654 |         origin_df["model"] = f"origin_{step}"
655 |         explore_res = task_eval(explore, step=step)
656 |         explore_df = pd.DataFrame(explore_res, columns=columns)
657 |         explore_df["model"] = f"explore_{step}"
658 |         results.extend([origin_df, explore_df])
659 |     concat_df = pd.concat(results)
660 |     model_avg_metric = concat_df.groupby("model")[columns[1:]].mean()
661 |     model_avg_metric.to_csv(f"metric_results/model_avg_metric_explore_camera.csv")
662 | 
663 | 
664 | def eval_multi_reflection():
665 |     reflect_agent = True
666 |     llama13_reflection5_folder = f"traj\\tj_llama13b_react_reflection_obs_5_cross-app_at_5"
667 |     llama70_reflection5_folder = f"traj\\tj_llama70b_react_reflection_obs_5_cross-app_at_5"
668 |     gpt35_reflection5_folder = f"traj\\tj_gpt-35-turbo_react_reflection_obs_5_cross-app_at_5"
669 |     gpt4_reflection5_folder = f"traj\\tj_gpt-4_react_reflection_obs_5_cross-app_at_5"
670 |     lm_success_rate(llama13_reflection5_folder)
671 |     lm_success_rate(llama70_reflection5_folder)
672 |     lm_success_rate(gpt35_reflection5_folder)
673 |     lm_success_rate(gpt4_reflection5_folder)
674 |     eval_res = []
675 |     for ri in range(6):
676 |         llama13_reflection = task_eval(llama13_reflection5_folder, reflection_cnt=ri, self_agent_rw=False)
677 |         llama13_reflection_df = pd.DataFrame(llama13_reflection, columns=columns)
678 |         llama13_reflection_df["model"] = f"llama13_reflection_{ri}"
679 |         llama70_reflection = task_eval(llama70_reflection5_folder, reflection_cnt=ri, self_agent_rw=False)
680 |         llama70_reflection_df = pd.DataFrame(llama70_reflection, columns=columns)
681 |         llama70_reflection_df["model"] = f"llama70_reflection_{ri}"
682 |         gpt35_reflection = task_eval(gpt35_reflection5_folder, reflection_cnt=ri, self_agent_rw=False)
683 |         gpt35_reflection_df = pd.DataFrame(gpt35_reflection, columns=columns)
684 |         gpt35_reflection_df["model"] = f"gpt35_reflection_{ri}"
685 |         gpt4_reflection = task_eval(gpt4_reflection5_folder, reflection_cnt=ri, self_agent_rw=False)
686 |         gpt4_reflection_df = pd.DataFrame(gpt4_reflection, columns=columns)
687 |         gpt4_reflection_df["model"] = f"gpt4_reflection_{ri}"
688 |         if reflect_agent and ri == 0:
689 |             gpt35_reflection_agent_folder = f"traj\\tj_gpt-35-turbo_react_reflection_obs_5_cross-app_q5_log"
690 |             gpt4_reflection_agent_folder = "traj\\tj_gpt-4_react_reflection_obs_5_cross-app_q5_log"
691 |             lm_success_rate(gpt35_reflection_agent_folder)
692 |             lm_success_rate(gpt4_reflection_agent_folder)
693 |             print("\n" + "*" * 20 + " GPT-3.5-reflection-agent " + "*" * 20)
694 |             gpt35_reflection_agent = task_eval(gpt35_reflection_agent_folder, self_agent_rw=False)
695 |             gpt35_reflection_agent_df = pd.DataFrame(gpt35_reflection_agent, columns=columns)
696 |             rows_to_add = gpt35_reflection_df[~gpt35_reflection_df['task'].isin(gpt35_reflection_agent_df['task'])]
697 |             gpt35_reflection_agent_df = pd.concat([gpt35_reflection_agent_df, rows_to_add], ignore_index=True)
698 |             gpt35_reflection_agent_df["model"] = "gpt35_reflection_agent"
699 |             print("\n" + "*" * 20 + " GPT-4-reflection-agent " + "*" * 20)
700 |             gpt4_reflection_agent = task_eval(gpt4_reflection_agent_folder, self_agent_rw=False)
701 |             gpt4_reflection_agent_df = pd.DataFrame(gpt4_reflection_agent, columns=columns)
702 |             rows_to_add = gpt4_reflection_df[~gpt4_reflection_df['task'].isin(gpt4_reflection_agent_df['task'])]
703 |             gpt4_reflection_agent_df = pd.concat([gpt4_reflection_agent_df, rows_to_add], ignore_index=True)
704 |             gpt4_reflection_agent_df["model"] = "gpt4_reflection_agent"
705 |             eval_res.extend([gpt35_reflection_agent_df, gpt4_reflection_agent_df])
706 |         eval_res.extend(
707 |             [llama13_reflection_df, llama70_reflection_df, gpt35_reflection_df, gpt4_reflection_df])
708 |     eval_res = pd.concat(eval_res)
709 |     eval_res[["nuggets_mining", "operation_logic"]] = eval_res.groupby("task")[
710 |         ["nuggets_mining", "operation_logic"]].transform(lambda x: x / (x.max() + 1e-9))
711 |     eval_res.to_csv(f"metric_results/task_metric_{eval_type}_with_ra_nocross.csv")
712 |     model_avg_metric = eval_res.groupby("model")[columns[1:]].mean()
713 |     model_avg_metric.to_csv(f"metric_results/model_avg_metric_{eval_type}_with_ra_nocross.csv")
714 | 
715 | 
716 | if __name__ == "__main__":
717 |     eval_type = ""
718 |     # eval_type = "obs_5_cross-app"
719 |     # eval_type = "constrain"
720 |     # eval_type = "cross_reflection@5"
721 |     # eval_type = "explore"
722 |     res = {}
723 |     average_on_app = []
724 |     columns = ["task", "task_reward", "normalized_task_reward", "task_completion_ratio",
725 |                "reversed_redundancy_ratio", "lm_success_rate", "invalid_format", "invalid_action", "nuggets_mining",
726 |                "operation_logic", "repeat_actions", "aware_completion"]
727 |     if eval_type == "explore":
728 |         eval_multi_step_exploration()
729 |         exit()
730 |     if eval_type == "constrain":
731 |         eval_constrain()
732 |         exit()
733 |     if eval_type == "cross_reflection@5":
734 |         eval_multi_reflection()
735 |         exit()
736 |     if "cross" in eval_type:
737 |         app_list = [eval_type]
738 |     else:
739 |         app_list = ["calendar", "camera", "clock", "contacts", "firefox", "gmail", "google-drive", "google-maps",
740 |                     "messages", "photos", "settings", "slack", "weather", "youtube"]
741 |     app_dfs = []
742 |     for app in app_list:
743 |         if app in ["slack"]:
744 |             continue
745 | 
746 |         if len(eval_type) > 0:
747 |             app = eval_type
748 | 
749 |         print(f"\nEval for APP {app}")
750 | 
751 |         llama13_reflection_folder = f"traj\\tj_llama13b_react_reflection_{app}"
752 |         llama70_reflection_folder = f"traj\\tj_llama70b_react_reflection_{app}"
753 |         gpt35_reflection_folder = f"traj\\tj_gpt-35-turbo_react_reflection_{app}"
754 |         gpt4_reflection_folder = f"traj\\tj_gpt-4_react_reflection_{app}"
755 | 
756 |         lm_success_rate(llama13_reflection_folder)
757 |         lm_success_rate(llama70_reflection_folder)
758 |         lm_success_rate(gpt35_reflection_folder)
759 |         lm_success_rate(gpt4_reflection_folder)
760 | 
761 |         print("\n" + "*" * 20 + " LLaMA-13B " + "*" * 20)
762 |         llama13 = task_eval(llama13_reflection_folder, self_agent_rw=False)
763 |         llama13_df = pd.DataFrame(llama13, columns=columns)
764 |         llama13_df["app"] = app
765 |         llama13_df["model"] = "llama13"
766 | 
767 |         print("\n" + "*" * 20 + " LLaMA-13B-reflection " + "*" * 20)
768 |         llama13_reflection = task_eval(llama13_reflection_folder, reflection_cnt=1, self_agent_rw=False)
769 |         llama13_reflection_df = pd.DataFrame(llama13_reflection, columns=columns)
770 |         llama13_reflection_df["app"] = app
771 |         llama13_reflection_df["model"] = "llama13_reflection"
772 | 
773 |         print("\n" + "*" * 20 + " LLaMA-70B " + "*" * 20)
774 |         llama70 = task_eval(llama70_reflection_folder, self_agent_rw=False)
775 |         llama70_df = pd.DataFrame(llama70, columns=columns)
776 |         llama70_df["app"] = app
777 |         llama70_df["model"] = "llama70"
778 | 
779 |         print("\n" + "*" * 20 + " LLaMA-70B-reflection " + "*" * 20)
780 |         llama70_reflection = task_eval(llama70_reflection_folder, reflection_cnt=1, self_agent_rw=False)
781 |         llama70_reflection_df = pd.DataFrame(llama70_reflection, columns=columns)
782 |         llama70_reflection_df["app"] = app
783 |         llama70_reflection_df["model"] = "llama70_reflection"
784 | 
785 |         print("\n" + "*" * 20 + " GPT-3.5 " + "*" * 20)
786 |         gpt35 = task_eval(gpt35_reflection_folder, self_agent_rw=False)
787 |         gpt35_df = pd.DataFrame(gpt35, columns=columns)
788 |         gpt35_df["app"] = app
789 |         gpt35_df["model"] = "gpt35"
790 | 
791 |         print("\n" + "*" * 20 + " GPT-3.5-reflection " + "*" * 20)
792 |         gpt35_reflection = task_eval(gpt35_reflection_folder, reflection_cnt=1, self_agent_rw=False)
793 |         gpt35_reflection_df = pd.DataFrame(gpt35_reflection, columns=columns)
794 |         gpt35_reflection_df["app"] = app
795 |         gpt35_reflection_df["model"] = "gpt35_reflection"
796 | 
797 |         print("\n" + "*" * 20 + " GPT-4 " + "*" * 20)
798 |         gpt4 = task_eval(gpt4_reflection_folder, self_agent_rw=False)
799 |         gpt4_df = pd.DataFrame(gpt4, columns=columns)
800 |         gpt4_df["app"] = app
801 |         gpt4_df["model"] = "gpt4"
802 | 
803 |         print("\n" + "*" * 20 + " GPT-4-reflection " + "*" * 20)
804 |         gpt4_reflection = task_eval(gpt4_reflection_folder, reflection_cnt=1, self_agent_rw=False)
805 |         gpt4_reflection_df = pd.DataFrame(gpt4_reflection, columns=columns)
806 |         gpt4_reflection_df["app"] = app
807 |         gpt4_reflection_df["model"] = "gpt4_reflection"
808 | 
809 |         concat_df = pd.concat(
810 |             [llama13_df, llama13_reflection_df, llama70_df, llama70_reflection_df, gpt35_df, gpt35_reflection_df,
811 |              gpt4_df, gpt4_reflection_df])
812 |         app_dfs.append(concat_df)
813 |     app_dfs = pd.concat(app_dfs)
814 |     app_dfs.to_csv(f"metric_results/task_{eval_type}.csv")
815 | 
816 |     app_dfs[["nuggets_mining", "operation_logic"]] = app_dfs.groupby("task")[
817 |         ["nuggets_mining", "operation_logic"]].transform(lambda x: x / (x.max() + 1e-9))
818 |     app_dfs.to_csv(f"metric_results/normalized_{eval_type}.csv")
819 | 
820 |     app_avg_metric = app_dfs.groupby("app")[columns[1:]].mean()
821 |     app_avg_metric.to_csv(f"metric_results/app_avg_metric_{eval_type}.csv")
822 | 
823 |     model_avg_metric = app_dfs.groupby("model")[columns[1:]].mean()
824 | 
825 |     model_avg_metric["understanding"] = (3 - model_avg_metric["invalid_format"] - model_avg_metric["invalid_action"] -
826 |                                          model_avg_metric["nuggets_mining"]) / 3.
827 |     model_avg_metric["reasoning"] = model_avg_metric["operation_logic"] + model_avg_metric["aware_completion"]
828 |     model_avg_metric["exploration"] = 1.0 - model_avg_metric["repeat_actions"]
829 |     model_avg_metric["reflection"] = 0.
830 |     model_avg_metric.loc["llama13_reflection", "reflection"] = model_avg_metric.loc[
831 |                                                                    "llama13_reflection", "normalized_task_reward"] - \
832 |                                                                model_avg_metric.loc[
833 |                                                                    "llama13", "normalized_task_reward"] + \
834 |                                                                model_avg_metric.loc[
835 |                                                                    "llama13_reflection", "task_completion_ratio"] - \
836 |                                                                model_avg_metric.loc["llama13", "task_completion_ratio"]
837 |     model_avg_metric.loc["llama70_reflection", "reflection"] = model_avg_metric.loc[
838 |                                                                    "llama70_reflection", "normalized_task_reward"] - \
839 |                                                                model_avg_metric.loc[
840 |                                                                    "llama70", "normalized_task_reward"] + \
841 |                                                                model_avg_metric.loc[
842 |                                                                    "llama70_reflection", "task_completion_ratio"] - \
843 |                                                                model_avg_metric.loc["llama70", "task_completion_ratio"]
844 |     model_avg_metric.loc["gpt35_reflection", "reflection"] = model_avg_metric.loc[
845 |                                                                  "gpt35_reflection", "normalized_task_reward"] - \
846 |                                                              model_avg_metric.loc["gpt35", "normalized_task_reward"] + \
847 |                                                              model_avg_metric.loc[
848 |                                                                  "gpt35_reflection", "task_completion_ratio"] - \
849 |                                                              model_avg_metric.loc["gpt35", "task_completion_ratio"]
850 |     model_avg_metric.loc["gpt4_reflection", "reflection"] = model_avg_metric.loc[
851 |                                                                 "gpt4_reflection", "normalized_task_reward"] - \
852 |                                                             model_avg_metric.loc["gpt4", "normalized_task_reward"] + \
853 |                                                             model_avg_metric.loc[
854 |                                                                 "gpt4_reflection", "task_completion_ratio"] - \
855 |                                                             model_avg_metric.loc["gpt4", "task_completion_ratio"]
856 |     model_avg_metric.to_csv(f"metric_results/model_avg_metric_{eval_type}.csv")
857 | 


--------------------------------------------------------------------------------