├── .github
├── assets
│ ├── dark_logo.png
│ ├── mobile_adapt_example.mp4
│ └── set_of_mark.png
└── workflows
│ └── package.yml
├── .gitignore
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── cognisim
├── __init__.py
├── device
│ ├── android
│ │ ├── __init__.py
│ │ ├── android_device.py
│ │ ├── android_ui.py
│ │ └── android_view_hierarchy.py
│ ├── device.py
│ ├── device_factory.py
│ ├── ios
│ │ ├── __init__.py
│ │ ├── ios_device.py
│ │ └── ios_view_hierarchy.py
│ └── ios_device.py
└── utils
│ └── constants.py
├── cookbook
├── agentic_example.py
├── examplescript2.py
├── smoke_example_android.py.py
└── smoke_example_ios.py
├── deploy
└── run.sh
├── poetry.lock
├── pyproject.toml
├── requirements.txt
├── scripts
├── format.sh
└── setup.sh
└── setup.py
/.github/assets/dark_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RevylAI/CogniSim/3d8902e011981b93cb0ebfafba1794eab93b053e/.github/assets/dark_logo.png
--------------------------------------------------------------------------------
/.github/assets/mobile_adapt_example.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RevylAI/CogniSim/3d8902e011981b93cb0ebfafba1794eab93b053e/.github/assets/mobile_adapt_example.mp4
--------------------------------------------------------------------------------
/.github/assets/set_of_mark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RevylAI/CogniSim/3d8902e011981b93cb0ebfafba1794eab93b053e/.github/assets/set_of_mark.png
--------------------------------------------------------------------------------
/.github/workflows/package.yml:
--------------------------------------------------------------------------------
1 | name: Publish
2 | on:
3 | push:
4 | branches:
5 | - main
6 | paths:
7 | - 'cognisim/**'
8 | jobs:
9 | build:
10 | runs-on: ubuntu-latest
11 | steps:
12 | - uses: actions/checkout@v3
13 |
14 | - uses: actions/setup-python@v3
15 | with:
16 | python-version: 3.12
17 |
18 | - run: |
19 | pip install poetry
20 | poetry build
21 |
22 | - uses: actions/upload-artifact@v3
23 | with:
24 | path: ./dist
25 |
26 | pypi-publish:
27 | needs: ['build']
28 | environment: 'publish'
29 |
30 | name: upload release to PyPI
31 | runs-on: ubuntu-latest
32 | permissions:
33 | # IMPORTANT: this permission is mandatory for trusted publishing
34 | id-token: write
35 | steps:
36 | - uses: actions/download-artifact@v3
37 |
38 | - name: Publish package distributions to PyPI
39 | uses: pypa/gh-action-pypi-publish@release/v1
40 | with:
41 | packages-dir: artifact/
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Xcode
2 | *.pbxuser
3 | !default.pbxuser
4 | *.mode1v3
5 | !default.mode1v3
6 | *.mode2v3
7 | !default.mode2v3
8 | *.perspectivev3
9 | !default.perspectivev3
10 | xcuserdata/
11 | *.xccheckout
12 | *.moved-aside
13 | DerivedData
14 | *.hmap
15 | *.ipa
16 | *.xcuserstate
17 | project.xcworkspace
18 | *.xml
19 | # CocoaPods
20 | Pods/
21 |
22 | # Carthage
23 | Carthage/Build
24 |
25 | # fastlane
26 | fastlane/report.xml
27 | fastlane/Preview.html
28 | fastlane/screenshots
29 | fastlane/test_output
30 |
31 | # Code Injection
32 | iOSInjectionProject/
33 |
34 | # Android/IntelliJ
35 | build/
36 | .idea
37 | .gradle
38 | local.properties
39 | *.iml
40 |
41 | # Node
42 | node_modules/
43 | npm-debug.log
44 | yarn-error.log
45 |
46 | # BUCK
47 | buck-out/
48 | \.buckd/
49 | *.keystore
50 |
51 | # Bundle artifact
52 | *.jsbundle
53 |
54 | # Ruby / CocoaPods
55 | /ios/Pods/
56 |
57 | # Temporary files
58 | *.swp
59 | *.swo
60 | *~
61 |
62 | # OS generated files
63 | .DS_Store
64 | .DS_Store?
65 | ._*
66 | .Spotlight-V100
67 | .Trashes
68 | ehthumbs.db
69 | Thumbs.db
70 |
71 | # Python
72 | __pycache__/
73 | *.py[cod]
74 | *$py.class
75 |
76 | # Virtual Environment
77 | venv/
78 | env/
79 | .venv/
80 | .env/
81 |
82 | # Distribution / packaging
83 | .Python
84 | develop-eggs/
85 | dist/
86 | downloads/
87 | eggs/
88 | .eggs/
89 | lib/
90 | lib64/
91 | parts/
92 | sdist/
93 | var/
94 | wheels/
95 | *.egg-info/
96 | .installed.cfg
97 | *.egg
98 |
99 | # Jupyter Notebook
100 | .ipynb_checkpoints
101 |
102 | # pyenv
103 | .python-version
104 |
105 | # celery beat schedule file
106 | celerybeat-schedule
107 |
108 | # SageMath parsed files
109 | *.sage.py
110 |
111 | # Environments
112 | .env
113 | .venv
114 | env/
115 | venv/
116 | ENV/
117 | env.bak/
118 | venv.bak/
119 |
120 | # Spyder project settings
121 | .spyderproject
122 | .spyproject
123 |
124 | # Rope project settings
125 | .ropeproject
126 |
127 | # mkdocs documentation
128 | /site
129 |
130 | # mypy
131 | .mypy_cache/
132 |
133 | # Pyre type checker
134 | .pyre/
135 |
136 |
137 | constants.cpython-37.pyc
138 |
139 | # Ignore XML files in mobileadapt/device directory
140 | mobileadapt/device/*.xml
141 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing to mobileadapt
2 |
3 | We're excited that you're interested in contributing to mobileadapt! This document outlines the process for contributing to this project.
4 |
5 | ## Getting Started
6 |
7 | 1. Fork the repository on GitHub.
8 | 2. Clone your fork locally:
9 | ```
10 | git clone https://github.com/your-username/mobileadapt.git
11 | ```
12 | 3. Create a new branch for your feature or bug fix:
13 | ```
14 | git checkout -b feature/your-feature-name
15 | ```
16 |
17 | ## Setting Up the Development Environment
18 |
19 | 1. Ensure you have Python 3.7+ installed.
20 | 2. Install Poetry (if not already installed):
21 | ```
22 | curl -sSL https://install.python-poetry.org | python3 -
23 | ```
24 | 3. Install the project dependencies:
25 | ```
26 | poetry install
27 | ```
28 | 4. Activate the virtual environment:
29 | ```
30 | poetry shell
31 | ```
32 | 5. Set up Appium and the necessary mobile SDKs as described in the project's README.
33 |
34 | ## Making Changes
35 |
36 | 1. Make your changes in your feature branch.
37 | 2. Add or update tests as necessary.
38 | 3. Ensure your code follows the project's coding style (we use PEP 8 for Python).
39 | 4. Run the test suite to make sure all tests pass:
40 | ```
41 | poetry run python -m unittest discover tests
42 | ```
43 |
44 | ## Updating Documentation
45 |
46 | 1. Any changes that affect the project's functionality, API, or usage should be reflected in the documentation.
47 | 2. The documentation for this project is maintained in a separate repository: [adaptdocs](https://github.com/RevylAI/adaptdocs).
48 | 3. Clone the documentation repository:
49 | ```
50 | git clone https://github.com/RevylAI/adaptdocs.git
51 | ```
52 | 4. Make the necessary updates to the relevant documentation files.
53 | 5. Submit a separate pull request to the adaptdocs repository with your documentation changes.
54 |
55 | ## Submitting Changes
56 |
57 | 1. Commit your changes:
58 | ```
59 | git commit -am "Add a brief description of your changes"
60 | ```
61 | 2. Push to your fork:
62 | ```
63 | git push origin feature/your-feature-name
64 | ```
65 | 3. Submit a pull request through the GitHub website.
66 | 4. If you've made documentation changes, submit a separate pull request to the adaptdocs repository.
67 |
68 | ## Pull Request Guidelines
69 |
70 | - Provide a clear title and description of your changes.
71 | - Include any relevant issue numbers in the PR description.
72 | - Ensure all tests pass and there are no linting errors.
73 | - Add or update documentation as necessary.
74 | - If your changes require documentation updates, mention the related PR in the adaptdocs repository.
75 |
76 | ## Reporting Bugs
77 |
78 | - Use the GitHub issue tracker to report bugs.
79 | - Describe the bug in detail, including steps to reproduce.
80 | - Include information about your environment (OS, Python version, etc.).
81 |
82 | ## Requesting Features
83 |
84 | - Use the GitHub issue tracker to suggest new features.
85 | - Clearly describe the feature and its potential benefits.
86 | - Be open to discussion about the feature's implementation.
87 |
88 | ## Code Review Process
89 |
90 | The core team will review your pull request. We may suggest changes, improvements, or alternatives.
91 |
92 | ## Coding Conventions
93 |
94 | - Follow PEP 8 style guide for Python code.
95 | - Use meaningful variable and function names.
96 | - Comment your code where necessary, especially for complex logic.
97 | - Write docstrings for all functions, classes, and modules.
98 |
99 | ## License
100 |
101 | By contributing to mobileadapt, you agree that your contributions will be licensed under the project's MIT license.
102 |
103 | Thank you for contributing to mobileadapt!
104 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 |
3 | Version 2.0, January 2004
4 |
5 | Copyright (c) 2024 RevylAI
6 |
7 | Permission is hereby granted, free of charge, to any person obtaining a copy
8 | of this software and associated documentation files (the "Software"), to deal
9 | in the Software without restriction, including without limitation the rights
10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 | copies of the Software, and to permit persons to whom the Software is
12 | furnished to do so, subject to the following conditions:
13 |
14 | The above copyright notice and this permission notice shall be included in all
15 | copies or substantial portions of the Software.
16 |
17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 | SOFTWARE.
24 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Cognisim
2 |
3 | [Company Website](https://revyl.ai) | [Twitter](https://x.com/tryrevyl) |
4 |
5 | 
6 |
7 |
8 |
9 |
10 |
11 | ### Interaction utilies for crossplatform interaction agents
12 |
13 | **LLM Control Library for iOS and Android**
14 |
15 | Have you ever wanted to test your mobile app or control iOS and Android devices with an LLM? You've probably encountered context problems due to the accessibility view being too long or just sending a screenshot to the LLM, which provides limited accuracy.
16 |
17 |
18 |
19 | (example of using cognisim to control an android device on the arcteryx app (bout to be dripped out))
20 |
21 |
22 | **Our Solution**
23 |
24 | We combine the accessibility tree with a set of mark prompting to provide a readable state for the LLM.
25 |
26 | **Real-World Application**
27 |
28 | At Revyl, we use this approach to test mobile apps with LLMs. Our platform integrates resilient end-to-end tests using agentic LLMs with open telemetry tracing, offering proactive observability into your mobile app.
29 |
30 | If you are interested in putting your testing on autopilot, and catching bugs before your users do,
31 |
32 |
33 |
34 | [book a demo with us](https://cal.com/landseer-enga/book-a-demo)
35 |
36 |
37 | #### [Revyl AI](https://revyl.ai)
38 |
39 | ### Prerequisites
40 |
41 | - Android Virtual Device (for Android adaptation)
42 | - iOS Simulator and Xcode (for iOS adaptation - coming soon)
43 | - macOS or Linux (recommended)
44 |
45 |
46 | ## Quick Start
47 |
48 |
49 | Create a Simulator with ios/android and make sure you have appium installed
50 |
51 |
52 | For macOS, install Appium using Homebrew:
53 | ```bash
54 | brew install appium
55 | ```
56 |
57 | For all other operating systems, install Appium using npm:
58 | ```bash
59 | npm i -g appium
60 | ```
61 |
62 |
63 | To install the mobileadapt package:
64 |
65 |
66 | ```bash
67 | poetry add cognisim
68 | ```
69 | or if you have pip installed:
70 |
71 | ```bash
72 | pip install cognisim
73 | ```
74 |
75 | For detailed instructions on getting started with Mobileadapt, please refer to our [Quickstart Guide](https://mobileadapt.revyl.ai/quickstart).
76 |
77 |
78 |
79 | # Usage
80 | ### Android Basic Example
81 |
82 | ```python
83 | import asyncio
84 | from cognisim import mobileadapt
85 |
86 | async def main():
87 | # Initialize and start Android device
88 | android_device = mobileadapt(platform="android")
89 | await android_device.start_device()
90 |
91 | # Get initial state and perform tap
92 | _, _, _ = await android_device.get_state()
93 | await android_device.tap(100, 100)
94 |
95 | # Get state after tap
96 | new_encoded_ui, _, _ = await android_device.get_state()
97 | print("State after tap:", new_encoded_ui)
98 |
99 | if __name__ == "__main__":
100 | asyncio.run(main())
101 | ```
102 |
103 | ### IOS Basic Example
104 |
105 | ```python
106 | import asyncio
107 | from cognisim import mobileadapt
108 |
109 | async def main():
110 | # Initialize and start iOS device
111 | ios_device = mobileadapt(platform="ios")
112 | await ios_device.start_device()
113 |
114 | # Get device state
115 | encoded_ui, _, _ = await ios_device.get_state()
116 | print("Current state:", encoded_ui)
117 |
118 | if __name__ == "__main__":
119 | asyncio.run(main())
120 | ```
121 |
122 | ### Go to [Documentation](https://mobileadapt.revyl.ai) or the cookbook folder for more examples and usage.
123 |
124 |
125 |
126 |
127 | ## Documentation
128 |
129 | For full documentation, visit [mobileadapt.revyl.ai](https://mobileadapt.revyl.ai).
130 |
131 |
132 | ## Key Features
133 |
134 | - **Android Support**: Works seamlessly with Android devices and emulators.
135 |
136 | - **IOS Support**: Works seamlessly with Android devices and emulators.
137 | - **Appium Integration**: Leverages the power of Appium for reliable mobile automation.
138 | - **LLM Agent Compatibility**: Designed to work seamlessly with language model agents.
139 | - **iOS Support**: Coming soon!
140 |
141 |
142 |
143 |
144 | ### Local Development
145 |
146 | 1. Clone the repository:
147 | ```bash
148 | git clone https://github.com/RevylAI/Mobileadapt/ && cd mobileadapt/deploy
149 | ```
150 |
151 | 2. Start the server:
152 | ```bash
153 | ./scripts/setup.sh
154 | ```
155 |
156 | ## Roadmap
157 | - [x] iOS Support
158 | - [ ] Abstract to different drivers other than appium
159 | - [ ] Recording interactions
160 | - [ ] Screen sharing via websocket to host recording
161 |
162 |
163 |
164 |
165 | ## Contributing
166 |
167 | We welcome contributions to the Mobileadapt project! If you'd like to contribute, please check our [Contribution Guidelines](https://github.com/RevylAI/Mobileadapt/blob/main/CONTRIBUTING.md).
168 |
169 | ## License
170 |
171 | Mobileadapt is released under the MIT License. See the [LICENSE](https://github.com/RevylAI/Mobileadapt/blob/main/LICENSE) file for more details.
172 |
173 |
174 |
175 | # Credits
176 |
177 | @inproceedings{shvoEtAl2021appbuddy,
178 | title={AppBuddy: Learning to Accomplish Tasks in Mobile Apps via Reinforcement Learning},
179 | author={Maayan Shvo and
180 | Zhiming Hu and
181 | Rodrigo Toro Icarte and
182 | Iqbal Mohomed and
183 | Allan D. Jepson and
184 | Sheila A. McIlraith},
185 | booktitle={Canadian Conference on Artificial Intelligence},
186 | year={2021}
187 | }
188 |
189 | @misc{google-research,
190 | title={Google Research},
191 | author={Google},
192 | year={2021},
193 | howpublished={\url{https://github.com/Berrylcm/google-research}}
194 | }
195 |
196 |
197 |
198 |
199 | ## How does it work?
200 |
201 | We use Appium under the hood to control the device and collect the UI. We then use a custom UI parser to convert the UI to a string that can be used by the LLM.
202 |
203 |
204 | The UI is parsed with a ui parser and then set of mark is generated for the image and we send that to the LLM..
205 |
206 | The UI is parsed with a ui parser and then a set of marks is generated for the image, and we send that to the LLM. For example, the parsed UI might look like this:
207 |
208 | ``` html
209 |
210 |
211 |
212 |
213 |
214 |
215 | revyl.ai
216 |
217 |
None
218 |
219 |
220 | Revyl is in private beta →
221 | None
222 |
223 | Revyl
224 |
225 |
226 | None
227 | AI Native Proactive Observability
228 | Catch bugs
229 | they happen using agentic E2E testing and OpenTelemetry's Tracing. Book a demo
230 | before
231 | now
232 | !
233 |
234 | Book a demo
235 |
236 | TRUSTED AND BUILT BY ENGINEERS AT
237 |
238 |
239 | VendorPM
240 |
241 | ```
242 |
243 | This structured representation of the UI elements is then used by the LLM to understand and interact with the mobile interface.
244 |
245 | Each of the ids are mapped to an element in the UI.
246 |
247 | We also create a set of mark prompting of the given state
248 |
249 |
250 |
251 | Here's an example of a set of mark image generated for the UI state:
252 |
253 |
254 |
255 | This image shows the UI elements with their corresponding IDs overlaid on the screenshot. This visual representation helps the LLM understand the layout and structure of the interface, making it easier to interact with specific elements.
256 |
257 | ## Citations
258 |
259 | ```
260 | bibtex
261 | @misc{revylai2024mobileadapt,
262 | title = {Cognisim},
263 | author = {Anam Hira, Landseer Enga, Aarib Sarker, Wasif Sarker, Hanzel Hira, Sushan Leel},
264 | year = {2024},
265 | howpublished = {GitHub},
266 | url = {https://github.com/RevylAI/Mobileadapt}
267 | }
268 | ```
--------------------------------------------------------------------------------
/cognisim/__init__.py:
--------------------------------------------------------------------------------
1 |
2 | from .device.device_factory import DeviceFactory
3 |
4 |
5 | def mobileadapt(
6 | platform: str,
7 | app_url: str = None,
8 | state_representation="aria",
9 | download_directory="default",
10 | session_id=None,
11 | ):
12 | return DeviceFactory.create_device(
13 | platform, app_url, state_representation, download_directory, session_id
14 | )
15 |
16 |
17 | __all__ = ["mobileadapt", "MobileAdapt"]
18 |
--------------------------------------------------------------------------------
/cognisim/device/android/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RevylAI/CogniSim/3d8902e011981b93cb0ebfafba1794eab93b053e/cognisim/device/android/__init__.py
--------------------------------------------------------------------------------
/cognisim/device/android/android_device.py:
--------------------------------------------------------------------------------
1 | import base64
2 | from datetime import datetime
3 | from cognisim.device.device import Device
4 | from appium import webdriver
5 | from appium.options.android import UiAutomator2Options
6 | from cognisim.device.android.android_view_hierarchy import ViewHierarchy
7 | import cv2
8 | from loguru import logger
9 | import numpy as np
10 | import os
11 | # Android Emulator Config
12 | SCREEN_WIDTH = 1080
13 | SCREEN_HEIGHT = 1920
14 | SCREEN_CHANNEL = 4
15 | SCREEN_TOP_HEAD = 63
16 | SCREEN_BOTTOM_HEAD = 126
17 | # screen config
18 | ADJACENT_BOUNDING_BOX_THRESHOLD = 3
19 | NORM_VERTICAL_NEIGHBOR_MARGIN = 0.01
20 | NORM_HORIZONTAL_NEIGHBOR_MARGIN = 0.01
21 | INPUT_ACTION_UPSAMPLE_RATIO = 1
22 | # XML screen config
23 | XML_SCREEN_WIDTH = 1440
24 | XML_SCREEN_HEIGHT = 2960
25 | # Get state implementation
26 |
27 |
28 | def sortchildrenby_viewhierarchy(view, attr="bounds"):
29 | if attr == 'bounds':
30 | bounds = [(ele.uiobject.bounding_box.x1, ele.uiobject.bounding_box.y1,
31 | ele.uiobject.bounding_box.x2, ele.uiobject.bounding_box.y2)
32 | for ele in view]
33 | sorted_bound_index = [
34 | bounds.index(i) for i in sorted(
35 | bounds, key=lambda x: (
36 | x[1], x[0]))]
37 |
38 | sort_children = [view[i] for i in sorted_bound_index]
39 | view[:] = sort_children
40 |
41 |
42 | CLASS_MAPPING = {
43 | 'TEXTVIEW': 'p',
44 | 'BUTTON': 'button',
45 | 'IMAGEBUTTON': 'button',
46 | 'IMAGEVIEW': 'img',
47 | 'EDITTEXT': 'input',
48 | 'CHECKBOX': 'input',
49 | 'CHECKEDTEXTVIEW': 'input',
50 | 'TOGGLEBUTTON': 'button',
51 | 'RADIOBUTTON': 'input',
52 | 'SPINNER': 'select',
53 | 'SWITCH': 'input',
54 | 'SLIDINGDRAWER': 'input',
55 | 'TABWIDGET': 'div',
56 | 'VIDEOVIEW': 'video',
57 | 'SEARCHVIEW': 'div'
58 | }
59 |
60 |
61 | class UI():
62 | def __init__(self, xml_file):
63 | self.xml_file = xml_file
64 | self.elements = {}
65 |
66 | def encoding(self):
67 | logger.info('reading hierarchy tree from {} ...'.format(
68 | self.xml_file.split('/')[-1]))
69 | with open(self.xml_file, 'r', encoding='utf-8') as f:
70 | vh_data = f.read().encode()
71 |
72 | vh = ViewHierarchy(
73 | screen_width=XML_SCREEN_WIDTH,
74 | screen_height=XML_SCREEN_HEIGHT)
75 | vh.load_xml(vh_data)
76 | view_hierarchy_leaf_nodes = vh.get_leaf_nodes()
77 | sortchildrenby_viewhierarchy(view_hierarchy_leaf_nodes, 'bounds')
78 |
79 | logger.debug('encoding the ui elements in hierarchy tree...')
80 | codes = ''
81 | # logger.info(view_hierarchy_leaf_nodes)
82 | for _id, ele in enumerate(view_hierarchy_leaf_nodes):
83 | obj_type = ele.uiobject.obj_type.name
84 | text = ele.uiobject.text
85 | text = text.replace('\n', ' ')
86 | resource_id = ele.uiobject.resource_id if ele.uiobject.resource_id is not None else ''
87 | content_desc = ele.uiobject.content_desc
88 | html_code = self.element_encoding(
89 | _id, obj_type, text, content_desc, resource_id)
90 | codes += html_code
91 | self.elements[_id] = ele.uiobject
92 | codes = "\n" + codes + ""
93 | return codes
94 |
95 | def element_encoding(
96 | self,
97 | _id,
98 | _obj_type,
99 | _text,
100 | _content_desc,
101 | _resource_id):
102 |
103 | _class = _resource_id.split('id/')[-1].strip()
104 | _text = _text.strip()
105 | assert _obj_type in CLASS_MAPPING.keys(), print(_obj_type)
106 | tag = CLASS_MAPPING[_obj_type]
107 |
108 | if _obj_type in ['CHECKBOX', 'CHECKEDTEXTVIEW', 'SWITCH']:
109 | code = f' \n'
110 | code += f' \n'
111 | elif _obj_type == 'RADIOBUTTON':
112 | code = f' \n'
113 | code += f' \n'
114 | elif _obj_type == 'SPINNER':
115 | code = f' \n'
116 | code += f' \n'
117 | elif _obj_type == 'IMAGEVIEW':
118 | if _class == "":
119 | code = f'
\n'
120 | else:
121 | code = f'
\n'
122 | else:
123 | if _class == "":
124 | _text = _content_desc if _text == "" else _text
125 | code = f' <{tag} id={_id}">{_text}{tag}>\n'
126 | else:
127 | _text = _content_desc if _text == "" else _text
128 | code = f' <{tag} id={_id} class="{_class}">{_text}{tag}>\n'
129 | return code
130 |
131 |
132 | class AndroidDevice(Device):
133 | def __init__(self, app_package, download_directory='default', session_id=None):
134 | super().__init__(app_package)
135 | self.download_directory = download_directory
136 | self.session_id = session_id
137 | self.desired_caps = {
138 | 'deviceName': 'Android Device',
139 | 'automationName': 'UiAutomator2',
140 | 'autoGrantPermission': True,
141 | 'newCommandTimeout': 600,
142 | 'mjpegScreenshotUrl': 'http://localhost:4723/stream.mjpeg',
143 |
144 | }
145 | self.options = UiAutomator2Options().load_capabilities(self.desired_caps)
146 |
147 | async def get_state(self):
148 | raw_appium_state = self.driver.page_source
149 |
150 | file_path = os.path.join(os.path.dirname(__file__), 'android_view_hierarchy.xml')
151 | xml_file = open(file_path, 'w')
152 | xml_file.write(raw_appium_state)
153 | xml_file.close()
154 |
155 | ui = UI(file_path)
156 | encoded_ui: str = ui.encoding()
157 | logger.info(f"Encoded UI: {encoded_ui}")
158 | # Take screenshot and encode as base64
159 | screenshot: bytes = self.driver.get_screenshot_as_png()
160 |
161 | # Return encoded UI and screenshot
162 | return encoded_ui, screenshot, ui
163 |
164 | async def navigate(self, package_name):
165 | """
166 | Opens the specified package using Appium with UiAutomator2.
167 |
168 | :param package_name: The package name of the app to open
169 | """
170 | try:
171 | self.driver.activate_app(package_name)
172 | logger.info(f"Successfully opened package: {package_name}")
173 | except Exception as e:
174 | logger.error(f"Failed to open package {package_name}. Error: {str(e)}")
175 | raise
176 |
177 | async def tap(self, x, y):
178 | self.driver.tap([(x, y)], 1)
179 |
180 | async def input(self, x, y, text):
181 | await self.tap(x, y)
182 | self.driver.execute_script('mobile: type', {'text': text})
183 |
184 | async def drag(self, startX, startY, endX, endY):
185 | self.driver.swipe(startX, startY, endX, endY, duration=1000)
186 |
187 | async def scroll(self, direction):
188 | direction_map = {
189 | 'up': 'UP',
190 | 'down': 'DOWN',
191 | 'left': 'LEFT',
192 | 'right': 'RIGHT'
193 | }
194 | self.driver.execute_script('mobile: scroll', {'direction': direction_map[direction]})
195 |
196 | async def swipe(self, direction):
197 | left = self.window_size["width"] * 0.2
198 | top = self.window_size["height"] * 0.2
199 | width = self.window_size["width"] * 0.6
200 | height = self.window_size["height"] * 0.6
201 | self.driver.execute_script("mobile: swipeGesture", {
202 | "left": left,
203 | "top": top,
204 | "width": width,
205 | "height": height,
206 | "direction": direction,
207 | "percent": 1.0
208 | })
209 |
210 | async def start_recording(self):
211 | """
212 | Starts screen recording on the Android device.
213 |
214 | Returns:
215 | None
216 | """
217 | try:
218 | self.driver.start_recording_screen()
219 | logger.info("Screen recording started successfully")
220 | except Exception as e:
221 | logger.error(f"Failed to start screen recording. Error: {str(e)}")
222 | raise
223 |
224 | async def stop_recording(self, save_path=None):
225 | """
226 | Stops screen recording on the Android device and saves the video.
227 |
228 | Args:
229 | save_path (str, optional): Path to save the video file. If not provided, a default path will be used.
230 |
231 | Returns:
232 | str: Path to the saved video file
233 | """
234 | video_base64 = self.driver.stop_recording_screen()
235 |
236 | if save_path is None:
237 | # Create a unique filename using timestamp
238 | timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
239 | filename = f"screen_recording_{timestamp}.mp4"
240 |
241 | # Define the default save path
242 | save_dir = os.path.join(os.getcwd(), "recordings")
243 | os.makedirs(save_dir, exist_ok=True)
244 | save_path = os.path.join(save_dir, filename)
245 |
246 | # Decode and save the video
247 | with open(save_path, "wb") as video_file:
248 | video_file.write(base64.b64decode(video_base64))
249 |
250 | logger.info(f"Screen recording saved to: {save_path}")
251 | return save_path
252 |
253 | async def stop_device(self):
254 | '''
255 | Stops a test
256 | '''
257 | pass
258 |
259 | async def capture_screenshot_with_bounding_box(self, bounds: dict, image_state: bytes = None) -> bytes:
260 | """
261 | Capture a screenshot with a bounding box drawn around a specified element.
262 |
263 | Args:
264 | bounds (dict): A dictionary containing the bounding box coordinates.
265 | Expected keys are x1, y1, x2, y2, all of which are integers.
266 | image_state (bytes, optional): The current screenshot if available.
267 |
268 | Returns:
269 | bytes: The screenshot image with bounding box as bytes.
270 | """
271 | logger.info("Creating tagged image")
272 | screenshot = image_state if image_state is not None else await self.device.screenshot()
273 | if screenshot is None:
274 | logger.info("Screenshot failed")
275 | return None
276 |
277 | # Convert the screenshot to a NumPy array
278 | image_np = np.frombuffer(screenshot, dtype=np.uint8)
279 | image = cv2.imdecode(image_np, cv2.IMREAD_COLOR)
280 |
281 | # Extract bounding box coordinates
282 | x1 = int(bounds[0])
283 | y1 = int(bounds[1])
284 | x2 = int(bounds[2])
285 | y2 = int(bounds[3])
286 |
287 | # Calculate width and height
288 | # width = x2 - x1
289 | # height = y2 - y1
290 |
291 | bright_color = (128, 0, 128) # Pink color
292 | # Draw the bounding box on the image
293 | cv2.rectangle(image, (x1, y1), (x2, y2), bright_color, 5)
294 |
295 | # Convert the image back to bytes
296 | _, encoded_image = cv2.imencode('.png', image)
297 | screenshot_with_bounding_box = encoded_image.tobytes()
298 |
299 | return screenshot_with_bounding_box
300 |
301 | def generate_set_of_mark(self,
302 | ui,
303 | image: bytes,
304 | position='top-left') -> bytes:
305 | '''
306 | Code to generate a set of mark for a given image and UI state
307 | ui: UI object
308 | image: bytes of the image
309 | step_id: step ids
310 | position: position of the annotation, defaults to 'top-lefts', can also be 'center'
311 | '''
312 | # Convert image bytes to numpy array
313 | nparr = np.frombuffer(image, np.uint8)
314 | img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
315 | height, width, _ = img.shape
316 |
317 | # Define the minimum area
318 | k = 3000
319 |
320 | for element_id in ui.elements:
321 | bounds = [
322 | ui.elements[element_id].bounding_box.x1,
323 | ui.elements[element_id].bounding_box.y1,
324 | ui.elements[element_id].bounding_box.x2,
325 | ui.elements[element_id].bounding_box.y2
326 | ]
327 |
328 | # Calculate the area of the bounding box
329 | area = (bounds[2] - bounds[0]) * (bounds[3] - bounds[1])
330 |
331 | # Only label elements with an area over k
332 | if area > k:
333 | # Draw a rectangle around the element
334 | cv2.rectangle(
335 | img, (int(bounds[0]), int(bounds[1])),
336 | (int(bounds[2]), int(bounds[3])), (0, 0, 255), 5)
337 |
338 | text = str(element_id)
339 | text_size = 2 # Fixed text size
340 | font = cv2.FONT_HERSHEY_SIMPLEX
341 |
342 | # Calculate the width and height of the text
343 | text_width, text_height = cv2.getTextSize(
344 | text, font, text_size, 2)[0]
345 |
346 | # Calculate the position of the text
347 | if position == 'top-left':
348 | text_x = int(bounds[0])
349 | text_y = int(bounds[1]) + text_height
350 | else: # Default to center
351 | text_x = (int(bounds[0]) + int(bounds[2])) // 2 - text_width // 2
352 | text_y = (int(bounds[1]) + int(bounds[3])) // 2 + text_height // 2
353 |
354 | # Draw a black rectangle behind the text
355 | cv2.rectangle(img, (text_x, text_y - text_height),
356 | (text_x + text_width, text_y), (0, 0, 0), thickness=cv2.FILLED)
357 |
358 | # Draw the text in white
359 | cv2.putText(img, text, (text_x, text_y), font,
360 | text_size, (255, 255, 255), 4)
361 |
362 | # Convert the image to bytes
363 | _, img_encoded = cv2.imencode('.png', img)
364 | img_bytes = img_encoded.tobytes()
365 |
366 | return img_bytes
367 |
368 | async def start_device(self):
369 | '''
370 | Start the Android device and connect to the appium server
371 | '''
372 | try:
373 | self.driver = webdriver.Remote('http://localhost:4723', options=self.options)
374 | except BaseException:
375 | self.desired_caps.pop('mjpegScreenshotUrl')
376 | self.options = UiAutomator2Options().load_capabilities(self.desired_caps)
377 | self.driver = webdriver.Remote('http://localhost:4723', options=self.options)
378 |
379 | # self.driver.start_recording_screen()
380 | self.driver.update_settings({'waitForIdleTimeout': 0, 'shouldWaitForQuiescence': False, 'maxTypingFrequency': 60})
381 | # self.driver.get_screenshot_as_base64()
382 | # self.driver.execute_script('mobile: startScreenStreaming', {
383 | # 'width': 1080,
384 | # 'height': 1920,
385 | # 'considerRotation': True,
386 | # 'quality': 45,
387 | # 'bitRate': 500000,
388 | # })
389 |
390 |
391 | if __name__ == "__main__":
392 | ui = UI(os.path.join(os.path.dirname(__file__), 'android_view_hierarchy.xml'))
393 | encoded_ui = ui.encoding()
394 | logger.info(f"Encoded UI: {encoded_ui}")
395 |
--------------------------------------------------------------------------------
/cognisim/device/android/android_ui.py:
--------------------------------------------------------------------------------
1 | CLASS_MAPPING = {
2 | "TEXTVIEW": "p",
3 | "BUTTON": "button",
4 | "IMAGEBUTTON": "button",
5 | "IMAGEVIEW": "img",
6 | "EDITTEXT": "input",
7 | "CHECKBOX": "input",
8 | "CHECKEDTEXTVIEW": "input",
9 | "TOGGLEBUTTON": "button",
10 | "RADIOBUTTON": "input",
11 | "SPINNER": "select",
12 | "SWITCH": "input",
13 | "SLIDINGDRAWER": "input",
14 | "TABWIDGET": "div",
15 | "VIDEOVIEW": "video",
16 | "SEARCHVIEW": "div",
17 | }
18 |
19 | from loguru import logger
20 |
21 | from mobileadapt.device.android.android_view_hierarchy import ViewHierarchy
22 | from mobileadapt.utils.constants import XML_SCREEN_HEIGHT, XML_SCREEN_WIDTH
23 |
24 |
25 | def sortchildrenby_viewhierarchy(view, attr="bounds"):
26 | if attr == "bounds":
27 | bounds = [
28 | (
29 | ele.uiobject.bounding_box.x1,
30 | ele.uiobject.bounding_box.y1,
31 | ele.uiobject.bounding_box.x2,
32 | ele.uiobject.bounding_box.y2,
33 | )
34 | for ele in view
35 | ]
36 | sorted_bound_index = [
37 | bounds.index(i) for i in sorted(bounds, key=lambda x: (x[1], x[0]))
38 | ]
39 |
40 | sort_children = [view[i] for i in sorted_bound_index]
41 | view[:] = sort_children
42 |
43 |
44 | class UI:
45 | def __init__(self, xml_file):
46 | self.xml_file = xml_file
47 | self.elements = {}
48 |
49 | def encoding(self):
50 | logger.info(
51 | "reading hierarchy tree from {} ...".format(self.xml_file.split("/")[-1])
52 | )
53 | with open(self.xml_file, "r", encoding="utf-8") as f:
54 | vh_data = f.read().encode()
55 |
56 | vh = ViewHierarchy(
57 | screen_width=XML_SCREEN_WIDTH, screen_height=XML_SCREEN_HEIGHT
58 | )
59 | vh.load_xml(vh_data)
60 | view_hierarchy_leaf_nodes = vh.get_leaf_nodes()
61 | sortchildrenby_viewhierarchy(view_hierarchy_leaf_nodes)
62 |
63 | # logger.debug("encoding the ui elements in hierarchy tree...")
64 | codes = ""
65 | # logger.info(view_hierarchy_leaf_nodes)
66 | for _id, ele in enumerate(view_hierarchy_leaf_nodes):
67 | obj_type = ele.uiobject.obj_type.name
68 | text = ele.uiobject.text
69 | text = text.replace("\n", " ")
70 | resource_id = (
71 | ele.uiobject.resource_id if ele.uiobject.resource_id is not None else ""
72 | )
73 | content_desc = ele.uiobject.content_desc
74 | html_code = self.element_encoding(
75 | _id, obj_type, text, content_desc, resource_id
76 | )
77 | codes += html_code
78 | self.elements[_id] = ele.uiobject
79 | codes = "\n" + codes + ""
80 |
81 | # logger.info('Encoded UI\n' + codes)
82 | return codes
83 |
84 | def element_encoding(self, _id, _obj_type, _text, _content_desc, _resource_id):
85 |
86 | _class = _resource_id.split("id/")[-1].strip()
87 | _text = _text.strip()
88 | assert _obj_type in CLASS_MAPPING.keys(), print(_obj_type)
89 | tag = CLASS_MAPPING[_obj_type]
90 |
91 | if _obj_type in ["CHECKBOX", "CHECKEDTEXTVIEW", "SWITCH"]:
92 | code = f' \n'
93 | code += f" \n"
94 | elif _obj_type == "RADIOBUTTON":
95 | code = f' \n'
96 | code += f" \n"
97 | elif _obj_type == "SPINNER":
98 | code = f" \n"
99 | code += f' \n'
100 | elif _obj_type == "IMAGEVIEW":
101 | if _class == "":
102 | code = f'
\n'
103 | else:
104 | code = f'
\n'
105 | else:
106 | if _class == "":
107 | _text = _content_desc if _text == "" else _text
108 | code = f' <{tag} id={_id}">{_text}{tag}>\n'
109 | else:
110 | _text = _content_desc if _text == "" else _text
111 | code = f' <{tag} id={_id} class="{_class}">{_text}{tag}>\n'
112 | return code
113 |
--------------------------------------------------------------------------------
/cognisim/device/android/android_view_hierarchy.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from __future__ import division
3 | from __future__ import print_function
4 |
5 | import collections
6 | import json
7 | import re
8 | import numpy as np
9 | import attr
10 | from str2bool import str2bool as strtobool
11 | from enum import Enum
12 | from lxml import etree
13 | import cognisim.utils.constants as config
14 |
15 |
16 | class UIObjectType(Enum):
17 | """Types of the different UI objects."""
18 | UNKNOWN = 0
19 | BUTTON = 1
20 | CHECKBOX = 2
21 | CHECKEDTEXTVIEW = 3
22 | EDITTEXT = 4
23 | IMAGEBUTTON = 5
24 | IMAGEVIEW = 6
25 | RADIOBUTTON = 7
26 | SLIDINGDRAWER = 8
27 | SPINNER = 9
28 | SWITCH = 10
29 | TABWIDGET = 11
30 | TEXTVIEW = 12
31 | TOGGLEBUTTON = 13
32 | VIDEOVIEW = 14
33 | SEARCHVIEW = 15
34 |
35 |
36 | class UIObjectGridLocation(Enum):
37 | """The on-screen grid location (3x3 grid) of an UI object."""
38 | TOP_LEFT = 0
39 | TOP_CENTER = 1
40 | TOP_RIGHT = 2
41 | LEFT = 3
42 | CENTER = 4
43 | RIGHT = 5
44 | BOTTOM_LEFT = 6
45 | BOTTOM_CENTER = 7
46 | BOTTOM_RIGHT = 8
47 |
48 |
49 | @attr.s
50 | class BoundingBox(object):
51 | """The bounding box with horizontal/vertical coordinates of an UI object."""
52 | x1 = attr.ib()
53 | y1 = attr.ib()
54 | x2 = attr.ib()
55 | y2 = attr.ib()
56 |
57 |
58 | @attr.s
59 | class UIObject(object):
60 | """Represents an UI object from the leaf node in the view hierarchy."""
61 | obj_type = attr.ib()
62 | obj_name = attr.ib()
63 | word_sequence = attr.ib()
64 | text = attr.ib()
65 | resource_id = attr.ib()
66 | android_class = attr.ib()
67 | android_package = attr.ib()
68 | content_desc = attr.ib()
69 | clickable = attr.ib()
70 | visible = attr.ib()
71 | enabled = attr.ib()
72 | focusable = attr.ib()
73 | focused = attr.ib()
74 | scrollable = attr.ib()
75 | long_clickable = attr.ib()
76 | selected = attr.ib()
77 | bounding_box = attr.ib()
78 | grid_location = attr.ib()
79 | dom_location = attr.ib()
80 | pointer = attr.ib()
81 | neighbors = attr.ib()
82 |
83 |
84 | def _build_word_sequence(text, content_desc, resource_id):
85 | """Returns a sequence of word tokens based on certain attributes.
86 |
87 | Args:
88 | text: `text` attribute of an element.
89 | content_desc: `content_desc` attribute of an element.
90 | resource_id: `resource_id` attribute of an element.
91 |
92 | Returns:
93 | A sequence of word tokens.
94 | """
95 | if text or content_desc:
96 | return re.findall(r"[\w']+|[?.!/,;:]", text if text else content_desc)
97 | else:
98 | # logger.info(f"Resource ID: {resource_id}")
99 | if resource_id is not None:
100 | name = resource_id.split('/')[-1]
101 | return filter(None, name.split('_'))
102 | else:
103 | return []
104 |
105 |
106 | def _build_object_type(android_class):
107 | """Returns the object type based on `class` attribute.
108 |
109 | Args:
110 | android_class: `class` attribute of an element (Android class).
111 |
112 | Returns:
113 | The UIObjectType enum.
114 | """
115 | if android_class.startswith('android.widget'):
116 | widget_type = android_class.split('.')[2]
117 | for obj_type in UIObjectType:
118 | if obj_type.name == widget_type.upper():
119 | return obj_type
120 | widget_type = android_class.split('.')[-1]
121 | for obj_type in UIObjectType:
122 | if obj_type.name in widget_type.upper():
123 | return obj_type
124 | return UIObjectType.BUTTON
125 |
126 |
127 | def _build_object_name(text, content_desc):
128 | """Returns the object name based on `text` or `content_desc` attribute.
129 |
130 | Args:
131 | text: The `text` attribute.
132 | content_desc: The `content_desc` attribute.
133 |
134 | Returns:
135 | The object name string.
136 | """
137 | return text if text else content_desc
138 |
139 |
140 | def _build_bounding_box(bounds):
141 | """Returns the object bounding box based on `bounds` attribute.
142 |
143 | Args:
144 | bounds: The `bounds` attribute.
145 |
146 | Returns:
147 | The BoundingBox object.
148 | """
149 | match = re.compile(r'\[(\d+),(\d+)\]\[(\d+),(\d+)\]').match(bounds)
150 | assert match, f"Invalid bounds format: {bounds}"
151 |
152 | x1, y1, x2, y2 = map(int, match.groups())
153 | # logger.info(type(x1))
154 | return BoundingBox(x1=x1, y1=y1, x2=x2, y2=y2)
155 |
156 |
157 | def _build_clickable(element, tree_child_as_clickable=True):
158 | """Returns whether the element is clickable or one of its ancestors is.
159 |
160 | Args:
161 | element: The etree.Element object.
162 | tree_child_as_clickable: treat all tree children as clickable
163 |
164 | Returns:
165 | A boolean to indicate whether the element is clickable or one of its
166 | ancestors is.
167 | """
168 | clickable = element.get('clickable')
169 | if clickable == 'false':
170 | for node in element.iterancestors():
171 | if node.get('clickable') == 'true':
172 | clickable = 'true'
173 | break
174 |
175 | # Below code is try to fix that: some target UI have 'clickable==False'
176 | # but it's clickable by human actually
177 |
178 | # Checkable elemnts should also be treated as clickable
179 | # Some menu items may have clickable==False but checkable==True
180 | if element.get('checkable') == 'true':
181 | clickable = 'true'
182 | if tree_child_as_clickable:
183 | p = element.getparent()
184 | while p is not None:
185 | if p.get('class') == 'android.widget.ListView':
186 | clickable = 'true'
187 | break
188 | p = p.getparent()
189 |
190 | return strtobool(clickable)
191 |
192 |
193 | def _pixel_distance(a_x1, a_x2, b_x1, b_x2):
194 | """Calculates the pixel distance between bounding box a and b.
195 |
196 | Args:
197 | a_x1: The x1 coordinate of box a.
198 | a_x2: The x2 coordinate of box a.
199 | b_x1: The x1 coordinate of box b.
200 | b_x2: The x2 coordinate of box b.
201 |
202 | Returns:
203 | The pixel distance between box a and b on the x axis. The distance
204 | on the y axis can be calculated in the same way. The distance can be
205 | positive number (b is right/bottom to a) and negative number
206 | (b is left or top to a).
207 | """
208 | # if a and b are close enough, then we set the their distance to be 1
209 | # because there are typically padding spaces inside an object's bounding
210 | # box
211 | if b_x1 <= a_x2 and a_x2 - b_x1 <= config.ADJACENT_BOUNDING_BOX_THRESHOLD:
212 | return 1
213 | if a_x1 <= b_x2 and b_x2 - a_x1 <= config.ADJACENT_BOUNDING_BOX_THRESHOLD:
214 | return -1
215 | # overlap
216 | if (a_x1 <= b_x1 <= a_x2) or (a_x1 <= b_x2 <= a_x2) or (
217 | b_x1 <= a_x1 <= b_x2) or (b_x1 <= a_x2 <= b_x2):
218 | return 0
219 | elif b_x1 > a_x2:
220 | return b_x1 - a_x2
221 | else:
222 | return b_x2 - a_x1
223 |
224 |
225 | def _grid_coordinate(x, width):
226 | """Calculates the 3x3 grid coordinate on the x axis.
227 |
228 | The grid coordinate on the y axis is calculated in the same way.
229 |
230 | Args:
231 | x: The x coordinate: [0, width).
232 | width: The screen width.
233 |
234 | Returns:
235 | The grid coordinate: [0, 2].
236 | Note that the screen is divided into 3x3 grid, so the grid coordinate
237 | uses the number from 0, 1, 2.
238 | """
239 | assert 0 <= x <= width
240 | grid_x_0 = width / 3
241 | grid_x_1 = 2 * grid_x_0
242 | if 0 <= x < grid_x_0:
243 | grid_coordinate_x = 0
244 | elif grid_x_0 <= x < grid_x_1:
245 | grid_coordinate_x = 1
246 | else:
247 | grid_coordinate_x = 2
248 | return grid_coordinate_x
249 |
250 |
251 | def _grid_location(bbox, screen_width, screen_height):
252 | """Calculates the grid number of the UI object's bounding box.
253 |
254 | The screen can be divided into 3x3 grid:
255 | (0, 0) (0, 1) (0, 2) 0 1 2
256 | (1, 0) (1, 1) (1, 2) ---> 3 4 5
257 | (2, 0) (2, 1) (2, 2) 6 7 8
258 |
259 | Args:
260 | bbox: The bounding box of the UI object.
261 | screen_width: The width of the screen associated with the hierarchy.
262 | screen_height: The height of the screen associated with the hierarchy.
263 |
264 | Returns:
265 | The grid location number.
266 | """
267 | bbox_center_x = (bbox.x1 + bbox.x2) / 2
268 | bbox_center_y = (bbox.y1 + bbox.y2) / 2
269 | bbox_grid_x = _grid_coordinate(bbox_center_x, screen_width)
270 | bbox_grid_y = _grid_coordinate(bbox_center_y, screen_height)
271 | return UIObjectGridLocation(bbox_grid_y * 3 + bbox_grid_x)
272 |
273 |
274 | def get_view_hierarchy_leaf_relation(objects, _screen_width, _screen_height):
275 | """Calculates adjacency relation from list of view hierarchy leaf nodes.
276 | Args:
277 | objects: a list of objects.
278 | _screen_width, _screen_height: Screen width and height.
279 | Returns:
280 | An un-padded feature dictionary as follow:
281 | 'v_distance': 2d numpy array of ui object vertical adjacency relation.
282 | 'h_distance': 2d numpy array of ui object horizontal adjacency relation.
283 | 'dom_distance': 2d numpy array of ui object dom adjacency relation.
284 | """
285 | vh_node_num = len(objects)
286 | vertical_adjacency = np.zeros((vh_node_num, vh_node_num))
287 | horizontal_adjacency = np.zeros((vh_node_num, vh_node_num))
288 | for row in range(len(objects)):
289 | for column in range(len(objects)):
290 | if row == column:
291 | h_dist = v_dist = 0
292 | else:
293 | node1 = objects[row]
294 | node2 = objects[column]
295 | h_dist, v_dist = normalized_pixel_distance(
296 | node1, node2, _screen_width, _screen_height)
297 | # print(node1.text, node2.text, v_dist)
298 | vertical_adjacency[row][column] = v_dist
299 | horizontal_adjacency[row][column] = h_dist
300 | return {
301 | 'v_distance': vertical_adjacency,
302 | 'h_distance': horizontal_adjacency
303 | }
304 |
305 |
306 | def _get_single_direction_neighbors(object_idx, ui_v_dist, ui_h_dist):
307 | """Gets four 'single direction neighbors' for one target ui_object.
308 | If B is A's bottom/top 'single direction neighbor', it means B is the
309 | vertical closest neighbor among all object whose horizontal distance to A is
310 | smaller than margin threshold. Same with left/right direction neighbor.
311 | Args:
312 | object_idx: index number of target ui_object in ui_object_list
313 | ui_v_dist: ui objects' vertical distances. shape=[num_ui_obj, num_ui_obj]
314 | ui_h_dist: ui objects' horizontal distances. shape=[num_ui_obj, num_ui_obj]
315 | Returns:
316 | a dictionary, keys are NeighborContextDesc Instance, values are neighbor
317 | object index.
318 | """
319 | neighbor_dict = {}
320 | vertical_dist = ui_v_dist[object_idx]
321 | horizontal_dist = ui_h_dist[object_idx]
322 | bottom_neighbors = np.array([
323 | idx for idx in range(len(vertical_dist)) if vertical_dist[idx] > 0 and
324 | abs(horizontal_dist[idx]) < config.NORM_HORIZONTAL_NEIGHBOR_MARGIN
325 | ])
326 | top_neighbors = np.array([
327 | idx for idx in range(len(vertical_dist)) if vertical_dist[idx] < 0 and
328 | abs(horizontal_dist[idx]) < config.NORM_HORIZONTAL_NEIGHBOR_MARGIN
329 | ])
330 | right_neighbors = np.array([
331 | idx for idx in range(len(horizontal_dist)) if horizontal_dist[idx] > 0 and
332 | abs(vertical_dist[idx]) < config.NORM_VERTICAL_NEIGHBOR_MARGIN
333 | ])
334 | left_neighbors = np.array([
335 | idx for idx in range(len(horizontal_dist)) if horizontal_dist[idx] < 0 and
336 | abs(vertical_dist[idx]) < config.NORM_VERTICAL_NEIGHBOR_MARGIN
337 | ])
338 |
339 | if bottom_neighbors.size:
340 | neighbor_dict['top'] = bottom_neighbors[np.argmin(
341 | vertical_dist[bottom_neighbors])]
342 | if top_neighbors.size:
343 | neighbor_dict['bottom'] = top_neighbors[np.argmax(
344 | vertical_dist[top_neighbors])]
345 | if right_neighbors.size:
346 | neighbor_dict['left'] = right_neighbors[np.argmin(
347 | horizontal_dist[right_neighbors])]
348 | if left_neighbors.size:
349 | neighbor_dict['right'] = left_neighbors[np.argmax(
350 | horizontal_dist[left_neighbors])]
351 |
352 | return neighbor_dict
353 |
354 |
355 | def normalized_pixel_distance(node1, node2, _screen_width, _screen_height):
356 | """Calculates normalized pixel distance between this node and other node.
357 |
358 | Args:
359 | node1, node2: Another object.
360 | _screen_width, _screen_height: Screen width and height.
361 |
362 | Returns:
363 | Normalized pixel distance on both horizontal and vertical direction.
364 | """
365 | h_distance = _pixel_distance(_build_bounding_box(node1.get('bounds')).x1,
366 | _build_bounding_box(node1.get('bounds')).x2,
367 | _build_bounding_box(node2.get('bounds')).x1,
368 | _build_bounding_box(node2.get('bounds')).x2)
369 | v_distance = _pixel_distance(_build_bounding_box(node1.get('bounds')).y1,
370 | _build_bounding_box(node1.get('bounds')).y2,
371 | _build_bounding_box(node2.get('bounds')).y1,
372 | _build_bounding_box(node2.get('bounds')).y2)
373 |
374 | return float(h_distance) / _screen_width, float(
375 | v_distance) / _screen_height
376 |
377 |
378 | def _build_neighbors(
379 | node,
380 | view_hierarchy_leaf_nodes,
381 | _screen_width,
382 | _screen_height):
383 | """Builds the neighbours from view_hierarchy.
384 |
385 | Args:
386 | node: The current etree root node.
387 | view_hierarchy_leaf_nodes: All of the etree nodes.
388 | _screen_width, _screen_height: Screen width and height.
389 |
390 | Returns:
391 | Neighbour directions and object pointers.
392 | """
393 | if view_hierarchy_leaf_nodes is None:
394 | return None
395 | vh_relation = get_view_hierarchy_leaf_relation(
396 | view_hierarchy_leaf_nodes, _screen_width, _screen_height)
397 | _neighbor = _get_single_direction_neighbors(
398 | view_hierarchy_leaf_nodes.index(node),
399 | vh_relation['v_distance'],
400 | vh_relation['h_distance'])
401 | for k, v in _neighbor.items():
402 | _neighbor[k] = view_hierarchy_leaf_nodes[v].get('pointer')
403 | return _neighbor
404 |
405 |
406 | def _build_etree_from_json(root, json_dict):
407 | """Builds the element tree from json_dict.
408 |
409 | Args:
410 | root: The current etree root node.
411 | json_dict: The current json_dict corresponding to the etree root node.
412 | """
413 | # set node attributes
414 | if root is None or json_dict is None:
415 | return
416 | x1, y1, x2, y2 = json_dict.get('bounds', [0, 0, 0, 0])
417 | root.set('bounds', '[%d,%d][%d,%d]' % (x1, y1, x2, y2))
418 | root.set('class', json_dict.get('class', ''))
419 | # XML element cannot contain NULL bytes.
420 | root.set('text', json_dict.get('text', '').replace('\x00', ''))
421 | root.set('resource-id', json_dict.get('resource-id', ''))
422 | content_desc = json_dict.get('content-desc', [None])
423 | root.set(
424 | 'content-desc',
425 | '' if content_desc[0] is None else content_desc[0].replace('\x00', ''))
426 | root.set('package', json_dict.get('package', ''))
427 | root.set('visible', str(json_dict.get('visible-to-user', True)))
428 | root.set('enabled', str(json_dict.get('enabled', False)))
429 | root.set('focusable', str(json_dict.get('focusable', False)))
430 | root.set('focused', str(json_dict.get('focused', False)))
431 | root.set(
432 | 'scrollable',
433 | str(
434 | json_dict.get('scrollable-horizontal', False) or
435 | json_dict.get('scrollable-vertical', False)))
436 | root.set('clickable', str(json_dict.get('clickable', False)))
437 | root.set('long-clickable', str(json_dict.get('long-clickable', False)))
438 | root.set('selected', str(json_dict.get('selected', False)))
439 | root.set('pointer', str(json_dict.get('pointer', '')))
440 | if 'children' not in json_dict: # leaf node
441 | return
442 | for child in json_dict['children']:
443 | # some json file has 'null' as one of the children.
444 | if child:
445 | child_node = etree.Element('node')
446 | root.append(child_node)
447 | _build_etree_from_json(child_node, child)
448 |
449 |
450 | class LeafNode(object):
451 | """Represents a leaf node in the view hierarchy data from xml."""
452 |
453 | def __init__(self,
454 | element,
455 | all_elements=None,
456 | dom_location=None,
457 | screen_width=config.SCREEN_WIDTH,
458 | screen_height=config.SCREEN_HEIGHT):
459 | """Constructor.
460 |
461 | Args:
462 | element: The etree.Element object.
463 | all_elements: All the etree.Element objects in the view hierarchy.
464 | dom_location: [depth, preorder-index, postorder-index] of element.
465 | screen_width: The width of the screen associated with the element.
466 | screen_height: The height of the screen associated with the element.
467 | """
468 | assert not element.findall('.//node')
469 | self.element = element
470 | self._screen_width = screen_width
471 | self._screen_height = screen_height
472 | # logger.info(f"element: {element}")
473 | bbox = _build_bounding_box(element.get('bounds'))
474 | self.uiobject = UIObject(
475 | obj_type=_build_object_type(element.get('class')),
476 | obj_name=_build_object_name(
477 | element.get('text'), element.get('content-desc')),
478 | word_sequence=_build_word_sequence(
479 | element.get('text'), element.get('content-desc'),
480 | element.get('resource-id')),
481 | text=element.get('text'),
482 | resource_id=element.get('resource-id'),
483 | android_class=element.get('class'),
484 | android_package=element.get('package'),
485 | content_desc=element.get('content-desc'),
486 | clickable=_build_clickable(element),
487 | visible=strtobool(element.get('visible', default='true')),
488 | enabled=strtobool(element.get('enabled')),
489 | focusable=strtobool(element.get('focusable')),
490 | focused=strtobool(element.get('focused')),
491 | scrollable=strtobool(element.get('scrollable')),
492 | long_clickable=strtobool(element.get('long-clickable')),
493 | selected=strtobool(element.get('selected')),
494 | bounding_box=bbox,
495 | grid_location=_grid_location(bbox, self._screen_width,
496 | self._screen_height),
497 | dom_location=dom_location,
498 | pointer=element.get('pointer'),
499 | neighbors=_build_neighbors(
500 | element, all_elements,
501 | self._screen_width, self._screen_height))
502 |
503 | def dom_distance(self, other_node):
504 | """Calculates dom distance between this node and other node.
505 |
506 | Args:
507 | other_node: Another LeafNode object.
508 |
509 | Returns:
510 | The dom distance in between two leaf nodes: defined as the number of
511 | nodes on the path from one leaf node to the other on the tree.
512 | """
513 | intersection = [
514 | node for node in self.element.iterancestors()
515 | if node in other_node.element.iterancestors()
516 | ]
517 | assert intersection
518 | ancestor_list = list(self.element.iterancestors())
519 | other_ancestor_list = list(other_node.element.iterancestors())
520 | return ancestor_list.index(
521 | intersection[0]) + other_ancestor_list.index(intersection[0]) + 1
522 |
523 |
524 | class DomLocationKey(Enum):
525 | """Keys of dom location info."""
526 | DEPTH = 0
527 | PREORDER_INDEX = 1
528 | POSTORDER_INDEX = 2
529 |
530 |
531 | class ViewHierarchy(object):
532 | """Represents the view hierarchy data from UIAutomator dump."""
533 |
534 | def __init__(self,
535 | screen_width=config.SCREEN_WIDTH,
536 | screen_height=config.SCREEN_HEIGHT):
537 | """Constructor.
538 |
539 | Args:
540 | screen_width: The pixel width of the screen for the view hierarchy.
541 | screen_height: The pixel height of the screen for the view hierarchy.
542 | """
543 | self._root = None
544 | self._root_element = None
545 | self._all_visible_leaves = []
546 | self._dom_location_dict = None
547 | self._preorder_index = 0
548 | self._postorder_index = 0
549 | self._screen_width = screen_width
550 | self._screen_height = screen_height
551 |
552 | def load_xml(self, xml_content):
553 | """Builds the etree from xml content.
554 |
555 | Args:
556 | xml_content: The string containing xml content.
557 | """
558 | self._root = etree.XML(xml_content)
559 | self._root_element = self._root[0]
560 |
561 | self._all_visible_leaves = self._get_visible_leaves()
562 |
563 | # dom_location_dict:
564 | # dict of {id(element): [depth, preorder-index, postorder-index]}
565 | # Note: for leaves of any tree, the following equation is always true:
566 | #
567 | # depth == preorder-index - postorder-index (depth is # of ancestors)
568 | #
569 | self._dom_location_dict = self._calculate_dom_location()
570 |
571 | def load_json(self, json_content):
572 | """Builds the etree from json content.
573 |
574 | Args:
575 | json_content: The string containing json content.
576 | """
577 | json_dict = json.loads(json_content)
578 | if json_dict is None:
579 | raise ValueError('empty json file.')
580 | self._root = etree.Element('hierarchy', rotation='0')
581 | self._root_element = etree.Element('node')
582 | self._root.append(self._root_element)
583 | _build_etree_from_json(
584 | self._root_element,
585 | json_dict['activity']['root'])
586 |
587 | self._all_visible_leaves = self._get_visible_leaves()
588 | self._dom_location_dict = self._calculate_dom_location()
589 |
590 | def get_leaf_nodes(self):
591 | """Returns a list of all the leaf Nodes."""
592 | return [
593 | LeafNode(element, self._all_visible_leaves,
594 | self._dom_location_dict[id(element)],
595 | self._screen_width, self._screen_height)
596 | for element in self._all_visible_leaves
597 | ]
598 |
599 | def get_ui_objects(self):
600 | """Returns a list of all ui objects represented by leaf nodes."""
601 | return [
602 | LeafNode(element, self._all_visible_leaves,
603 | self._dom_location_dict[id(element)],
604 | self._screen_width, self._screen_height).uiobject
605 | for element in self._all_visible_leaves
606 | ]
607 |
608 | def dedup(self, click_x_and_y):
609 | """Dedup UI objects with same text or content_desc.
610 |
611 | Args:
612 | click_x_and_y: the event x and y (like: click pos in screen)
613 | """
614 | click_x, click_y = click_x_and_y
615 |
616 | # Map of {'name': [list of UI objects with this name]}
617 | name_element_map = collections.defaultdict(list)
618 | for element in self._all_visible_leaves:
619 | name = _build_object_name(element.get('text'),
620 | element.get('content_desc'))
621 | name_element_map[name].append(element)
622 |
623 | def delete_element(element):
624 | element.getparent().remove(element)
625 |
626 | for name, elements in name_element_map.items():
627 | if not name:
628 | continue
629 | # Search if the event (x, y) happens in one of these objects
630 | target_index = None
631 | for index, element in enumerate(elements):
632 | box = _build_bounding_box(element.get('bounds'))
633 | if (box.x1 <= click_x <= box.x2 and box.y1 <= click_y <= box.y2):
634 | target_index = index
635 |
636 | if target_index is None: # target UI obj is not in this elements
637 | for ele in elements[1:]:
638 | delete_element(ele)
639 | else: # if target UI obj is one of them, delete the rest UI objs
640 | for ele in elements[:target_index] + \
641 | elements[target_index + 1:]:
642 | delete_element(ele)
643 |
644 | print('Dedup: %d -> %d' % (len(self._all_visible_leaves),
645 | len(self._get_visible_leaves())))
646 |
647 | self._all_visible_leaves = self._get_visible_leaves()
648 | self._dom_location_dict = self._calculate_dom_location()
649 |
650 | def _get_visible_leaves(self):
651 | """Gets all the visible leaves from view hierarchy.
652 |
653 | Returns:
654 | all_visible_leaves: The list of all the visible leaf elements.
655 | """
656 |
657 | all_elements = [element for element in self._root.iter('*')]
658 | # View the attributes of each element
659 | # for element in all_elements:
660 | # logger.info(element.attrib)
661 | # logger.info(element.attrib.get('bounds'))
662 | # logger.info(element.attrib.get('displayed'))
663 |
664 | all_visible_leaves = [
665 | element for element in all_elements if self._is_leaf(element) and
666 | strtobool(element.attrib.get('displayed', default='true')) and
667 | self._is_within_screen_bound(element)
668 | ]
669 | return all_visible_leaves
670 |
671 | def _calculate_dom_location(self):
672 | """Calculate [depth, preorder-index, postorder-index] of all leaf nodes.
673 |
674 | This method is NOT thread safe if multiple threads call this method of same
675 | ViewHierarchy object: This method keeps updating self._preorder_index
676 | and self._postorder_index when call pre/post travel method recursively.
677 |
678 | All leaf elements will be filted and cached in self._all_visible_leaves.
679 | This is necessary because dom_location_dict use id(element) as keys, if
680 | call _root.iter('*') every time, the id(element) will not be a fixed value
681 | even for same element in XML.
682 |
683 | Returns:
684 | dom_location_dict, dict of
685 | {id(element): [depth, preorder-index, postorder-index]}
686 | """
687 | dom_location_dict = collections.defaultdict(lambda: [None, None, None])
688 | # Calculate the depth of all leaf nodes.
689 | for element in self._all_visible_leaves:
690 | ancestors = [node for node in element.iterancestors()]
691 | dom_location_dict[id(element)][DomLocationKey.DEPTH.value] = len(
692 | ancestors)
693 |
694 | # Calculate the pre/post index by calling pre/post iteration
695 | # recursively.
696 | self._preorder_index = 0
697 | self._pre_order_iterate(self._root, dom_location_dict)
698 | self._postorder_index = 0
699 | self._post_order_iterate(self._root, dom_location_dict)
700 | return dom_location_dict
701 |
702 | def _pre_order_iterate(self, element, dom_location_dict):
703 | """Preorder travel on hierarchy tree.
704 |
705 | Args:
706 | element: etree element which will be visited now.
707 | dom_location_dict: dict of
708 | {id(element): [depth, preorder-index, postorder-index]}
709 | """
710 | if self._is_leaf(element):
711 | dom_location_dict[id(element)][DomLocationKey.PREORDER_INDEX
712 | .value] = self._preorder_index
713 | self._preorder_index += 1
714 |
715 | for child in element:
716 | if child.getparent() == element:
717 | self._pre_order_iterate(child, dom_location_dict)
718 |
719 | def _post_order_iterate(self, element, dom_location_dict):
720 | """Postorder travel on hierarchy tree.
721 |
722 | Args:
723 | element: etree element which will be visited now.
724 | dom_location_dict: dict of
725 | {id(element): [depth, preorder-index, postorder-index]}
726 | """
727 | for child in element:
728 | if child.getparent() == element:
729 | self._post_order_iterate(child, dom_location_dict)
730 |
731 | if self._is_leaf(element):
732 | dom_location_dict[id(element)][DomLocationKey.POSTORDER_INDEX
733 | .value] = self._postorder_index
734 | self._postorder_index += 1
735 |
736 | def _is_leaf(self, element):
737 | """Whether an etree element is leaf in hierachy tree."""
738 |
739 | return not element.findall('.//*')
740 |
741 | def _is_within_screen_bound(self, element):
742 | """Whether an etree element's bounding box is within screen boundary."""
743 | bbox = _build_bounding_box(element.attrib.get('bounds'))
744 | in_x = (0 <= bbox.x1 <= self._screen_width) and (0 <= bbox.x2 <=
745 | self._screen_width)
746 | in_y = (0 <= bbox.y1 <= self._screen_height) and (0 <= bbox.y2 <=
747 | self._screen_height)
748 | x1_less_than_x2 = bbox.x1 < bbox.x2
749 | y1_less_than_y2 = bbox.y1 < bbox.y2
750 | return in_x and in_y and x1_less_than_x2 and y1_less_than_y2
751 |
--------------------------------------------------------------------------------
/cognisim/device/device.py:
--------------------------------------------------------------------------------
1 | from abc import ABC, abstractmethod
2 |
3 |
4 | class Device(ABC):
5 | def __init__(self, app_package):
6 | self.app_package = app_package
7 |
8 | @abstractmethod
9 | def start_device(self):
10 | '''
11 | Function to start device
12 | '''
13 | pass
14 |
15 | @abstractmethod
16 | def stop_device(self):
17 | '''
18 | Function to stop device
19 | '''
20 | pass
21 |
22 | @abstractmethod
23 | def get_state(self):
24 | pass
25 |
26 | @abstractmethod
27 | def tap(self, x, y):
28 | pass
29 |
30 | @abstractmethod
31 | def input(self, x, y, text):
32 | pass
33 |
34 | @abstractmethod
35 | def swipe(self, x, y, direction):
36 | pass
37 |
--------------------------------------------------------------------------------
/cognisim/device/device_factory.py:
--------------------------------------------------------------------------------
1 | # device/device_factory.py
2 | # from .device import Device
3 | from cognisim.device.android.android_device import AndroidDevice
4 | from cognisim.device.ios.ios_device import IOSDevice
5 | from loguru import logger
6 |
7 |
8 | class DeviceFactory:
9 | @staticmethod
10 | def create_device(
11 | platform: str,
12 | app_url: str,
13 | state_representation='aria',
14 | download_directory='default',
15 | session_id=None,
16 | tracing=False,
17 | tracingconfig=None
18 | ):
19 | if platform == 'android':
20 | return AndroidDevice(
21 | app_package=app_url,
22 | download_directory=download_directory,
23 | session_id=session_id
24 | )
25 | elif platform == 'ios':
26 | return IOSDevice(app_url)
27 |
28 | elif platform == 'web':
29 | logger.info("Creating web device")
30 | raise NotImplementedError("Web support is not yet implemented")
31 | else:
32 | raise ValueError(
33 | "Invalid type. Expected one of: 'android', 'web'.")
34 |
--------------------------------------------------------------------------------
/cognisim/device/ios/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RevylAI/CogniSim/3d8902e011981b93cb0ebfafba1794eab93b053e/cognisim/device/ios/__init__.py
--------------------------------------------------------------------------------
/cognisim/device/ios/ios_device.py:
--------------------------------------------------------------------------------
1 | import base64
2 | from datetime import datetime
3 | from appium.webdriver.common.appiumby import AppiumBy
4 | from cognisim.device.device import Device
5 | from appium.options.ios import XCUITestOptions
6 | from appium import webdriver
7 | from cognisim.device.ios.ios_view_hierarchy import UI
8 | from cognisim.device.ios.ios_view_hierarchy_maestro import get_formatted_hierarchy as get_formatted_hierarchy_maestro
9 | from loguru import logger
10 | import os
11 | import cv2
12 | import numpy as np
13 | import asyncio
14 | import json
15 | SCREEN_WITH = 430
16 | SCREEN_HEIGHT = 932
17 |
18 | SCREEN_CHANNEL = 4
19 |
20 |
21 | class IOSDevice(Device):
22 | def __init__(self, app_package=None, download_directory='default', session_id=None):
23 | super().__init__(app_package)
24 | self.download_directory = download_directory
25 | self.app_package = app_package
26 | self.session_id = session_id
27 | self.desired_caps = {
28 | 'deviceName': 'iPhone 14',
29 | 'automationName': 'XCUITest',
30 | 'autoGrantPermission': True,
31 | 'newCommandTimeout': 600,
32 | 'mjpegScreenshotUrl': 'http://localhost:4723/stream.mjpeg',
33 | 'platformVersion': '16.4',
34 | 'snapshotMaxDepth': 30,
35 | 'customSnapshotTimeout': 250,
36 | }
37 |
38 | self.options = XCUITestOptions().load_capabilities(self.desired_caps)
39 | self.use_maestro = True
40 |
41 | async def start_device(self):
42 | '''
43 | Start the IOS device and connect to the appium server
44 | '''
45 | try:
46 | self.driver = webdriver.Remote('http://localhost:4723', options=self.options)
47 | except BaseException:
48 | self.desired_caps.pop('mjpegScreenshotUrl')
49 | self.options = XCUITestOptions().load_capabilities(self.desired_caps)
50 | self.driver = webdriver.Remote('http://localhost:4723', options=self.options)
51 |
52 | self.driver.update_settings({'waitForIdleTimeout': 0, 'shouldWaitForQuiescence': False, 'maxTypingFrequency': 60})
53 |
54 | async def mobile_get_source(self, format='json'):
55 | return self.driver.execute_script('mobile: source', {'format': format, 'excludedAttributes': 'visible'})
56 |
57 | async def start_recording(self):
58 | '''
59 | Start recording screen on the IOS device
60 | returns: None
61 | '''
62 | try:
63 | self.driver.start_recording_screen()
64 | except Exception as e:
65 | logger.error(f"Failed to start screen recording. Error: {str(e)}")
66 | raise
67 |
68 | async def stop_recording(self, save_path=None):
69 | '''
70 | Stops screen recording on the IOS device and saves the video
71 | Args:
72 | save_path (str, optional): Path to save the video file. If not provided, a default path will be used.
73 |
74 | Returns:
75 | str: Path to the saved video file
76 |
77 | '''
78 | video_base64 = self.driver.stop_recording_screen()
79 | if save_path is None:
80 | timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
81 | filename = f"screen_recording_{timestamp}.mp4"
82 | save_dir = os.path.join(os.getcwd(), "recordings")
83 | os.makedirs(save_dir, exist_ok=True)
84 | save_path = os.path.join(save_dir, filename)
85 |
86 | with open(save_path, "wb") as video_file:
87 | video_file.write(base64.b64decode(video_base64))
88 |
89 | logger.info(f"Screen recording saved to: {save_path}")
90 | return save_path
91 |
92 | async def get_state(self, use_maestro=True):
93 | try:
94 | if use_maestro:
95 | encoded_ui, ui = await self.get_state_maestro()
96 | logger.info(f"Maestro hierarchy: {encoded_ui}")
97 | else:
98 | raw_appium_state = self.driver.page_source
99 |
100 | file_path = os.path.join(os.path.dirname(__file__), 'ios_view_hierarchy.xml')
101 | xml_file = open(file_path, 'w')
102 | xml_file.write(raw_appium_state)
103 | xml_file.close()
104 |
105 | ui = UI(file_path)
106 | self.ui = ui
107 | encoded_ui: str = ui.encoding()
108 | logger.info(f"Encoded UI: {encoded_ui}")
109 | # logger.info(f"Raw Appium State: {raw_appium_state}")
110 | except Exception as e:
111 | logger.info(f"Error getting page source: {e}")
112 | raw_appium_state = ""
113 |
114 | screenshot: bytes = self.driver.get_screenshot_as_png()
115 | return encoded_ui, screenshot, ui
116 |
117 | async def get_state_maestro(self):
118 | '''
119 | Use Maestro to get the view hierarchy
120 | '''
121 | try:
122 | # Run maestro hierarchy command and capture output
123 | process = await asyncio.create_subprocess_exec(
124 | 'maestro', 'hierarchy',
125 | stdout=asyncio.subprocess.PIPE,
126 | stderr=asyncio.subprocess.PIPE
127 | )
128 | stdout, stderr = await process.communicate()
129 |
130 | if process.returncode != 0:
131 | logger.error(f"Error getting Maestro hierarchy: {stderr.decode()}")
132 | return None
133 | # Parse JSON output
134 | stdout = stdout.decode().strip()
135 | # Parse until first opening brace
136 | stdout = stdout[stdout.find('{'):]
137 | hierarchy = json.loads(stdout)
138 | # logger.info(f"Hierarchy length: {len(hierarchy)}")
139 | # Format hierarchy
140 | formatted_html, ui_objects = get_formatted_hierarchy_maestro(hierarchy)
141 | return formatted_html, ui_objects
142 |
143 | except Exception as e:
144 | logger.error(f"Error in get_state_maestro: {e}")
145 | return None
146 |
147 | def generate_set_of_mark(self,
148 | ui,
149 | image: bytes,
150 | position='top-left') -> bytes:
151 | '''
152 | Code to generate a set of mark for a given image and UI state
153 | ui: UI object
154 | image: bytes of the image
155 | step_i: step number
156 | position: position of the annotation, defaults to 'top-lefts, can also be 'center
157 | '''
158 | nparr = np.frombuffer(image, np.uint8)
159 | img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
160 | height, width, _ = img.shape
161 | k = 3000
162 |
163 | for element_id in ui.elements:
164 | bounds = [
165 | ui.elements[element_id].bounding_box.x1,
166 | ui.elements[element_id].bounding_box.y1,
167 | ui.elements[element_id].bounding_box.x2,
168 | ui.elements[element_id].bounding_box.y2
169 | ]
170 | # Calculate the area of the bounding box
171 | area = (bounds[2] - bounds[0]) * (bounds[3] - bounds[1])
172 |
173 | # Only label elements with area over k
174 | if area > k:
175 | # Draw a rectangle around the element
176 | cv2.rectangle(
177 | img, (int(bounds[0]), int(bounds[1])),
178 | (int(bounds[2]), int(bounds[3])), (0, 0, 255), 5)
179 |
180 | text = str(element_id)
181 | text_size = 2 # Fixed text size
182 | font = cv2.FONT_HERSHEY_SIMPLEX
183 |
184 | # Calculate the width and height of the text
185 | text_width, text_height = cv2.getTextSize(text, font, text_size, 2)[0]
186 |
187 | if position == 'top-left':
188 | text_x = int(bounds[0])
189 | text_y = int(bounds[1]) + text_height
190 | else:
191 | text_x = (int(bounds[0]) + int(bounds[2])) // 2 - text_width // 2
192 | text_y = (int(bounds[1]) + int(bounds[3])) // 2 + text_height // 2
193 |
194 | # Draw a black rectangle behind the text
195 | cv2.rectangle(img, (text_x, text_y - text_height),
196 | (text_x + text_width, text_y), (0, 0, 0), thickness=cv2.FILLED)
197 |
198 | # Draw the text in white
199 | cv2.putText(img, text, (text_x, text_y), font,
200 | text_size, (255, 255, 255), 4)
201 |
202 | _, img_encoded = cv2.imencode('.png', img)
203 | img_bytes = img_encoded.tobytes()
204 |
205 | return img_bytes
206 |
207 | async def tap(self, x, y):
208 | self.driver.execute_script('mobile: tap', {'x': x, 'y': y})
209 |
210 | async def input(self, x, y, text):
211 | self.driver.execute_script('mobile: tap', {'x': x, 'y': y})
212 | self.driver.find_element(AppiumBy.IOS_PREDICATE, "type == 'XCUIElementTypeApplication'").send_keys(text)
213 | # self.driver.execute_script('mobile: type', {'text': text})
214 |
215 | async def swipe(self, initial_x, initial_y, end_x, end_y, duration=1):
216 | """
217 | Performs a swipe gesture on the iOS device
218 |
219 | Args:
220 | initial_x (int): Starting x coordinate of the swipe
221 | initial_y (int): Starting y coordinate of the swipe
222 | end_x (int): Ending x coordinate of the swipe
223 | end_y (int): Ending y coordinate of the swipe
224 | duration (int, optional): Duration of the swipe in seconds. Defaults to 1.
225 | """
226 | self.driver.execute_script('mobile: dragFromToForDuration', {'fromX': initial_x, 'fromY': initial_y, 'toX': end_x, 'toY': end_y, 'duration': duration})
227 |
228 | async def scroll(self, direction):
229 | direction_map = {
230 | 'up': 'UP',
231 | 'down': 'DOWN',
232 | 'left': 'LEFT',
233 | 'right': 'RIGHT'
234 | }
235 | await self.driver.execute_script('mobile: scroll', {'direction': direction_map[direction]})
236 |
237 | async def get_screenshot(self) -> bytes:
238 | '''
239 | Get Screenshot as bytes
240 | '''
241 | screenshot: bytes = self.driver.get_screenshot_as_png()
242 | return screenshot
243 |
244 | async def navigate(self, package_name: str):
245 | self.driver.activate_app(package_name)
246 |
247 | async def capture_screenshot_with_bounding_box(self, bounds: dict, image_state: bytes = None) -> bytes:
248 | """
249 | Capture a screenshot with a bounding box drawn around a specified element.
250 |
251 | Args:
252 | bounds (dict): A dictionary containing the bounding box coordinates.
253 | Expected keys are x1, y1, x2, y2, all of which are integers.
254 | image_state (bytes, optional): The current screenshot if available.
255 |
256 | Returns:
257 | bytes: The screenshot image with bounding box as bytes.
258 | """
259 | logger.info("Creating tagged image")
260 | screenshot = image_state if image_state is not None else await self.device.screenshot()
261 | if screenshot is None:
262 | logger.info("Screenshot failed")
263 | return None
264 |
265 | # Convert the screenshot to a NumPy array
266 | image_np = np.frombuffer(screenshot, dtype=np.uint8)
267 | image = cv2.imdecode(image_np, cv2.IMREAD_COLOR)
268 |
269 | # Extract bounding box coordinates
270 | x1 = int(bounds[0])
271 | y1 = int(bounds[1])
272 | x2 = int(bounds[2])
273 | y2 = int(bounds[3])
274 |
275 | # Calculate width and height
276 | # width = x2 - x1
277 | # height = y2 - y1
278 |
279 | bright_color = (128, 0, 128) # Pink color
280 | # Draw the bounding box on the image
281 | cv2.rectangle(image, (x1, y1), (x2, y2), bright_color, 5)
282 |
283 | # Convert the image back to bytes
284 | _, encoded_image = cv2.imencode('.png', image)
285 | screenshot_with_bounding_box = encoded_image.tobytes()
286 |
287 | return screenshot_with_bounding_box
288 |
289 | async def stop_device(self):
290 | '''
291 | Stops the device
292 | '''
293 | pass
294 |
295 |
296 | if __name__ == "__main__":
297 | ui = UI(os.path.join(os.path.dirname(__file__), 'ios_view_hierarchy.xml'))
298 | encoded_ui = ui.encoding()
299 |
300 | logger.info(f"Encoded UI: {encoded_ui}")
301 |
--------------------------------------------------------------------------------
/cognisim/device/ios/ios_view_hierarchy.py:
--------------------------------------------------------------------------------
1 | from lxml import etree
2 | from enum import Enum
3 | from distutils.util import strtobool
4 | import attr
5 | import numpy as np
6 | import re
7 | import json
8 | import collections
9 | from loguru import logger
10 | SCREEN_WIDTH = 430
11 | SCREEN_HEIGHT = 932
12 |
13 | SCREEN_CHANNEL = 4
14 |
15 | ADJACENT_BOUNDING_BOX_THRESHOLD = 3
16 | NORM_VERTICAL_NEIGHTBOR_MARGIN = 0.01
17 | NORM_HORIZONTAL_NEIGHTBOR_MARGIN = 0.01
18 | INPUT_ACTION_UPSAMPLE_RATIO = 1
19 | XML_SCREEN_WIDTH = 430
20 | XML_SCREEN_HEIGHT = 932
21 | CLASS_MAPPING = {
22 | "STATICTEXT": 'p',
23 | "BUTTON": 'button',
24 | "IMAGE": 'img',
25 | "SWITCH": 'input',
26 | "CELL": 'div',
27 | "TABLE": 'table',
28 | "NAVIGATIONBAR": 'nav',
29 | "APPLICATION": "div",
30 | "TEXTFIELD": "input",
31 | "SECURETEXTFIELD": "input",
32 | "DatePicker:": "input",
33 | "PICKER": "input",
34 | "PICKERWHEEL": "input",
35 | "PAGEINDICATOR": "div",
36 | "KEY": "button",
37 | "KEYBOARD": "div",
38 | "LINK": "a",
39 | "SEARCHFIELD:": "input",
40 | "TEXTVIEW": "textarea",
41 | "WEBVIEW": "iframe",
42 | "BUTTON": "button",
43 | "OTHER": "div"
44 | }
45 |
46 |
47 | class DomLocationKey(Enum):
48 | '''
49 | Keys of dom location info
50 | '''
51 | DEPTH = 0
52 | PREORDER_INDEX = 1
53 | POSTORDER_INDEX = 2
54 |
55 |
56 | class UIObjectType(Enum):
57 | """
58 | Typoes of the different UI objects
59 | """
60 | UNKNOWN = 0
61 | BUTTON = 1
62 | IMAGE = 2
63 | SWITCH = 3
64 | CELL = 4
65 | OTHER = 5
66 | TABLE = 6
67 | NAVIGATIONBAR = 7
68 | APPLICATION = 8
69 | WINDOW = 9
70 | STATICTEXT = 10
71 | SLIDER = 11
72 | TEXTFIELD = 12
73 | SECURETEXTFIELD = 13
74 | DATEPICKER = 14
75 | PICKER = 15
76 | PICKERWHEEL = 16
77 | PAGEINDICATOR = 17
78 | KEY = 18
79 | KEYBOARD = 19
80 | LINK = 20
81 | SEARCHFIELD = 21
82 | TEXTVIEW = 22
83 | WEBVIEW = 23
84 |
85 |
86 | class UIObjectGridLocation(Enum):
87 | '''
88 | The on-screen grid location (3x3 grid) of an UI object
89 | '''
90 | TOP_LEFT = 0
91 | TOP_CENTER = 1
92 | TOP_RIGHT = 2
93 | LEFT = 3
94 | CENTER = 4
95 | RIGHT = 5
96 | BOTTOM_LEFT = 6
97 | BOTTOM_CENTER = 7
98 | BOTTOM_RIGHT = 8
99 |
100 |
101 | @attr.s
102 | class BoundingBox(object):
103 | '''
104 | The bounding box with horizontal/vertical coordinates of a ui object
105 | '''
106 | x1 = attr.ib()
107 | y1 = attr.ib()
108 | x2 = attr.ib()
109 | y2 = attr.ib()
110 |
111 |
112 | @attr.s
113 | class UiObject(object):
114 | '''
115 | Represents a UI object form the leaf node in the view hierarchy
116 | '''
117 | # type
118 | obj_type = attr.ib()
119 | # name
120 | obj_name = attr.ib()
121 |
122 | word_sequence = attr.ib()
123 | # text
124 | text = attr.ib()
125 | # accessibility label
126 | accesible = attr.ib()
127 |
128 | # ios_Type
129 | ios_class = attr.ib()
130 |
131 | # name
132 | content_desc = attr.ib()
133 | #
134 |
135 | visible = attr.ib()
136 | enabled = attr.ib()
137 |
138 | bounding_box = attr.ib()
139 |
140 | grid_location = attr.ib()
141 |
142 | dom_location = attr.ib()
143 |
144 | pointer = attr.ib()
145 |
146 | neighbors = attr.ib()
147 |
148 |
149 | def _build_word_sequence(text, content_desc, resource_id):
150 | '''
151 | Returns a sequence of word toekns based on certain attributes
152 |
153 | Args:
154 | text: the text attribute of an element
155 | content_desc: the content-desc attribute of an element
156 | resource_id: `resource_id` attribute of an element
157 | Priority of the attributes: text > content_desc > resource_id
158 | Returns:
159 | A sequence of word tokens
160 | '''
161 | if text or content_desc:
162 | return re.findall(r"[\w']+|[.,!?;]", text if text else content_desc)
163 | else:
164 | name = resource_id.split('/')[-1]
165 | return filter(None, name.split('_'))
166 |
167 |
168 | def _build_object_type(ios_class: str):
169 | '''
170 | Returns the object type based on `class` attribute
171 |
172 | Args:
173 | ios_class: the `class` attribute of an element
174 | Returns:
175 | The UIObjectType of the element
176 |
177 | '''
178 | if ios_class.startswith("XCUIElementType"):
179 | widget_type = ios_class.split("XCUIElementType")[1]
180 | for obj_type in UIObjectType:
181 | if obj_type.name == widget_type.upper():
182 | # logger.info(f"obj_type: {obj_type}")
183 | return obj_type
184 | return UIObjectType.BUTTON
185 |
186 |
187 | def _build_object_name(text, content_desc):
188 | '''
189 | Returns the object name based on 'text' or 'context_desc' attribute
190 | Args:
191 | text: the `text` attribute of an element
192 | content_desc: the `content_desc` attribute of an element
193 | Returns:
194 | The object name
195 | '''
196 | return text if text else content_desc
197 |
198 |
199 | def _build_bounding_box(bounds):
200 | '''
201 | Returns the object bounding box based on `bounds` attribute
202 |
203 | Args:
204 | bounds the `b_ounds` attribute of an element
205 |
206 | Return:
207 | The BoundingBox Object
208 | '''
209 | match = re.compile(
210 | r'\[\'(\d+)\', \'(\d+)\'\]\[\'(\d+)\', \'(\d+)\'\]').match(bounds)
211 |
212 | assert match
213 | x1, y1, x2, y2 = map(int, match.groups())
214 | return BoundingBox(x1, y1, x2, y2)
215 |
216 |
217 | def _build_clickable(element, tree_child_as_clickable=True):
218 | ''''
219 | Returns whether the element is clickable based on certain attributes
220 | Args:
221 | element: The etree.element object
222 | tree_child_as_clickable: Whether to consider the tree child as clickable
223 |
224 | Returns:
225 | A boolean to indicate whether the element is clickable or one of its ancesors is
226 | basicallty given an element check if it is clickable or for the purposeo of this
227 | html representation
228 | '''
229 | clickable = element.get('accessible')
230 | if clickable == 'false':
231 | for node in element.iterancestors():
232 | if node.get('accessible') == 'true':
233 | clickable = True
234 | break
235 | if element.get('accessible') == 'true':
236 | clickable = 'true'
237 | if tree_child_as_clickable:
238 | p = element.getparent()
239 | while p is not None:
240 | if p.get('class') == 'android.widget.ListView':
241 | clickable = False
242 | break
243 | p = p.getparent()
244 |
245 | return strtobool(clickable)
246 |
247 |
248 | def _pixel_distance(a_x1, a_x2, b_x1, b_x2):
249 | '''
250 | Calculates the pixel distance between bounding box a and b
251 |
252 | Args:
253 | a_x1: The x_1 coordinate of box a
254 | a_x2: The x_2 coordinate of box a
255 | b_x1: The x_1 coordinate of box b
256 | b_x2: The x_2 coordinate of box b
257 |
258 | Returns:
259 | The pixel distance between box a and b on the x acis. The distance
260 | on the y acis can be calculated in the same way. The distance can be
261 | positive number (b is right/bottom to a ) and negative
262 | (b is left/top to a)
263 |
264 | The _pixel_distance function calculates the pixel distance between two bounding box (a and b) along the x-axis
265 |
266 | Here's a breakdown:
267 |
268 | 1. If box b is close enough to box a on the right side. (distance is less than or equal to a threshold), it returns 1.
269 |
270 | 2. If box b is close enough to box a on the left side. (distance is less than or equal to a threshold), it returns -1.
271 |
272 | 3. If box a and box b overlap on the x-axis it returns -
273 |
274 | 4. If box b is to teh right of box a (b_x1 > a_x2), it reutrns the distance from the right side of box a to the left side of box b (b_x1 - a_x2)
275 |
276 | 5. If none of the above conditions are met, box b is to the left of box a, and it returns the distance from the right side of box b to the left side of box a (b_x2- a_x1)
277 |
278 | The function assumes that the x1 coordinate is the left side of a box and the x2 coordinate is teh right side. The returned distance can be positive (if b is to the right of a)
279 | Tldr: the fucntion runs the distance between two boundings boxes along the x-acis and if they close enoguht it returns 1 or -1
280 | '''
281 |
282 | if b_x1 <= a_x2 and a_x2 - b_x1 <= ADJACENT_BOUNDING_BOX_THRESHOLD:
283 | return 1
284 | if a_x1 <= b_x2 and b_x2 - a_x1 <= ADJACENT_BOUNDING_BOX_THRESHOLD:
285 | return -1
286 |
287 | # overlap
288 | if (a_x1 <= b_x1 <= a_x2) or (a_x1 <= b_x2 <= a_x2) or (b_x1 <= a_x1 <= b_x2) or (b_x1 <= a_x2 <= b_x2):
289 | return 0
290 | elif b_x1 > a_x2:
291 | return b_x1 - a_x2
292 | else:
293 | return b_x2 - a_x1
294 |
295 |
296 | def _grid_coordinate(x, width):
297 | """Calculates the 3x3 grid coordinate on the x axis.
298 |
299 | The grid coordinate on the y axis is calculated in the same way.
300 |
301 | Args:
302 | x: The x coordinate: [0, width).
303 | width: The screen width.
304 |
305 | Returns:
306 | The grid coordinate: [0, 2].
307 | Note that the screen is divided into 3x3 grid, so the grid coordinate
308 | uses the number from 0, 1, 2.
309 | """
310 | logger.info(f"x: {x}, width: {width}")
311 | # assert 0 <= x <= width
312 | grid_x_0 = width / 3
313 | grid_x_1 = 2 * grid_x_0
314 | if 0 <= x < grid_x_0:
315 | grid_coordinate_x = 0
316 | elif grid_x_0 <= x < grid_x_1:
317 | grid_coordinate_x = 1
318 | else:
319 | grid_coordinate_x = 2
320 | return grid_coordinate_x
321 |
322 |
323 | def _grid_location(bbox, screen_width, screen_height):
324 | '''
325 | Calculates teh grid number of the UI bounding box
326 |
327 | Args:
328 | bbox: The bounding box of the UI OBject
329 | screen_width: The width of the screen
330 | screen_height: The height of the screen
331 |
332 | Returns:
333 | The grid location number
334 | '''
335 | bbox_center_x = (bbox.x1 + bbox.x2) / 2
336 | bbox_center_y = (bbox.y1 + bbox.y2) / 2
337 | bbox_grid_x = _grid_coordinate(bbox_center_x, screen_width)
338 | bbox_grid_y = _grid_coordinate(bbox_center_y, screen_height)
339 | return UIObjectGridLocation(bbox_grid_y * 3 + bbox_grid_x)
340 |
341 |
342 | def get_view_hiearchy_leaf_relation(objects, _screen_width, _screen_height):
343 | '''
344 | Calculates teh adjacency relatio from list of view hierarchy leaf nodes
345 | Args:
346 | object: The list of view hierarchy leaf nodes
347 | _screen_width, _screen_width: Screen width and height
348 |
349 | Returns:
350 | An un-padded feature dictionary as follow:
351 | 'v_distance' 2d numpy array of ui object vertical adjancency relation
352 | 'h_distance' 2d numpy array of ui object horizontal adjacency relation
353 | 'dom_distance": 2d numpy array of ui object dom adjacency relation
354 |
355 |
356 | Adjacency matrix for vertical, horizontal, and dom relation
357 |
358 | '''
359 |
360 | vh_node_num = len(objects)
361 | vertical_adjacency = np.zeros((vh_node_num, vh_node_num))
362 | horizontal_adjacency = np.zeros((vh_node_num, vh_node_num))
363 |
364 | for row in range(len(objects)):
365 | for column in range(len(objects)):
366 | if row == column:
367 | h_dist = v_dist = 0
368 | else:
369 | node1 = objects[row]
370 | node2 = objects[column]
371 | h_dist, v_dist = normalized_pixel_distance(
372 | node1, node2, _screen_width, _screen_height)
373 |
374 | vertical_adjacency[row][column] = v_dist
375 | horizontal_adjacency[row][column] = h_dist
376 | return {
377 | 'v_distance': vertical_adjacency,
378 | 'h_distance': horizontal_adjacency
379 | }
380 |
381 |
382 | def normalized_pixel_distance(node1, node2, _screen_width, _screen_height):
383 | '''
384 | Caclulates teh normalized
385 |
386 | Args:
387 | node1, node2: Another object
388 |
389 | Reutrns:
390 | Normalized pixel distance on both horizontal and vertical direction
391 | '''
392 | node1_x_1 = int(node1.get('x'))
393 |
394 | node1_x_2 = node1_x_1 + int(node1.get('width'))
395 |
396 | node1_y_1 = int(node1.get('y'))
397 | node1_y_2 = node1_y_1 + int(node1.get('height'))
398 | node2_x_1 = int(node2.get('x'))
399 |
400 | node2_x_2 = node2_x_1 + int(node2.get('width'))
401 |
402 | node2_y_1 = int(node2.get('y'))
403 |
404 | node2_y_2 = node2_y_1 + int(node2.get('height'))
405 |
406 | h_distance = _pixel_distance(node1_x_1, node1_x_2, node2_x_1, node2_x_2)
407 |
408 | v_distance = _pixel_distance(node1_y_1, node1_y_2, node2_y_1, node2_y_2)
409 |
410 | return float(h_distance) / _screen_width, float(v_distance) / _screen_height
411 |
412 |
413 | def _build_neighbors(node, view_hierarchy_leaf_nodes,
414 | _screen_width, _screen_height):
415 | '''
416 | Builds the neighbors of a node based on the view hierarchy leaf nodes
417 |
418 | Args:
419 | node: The etree element object
420 | view_hierarchy_leaf_nodes: The list of view hierarchy leaf nodes
421 | _screen_width: The screen width
422 | _screen_height: The screen height
423 |
424 | Returns:
425 | A list of neighbors of the node
426 | '''
427 | if view_hierarchy_leaf_nodes is None:
428 | return None
429 |
430 | vh_relation = get_view_hiearchy_leaf_relation(
431 | view_hierarchy_leaf_nodes, _screen_width, _screen_height)
432 | _neighbor = _get_single_direction_neighbors(
433 | view_hierarchy_leaf_nodes,
434 | vh_relation['v_distance'],
435 | vh_relation['h_distance'],
436 | )
437 | for k, v in _neighbor.items():
438 | _neighbor[k] = view_hierarchy_leaf_nodes[v].get('pointer')
439 | return _neighbor
440 |
441 |
442 | def _get_single_direction_neighbors(object_idx, ui_v_dist, ui_h_dist):
443 | '''
444 | Gets four single direction neighbor for one target ui_object
445 |
446 | Args:
447 | object_idx: The index of the target ui_object
448 | ui_v_dist: The vertical adjacency matrix
449 | ui_h_dist: The horizontal adjacency matrix
450 |
451 | Returns:
452 | A dictionary of the four single direction neighbors
453 |
454 | '''
455 | neighbor_dict = {}
456 | vertical_distance = ui_v_dist[object_idx]
457 | horizontal_distance = ui_h_dist[object_idx]
458 | bottom_neighbor = np.array([
459 | idx for idx in range(len(vertical_distance)) if vertical_distance[idx] > 0 and
460 | abs(horizontal_distance[idx]) < NORM_HORIZONTAL_NEIGHTBOR_MARGIN
461 | ])
462 | top_neighbor = np.array([
463 | idx for idx in range(len(vertical_distance)) if vertical_distance[idx] < 0 and
464 | abs(horizontal_distance[idx]) < NORM_HORIZONTAL_NEIGHTBOR_MARGIN
465 | ])
466 | right_neighbor = np.array([
467 | idx for idx in range(len(horizontal_distance)) if horizontal_distance[idx] > 0 and
468 | abs(vertical_distance[idx]) < NORM_VERTICAL_NEIGHTBOR_MARGIN
469 | ])
470 | left_neighbor = np.array([
471 | idx for idx in range(len(horizontal_distance)) if horizontal_distance[idx] < 0 and
472 | abs(vertical_distance[idx]) < NORM_VERTICAL_NEIGHTBOR_MARGIN
473 | ])
474 |
475 | if bottom_neighbor.size:
476 | neighbor_dict['top'] = bottom_neighbor[
477 | np.argmin(vertical_distance[bottom_neighbor])]
478 | if top_neighbor.size:
479 | neighbor_dict['bottom'] = top_neighbor[np.argmax(
480 | vertical_distance[top_neighbor])]
481 | if right_neighbor.size:
482 | neighbor_dict['left'] = right_neighbor[np.argmax(
483 | horizontal_distance[right_neighbor])]
484 | if left_neighbor.size:
485 | neighbor_dict['right'] = left_neighbor[np.argmin(
486 | horizontal_distance[left_neighbor])]
487 |
488 | return neighbor_dict
489 |
490 |
491 | def _build_etree_from_json(root, json_dict):
492 | '''
493 | Builds teh element tree from json_dict
494 |
495 | Args:
496 | root: The current etree root node
497 | json_dict: The current json_dict corresponding ot the etree root node
498 |
499 |
500 | '''
501 |
502 | if root is None or json_dict is None:
503 | return
504 | x1, y1, x2, y2 = json_dict.get('bounds', [0, 0, 0, 0])
505 | root.set('bounds', '[%d, %d, %d, %d]' % (x1, y1, x2, y2))
506 | root.set('class', json_dict.get('class', ''))
507 | root.set('type', json_dict.get('type', ''))
508 |
509 | root.set('text', json_dict.get('text', '').replace('\x00', ''))
510 |
511 | root.set('resource-id', json_dict.get('resource-id', ''))
512 |
513 | root.set('content-desc', json_dict.get('content-desc', [None]))
514 | root.set('package', json_dict.get('package', ''))
515 | root.set('visible', str(json_dict.get('displayed', True)))
516 | root.set('enable', str(json_dict.get('enabled', False)))
517 | root.set('focusable', str(json_dict.get('focusable', False)))
518 | root.set('focused', str(json_dict.get('focused', False)))
519 |
520 | root.set('scrollable',
521 | str(
522 | json_dict.get('scrollable-horizontal', False) or
523 | json_dict.get('scrollable-vertical', False)
524 | ))
525 | root.set('clickable', str(json_dict.get('clickable', False)))
526 | root.set('long-clickable', str(json_dict.get('long-clickable', False)))
527 |
528 | root.set('selected', str(json_dict.get('selected', False)))
529 |
530 | root.set('pointer', json_dict.get('pointer', ''))
531 |
532 | if 'children' in json_dict:
533 | for child in json_dict['children']:
534 | child_element = etree.Element('node')
535 | root.append(child_element)
536 | _build_etree_from_json(child_element, child)
537 |
538 |
539 | class LeafNode(object):
540 | '''
541 | Represent a leaf node in the view hierachy
542 | '''
543 |
544 | def __init__(
545 | self,
546 | element,
547 | all_elements=None,
548 | dom_location=None,
549 | screen_width=SCREEN_WIDTH,
550 | screen_height=SCREEN_HEIGHT,
551 | ):
552 | '''
553 | Constructor.
554 |
555 | Args:
556 |
557 | element: the etree.Element object
558 | all_element: All the etree.Element objects in the view hierarchy
559 | dom_location: [depth, preorder-index, postorder-index] of element
560 | screen_width: The width of the screen associated with the element
561 | screen_height: The height of the screen associated with the element
562 | '''
563 |
564 | assert not len(element)
565 | self.element = element
566 |
567 | self._screen_width = screen_width
568 |
569 | self._screen_height = screen_height
570 |
571 | x_1 = str(max(0, int(element.get('x'))))
572 | y_1 = str(max(0, int(element.get('y'))))
573 | x_2 = str(int(x_1) + int(element.get('width')))
574 | y_2 = str(int(y_1) + int(element.get('height')))
575 |
576 | inits = str([x_1, y_1])
577 | ends = str([x_2, y_2])
578 | bounds = str(inits) + str(ends)
579 |
580 | bbox = _build_bounding_box(bounds)
581 |
582 | self.uiobject = UiObject(
583 | obj_type=_build_object_type(element.get('type')),
584 | content_desc=element.get('content-desc', default='').split('.')[-1]
585 | if '.' in element.get('name', default='') else element.get('name', default=''),
586 | obj_name=_build_object_name(
587 | text=element.get('name', default=''),
588 | content_desc=element.get('content-desc', default='')
589 | ),
590 | word_sequence=_build_word_sequence(
591 | text=element.get(
592 | 'text', default=''
593 | ),
594 | content_desc=element.get(
595 | 'content-desc', default=''
596 | ),
597 | resource_id=element.get('resource-id', default='')
598 |
599 | ),
600 | text=element.get('label', default=''),
601 | accesible=element.get('accessible', default='true'),
602 |
603 | ios_class=element.get('type', default=''),
604 | visible=strtobool(element.get('visible', default='true')),
605 | enabled=strtobool(element.get('enabled', default='true')),
606 | bounding_box=bbox,
607 | grid_location=_grid_location(bbox, self._screen_width, self._screen_height),
608 | dom_location=dom_location,
609 | pointer=element.get('pointer', default=''),
610 | neighbors=_build_neighbors(element, all_elements, self._screen_width, self._screen_height),
611 |
612 | )
613 |
614 | def dom_distance(self, other_node):
615 | '''
616 | Calculate the dom distance between two nodes
617 | Args:
618 | other_node: Another LeafNode
619 | Returns dom distance
620 | '''
621 | intersection = [
622 | node for node in self.element.iterancestors()
623 | if node in other_node.element.iterancestors()
624 | ]
625 | assert intersection
626 | ancestor_list = list(self.element.iterancestors())
627 |
628 | other_ancestor_list = list(other_node.element.iterancestors())
629 |
630 | return ancestor_list.index(
631 | intersection[0]) + other_ancestor_list.index(intersection[0]) + 1
632 |
633 |
634 | class ViewHierarchy(object):
635 | '''
636 | Represents the view hierachy from XCUI Test
637 | '''
638 |
639 | def __init__(self, screen_width=SCREEN_WIDTH, screen_height=SCREEN_HEIGHT):
640 | '''
641 | Constructor
642 |
643 | Args:
644 |
645 | screen_width: The pixel width of the screen
646 | screen_height: The pixel height of the screen
647 | '''
648 |
649 | self._root = None
650 | self._root_element = None
651 |
652 | self._all_visible_leaves = []
653 |
654 | self._dom_location_dict = None
655 | self._preorder_index = 0
656 | self._postorder_index = 0
657 |
658 | self._screen_width = screen_width
659 | self._screen_height = screen_height
660 |
661 | def load_xml(self, xml_content):
662 | '''
663 | Builds the etree from xml content
664 | Args:
665 | xml_content: The string containing xml content
666 | '''
667 | self._root = etree.XML(xml_content)
668 |
669 | self._root_element = self._root[0]
670 | self._all_visible_leaves = self._get_visible_leaves()
671 |
672 | self._dom_location_dict = self._calculate_dom_location()
673 |
674 | def load_json(self, json_content):
675 | '''
676 | Builds the etree from json content
677 | args:
678 | json_content: The string containing json content
679 | '''
680 | json_dict = json.loads(json_content)
681 | if json_dict:
682 | raise ValueError('The json content is empty')
683 |
684 | self._root = etree.Element('hierarchy', rotation='0')
685 | self._root_element = etree.Element('node')
686 | self._root.append(self._root_element)
687 | _build_etree_from_json(self._root_element, json_dict['activity']['root'])
688 |
689 | self._all_visible_leaves = self._get_all_visible_leaves()
690 |
691 | self._dom_location_dict = self._calculate_dom_location_dict()
692 |
693 | def get_leaf_nodes(self):
694 | '''
695 | Returns all the leaf nodes in the view hierarchy
696 |
697 | '''
698 | return [
699 |
700 | LeafNode(
701 | element,
702 | self._all_visible_leaves,
703 | self._dom_location_dict[id(element)],
704 | self._screen_width,
705 | self._screen_height
706 | )
707 | for element in self._all_visible_leaves
708 | ]
709 |
710 | def get_ui_objects(self):
711 | '''
712 | Returns a list of all UI objects represented by leaf nodes
713 | '''
714 | return [
715 | LeafNode(element, self._all_visible_leaves, self._dom_location_dict[id(element)], self._screen_width, self._screen_height).uiobject
716 | for element in self._all_visible_leaves
717 | ]
718 |
719 | def dedup(self, click_x_and_y):
720 | '''
721 | Dedup UI objects with same text or content_desc
722 | Args
723 | click_x_and_y: The click x and y coordinates
724 | '''
725 | click_x, click_y = click_x_and_y
726 |
727 | name_element_map = collections.defaultdict(list)
728 |
729 | for element in self._all_visible_leaves:
730 | name = _build_object_name(
731 | element.get('text'),
732 | element.get('content-desc')
733 | )
734 | name_element_map[name].append(element)
735 |
736 | def delete_element(element):
737 | element.getparent().remove(element)
738 |
739 | for name, elements in name_element_map.items():
740 | if not name:
741 | continue
742 | target_index = None
743 | for index, element in enumerate(elements):
744 | box = _build_bounding_box(element.get('bounds'))
745 | if (box.x1 <= click_x <= box.x2) and (box.y1 <= click_y <= box.y2):
746 | target_index = index
747 | break
748 |
749 | if target_index is None:
750 | for ele in elements[1:]:
751 | delete_element(ele)
752 | else:
753 | for ele in elements[:target_index] + elements[target_index + 1:]:
754 | delete_element(ele)
755 |
756 | print('Dedup %d elements' % (len(elements) - 1))
757 |
758 | self._dom_location_dict = self._calculate_dom_location_dict()
759 | self._all_visible_leaves = self._get_visible_leaves()
760 |
761 | def _get_visible_leaves(self):
762 | '''
763 | Gets all visible leaves from view hierarchy
764 | All_visible_leaves: The list of teh visible leaf elements
765 | '''
766 | all_elements = [element for element in self._root.iter('*')]
767 | button_elements = [element for element in all_elements if element.get('type') == 'XCUIElementTypeButton']
768 |
769 | for button in button_elements:
770 | self._make_button_a_leaf(button)
771 |
772 | all_visible_leaves = [
773 |
774 | element for element in all_elements if self._is_leaf(element) and
775 | strtobool(element.get('visible', default='true')) and
776 | self._is_within_screen_bound(element)
777 | ]
778 |
779 | return all_visible_leaves
780 |
781 | def _make_button_a_leaf(self, element):
782 | '''
783 | IF an element is a button remove its children
784 | '''
785 | if element.get('type') == 'XCUIElementTypeButton':
786 | for child in element.findall('*'):
787 | element.remove(child)
788 |
789 | def _calculate_dom_location(self):
790 | '''
791 | Calcualte [depth, preorder-index, postorder-index] for each element
792 |
793 | This method is not thread safe if multiple threads call this method of same ViewHierarchy object object
794 |
795 | Returns:
796 | dom_location_dict, dict of
797 | {
798 | id(element): [depth, preorder-index, postorder-index]
799 | }
800 | '''
801 | dom_location_dict = collections.defaultdict(lambda: [None, None, None])
802 | for element in self._all_visible_leaves:
803 | ancestors = [node for node in element.iterancestors()]
804 | dom_location_dict[id(element)][DomLocationKey.DEPTH.value] = len(ancestors)
805 |
806 | self._peorder_index = 0
807 | self._preorder_iterate(self._root, dom_location_dict)
808 | self._postorder_index = 0
809 | self._postorder_iterate(self._root, dom_location_dict)
810 | return dom_location_dict
811 |
812 | def _preorder_iterate(self, element, dom_location_dict):
813 | '''
814 | Preorder traversal on the view hierarchy tree
815 | ARGS:
816 | element: The current etree element
817 | dom_location_dict: The dict of dom location info
818 |
819 | '''
820 | if self._is_leaf(element):
821 | dom_location_dict[id(element)][DomLocationKey.PREORDER_INDEX.value] = self._preorder_index
822 | self._preorder_index += 1
823 | for child in element:
824 | if child.getparent() == element:
825 | self._preorder_iterate(child, dom_location_dict)
826 |
827 | def _postorder_iterate(self, element, dom_location_dict):
828 | '''
829 | Postorder traversal on the view hierarchy tree
830 | Args:
831 | element: The current etree element
832 | dom_location_dict: The dict of dom location info
833 | '''
834 | for child in element:
835 | if child.getparent() == element:
836 | self._postorder_iterate(child, dom_location_dict)
837 | if self._is_leaf(element):
838 | dom_location_dict[id(element)][DomLocationKey.POSTORDER_INDEX.value] = self._postorder_index
839 | self._postorder_index += 1
840 |
841 | def _is_leaf(self, element):
842 | return not element.findall('.//*')
843 |
844 | def _is_within_screen_bound(self, element):
845 | '''
846 | Checks if the element is within the screen bound
847 | Args:
848 | element: The etree element object
849 | Returns:
850 | A boolean to indicate whether the element is within the screen bound
851 | '''
852 | x_1 = str(max(0, int(element.get('x'))))
853 |
854 | y_1 = str(max(0, int(element.get('y'))))
855 |
856 | x_2 = str(int(x_1) + int(element.get('width')))
857 |
858 | y_2 = str(int(y_1) + int(element.get('height')))
859 | # logger.info(x_1)
860 | inits = str([x_1, y_1])
861 |
862 | ends = str([x_2, y_2])
863 |
864 | bbox = _build_bounding_box(inits + ends)
865 |
866 | in_x = (0 <= bbox.x1 <= self._screen_width) or (0 <= bbox.x2 <= self._screen_width)
867 |
868 | in_y = (0 <= bbox.y1 <= self._screen_height) or (0 <= bbox.y2 <= self._screen_height)
869 |
870 | x1_less_than_x2 = bbox.x1 < bbox.x2
871 |
872 | y1_less_than_y2 = bbox.y1 < bbox.y2
873 |
874 | return in_x and in_y and x1_less_than_x2 and y1_less_than_y2
875 |
876 |
877 | class UI:
878 | def __init__(self, xml_file):
879 | self.xml_file = xml_file
880 | self.elements = {
881 | }
882 |
883 | def sortchildrenby_viewhierarchy(self, view, attr="bounds"):
884 | if attr == "bounds":
885 | bounds = [
886 | (ele.uiobject.bounding_box.x1, ele.uiobject.bounding_box.y1, ele.uiobject.bounding_box.x2, ele.uiobject.bounding_box.y2)
887 | for ele in view
888 | ]
889 | sorted_bounds_index = [
890 | bounds.index(i) for i in sorted(
891 | bounds, key=lambda x: (x[1], x[0])
892 | )
893 | ]
894 | sort_children = [view[i] for i in sorted_bounds_index]
895 | view[:] = sort_children
896 |
897 | def encoding(self):
898 | '''
899 | Encodes the UI into a string representation
900 |
901 | Returns:
902 | the string representation of the UI
903 | '''
904 | with open(self.xml_file, 'r', encoding='utf-8') as f:
905 | xml_content = f.read().encode()
906 |
907 | vh = ViewHierarchy(
908 | screen_width=XML_SCREEN_WIDTH,
909 | screen_height=XML_SCREEN_HEIGHT
910 | )
911 | vh.load_xml(xml_content)
912 | view_hierarchy_leaf_nodes = vh.get_leaf_nodes()
913 | # logger.info(view_hierarchy_leaf_nodes)
914 | self.sortchildrenby_viewhierarchy(
915 | view_hierarchy_leaf_nodes,
916 | attr="bounds")
917 |
918 | codes = ''
919 | for _id, ele in enumerate(view_hierarchy_leaf_nodes):
920 | obj_type_str = ele.uiobject.obj_type.name
921 | text = ele.uiobject.text
922 | text = text.replace('\n', ' ')
923 |
924 | resource_id = ele.uiobject.obj_name
925 |
926 | content_desc = ele.uiobject.content_desc
927 | # logger.info(resource_id)
928 | # ogger.info(content_desc)
929 |
930 | html_code = self.element_encoding(
931 | _id=_id,
932 | _obj_type=obj_type_str,
933 | _text=text,
934 | _content_desc=content_desc,
935 | _resource_id=resource_id
936 | )
937 |
938 | codes += html_code if html_code else ''
939 | self.elements[_id] = ele.uiobject
940 |
941 | codes = "\n" + codes + ""
942 |
943 | return codes
944 |
945 | def action_encoding(self):
946 | '''
947 | Get Heuristic of possible actions output
948 | {action_type: type, encoding}
949 | '''
950 | pass
951 |
952 | def element_encoding(self,
953 | _id,
954 | _obj_type,
955 | _text,
956 | _content_desc,
957 | _resource_id):
958 | '''
959 | Encodes the element into a string representation
960 |
961 | Args:
962 | _id: The id of the element
963 | _obj_type: The type of the element
964 | _text: The text of the element
965 | _content_desc: The content description of the element
966 | _resource_id: The resource id of the element
967 |
968 | Returns:
969 | The string representation of the element
970 | '''
971 | _class = _resource_id.split('.')[-1] if '.' in _resource_id else _resource_id
972 | _text = _text.strip()
973 | # logger.info(_id)
974 | # logger.info(_obj_type)
975 |
976 | assert _obj_type in CLASS_MAPPING.keys()
977 |
978 | tag = CLASS_MAPPING[_obj_type]
979 |
980 | if _obj_type == 'None':
981 | tag = ''
982 | code = ''
983 | if _obj_type == "XCUIElementTypeSwitch":
984 | code = f'\n'
985 | code += f'\n'
986 |
987 | elif _obj_type == "XCUIElementTypeImage":
988 | if _class == "":
989 | code = f'
\n'
990 | else:
991 | code = f'
\n'
992 | else:
993 | _text = _content_desc if _text == "" else _text
994 | if _class == "":
995 | code = f'<{tag} id="{_id}">{_text}{tag}>\n'
996 | else:
997 | code = f'<{tag} id="{_id}" class="{_class}">{_text}{tag}>\n'
998 | return code
999 |
--------------------------------------------------------------------------------
/cognisim/device/ios_device.py:
--------------------------------------------------------------------------------
1 | from mobileadapt.device.device import Device
2 |
3 |
4 | class IOSDevice(Device):
5 | def __init__(self, app_start_url=""):
6 | pass
7 |
8 | def get_state(self):
9 | # TODO: Implement get_state for iOS device
10 | pass
11 |
12 | def tap(self, x, y):
13 | # TODO: Implement tap for iOS device
14 | pass
15 |
16 | def input(self, x, y, text):
17 | # TODO: Implement input for iOS device
18 | pass
19 |
20 | def swipe(self, x, y, direction):
21 | # TODO: Implement swipe for iOS device
22 | pass
23 |
--------------------------------------------------------------------------------
/cognisim/utils/constants.py:
--------------------------------------------------------------------------------
1 | # Android Emulator Config
2 | SCREEN_WIDTH = 1080
3 | SCREEN_HEIGHT = 1920
4 | SCREEN_CHANNEL = 4
5 | SCREEN_TOP_HEAD = 63
6 | SCREEN_BOTTOM_HEAD = 126
7 | # screen config
8 | ADJACENT_BOUNDING_BOX_THRESHOLD = 3
9 | NORM_VERTICAL_NEIGHBOR_MARGIN = 0.01
10 | NORM_HORIZONTAL_NEIGHBOR_MARGIN = 0.01
11 | INPUT_ACTION_UPSAMPLE_RATIO = 1
12 | # XML screen config
13 | XML_SCREEN_WIDTH = 1440
14 | XML_SCREEN_HEIGHT = 2960
15 |
16 | # Max number of reflections before going to next step
17 |
18 | MAX_REFLECTIONS = 5
19 |
20 | # PLAYWRIGHT TIMEOUTS
21 | BOUNDING_BOX_TIMEOUT = 3000
22 |
--------------------------------------------------------------------------------
/cookbook/agentic_example.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 | import base64
3 | import io
4 | import json
5 | import os
6 | from datetime import datetime
7 | from typing import Any, Dict
8 |
9 | import openai
10 | from loguru import logger
11 | from openai import OpenAI
12 | from PIL import Image
13 |
14 | from cognisim import mobileadapt
15 |
16 | openai.api_key = ""
17 |
18 |
19 | def llm_call(html_state: str, image: bytes, nlp_task: str):
20 | client = OpenAI()
21 |
22 | function_call_instruction_guided_replay = {
23 | "name": "run_step",
24 | "description": "Based on the current step and the current state, return the next action to take",
25 | "parameters": {
26 | "type": "object",
27 | "properties": {
28 | "reasoning": {
29 | "type": "string",
30 | "description": "The reasoning for the action to be performed in the current step",
31 | },
32 | "action_type": {
33 | "type": "string",
34 | "description": "The type of action to be performed",
35 | "enum": ["tap", "input", "swipe", "validate" "scroll"],
36 | },
37 | "action_id": {
38 | "type": "integer",
39 | "description": "The id of the action to be performed in the current step based on the current state",
40 | },
41 | "value": {
42 | "type": "string",
43 | "description": "The value to be inputted if action_type is input or the text to be validated if action_type is validate",
44 | },
45 | "direction": {
46 | "type": "string",
47 | "description": "The direction to be swiped if action_type is swipe",
48 | "enum": ["up", "down", "left", "right"],
49 | },
50 | },
51 | "required": ["action_type", "action_id", "reasoning"],
52 | },
53 | }
54 |
55 | response = client.chat.completions.create(
56 | model="gpt-4o-2024-08-06",
57 | messages=[
58 | {
59 | "role": "system",
60 | "content": "You are an AI assistant that helps with mobile app testing.",
61 | },
62 | {
63 | "role": "user",
64 | "content": [
65 | {
66 | "type": "text",
67 | "text": f"Given the following task: {nlp_task}\n\nAnd the current state of the app:\n\nHTML: {html_state}",
68 | },
69 | {
70 | "type": "image_url",
71 | "image_url": {
72 | "url": f"data:image/jpeg;base64,{base64.b64encode(image).decode('utf-8')}"
73 | },
74 | },
75 | ],
76 | },
77 | ],
78 | functions=[function_call_instruction_guided_replay],
79 | function_call={"name": "run_step"},
80 | )
81 |
82 | return json.loads(response.choices[0].message.function_call.arguments)
83 |
84 |
85 | async def main():
86 |
87 | android_device = mobileadapt(platform="android")
88 | # Start device
89 | await android_device.start_device()
90 |
91 | encoded_ui, screenshot, ui = await android_device.get_state()
92 |
93 | # Open the app (Flexify - https://f-droid.org/en/packages/com.presley.flexify/)
94 | await android_device.navigate("com.presley.flexify")
95 |
96 | # Press the button with the text 'Add a new task'
97 |
98 | encoded_ui, screenshot, ui = await android_device.get_state()
99 |
100 | # Create set of mark screenshot
101 | set_of_mark: bytes = android_device.generate_set_of_mark(ui, screenshot)
102 |
103 | action_grounded: Dict[str, Any] = llm_call(
104 | html_state=encoded_ui,
105 | image=set_of_mark,
106 | nlp_task="Press the buttom with the text 'Add a new task'",
107 | )
108 |
109 | await android_device.perform_action(action_grounded)
110 |
111 | encoded_ui, screenshot, ui = await android_device.get_state()
112 |
113 | # save set of mark screens
114 |
115 | await android_device.stop_device()
116 | await android_device.start_device()
117 |
118 |
119 | if __name__ == "__main__":
120 | asyncio.run(main())
121 |
--------------------------------------------------------------------------------
/cookbook/examplescript2.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 | import io
3 | import os
4 | from datetime import datetime
5 |
6 | from PIL import Image
7 |
8 | from cognisim import mobileadapt
9 |
10 |
11 | async def save_screenshot(screenshot_data, filename):
12 | image = Image.open(io.BytesIO(screenshot_data))
13 | image.save(filename)
14 |
15 |
16 | async def perform_actions(device):
17 | # Tap actions
18 | await device.tap(200, 300)
19 | print("Tapped at (200, 300)")
20 | await device.tap(100, 400)
21 | print("Tapped at (100, 400)")
22 |
23 | # Swipe actions
24 | await device.swipe("up")
25 | print("Swiped up")
26 | await device.swipe("down")
27 | print("Swiped down")
28 | await device.swipe("left")
29 | print("Swiped left")
30 | await device.swipe("right")
31 | print("Swiped right")
32 |
33 | # Input text
34 | await device.input(150, 500, "Hello, MobileAdapt!")
35 | print("Input text at (150, 500)")
36 |
37 |
38 | async def main():
39 | android_device = mobileadapt(platform="android")
40 | await android_device.start_device()
41 |
42 | # Perform initial state capture
43 | encoded_ui, screenshot, ui = await android_device.get_state()
44 | timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
45 | filename = os.path.join(
46 | os.path.dirname(__file__), f"screenshot_initial_{timestamp}.png"
47 | )
48 | await save_screenshot(screenshot, filename)
49 | print(f"Initial screenshot saved as {filename}")
50 | print("Initial UI state:", encoded_ui)
51 |
52 | # Perform a series of actions and capture states
53 | for i in range(3):
54 | print(f"\nPerforming action set {i+1}")
55 | await perform_actions(android_device)
56 |
57 | encoded_ui, screenshot, ui = await android_device.get_state()
58 | timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
59 | filename = os.path.join(
60 | os.path.dirname(__file__), f"screenshot_action{i+1}_{timestamp}.png"
61 | )
62 | await save_screenshot(screenshot, filename)
63 | print(f"Screenshot after action set {i+1} saved as {filename}")
64 | print(f"UI state after action set {i+1}:", encoded_ui)
65 |
66 | # Additional complex interaction
67 | print("\nPerforming additional complex interaction")
68 | await android_device.tap(300, 300)
69 | await android_device.swipe("up")
70 | await android_device.input(200, 600, "Complex interaction")
71 | await android_device.swipe("left")
72 | await android_device.tap(150, 450)
73 |
74 | # Capture final state
75 | encoded_ui, screenshot, ui = await android_device.get_state()
76 | timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
77 | filename = os.path.join(
78 | os.path.dirname(__file__), f"screenshot_final_{timestamp}.png"
79 | )
80 | await save_screenshot(screenshot, filename)
81 | print(f"Final screenshot saved as {filename}")
82 | print("Final UI state:", encoded_ui)
83 |
84 |
85 | if __name__ == "__main__":
86 | asyncio.run(main())
87 |
--------------------------------------------------------------------------------
/cookbook/smoke_example_android.py.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 | import base64
3 | import io
4 | import os
5 | from datetime import datetime
6 |
7 | from loguru import logger
8 | from PIL import Image
9 |
10 | from cognisim import mobileadapt
11 |
12 | """ From the root directory use the following command to start the script:
13 | python example-scripts/examplescript.py
14 | """
15 |
16 |
17 | async def save_screenshot(screenshot_data, filename):
18 | # Open the screenshot data as an image and save it
19 | image = Image.open(io.BytesIO(screenshot_data))
20 | image.save(filename)
21 |
22 |
23 | async def main():
24 | # Create an Android device instance
25 | android_device = mobileadapt(platform="android")
26 |
27 | # Initialize the device (starts the Appium session)
28 | await android_device.start_device()
29 |
30 | # Get the current state of the device
31 | encoded_ui, screenshot, ui = await android_device.get_state()
32 | logger.info(f"Current state: {encoded_ui}")
33 |
34 | # Save the first screenshot
35 | timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
36 | # filename1 = os.path.join(os.path.dirname(__file__), f"screenshot_before_{timestamp}.png")
37 | # await save_screenshot(screenshot, filename1)
38 | # print(f"Screenshot saved as {filename1}")
39 |
40 | # Perform a tap action at coordinates (100, 100)
41 | await android_device.tap(100, 100)
42 |
43 | # Get the state again after the tap action
44 | new_encoded_ui, new_screenshot, new_ui = await android_device.get_state()
45 | print("New state after tap:", new_encoded_ui)
46 |
47 | # Save the second screenshot
48 | timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
49 | filename2 = os.path.join(
50 | os.path.dirname(__file__), f"screenshot_after_{timestamp}.png"
51 | )
52 | await save_screenshot(new_screenshot, filename2)
53 | print(f"Screenshot saved as {filename2}")
54 |
55 |
56 | if __name__ == "__main__":
57 | # Run the main function asynchronously
58 | asyncio.run(main())
59 |
--------------------------------------------------------------------------------
/cookbook/smoke_example_ios.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 |
3 | import base64
4 | import io
5 | import os
6 |
7 | from datetime import datetime
8 |
9 | from PIL import Image
10 | from loguru import logger
11 | from cognisim import mobileadapt
12 |
13 |
14 | async def main():
15 |
16 | ios_device = mobileadapt(platform="ios")
17 |
18 | await ios_device.start_device()
19 |
20 |
21 | encoded_ui, screenshot, ui = await ios_device.get_state()
22 | logger.info(f"Current state: {encoded_ui}")
23 |
24 | if __name__ == "__main__":
25 | asyncio.run(main())
26 |
--------------------------------------------------------------------------------
/deploy/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -o errexit
4 |
5 | # Regular Colors
6 | Green='\033[0;32m'
7 | Yellow='\033[0;33m'
8 | Red='\033[0;31m'
9 | NC='\033[0m' # No Color
10 |
11 | # Change to the script's directory
12 | cd "$(dirname "$0")"
13 |
14 | is_command_present() {
15 | type "$1" >/dev/null 2>&1
16 | }
17 |
18 | check_os() {
19 | if [[ "$OSTYPE" == "darwin"* ]]; then
20 | echo "macOS detected"
21 | package_manager="brew"
22 | elif [[ "$OSTYPE" == "linux-gnu"* ]]; then
23 | echo "Linux detected"
24 | if is_command_present apt-get; then
25 | package_manager="apt-get"
26 | elif is_command_present yum; then
27 | package_manager="yum"
28 | else
29 | echo "Unsupported package manager. Please install Python3, pip3, and npm manually."
30 | exit 1
31 | fi
32 | else
33 | echo "Unsupported OS"
34 | exit 1
35 | fi
36 | }
37 |
38 | request_sudo() {
39 | if [[ $EUID != 0 ]]; then
40 | sudo_cmd="sudo"
41 | echo "We need sudo access to complete the installation."
42 | sudo -v
43 | fi
44 | }
45 |
46 | install_dependencies() {
47 | echo "Installing dependencies..."
48 | if [[ $package_manager == "brew" ]]; then
49 | brew install python node
50 | elif [[ $package_manager == "apt-get" ]]; then
51 | $sudo_cmd $package_manager update
52 | $sudo_cmd $package_manager install -y python3 python3-pip nodejs npm
53 | else
54 | $sudo_cmd $package_manager install -y python3 python3-pip nodejs npm
55 | fi
56 | }
57 |
58 | install_python_dependencies() {
59 | echo "Setting up Python virtual environment..."
60 | python3 -m venv venv
61 | source venv/bin/activate
62 |
63 | echo "Upgrading pip..."
64 | python3 -m pip install --upgrade pip
65 |
66 | echo "Installing Python dependencies..."
67 | python3 -m pip install -r ../requirements.txt
68 | }
69 |
70 | install_appium() {
71 | echo "Installing Appium..."
72 | $sudo_cmd npm install -g appium
73 | }
74 |
75 | start_appium() {
76 | echo "Starting Appium server..."
77 | appium &
78 | APPIUM_PID=$!
79 | echo "Appium server started with PID: $APPIUM_PID"
80 | sleep 5 # Give Appium some time to start up
81 | }
82 |
83 | # Main script execution
84 | echo -e "${Green}Setting up the mobile adapter environment...${NC}"
85 |
86 | check_os
87 | request_sudo
88 | install_dependencies
89 | install_python_dependencies
90 | install_appium
91 | start_appium
92 |
93 | echo -e "${Green}Mobile adapter setup complete.${NC}"
94 | echo -e "${Yellow}Activating the virtual environment...${NC}"
95 | source "$(dirname "$0")/venv/bin/activate"
96 | echo -e "${Green}Virtual environment activated. You can now use mobileadapt.${NC}"
97 | echo -e "${Yellow}To deactivate the virtual environment when you're done, type 'deactivate'.${NC}"
98 | echo -e "${Yellow}To stop Appium server, run: kill $APPIUM_PID${NC}"
99 |
100 | # Keep the script running to maintain the Appium server
101 | wait $APPIUM_PID
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["poetry-core>=1.0.0"]
3 | build-backend = "poetry.core.masonry.api"
4 |
5 | [tool.poetry]
6 | name = "cognisim"
7 | version = "0.1.0"
8 | description = "A package for mobile app adaptation and testing"
9 | authors = ["Revyl AI "]
10 | license = "MIT"
11 | readme = "README.md"
12 | repository = "https://github.com/RevylAI/Mobileadapt"
13 | homepage = "https://mobileadapt.revyl.ai"
14 | packages = [{include = "cognisim"}]
15 | classifiers = [
16 | "Development Status :: 3 - Alpha",
17 | "Intended Audience :: Developers",
18 | "License :: OSI Approved :: MIT License",
19 | "Operating System :: OS Independent",
20 | "Programming Language :: Python :: 3",
21 | "Programming Language :: Python :: 3.7",
22 | "Programming Language :: Python :: 3.8",
23 | "Programming Language :: Python :: 3.9",
24 | ]
25 |
26 | [tool.poetry.dependencies]
27 | python = "^3.12"
28 | appium-python-client = "*"
29 | loguru = "*"
30 | lxml = "*"
31 | numpy = "*"
32 | attrs = "*"
33 | str2bool = "^1.1"
34 | pillow = "^10.4.0"
35 | opencv-python = "^4.10.0.84"
36 | retrying = "^1.3.4"
37 | openai = "^1.43.0"
38 | setuptools = "^75.2.0"
39 |
40 | [tool.poetry.dev-dependencies]
41 | pytest = "^6.2"
42 |
43 | [tool.poetry.urls]
44 | "Bug Tracker" = "https://github.com/RevylAI/Mobileadapt/issues"
45 | [tool.poetry.group.dev.dependencies]
46 | black = "^24.8.0"
47 | isort = "^5.13.2"
48 | mypy = "^1.11.2"
49 |
50 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | appium-python-client # Core Appium client for Python, essential for mobile automation
2 | loguru # Advanced logging library for better debug information
3 | lxml # Efficient XML and HTML processing, used for parsing view hierarchies
4 | numpy # Numerical computing library, useful for data manipulation and analysis
5 | attrs # Reduces boilerplate for Python classes, used in defining UI objects
--------------------------------------------------------------------------------
/scripts/format.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | cd "$(dirname "$0")" || exit 1
3 | cd ..
4 |
5 |
6 | printf "\nFormatting Python 🧹\n"
7 | poetry run black .
8 |
9 | printf "\nSorting imports 🧹\n"
10 | poetry run isort .
11 |
12 |
--------------------------------------------------------------------------------
/scripts/setup.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -o errexit
4 |
5 | # Change to the script's directory
6 | cd "$(dirname "$0")"
7 |
8 | check_os() {
9 | if [[ "$OSTYPE" == "darwin"* ]]; then
10 | echo "macOS detected"
11 | package_manager="brew"
12 | elif [[ "$OSTYPE" == "linux-gnu"* ]]; then
13 | echo "Linux detected"
14 | if type apt-get >/dev/null 2>&1; then
15 | package_manager="apt-get"
16 | elif type yum >/dev/null 2>&1; then
17 | package_manager="yum"
18 | else
19 | echo "Unsupported package manager. Please install Python3, pip3, and npm manually."
20 | exit 1
21 | fi
22 | else
23 | echo "Unsupported OS"
24 | exit 1
25 | fi
26 | }
27 |
28 | request_sudo() {
29 | if [[ $EUID != 0 ]]; then
30 | sudo_cmd="sudo"
31 | echo "We need sudo access to complete the installation."
32 | sudo -v
33 | fi
34 | }
35 |
36 | install_dependencies() {
37 | echo "Installing dependencies..."
38 | if [[ $package_manager == "brew" ]]; then
39 | brew install node
40 | elif [[ $package_manager == "apt-get" ]]; then
41 | $sudo_cmd $package_manager update
42 | $sudo_cmd $package_manager install -y nodejs npm
43 | else
44 | $sudo_cmd $package_manager install -y nodejs npm
45 | fi
46 | }
47 |
48 | install_appium() {
49 | echo "Installing Appium..."
50 | $sudo_cmd npm install -g appium
51 | }
52 |
53 | cd ..
54 | poetry install
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import find_packages, setup
2 |
3 | with open("README.md", "r", encoding="utf-8") as fh:
4 | long_description = fh.read()
5 |
6 | setup(
7 | name="cognisim",
8 | version="0.1.0",
9 | author="Revyl AI",
10 | author_email="anam@revyl.ai",
11 | description="A package for cross platform LLM agentic testing",
12 | long_description=long_description,
13 | long_description_content_type="text/markdown",
14 | url="https://github.com/revyl-ai/mobileadapt",
15 | packages=find_packages(),
16 | classifiers=[
17 | "Development Status :: 3 - Alpha",
18 | "Intended Audience :: Developers",
19 | "License :: OSI Approved :: MIT License",
20 | "Operating System :: OS Independent",
21 | "Programming Language :: Python :: 3",
22 | "Programming Language :: Python :: 3.7",
23 | "Programming Language :: Python :: 3.8",
24 | "Programming Language :: Python :: 3.9",
25 | ],
26 | python_requires=">=3.7",
27 | install_requires=[
28 | "appium-python-client",
29 | "loguru",
30 | "lxml",
31 | "numpy",
32 | "attrs",
33 | ],
34 | )
35 |
--------------------------------------------------------------------------------