├── .github ├── assets │ ├── dark_logo.png │ ├── mobile_adapt_example.mp4 │ └── set_of_mark.png └── workflows │ └── package.yml ├── .gitignore ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── cognisim ├── __init__.py ├── device │ ├── android │ │ ├── __init__.py │ │ ├── android_device.py │ │ ├── android_ui.py │ │ └── android_view_hierarchy.py │ ├── device.py │ ├── device_factory.py │ ├── ios │ │ ├── __init__.py │ │ ├── ios_device.py │ │ └── ios_view_hierarchy.py │ └── ios_device.py └── utils │ └── constants.py ├── cookbook ├── agentic_example.py ├── examplescript2.py ├── smoke_example_android.py.py └── smoke_example_ios.py ├── deploy └── run.sh ├── poetry.lock ├── pyproject.toml ├── requirements.txt ├── scripts ├── format.sh └── setup.sh └── setup.py /.github/assets/dark_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RevylAI/CogniSim/3d8902e011981b93cb0ebfafba1794eab93b053e/.github/assets/dark_logo.png -------------------------------------------------------------------------------- /.github/assets/mobile_adapt_example.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RevylAI/CogniSim/3d8902e011981b93cb0ebfafba1794eab93b053e/.github/assets/mobile_adapt_example.mp4 -------------------------------------------------------------------------------- /.github/assets/set_of_mark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RevylAI/CogniSim/3d8902e011981b93cb0ebfafba1794eab93b053e/.github/assets/set_of_mark.png -------------------------------------------------------------------------------- /.github/workflows/package.yml: -------------------------------------------------------------------------------- 1 | name: Publish 2 | on: 3 | push: 4 | branches: 5 | - main 6 | paths: 7 | - 'cognisim/**' 8 | jobs: 9 | build: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v3 13 | 14 | - uses: actions/setup-python@v3 15 | with: 16 | python-version: 3.12 17 | 18 | - run: | 19 | pip install poetry 20 | poetry build 21 | 22 | - uses: actions/upload-artifact@v3 23 | with: 24 | path: ./dist 25 | 26 | pypi-publish: 27 | needs: ['build'] 28 | environment: 'publish' 29 | 30 | name: upload release to PyPI 31 | runs-on: ubuntu-latest 32 | permissions: 33 | # IMPORTANT: this permission is mandatory for trusted publishing 34 | id-token: write 35 | steps: 36 | - uses: actions/download-artifact@v3 37 | 38 | - name: Publish package distributions to PyPI 39 | uses: pypa/gh-action-pypi-publish@release/v1 40 | with: 41 | packages-dir: artifact/ -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Xcode 2 | *.pbxuser 3 | !default.pbxuser 4 | *.mode1v3 5 | !default.mode1v3 6 | *.mode2v3 7 | !default.mode2v3 8 | *.perspectivev3 9 | !default.perspectivev3 10 | xcuserdata/ 11 | *.xccheckout 12 | *.moved-aside 13 | DerivedData 14 | *.hmap 15 | *.ipa 16 | *.xcuserstate 17 | project.xcworkspace 18 | *.xml 19 | # CocoaPods 20 | Pods/ 21 | 22 | # Carthage 23 | Carthage/Build 24 | 25 | # fastlane 26 | fastlane/report.xml 27 | fastlane/Preview.html 28 | fastlane/screenshots 29 | fastlane/test_output 30 | 31 | # Code Injection 32 | iOSInjectionProject/ 33 | 34 | # Android/IntelliJ 35 | build/ 36 | .idea 37 | .gradle 38 | local.properties 39 | *.iml 40 | 41 | # Node 42 | node_modules/ 43 | npm-debug.log 44 | yarn-error.log 45 | 46 | # BUCK 47 | buck-out/ 48 | \.buckd/ 49 | *.keystore 50 | 51 | # Bundle artifact 52 | *.jsbundle 53 | 54 | # Ruby / CocoaPods 55 | /ios/Pods/ 56 | 57 | # Temporary files 58 | *.swp 59 | *.swo 60 | *~ 61 | 62 | # OS generated files 63 | .DS_Store 64 | .DS_Store? 65 | ._* 66 | .Spotlight-V100 67 | .Trashes 68 | ehthumbs.db 69 | Thumbs.db 70 | 71 | # Python 72 | __pycache__/ 73 | *.py[cod] 74 | *$py.class 75 | 76 | # Virtual Environment 77 | venv/ 78 | env/ 79 | .venv/ 80 | .env/ 81 | 82 | # Distribution / packaging 83 | .Python 84 | develop-eggs/ 85 | dist/ 86 | downloads/ 87 | eggs/ 88 | .eggs/ 89 | lib/ 90 | lib64/ 91 | parts/ 92 | sdist/ 93 | var/ 94 | wheels/ 95 | *.egg-info/ 96 | .installed.cfg 97 | *.egg 98 | 99 | # Jupyter Notebook 100 | .ipynb_checkpoints 101 | 102 | # pyenv 103 | .python-version 104 | 105 | # celery beat schedule file 106 | celerybeat-schedule 107 | 108 | # SageMath parsed files 109 | *.sage.py 110 | 111 | # Environments 112 | .env 113 | .venv 114 | env/ 115 | venv/ 116 | ENV/ 117 | env.bak/ 118 | venv.bak/ 119 | 120 | # Spyder project settings 121 | .spyderproject 122 | .spyproject 123 | 124 | # Rope project settings 125 | .ropeproject 126 | 127 | # mkdocs documentation 128 | /site 129 | 130 | # mypy 131 | .mypy_cache/ 132 | 133 | # Pyre type checker 134 | .pyre/ 135 | 136 | 137 | constants.cpython-37.pyc 138 | 139 | # Ignore XML files in mobileadapt/device directory 140 | mobileadapt/device/*.xml 141 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to mobileadapt 2 | 3 | We're excited that you're interested in contributing to mobileadapt! This document outlines the process for contributing to this project. 4 | 5 | ## Getting Started 6 | 7 | 1. Fork the repository on GitHub. 8 | 2. Clone your fork locally: 9 | ``` 10 | git clone https://github.com/your-username/mobileadapt.git 11 | ``` 12 | 3. Create a new branch for your feature or bug fix: 13 | ``` 14 | git checkout -b feature/your-feature-name 15 | ``` 16 | 17 | ## Setting Up the Development Environment 18 | 19 | 1. Ensure you have Python 3.7+ installed. 20 | 2. Install Poetry (if not already installed): 21 | ``` 22 | curl -sSL https://install.python-poetry.org | python3 - 23 | ``` 24 | 3. Install the project dependencies: 25 | ``` 26 | poetry install 27 | ``` 28 | 4. Activate the virtual environment: 29 | ``` 30 | poetry shell 31 | ``` 32 | 5. Set up Appium and the necessary mobile SDKs as described in the project's README. 33 | 34 | ## Making Changes 35 | 36 | 1. Make your changes in your feature branch. 37 | 2. Add or update tests as necessary. 38 | 3. Ensure your code follows the project's coding style (we use PEP 8 for Python). 39 | 4. Run the test suite to make sure all tests pass: 40 | ``` 41 | poetry run python -m unittest discover tests 42 | ``` 43 | 44 | ## Updating Documentation 45 | 46 | 1. Any changes that affect the project's functionality, API, or usage should be reflected in the documentation. 47 | 2. The documentation for this project is maintained in a separate repository: [adaptdocs](https://github.com/RevylAI/adaptdocs). 48 | 3. Clone the documentation repository: 49 | ``` 50 | git clone https://github.com/RevylAI/adaptdocs.git 51 | ``` 52 | 4. Make the necessary updates to the relevant documentation files. 53 | 5. Submit a separate pull request to the adaptdocs repository with your documentation changes. 54 | 55 | ## Submitting Changes 56 | 57 | 1. Commit your changes: 58 | ``` 59 | git commit -am "Add a brief description of your changes" 60 | ``` 61 | 2. Push to your fork: 62 | ``` 63 | git push origin feature/your-feature-name 64 | ``` 65 | 3. Submit a pull request through the GitHub website. 66 | 4. If you've made documentation changes, submit a separate pull request to the adaptdocs repository. 67 | 68 | ## Pull Request Guidelines 69 | 70 | - Provide a clear title and description of your changes. 71 | - Include any relevant issue numbers in the PR description. 72 | - Ensure all tests pass and there are no linting errors. 73 | - Add or update documentation as necessary. 74 | - If your changes require documentation updates, mention the related PR in the adaptdocs repository. 75 | 76 | ## Reporting Bugs 77 | 78 | - Use the GitHub issue tracker to report bugs. 79 | - Describe the bug in detail, including steps to reproduce. 80 | - Include information about your environment (OS, Python version, etc.). 81 | 82 | ## Requesting Features 83 | 84 | - Use the GitHub issue tracker to suggest new features. 85 | - Clearly describe the feature and its potential benefits. 86 | - Be open to discussion about the feature's implementation. 87 | 88 | ## Code Review Process 89 | 90 | The core team will review your pull request. We may suggest changes, improvements, or alternatives. 91 | 92 | ## Coding Conventions 93 | 94 | - Follow PEP 8 style guide for Python code. 95 | - Use meaningful variable and function names. 96 | - Comment your code where necessary, especially for complex logic. 97 | - Write docstrings for all functions, classes, and modules. 98 | 99 | ## License 100 | 101 | By contributing to mobileadapt, you agree that your contributions will be licensed under the project's MIT license. 102 | 103 | Thank you for contributing to mobileadapt! 104 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | 3 | Version 2.0, January 2004 4 | 5 | Copyright (c) 2024 RevylAI 6 | 7 | Permission is hereby granted, free of charge, to any person obtaining a copy 8 | of this software and associated documentation files (the "Software"), to deal 9 | in the Software without restriction, including without limitation the rights 10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | copies of the Software, and to permit persons to whom the Software is 12 | furnished to do so, subject to the following conditions: 13 | 14 | The above copyright notice and this permission notice shall be included in all 15 | copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Cognisim 2 | 3 | [Company Website](https://revyl.ai) | [Twitter](https://x.com/tryrevyl) | 4 | 5 | ![Revyl AI Logo](.github/assets/dark_logo.png) 6 | 7 | 8 | 9 | 10 | 11 | ### Interaction utilies for crossplatform interaction agents 12 | 13 | **LLM Control Library for iOS and Android** 14 | 15 | Have you ever wanted to test your mobile app or control iOS and Android devices with an LLM? You've probably encountered context problems due to the accessibility view being too long or just sending a screenshot to the LLM, which provides limited accuracy. 16 | 17 | 18 | 19 | (example of using cognisim to control an android device on the arcteryx app (bout to be dripped out)) 20 | 21 | 22 | **Our Solution** 23 | 24 | We combine the accessibility tree with a set of mark prompting to provide a readable state for the LLM. 25 | 26 | **Real-World Application** 27 | 28 | At Revyl, we use this approach to test mobile apps with LLMs. Our platform integrates resilient end-to-end tests using agentic LLMs with open telemetry tracing, offering proactive observability into your mobile app. 29 | 30 | If you are interested in putting your testing on autopilot, and catching bugs before your users do, 31 | 32 | 33 | 34 | [book a demo with us](https://cal.com/landseer-enga/book-a-demo) 35 | 36 | 37 | #### [Revyl AI](https://revyl.ai) 38 | 39 | ### Prerequisites 40 | 41 | - Android Virtual Device (for Android adaptation) 42 | - iOS Simulator and Xcode (for iOS adaptation - coming soon) 43 | - macOS or Linux (recommended) 44 | 45 | 46 | ## Quick Start 47 | 48 | 49 | Create a Simulator with ios/android and make sure you have appium installed 50 | 51 | 52 | For macOS, install Appium using Homebrew: 53 | ```bash 54 | brew install appium 55 | ``` 56 | 57 | For all other operating systems, install Appium using npm: 58 | ```bash 59 | npm i -g appium 60 | ``` 61 | 62 | 63 | To install the mobileadapt package: 64 | 65 | 66 | ```bash 67 | poetry add cognisim 68 | ``` 69 | or if you have pip installed: 70 | 71 | ```bash 72 | pip install cognisim 73 | ``` 74 | 75 | For detailed instructions on getting started with Mobileadapt, please refer to our [Quickstart Guide](https://mobileadapt.revyl.ai/quickstart). 76 | 77 | 78 | 79 | # Usage 80 | ### Android Basic Example 81 | 82 | ```python 83 | import asyncio 84 | from cognisim import mobileadapt 85 | 86 | async def main(): 87 | # Initialize and start Android device 88 | android_device = mobileadapt(platform="android") 89 | await android_device.start_device() 90 | 91 | # Get initial state and perform tap 92 | _, _, _ = await android_device.get_state() 93 | await android_device.tap(100, 100) 94 | 95 | # Get state after tap 96 | new_encoded_ui, _, _ = await android_device.get_state() 97 | print("State after tap:", new_encoded_ui) 98 | 99 | if __name__ == "__main__": 100 | asyncio.run(main()) 101 | ``` 102 | 103 | ### IOS Basic Example 104 | 105 | ```python 106 | import asyncio 107 | from cognisim import mobileadapt 108 | 109 | async def main(): 110 | # Initialize and start iOS device 111 | ios_device = mobileadapt(platform="ios") 112 | await ios_device.start_device() 113 | 114 | # Get device state 115 | encoded_ui, _, _ = await ios_device.get_state() 116 | print("Current state:", encoded_ui) 117 | 118 | if __name__ == "__main__": 119 | asyncio.run(main()) 120 | ``` 121 | 122 | ### Go to [Documentation](https://mobileadapt.revyl.ai) or the cookbook folder for more examples and usage. 123 | 124 | 125 | 126 | 127 | ## Documentation 128 | 129 | For full documentation, visit [mobileadapt.revyl.ai](https://mobileadapt.revyl.ai). 130 | 131 | 132 | ## Key Features 133 | 134 | - **Android Support**: Works seamlessly with Android devices and emulators. 135 | 136 | - **IOS Support**: Works seamlessly with Android devices and emulators. 137 | - **Appium Integration**: Leverages the power of Appium for reliable mobile automation. 138 | - **LLM Agent Compatibility**: Designed to work seamlessly with language model agents. 139 | - **iOS Support**: Coming soon! 140 | 141 | 142 | 143 | 144 | ### Local Development 145 | 146 | 1. Clone the repository: 147 | ```bash 148 | git clone https://github.com/RevylAI/Mobileadapt/ && cd mobileadapt/deploy 149 | ``` 150 | 151 | 2. Start the server: 152 | ```bash 153 | ./scripts/setup.sh 154 | ``` 155 | 156 | ## Roadmap 157 | - [x] iOS Support 158 | - [ ] Abstract to different drivers other than appium 159 | - [ ] Recording interactions 160 | - [ ] Screen sharing via websocket to host recording 161 | 162 | 163 | 164 | 165 | ## Contributing 166 | 167 | We welcome contributions to the Mobileadapt project! If you'd like to contribute, please check our [Contribution Guidelines](https://github.com/RevylAI/Mobileadapt/blob/main/CONTRIBUTING.md). 168 | 169 | ## License 170 | 171 | Mobileadapt is released under the MIT License. See the [LICENSE](https://github.com/RevylAI/Mobileadapt/blob/main/LICENSE) file for more details. 172 | 173 | 174 | 175 | # Credits 176 | 177 | @inproceedings{shvoEtAl2021appbuddy, 178 | title={AppBuddy: Learning to Accomplish Tasks in Mobile Apps via Reinforcement Learning}, 179 | author={Maayan Shvo and 180 | Zhiming Hu and 181 | Rodrigo Toro Icarte and 182 | Iqbal Mohomed and 183 | Allan D. Jepson and 184 | Sheila A. McIlraith}, 185 | booktitle={Canadian Conference on Artificial Intelligence}, 186 | year={2021} 187 | } 188 | 189 | @misc{google-research, 190 | title={Google Research}, 191 | author={Google}, 192 | year={2021}, 193 | howpublished={\url{https://github.com/Berrylcm/google-research}} 194 | } 195 | 196 | 197 | 198 | 199 | ## How does it work? 200 | 201 | We use Appium under the hood to control the device and collect the UI. We then use a custom UI parser to convert the UI to a string that can be used by the LLM. 202 | 203 | 204 | The UI is parsed with a ui parser and then set of mark is generated for the image and we send that to the LLM.. 205 | 206 | The UI is parsed with a ui parser and then a set of marks is generated for the image, and we send that to the LLM. For example, the parsed UI might look like this: 207 | 208 | ``` html 209 | 210 | 211 | 212 | 213 | 214 | 215 | revyl.ai 216 | Connection is secure 217 |

None

218 | None 219 | 220 |

Revyl is in private beta →

221 |

None

222 | 223 |

Revyl

224 | 225 | 226 |

None

227 |

AI Native Proactive Observability

228 |

Catch bugs

229 |

they happen using agentic E2E testing and OpenTelemetry's Tracing. Book a demo

230 |

before

231 |

now

232 |

!

233 | 234 |

Book a demo

235 | 236 |

TRUSTED AND BUILT BY ENGINEERS AT

237 | 238 | 239 |

VendorPM

240 | 241 | ``` 242 | 243 | This structured representation of the UI elements is then used by the LLM to understand and interact with the mobile interface. 244 | 245 | Each of the ids are mapped to an element in the UI. 246 | 247 | We also create a set of mark prompting of the given state 248 | 249 | 250 | 251 | Here's an example of a set of mark image generated for the UI state: 252 | 253 | 254 | 255 | This image shows the UI elements with their corresponding IDs overlaid on the screenshot. This visual representation helps the LLM understand the layout and structure of the interface, making it easier to interact with specific elements. 256 | 257 | ## Citations 258 | 259 | ``` 260 | bibtex 261 | @misc{revylai2024mobileadapt, 262 | title = {Cognisim}, 263 | author = {Anam Hira, Landseer Enga, Aarib Sarker, Wasif Sarker, Hanzel Hira, Sushan Leel}, 264 | year = {2024}, 265 | howpublished = {GitHub}, 266 | url = {https://github.com/RevylAI/Mobileadapt} 267 | } 268 | ``` -------------------------------------------------------------------------------- /cognisim/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from .device.device_factory import DeviceFactory 3 | 4 | 5 | def mobileadapt( 6 | platform: str, 7 | app_url: str = None, 8 | state_representation="aria", 9 | download_directory="default", 10 | session_id=None, 11 | ): 12 | return DeviceFactory.create_device( 13 | platform, app_url, state_representation, download_directory, session_id 14 | ) 15 | 16 | 17 | __all__ = ["mobileadapt", "MobileAdapt"] 18 | -------------------------------------------------------------------------------- /cognisim/device/android/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RevylAI/CogniSim/3d8902e011981b93cb0ebfafba1794eab93b053e/cognisim/device/android/__init__.py -------------------------------------------------------------------------------- /cognisim/device/android/android_device.py: -------------------------------------------------------------------------------- 1 | import base64 2 | from datetime import datetime 3 | from cognisim.device.device import Device 4 | from appium import webdriver 5 | from appium.options.android import UiAutomator2Options 6 | from cognisim.device.android.android_view_hierarchy import ViewHierarchy 7 | import cv2 8 | from loguru import logger 9 | import numpy as np 10 | import os 11 | # Android Emulator Config 12 | SCREEN_WIDTH = 1080 13 | SCREEN_HEIGHT = 1920 14 | SCREEN_CHANNEL = 4 15 | SCREEN_TOP_HEAD = 63 16 | SCREEN_BOTTOM_HEAD = 126 17 | # screen config 18 | ADJACENT_BOUNDING_BOX_THRESHOLD = 3 19 | NORM_VERTICAL_NEIGHBOR_MARGIN = 0.01 20 | NORM_HORIZONTAL_NEIGHBOR_MARGIN = 0.01 21 | INPUT_ACTION_UPSAMPLE_RATIO = 1 22 | # XML screen config 23 | XML_SCREEN_WIDTH = 1440 24 | XML_SCREEN_HEIGHT = 2960 25 | # Get state implementation 26 | 27 | 28 | def sortchildrenby_viewhierarchy(view, attr="bounds"): 29 | if attr == 'bounds': 30 | bounds = [(ele.uiobject.bounding_box.x1, ele.uiobject.bounding_box.y1, 31 | ele.uiobject.bounding_box.x2, ele.uiobject.bounding_box.y2) 32 | for ele in view] 33 | sorted_bound_index = [ 34 | bounds.index(i) for i in sorted( 35 | bounds, key=lambda x: ( 36 | x[1], x[0]))] 37 | 38 | sort_children = [view[i] for i in sorted_bound_index] 39 | view[:] = sort_children 40 | 41 | 42 | CLASS_MAPPING = { 43 | 'TEXTVIEW': 'p', 44 | 'BUTTON': 'button', 45 | 'IMAGEBUTTON': 'button', 46 | 'IMAGEVIEW': 'img', 47 | 'EDITTEXT': 'input', 48 | 'CHECKBOX': 'input', 49 | 'CHECKEDTEXTVIEW': 'input', 50 | 'TOGGLEBUTTON': 'button', 51 | 'RADIOBUTTON': 'input', 52 | 'SPINNER': 'select', 53 | 'SWITCH': 'input', 54 | 'SLIDINGDRAWER': 'input', 55 | 'TABWIDGET': 'div', 56 | 'VIDEOVIEW': 'video', 57 | 'SEARCHVIEW': 'div' 58 | } 59 | 60 | 61 | class UI(): 62 | def __init__(self, xml_file): 63 | self.xml_file = xml_file 64 | self.elements = {} 65 | 66 | def encoding(self): 67 | logger.info('reading hierarchy tree from {} ...'.format( 68 | self.xml_file.split('/')[-1])) 69 | with open(self.xml_file, 'r', encoding='utf-8') as f: 70 | vh_data = f.read().encode() 71 | 72 | vh = ViewHierarchy( 73 | screen_width=XML_SCREEN_WIDTH, 74 | screen_height=XML_SCREEN_HEIGHT) 75 | vh.load_xml(vh_data) 76 | view_hierarchy_leaf_nodes = vh.get_leaf_nodes() 77 | sortchildrenby_viewhierarchy(view_hierarchy_leaf_nodes, 'bounds') 78 | 79 | logger.debug('encoding the ui elements in hierarchy tree...') 80 | codes = '' 81 | # logger.info(view_hierarchy_leaf_nodes) 82 | for _id, ele in enumerate(view_hierarchy_leaf_nodes): 83 | obj_type = ele.uiobject.obj_type.name 84 | text = ele.uiobject.text 85 | text = text.replace('\n', ' ') 86 | resource_id = ele.uiobject.resource_id if ele.uiobject.resource_id is not None else '' 87 | content_desc = ele.uiobject.content_desc 88 | html_code = self.element_encoding( 89 | _id, obj_type, text, content_desc, resource_id) 90 | codes += html_code 91 | self.elements[_id] = ele.uiobject 92 | codes = "\n" + codes + "" 93 | return codes 94 | 95 | def element_encoding( 96 | self, 97 | _id, 98 | _obj_type, 99 | _text, 100 | _content_desc, 101 | _resource_id): 102 | 103 | _class = _resource_id.split('id/')[-1].strip() 104 | _text = _text.strip() 105 | assert _obj_type in CLASS_MAPPING.keys(), print(_obj_type) 106 | tag = CLASS_MAPPING[_obj_type] 107 | 108 | if _obj_type in ['CHECKBOX', 'CHECKEDTEXTVIEW', 'SWITCH']: 109 | code = f' \n' 110 | code += f' \n' 111 | elif _obj_type == 'RADIOBUTTON': 112 | code = f' \n' 113 | code += f' \n' 114 | elif _obj_type == 'SPINNER': 115 | code = f' \n' 116 | code += f' \n' 117 | elif _obj_type == 'IMAGEVIEW': 118 | if _class == "": 119 | code = f' {_content_desc}\n' 120 | else: 121 | code = f' {_content_desc}\n' 122 | else: 123 | if _class == "": 124 | _text = _content_desc if _text == "" else _text 125 | code = f' <{tag} id={_id}">{_text}\n' 126 | else: 127 | _text = _content_desc if _text == "" else _text 128 | code = f' <{tag} id={_id} class="{_class}">{_text}\n' 129 | return code 130 | 131 | 132 | class AndroidDevice(Device): 133 | def __init__(self, app_package, download_directory='default', session_id=None): 134 | super().__init__(app_package) 135 | self.download_directory = download_directory 136 | self.session_id = session_id 137 | self.desired_caps = { 138 | 'deviceName': 'Android Device', 139 | 'automationName': 'UiAutomator2', 140 | 'autoGrantPermission': True, 141 | 'newCommandTimeout': 600, 142 | 'mjpegScreenshotUrl': 'http://localhost:4723/stream.mjpeg', 143 | 144 | } 145 | self.options = UiAutomator2Options().load_capabilities(self.desired_caps) 146 | 147 | async def get_state(self): 148 | raw_appium_state = self.driver.page_source 149 | 150 | file_path = os.path.join(os.path.dirname(__file__), 'android_view_hierarchy.xml') 151 | xml_file = open(file_path, 'w') 152 | xml_file.write(raw_appium_state) 153 | xml_file.close() 154 | 155 | ui = UI(file_path) 156 | encoded_ui: str = ui.encoding() 157 | logger.info(f"Encoded UI: {encoded_ui}") 158 | # Take screenshot and encode as base64 159 | screenshot: bytes = self.driver.get_screenshot_as_png() 160 | 161 | # Return encoded UI and screenshot 162 | return encoded_ui, screenshot, ui 163 | 164 | async def navigate(self, package_name): 165 | """ 166 | Opens the specified package using Appium with UiAutomator2. 167 | 168 | :param package_name: The package name of the app to open 169 | """ 170 | try: 171 | self.driver.activate_app(package_name) 172 | logger.info(f"Successfully opened package: {package_name}") 173 | except Exception as e: 174 | logger.error(f"Failed to open package {package_name}. Error: {str(e)}") 175 | raise 176 | 177 | async def tap(self, x, y): 178 | self.driver.tap([(x, y)], 1) 179 | 180 | async def input(self, x, y, text): 181 | await self.tap(x, y) 182 | self.driver.execute_script('mobile: type', {'text': text}) 183 | 184 | async def drag(self, startX, startY, endX, endY): 185 | self.driver.swipe(startX, startY, endX, endY, duration=1000) 186 | 187 | async def scroll(self, direction): 188 | direction_map = { 189 | 'up': 'UP', 190 | 'down': 'DOWN', 191 | 'left': 'LEFT', 192 | 'right': 'RIGHT' 193 | } 194 | self.driver.execute_script('mobile: scroll', {'direction': direction_map[direction]}) 195 | 196 | async def swipe(self, direction): 197 | left = self.window_size["width"] * 0.2 198 | top = self.window_size["height"] * 0.2 199 | width = self.window_size["width"] * 0.6 200 | height = self.window_size["height"] * 0.6 201 | self.driver.execute_script("mobile: swipeGesture", { 202 | "left": left, 203 | "top": top, 204 | "width": width, 205 | "height": height, 206 | "direction": direction, 207 | "percent": 1.0 208 | }) 209 | 210 | async def start_recording(self): 211 | """ 212 | Starts screen recording on the Android device. 213 | 214 | Returns: 215 | None 216 | """ 217 | try: 218 | self.driver.start_recording_screen() 219 | logger.info("Screen recording started successfully") 220 | except Exception as e: 221 | logger.error(f"Failed to start screen recording. Error: {str(e)}") 222 | raise 223 | 224 | async def stop_recording(self, save_path=None): 225 | """ 226 | Stops screen recording on the Android device and saves the video. 227 | 228 | Args: 229 | save_path (str, optional): Path to save the video file. If not provided, a default path will be used. 230 | 231 | Returns: 232 | str: Path to the saved video file 233 | """ 234 | video_base64 = self.driver.stop_recording_screen() 235 | 236 | if save_path is None: 237 | # Create a unique filename using timestamp 238 | timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") 239 | filename = f"screen_recording_{timestamp}.mp4" 240 | 241 | # Define the default save path 242 | save_dir = os.path.join(os.getcwd(), "recordings") 243 | os.makedirs(save_dir, exist_ok=True) 244 | save_path = os.path.join(save_dir, filename) 245 | 246 | # Decode and save the video 247 | with open(save_path, "wb") as video_file: 248 | video_file.write(base64.b64decode(video_base64)) 249 | 250 | logger.info(f"Screen recording saved to: {save_path}") 251 | return save_path 252 | 253 | async def stop_device(self): 254 | ''' 255 | Stops a test 256 | ''' 257 | pass 258 | 259 | async def capture_screenshot_with_bounding_box(self, bounds: dict, image_state: bytes = None) -> bytes: 260 | """ 261 | Capture a screenshot with a bounding box drawn around a specified element. 262 | 263 | Args: 264 | bounds (dict): A dictionary containing the bounding box coordinates. 265 | Expected keys are x1, y1, x2, y2, all of which are integers. 266 | image_state (bytes, optional): The current screenshot if available. 267 | 268 | Returns: 269 | bytes: The screenshot image with bounding box as bytes. 270 | """ 271 | logger.info("Creating tagged image") 272 | screenshot = image_state if image_state is not None else await self.device.screenshot() 273 | if screenshot is None: 274 | logger.info("Screenshot failed") 275 | return None 276 | 277 | # Convert the screenshot to a NumPy array 278 | image_np = np.frombuffer(screenshot, dtype=np.uint8) 279 | image = cv2.imdecode(image_np, cv2.IMREAD_COLOR) 280 | 281 | # Extract bounding box coordinates 282 | x1 = int(bounds[0]) 283 | y1 = int(bounds[1]) 284 | x2 = int(bounds[2]) 285 | y2 = int(bounds[3]) 286 | 287 | # Calculate width and height 288 | # width = x2 - x1 289 | # height = y2 - y1 290 | 291 | bright_color = (128, 0, 128) # Pink color 292 | # Draw the bounding box on the image 293 | cv2.rectangle(image, (x1, y1), (x2, y2), bright_color, 5) 294 | 295 | # Convert the image back to bytes 296 | _, encoded_image = cv2.imencode('.png', image) 297 | screenshot_with_bounding_box = encoded_image.tobytes() 298 | 299 | return screenshot_with_bounding_box 300 | 301 | def generate_set_of_mark(self, 302 | ui, 303 | image: bytes, 304 | position='top-left') -> bytes: 305 | ''' 306 | Code to generate a set of mark for a given image and UI state 307 | ui: UI object 308 | image: bytes of the image 309 | step_id: step ids 310 | position: position of the annotation, defaults to 'top-lefts', can also be 'center' 311 | ''' 312 | # Convert image bytes to numpy array 313 | nparr = np.frombuffer(image, np.uint8) 314 | img = cv2.imdecode(nparr, cv2.IMREAD_COLOR) 315 | height, width, _ = img.shape 316 | 317 | # Define the minimum area 318 | k = 3000 319 | 320 | for element_id in ui.elements: 321 | bounds = [ 322 | ui.elements[element_id].bounding_box.x1, 323 | ui.elements[element_id].bounding_box.y1, 324 | ui.elements[element_id].bounding_box.x2, 325 | ui.elements[element_id].bounding_box.y2 326 | ] 327 | 328 | # Calculate the area of the bounding box 329 | area = (bounds[2] - bounds[0]) * (bounds[3] - bounds[1]) 330 | 331 | # Only label elements with an area over k 332 | if area > k: 333 | # Draw a rectangle around the element 334 | cv2.rectangle( 335 | img, (int(bounds[0]), int(bounds[1])), 336 | (int(bounds[2]), int(bounds[3])), (0, 0, 255), 5) 337 | 338 | text = str(element_id) 339 | text_size = 2 # Fixed text size 340 | font = cv2.FONT_HERSHEY_SIMPLEX 341 | 342 | # Calculate the width and height of the text 343 | text_width, text_height = cv2.getTextSize( 344 | text, font, text_size, 2)[0] 345 | 346 | # Calculate the position of the text 347 | if position == 'top-left': 348 | text_x = int(bounds[0]) 349 | text_y = int(bounds[1]) + text_height 350 | else: # Default to center 351 | text_x = (int(bounds[0]) + int(bounds[2])) // 2 - text_width // 2 352 | text_y = (int(bounds[1]) + int(bounds[3])) // 2 + text_height // 2 353 | 354 | # Draw a black rectangle behind the text 355 | cv2.rectangle(img, (text_x, text_y - text_height), 356 | (text_x + text_width, text_y), (0, 0, 0), thickness=cv2.FILLED) 357 | 358 | # Draw the text in white 359 | cv2.putText(img, text, (text_x, text_y), font, 360 | text_size, (255, 255, 255), 4) 361 | 362 | # Convert the image to bytes 363 | _, img_encoded = cv2.imencode('.png', img) 364 | img_bytes = img_encoded.tobytes() 365 | 366 | return img_bytes 367 | 368 | async def start_device(self): 369 | ''' 370 | Start the Android device and connect to the appium server 371 | ''' 372 | try: 373 | self.driver = webdriver.Remote('http://localhost:4723', options=self.options) 374 | except BaseException: 375 | self.desired_caps.pop('mjpegScreenshotUrl') 376 | self.options = UiAutomator2Options().load_capabilities(self.desired_caps) 377 | self.driver = webdriver.Remote('http://localhost:4723', options=self.options) 378 | 379 | # self.driver.start_recording_screen() 380 | self.driver.update_settings({'waitForIdleTimeout': 0, 'shouldWaitForQuiescence': False, 'maxTypingFrequency': 60}) 381 | # self.driver.get_screenshot_as_base64() 382 | # self.driver.execute_script('mobile: startScreenStreaming', { 383 | # 'width': 1080, 384 | # 'height': 1920, 385 | # 'considerRotation': True, 386 | # 'quality': 45, 387 | # 'bitRate': 500000, 388 | # }) 389 | 390 | 391 | if __name__ == "__main__": 392 | ui = UI(os.path.join(os.path.dirname(__file__), 'android_view_hierarchy.xml')) 393 | encoded_ui = ui.encoding() 394 | logger.info(f"Encoded UI: {encoded_ui}") 395 | -------------------------------------------------------------------------------- /cognisim/device/android/android_ui.py: -------------------------------------------------------------------------------- 1 | CLASS_MAPPING = { 2 | "TEXTVIEW": "p", 3 | "BUTTON": "button", 4 | "IMAGEBUTTON": "button", 5 | "IMAGEVIEW": "img", 6 | "EDITTEXT": "input", 7 | "CHECKBOX": "input", 8 | "CHECKEDTEXTVIEW": "input", 9 | "TOGGLEBUTTON": "button", 10 | "RADIOBUTTON": "input", 11 | "SPINNER": "select", 12 | "SWITCH": "input", 13 | "SLIDINGDRAWER": "input", 14 | "TABWIDGET": "div", 15 | "VIDEOVIEW": "video", 16 | "SEARCHVIEW": "div", 17 | } 18 | 19 | from loguru import logger 20 | 21 | from mobileadapt.device.android.android_view_hierarchy import ViewHierarchy 22 | from mobileadapt.utils.constants import XML_SCREEN_HEIGHT, XML_SCREEN_WIDTH 23 | 24 | 25 | def sortchildrenby_viewhierarchy(view, attr="bounds"): 26 | if attr == "bounds": 27 | bounds = [ 28 | ( 29 | ele.uiobject.bounding_box.x1, 30 | ele.uiobject.bounding_box.y1, 31 | ele.uiobject.bounding_box.x2, 32 | ele.uiobject.bounding_box.y2, 33 | ) 34 | for ele in view 35 | ] 36 | sorted_bound_index = [ 37 | bounds.index(i) for i in sorted(bounds, key=lambda x: (x[1], x[0])) 38 | ] 39 | 40 | sort_children = [view[i] for i in sorted_bound_index] 41 | view[:] = sort_children 42 | 43 | 44 | class UI: 45 | def __init__(self, xml_file): 46 | self.xml_file = xml_file 47 | self.elements = {} 48 | 49 | def encoding(self): 50 | logger.info( 51 | "reading hierarchy tree from {} ...".format(self.xml_file.split("/")[-1]) 52 | ) 53 | with open(self.xml_file, "r", encoding="utf-8") as f: 54 | vh_data = f.read().encode() 55 | 56 | vh = ViewHierarchy( 57 | screen_width=XML_SCREEN_WIDTH, screen_height=XML_SCREEN_HEIGHT 58 | ) 59 | vh.load_xml(vh_data) 60 | view_hierarchy_leaf_nodes = vh.get_leaf_nodes() 61 | sortchildrenby_viewhierarchy(view_hierarchy_leaf_nodes) 62 | 63 | # logger.debug("encoding the ui elements in hierarchy tree...") 64 | codes = "" 65 | # logger.info(view_hierarchy_leaf_nodes) 66 | for _id, ele in enumerate(view_hierarchy_leaf_nodes): 67 | obj_type = ele.uiobject.obj_type.name 68 | text = ele.uiobject.text 69 | text = text.replace("\n", " ") 70 | resource_id = ( 71 | ele.uiobject.resource_id if ele.uiobject.resource_id is not None else "" 72 | ) 73 | content_desc = ele.uiobject.content_desc 74 | html_code = self.element_encoding( 75 | _id, obj_type, text, content_desc, resource_id 76 | ) 77 | codes += html_code 78 | self.elements[_id] = ele.uiobject 79 | codes = "\n" + codes + "" 80 | 81 | # logger.info('Encoded UI\n' + codes) 82 | return codes 83 | 84 | def element_encoding(self, _id, _obj_type, _text, _content_desc, _resource_id): 85 | 86 | _class = _resource_id.split("id/")[-1].strip() 87 | _text = _text.strip() 88 | assert _obj_type in CLASS_MAPPING.keys(), print(_obj_type) 89 | tag = CLASS_MAPPING[_obj_type] 90 | 91 | if _obj_type in ["CHECKBOX", "CHECKEDTEXTVIEW", "SWITCH"]: 92 | code = f' \n' 93 | code += f" \n" 94 | elif _obj_type == "RADIOBUTTON": 95 | code = f' \n' 96 | code += f" \n" 97 | elif _obj_type == "SPINNER": 98 | code = f" \n" 99 | code += f' \n' 100 | elif _obj_type == "IMAGEVIEW": 101 | if _class == "": 102 | code = f' {_content_desc}\n' 103 | else: 104 | code = f' {_content_desc}\n' 105 | else: 106 | if _class == "": 107 | _text = _content_desc if _text == "" else _text 108 | code = f' <{tag} id={_id}">{_text}\n' 109 | else: 110 | _text = _content_desc if _text == "" else _text 111 | code = f' <{tag} id={_id} class="{_class}">{_text}\n' 112 | return code 113 | -------------------------------------------------------------------------------- /cognisim/device/android/android_view_hierarchy.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import collections 6 | import json 7 | import re 8 | import numpy as np 9 | import attr 10 | from str2bool import str2bool as strtobool 11 | from enum import Enum 12 | from lxml import etree 13 | import cognisim.utils.constants as config 14 | 15 | 16 | class UIObjectType(Enum): 17 | """Types of the different UI objects.""" 18 | UNKNOWN = 0 19 | BUTTON = 1 20 | CHECKBOX = 2 21 | CHECKEDTEXTVIEW = 3 22 | EDITTEXT = 4 23 | IMAGEBUTTON = 5 24 | IMAGEVIEW = 6 25 | RADIOBUTTON = 7 26 | SLIDINGDRAWER = 8 27 | SPINNER = 9 28 | SWITCH = 10 29 | TABWIDGET = 11 30 | TEXTVIEW = 12 31 | TOGGLEBUTTON = 13 32 | VIDEOVIEW = 14 33 | SEARCHVIEW = 15 34 | 35 | 36 | class UIObjectGridLocation(Enum): 37 | """The on-screen grid location (3x3 grid) of an UI object.""" 38 | TOP_LEFT = 0 39 | TOP_CENTER = 1 40 | TOP_RIGHT = 2 41 | LEFT = 3 42 | CENTER = 4 43 | RIGHT = 5 44 | BOTTOM_LEFT = 6 45 | BOTTOM_CENTER = 7 46 | BOTTOM_RIGHT = 8 47 | 48 | 49 | @attr.s 50 | class BoundingBox(object): 51 | """The bounding box with horizontal/vertical coordinates of an UI object.""" 52 | x1 = attr.ib() 53 | y1 = attr.ib() 54 | x2 = attr.ib() 55 | y2 = attr.ib() 56 | 57 | 58 | @attr.s 59 | class UIObject(object): 60 | """Represents an UI object from the leaf node in the view hierarchy.""" 61 | obj_type = attr.ib() 62 | obj_name = attr.ib() 63 | word_sequence = attr.ib() 64 | text = attr.ib() 65 | resource_id = attr.ib() 66 | android_class = attr.ib() 67 | android_package = attr.ib() 68 | content_desc = attr.ib() 69 | clickable = attr.ib() 70 | visible = attr.ib() 71 | enabled = attr.ib() 72 | focusable = attr.ib() 73 | focused = attr.ib() 74 | scrollable = attr.ib() 75 | long_clickable = attr.ib() 76 | selected = attr.ib() 77 | bounding_box = attr.ib() 78 | grid_location = attr.ib() 79 | dom_location = attr.ib() 80 | pointer = attr.ib() 81 | neighbors = attr.ib() 82 | 83 | 84 | def _build_word_sequence(text, content_desc, resource_id): 85 | """Returns a sequence of word tokens based on certain attributes. 86 | 87 | Args: 88 | text: `text` attribute of an element. 89 | content_desc: `content_desc` attribute of an element. 90 | resource_id: `resource_id` attribute of an element. 91 | 92 | Returns: 93 | A sequence of word tokens. 94 | """ 95 | if text or content_desc: 96 | return re.findall(r"[\w']+|[?.!/,;:]", text if text else content_desc) 97 | else: 98 | # logger.info(f"Resource ID: {resource_id}") 99 | if resource_id is not None: 100 | name = resource_id.split('/')[-1] 101 | return filter(None, name.split('_')) 102 | else: 103 | return [] 104 | 105 | 106 | def _build_object_type(android_class): 107 | """Returns the object type based on `class` attribute. 108 | 109 | Args: 110 | android_class: `class` attribute of an element (Android class). 111 | 112 | Returns: 113 | The UIObjectType enum. 114 | """ 115 | if android_class.startswith('android.widget'): 116 | widget_type = android_class.split('.')[2] 117 | for obj_type in UIObjectType: 118 | if obj_type.name == widget_type.upper(): 119 | return obj_type 120 | widget_type = android_class.split('.')[-1] 121 | for obj_type in UIObjectType: 122 | if obj_type.name in widget_type.upper(): 123 | return obj_type 124 | return UIObjectType.BUTTON 125 | 126 | 127 | def _build_object_name(text, content_desc): 128 | """Returns the object name based on `text` or `content_desc` attribute. 129 | 130 | Args: 131 | text: The `text` attribute. 132 | content_desc: The `content_desc` attribute. 133 | 134 | Returns: 135 | The object name string. 136 | """ 137 | return text if text else content_desc 138 | 139 | 140 | def _build_bounding_box(bounds): 141 | """Returns the object bounding box based on `bounds` attribute. 142 | 143 | Args: 144 | bounds: The `bounds` attribute. 145 | 146 | Returns: 147 | The BoundingBox object. 148 | """ 149 | match = re.compile(r'\[(\d+),(\d+)\]\[(\d+),(\d+)\]').match(bounds) 150 | assert match, f"Invalid bounds format: {bounds}" 151 | 152 | x1, y1, x2, y2 = map(int, match.groups()) 153 | # logger.info(type(x1)) 154 | return BoundingBox(x1=x1, y1=y1, x2=x2, y2=y2) 155 | 156 | 157 | def _build_clickable(element, tree_child_as_clickable=True): 158 | """Returns whether the element is clickable or one of its ancestors is. 159 | 160 | Args: 161 | element: The etree.Element object. 162 | tree_child_as_clickable: treat all tree children as clickable 163 | 164 | Returns: 165 | A boolean to indicate whether the element is clickable or one of its 166 | ancestors is. 167 | """ 168 | clickable = element.get('clickable') 169 | if clickable == 'false': 170 | for node in element.iterancestors(): 171 | if node.get('clickable') == 'true': 172 | clickable = 'true' 173 | break 174 | 175 | # Below code is try to fix that: some target UI have 'clickable==False' 176 | # but it's clickable by human actually 177 | 178 | # Checkable elemnts should also be treated as clickable 179 | # Some menu items may have clickable==False but checkable==True 180 | if element.get('checkable') == 'true': 181 | clickable = 'true' 182 | if tree_child_as_clickable: 183 | p = element.getparent() 184 | while p is not None: 185 | if p.get('class') == 'android.widget.ListView': 186 | clickable = 'true' 187 | break 188 | p = p.getparent() 189 | 190 | return strtobool(clickable) 191 | 192 | 193 | def _pixel_distance(a_x1, a_x2, b_x1, b_x2): 194 | """Calculates the pixel distance between bounding box a and b. 195 | 196 | Args: 197 | a_x1: The x1 coordinate of box a. 198 | a_x2: The x2 coordinate of box a. 199 | b_x1: The x1 coordinate of box b. 200 | b_x2: The x2 coordinate of box b. 201 | 202 | Returns: 203 | The pixel distance between box a and b on the x axis. The distance 204 | on the y axis can be calculated in the same way. The distance can be 205 | positive number (b is right/bottom to a) and negative number 206 | (b is left or top to a). 207 | """ 208 | # if a and b are close enough, then we set the their distance to be 1 209 | # because there are typically padding spaces inside an object's bounding 210 | # box 211 | if b_x1 <= a_x2 and a_x2 - b_x1 <= config.ADJACENT_BOUNDING_BOX_THRESHOLD: 212 | return 1 213 | if a_x1 <= b_x2 and b_x2 - a_x1 <= config.ADJACENT_BOUNDING_BOX_THRESHOLD: 214 | return -1 215 | # overlap 216 | if (a_x1 <= b_x1 <= a_x2) or (a_x1 <= b_x2 <= a_x2) or ( 217 | b_x1 <= a_x1 <= b_x2) or (b_x1 <= a_x2 <= b_x2): 218 | return 0 219 | elif b_x1 > a_x2: 220 | return b_x1 - a_x2 221 | else: 222 | return b_x2 - a_x1 223 | 224 | 225 | def _grid_coordinate(x, width): 226 | """Calculates the 3x3 grid coordinate on the x axis. 227 | 228 | The grid coordinate on the y axis is calculated in the same way. 229 | 230 | Args: 231 | x: The x coordinate: [0, width). 232 | width: The screen width. 233 | 234 | Returns: 235 | The grid coordinate: [0, 2]. 236 | Note that the screen is divided into 3x3 grid, so the grid coordinate 237 | uses the number from 0, 1, 2. 238 | """ 239 | assert 0 <= x <= width 240 | grid_x_0 = width / 3 241 | grid_x_1 = 2 * grid_x_0 242 | if 0 <= x < grid_x_0: 243 | grid_coordinate_x = 0 244 | elif grid_x_0 <= x < grid_x_1: 245 | grid_coordinate_x = 1 246 | else: 247 | grid_coordinate_x = 2 248 | return grid_coordinate_x 249 | 250 | 251 | def _grid_location(bbox, screen_width, screen_height): 252 | """Calculates the grid number of the UI object's bounding box. 253 | 254 | The screen can be divided into 3x3 grid: 255 | (0, 0) (0, 1) (0, 2) 0 1 2 256 | (1, 0) (1, 1) (1, 2) ---> 3 4 5 257 | (2, 0) (2, 1) (2, 2) 6 7 8 258 | 259 | Args: 260 | bbox: The bounding box of the UI object. 261 | screen_width: The width of the screen associated with the hierarchy. 262 | screen_height: The height of the screen associated with the hierarchy. 263 | 264 | Returns: 265 | The grid location number. 266 | """ 267 | bbox_center_x = (bbox.x1 + bbox.x2) / 2 268 | bbox_center_y = (bbox.y1 + bbox.y2) / 2 269 | bbox_grid_x = _grid_coordinate(bbox_center_x, screen_width) 270 | bbox_grid_y = _grid_coordinate(bbox_center_y, screen_height) 271 | return UIObjectGridLocation(bbox_grid_y * 3 + bbox_grid_x) 272 | 273 | 274 | def get_view_hierarchy_leaf_relation(objects, _screen_width, _screen_height): 275 | """Calculates adjacency relation from list of view hierarchy leaf nodes. 276 | Args: 277 | objects: a list of objects. 278 | _screen_width, _screen_height: Screen width and height. 279 | Returns: 280 | An un-padded feature dictionary as follow: 281 | 'v_distance': 2d numpy array of ui object vertical adjacency relation. 282 | 'h_distance': 2d numpy array of ui object horizontal adjacency relation. 283 | 'dom_distance': 2d numpy array of ui object dom adjacency relation. 284 | """ 285 | vh_node_num = len(objects) 286 | vertical_adjacency = np.zeros((vh_node_num, vh_node_num)) 287 | horizontal_adjacency = np.zeros((vh_node_num, vh_node_num)) 288 | for row in range(len(objects)): 289 | for column in range(len(objects)): 290 | if row == column: 291 | h_dist = v_dist = 0 292 | else: 293 | node1 = objects[row] 294 | node2 = objects[column] 295 | h_dist, v_dist = normalized_pixel_distance( 296 | node1, node2, _screen_width, _screen_height) 297 | # print(node1.text, node2.text, v_dist) 298 | vertical_adjacency[row][column] = v_dist 299 | horizontal_adjacency[row][column] = h_dist 300 | return { 301 | 'v_distance': vertical_adjacency, 302 | 'h_distance': horizontal_adjacency 303 | } 304 | 305 | 306 | def _get_single_direction_neighbors(object_idx, ui_v_dist, ui_h_dist): 307 | """Gets four 'single direction neighbors' for one target ui_object. 308 | If B is A's bottom/top 'single direction neighbor', it means B is the 309 | vertical closest neighbor among all object whose horizontal distance to A is 310 | smaller than margin threshold. Same with left/right direction neighbor. 311 | Args: 312 | object_idx: index number of target ui_object in ui_object_list 313 | ui_v_dist: ui objects' vertical distances. shape=[num_ui_obj, num_ui_obj] 314 | ui_h_dist: ui objects' horizontal distances. shape=[num_ui_obj, num_ui_obj] 315 | Returns: 316 | a dictionary, keys are NeighborContextDesc Instance, values are neighbor 317 | object index. 318 | """ 319 | neighbor_dict = {} 320 | vertical_dist = ui_v_dist[object_idx] 321 | horizontal_dist = ui_h_dist[object_idx] 322 | bottom_neighbors = np.array([ 323 | idx for idx in range(len(vertical_dist)) if vertical_dist[idx] > 0 and 324 | abs(horizontal_dist[idx]) < config.NORM_HORIZONTAL_NEIGHBOR_MARGIN 325 | ]) 326 | top_neighbors = np.array([ 327 | idx for idx in range(len(vertical_dist)) if vertical_dist[idx] < 0 and 328 | abs(horizontal_dist[idx]) < config.NORM_HORIZONTAL_NEIGHBOR_MARGIN 329 | ]) 330 | right_neighbors = np.array([ 331 | idx for idx in range(len(horizontal_dist)) if horizontal_dist[idx] > 0 and 332 | abs(vertical_dist[idx]) < config.NORM_VERTICAL_NEIGHBOR_MARGIN 333 | ]) 334 | left_neighbors = np.array([ 335 | idx for idx in range(len(horizontal_dist)) if horizontal_dist[idx] < 0 and 336 | abs(vertical_dist[idx]) < config.NORM_VERTICAL_NEIGHBOR_MARGIN 337 | ]) 338 | 339 | if bottom_neighbors.size: 340 | neighbor_dict['top'] = bottom_neighbors[np.argmin( 341 | vertical_dist[bottom_neighbors])] 342 | if top_neighbors.size: 343 | neighbor_dict['bottom'] = top_neighbors[np.argmax( 344 | vertical_dist[top_neighbors])] 345 | if right_neighbors.size: 346 | neighbor_dict['left'] = right_neighbors[np.argmin( 347 | horizontal_dist[right_neighbors])] 348 | if left_neighbors.size: 349 | neighbor_dict['right'] = left_neighbors[np.argmax( 350 | horizontal_dist[left_neighbors])] 351 | 352 | return neighbor_dict 353 | 354 | 355 | def normalized_pixel_distance(node1, node2, _screen_width, _screen_height): 356 | """Calculates normalized pixel distance between this node and other node. 357 | 358 | Args: 359 | node1, node2: Another object. 360 | _screen_width, _screen_height: Screen width and height. 361 | 362 | Returns: 363 | Normalized pixel distance on both horizontal and vertical direction. 364 | """ 365 | h_distance = _pixel_distance(_build_bounding_box(node1.get('bounds')).x1, 366 | _build_bounding_box(node1.get('bounds')).x2, 367 | _build_bounding_box(node2.get('bounds')).x1, 368 | _build_bounding_box(node2.get('bounds')).x2) 369 | v_distance = _pixel_distance(_build_bounding_box(node1.get('bounds')).y1, 370 | _build_bounding_box(node1.get('bounds')).y2, 371 | _build_bounding_box(node2.get('bounds')).y1, 372 | _build_bounding_box(node2.get('bounds')).y2) 373 | 374 | return float(h_distance) / _screen_width, float( 375 | v_distance) / _screen_height 376 | 377 | 378 | def _build_neighbors( 379 | node, 380 | view_hierarchy_leaf_nodes, 381 | _screen_width, 382 | _screen_height): 383 | """Builds the neighbours from view_hierarchy. 384 | 385 | Args: 386 | node: The current etree root node. 387 | view_hierarchy_leaf_nodes: All of the etree nodes. 388 | _screen_width, _screen_height: Screen width and height. 389 | 390 | Returns: 391 | Neighbour directions and object pointers. 392 | """ 393 | if view_hierarchy_leaf_nodes is None: 394 | return None 395 | vh_relation = get_view_hierarchy_leaf_relation( 396 | view_hierarchy_leaf_nodes, _screen_width, _screen_height) 397 | _neighbor = _get_single_direction_neighbors( 398 | view_hierarchy_leaf_nodes.index(node), 399 | vh_relation['v_distance'], 400 | vh_relation['h_distance']) 401 | for k, v in _neighbor.items(): 402 | _neighbor[k] = view_hierarchy_leaf_nodes[v].get('pointer') 403 | return _neighbor 404 | 405 | 406 | def _build_etree_from_json(root, json_dict): 407 | """Builds the element tree from json_dict. 408 | 409 | Args: 410 | root: The current etree root node. 411 | json_dict: The current json_dict corresponding to the etree root node. 412 | """ 413 | # set node attributes 414 | if root is None or json_dict is None: 415 | return 416 | x1, y1, x2, y2 = json_dict.get('bounds', [0, 0, 0, 0]) 417 | root.set('bounds', '[%d,%d][%d,%d]' % (x1, y1, x2, y2)) 418 | root.set('class', json_dict.get('class', '')) 419 | # XML element cannot contain NULL bytes. 420 | root.set('text', json_dict.get('text', '').replace('\x00', '')) 421 | root.set('resource-id', json_dict.get('resource-id', '')) 422 | content_desc = json_dict.get('content-desc', [None]) 423 | root.set( 424 | 'content-desc', 425 | '' if content_desc[0] is None else content_desc[0].replace('\x00', '')) 426 | root.set('package', json_dict.get('package', '')) 427 | root.set('visible', str(json_dict.get('visible-to-user', True))) 428 | root.set('enabled', str(json_dict.get('enabled', False))) 429 | root.set('focusable', str(json_dict.get('focusable', False))) 430 | root.set('focused', str(json_dict.get('focused', False))) 431 | root.set( 432 | 'scrollable', 433 | str( 434 | json_dict.get('scrollable-horizontal', False) or 435 | json_dict.get('scrollable-vertical', False))) 436 | root.set('clickable', str(json_dict.get('clickable', False))) 437 | root.set('long-clickable', str(json_dict.get('long-clickable', False))) 438 | root.set('selected', str(json_dict.get('selected', False))) 439 | root.set('pointer', str(json_dict.get('pointer', ''))) 440 | if 'children' not in json_dict: # leaf node 441 | return 442 | for child in json_dict['children']: 443 | # some json file has 'null' as one of the children. 444 | if child: 445 | child_node = etree.Element('node') 446 | root.append(child_node) 447 | _build_etree_from_json(child_node, child) 448 | 449 | 450 | class LeafNode(object): 451 | """Represents a leaf node in the view hierarchy data from xml.""" 452 | 453 | def __init__(self, 454 | element, 455 | all_elements=None, 456 | dom_location=None, 457 | screen_width=config.SCREEN_WIDTH, 458 | screen_height=config.SCREEN_HEIGHT): 459 | """Constructor. 460 | 461 | Args: 462 | element: The etree.Element object. 463 | all_elements: All the etree.Element objects in the view hierarchy. 464 | dom_location: [depth, preorder-index, postorder-index] of element. 465 | screen_width: The width of the screen associated with the element. 466 | screen_height: The height of the screen associated with the element. 467 | """ 468 | assert not element.findall('.//node') 469 | self.element = element 470 | self._screen_width = screen_width 471 | self._screen_height = screen_height 472 | # logger.info(f"element: {element}") 473 | bbox = _build_bounding_box(element.get('bounds')) 474 | self.uiobject = UIObject( 475 | obj_type=_build_object_type(element.get('class')), 476 | obj_name=_build_object_name( 477 | element.get('text'), element.get('content-desc')), 478 | word_sequence=_build_word_sequence( 479 | element.get('text'), element.get('content-desc'), 480 | element.get('resource-id')), 481 | text=element.get('text'), 482 | resource_id=element.get('resource-id'), 483 | android_class=element.get('class'), 484 | android_package=element.get('package'), 485 | content_desc=element.get('content-desc'), 486 | clickable=_build_clickable(element), 487 | visible=strtobool(element.get('visible', default='true')), 488 | enabled=strtobool(element.get('enabled')), 489 | focusable=strtobool(element.get('focusable')), 490 | focused=strtobool(element.get('focused')), 491 | scrollable=strtobool(element.get('scrollable')), 492 | long_clickable=strtobool(element.get('long-clickable')), 493 | selected=strtobool(element.get('selected')), 494 | bounding_box=bbox, 495 | grid_location=_grid_location(bbox, self._screen_width, 496 | self._screen_height), 497 | dom_location=dom_location, 498 | pointer=element.get('pointer'), 499 | neighbors=_build_neighbors( 500 | element, all_elements, 501 | self._screen_width, self._screen_height)) 502 | 503 | def dom_distance(self, other_node): 504 | """Calculates dom distance between this node and other node. 505 | 506 | Args: 507 | other_node: Another LeafNode object. 508 | 509 | Returns: 510 | The dom distance in between two leaf nodes: defined as the number of 511 | nodes on the path from one leaf node to the other on the tree. 512 | """ 513 | intersection = [ 514 | node for node in self.element.iterancestors() 515 | if node in other_node.element.iterancestors() 516 | ] 517 | assert intersection 518 | ancestor_list = list(self.element.iterancestors()) 519 | other_ancestor_list = list(other_node.element.iterancestors()) 520 | return ancestor_list.index( 521 | intersection[0]) + other_ancestor_list.index(intersection[0]) + 1 522 | 523 | 524 | class DomLocationKey(Enum): 525 | """Keys of dom location info.""" 526 | DEPTH = 0 527 | PREORDER_INDEX = 1 528 | POSTORDER_INDEX = 2 529 | 530 | 531 | class ViewHierarchy(object): 532 | """Represents the view hierarchy data from UIAutomator dump.""" 533 | 534 | def __init__(self, 535 | screen_width=config.SCREEN_WIDTH, 536 | screen_height=config.SCREEN_HEIGHT): 537 | """Constructor. 538 | 539 | Args: 540 | screen_width: The pixel width of the screen for the view hierarchy. 541 | screen_height: The pixel height of the screen for the view hierarchy. 542 | """ 543 | self._root = None 544 | self._root_element = None 545 | self._all_visible_leaves = [] 546 | self._dom_location_dict = None 547 | self._preorder_index = 0 548 | self._postorder_index = 0 549 | self._screen_width = screen_width 550 | self._screen_height = screen_height 551 | 552 | def load_xml(self, xml_content): 553 | """Builds the etree from xml content. 554 | 555 | Args: 556 | xml_content: The string containing xml content. 557 | """ 558 | self._root = etree.XML(xml_content) 559 | self._root_element = self._root[0] 560 | 561 | self._all_visible_leaves = self._get_visible_leaves() 562 | 563 | # dom_location_dict: 564 | # dict of {id(element): [depth, preorder-index, postorder-index]} 565 | # Note: for leaves of any tree, the following equation is always true: 566 | # 567 | # depth == preorder-index - postorder-index (depth is # of ancestors) 568 | # 569 | self._dom_location_dict = self._calculate_dom_location() 570 | 571 | def load_json(self, json_content): 572 | """Builds the etree from json content. 573 | 574 | Args: 575 | json_content: The string containing json content. 576 | """ 577 | json_dict = json.loads(json_content) 578 | if json_dict is None: 579 | raise ValueError('empty json file.') 580 | self._root = etree.Element('hierarchy', rotation='0') 581 | self._root_element = etree.Element('node') 582 | self._root.append(self._root_element) 583 | _build_etree_from_json( 584 | self._root_element, 585 | json_dict['activity']['root']) 586 | 587 | self._all_visible_leaves = self._get_visible_leaves() 588 | self._dom_location_dict = self._calculate_dom_location() 589 | 590 | def get_leaf_nodes(self): 591 | """Returns a list of all the leaf Nodes.""" 592 | return [ 593 | LeafNode(element, self._all_visible_leaves, 594 | self._dom_location_dict[id(element)], 595 | self._screen_width, self._screen_height) 596 | for element in self._all_visible_leaves 597 | ] 598 | 599 | def get_ui_objects(self): 600 | """Returns a list of all ui objects represented by leaf nodes.""" 601 | return [ 602 | LeafNode(element, self._all_visible_leaves, 603 | self._dom_location_dict[id(element)], 604 | self._screen_width, self._screen_height).uiobject 605 | for element in self._all_visible_leaves 606 | ] 607 | 608 | def dedup(self, click_x_and_y): 609 | """Dedup UI objects with same text or content_desc. 610 | 611 | Args: 612 | click_x_and_y: the event x and y (like: click pos in screen) 613 | """ 614 | click_x, click_y = click_x_and_y 615 | 616 | # Map of {'name': [list of UI objects with this name]} 617 | name_element_map = collections.defaultdict(list) 618 | for element in self._all_visible_leaves: 619 | name = _build_object_name(element.get('text'), 620 | element.get('content_desc')) 621 | name_element_map[name].append(element) 622 | 623 | def delete_element(element): 624 | element.getparent().remove(element) 625 | 626 | for name, elements in name_element_map.items(): 627 | if not name: 628 | continue 629 | # Search if the event (x, y) happens in one of these objects 630 | target_index = None 631 | for index, element in enumerate(elements): 632 | box = _build_bounding_box(element.get('bounds')) 633 | if (box.x1 <= click_x <= box.x2 and box.y1 <= click_y <= box.y2): 634 | target_index = index 635 | 636 | if target_index is None: # target UI obj is not in this elements 637 | for ele in elements[1:]: 638 | delete_element(ele) 639 | else: # if target UI obj is one of them, delete the rest UI objs 640 | for ele in elements[:target_index] + \ 641 | elements[target_index + 1:]: 642 | delete_element(ele) 643 | 644 | print('Dedup: %d -> %d' % (len(self._all_visible_leaves), 645 | len(self._get_visible_leaves()))) 646 | 647 | self._all_visible_leaves = self._get_visible_leaves() 648 | self._dom_location_dict = self._calculate_dom_location() 649 | 650 | def _get_visible_leaves(self): 651 | """Gets all the visible leaves from view hierarchy. 652 | 653 | Returns: 654 | all_visible_leaves: The list of all the visible leaf elements. 655 | """ 656 | 657 | all_elements = [element for element in self._root.iter('*')] 658 | # View the attributes of each element 659 | # for element in all_elements: 660 | # logger.info(element.attrib) 661 | # logger.info(element.attrib.get('bounds')) 662 | # logger.info(element.attrib.get('displayed')) 663 | 664 | all_visible_leaves = [ 665 | element for element in all_elements if self._is_leaf(element) and 666 | strtobool(element.attrib.get('displayed', default='true')) and 667 | self._is_within_screen_bound(element) 668 | ] 669 | return all_visible_leaves 670 | 671 | def _calculate_dom_location(self): 672 | """Calculate [depth, preorder-index, postorder-index] of all leaf nodes. 673 | 674 | This method is NOT thread safe if multiple threads call this method of same 675 | ViewHierarchy object: This method keeps updating self._preorder_index 676 | and self._postorder_index when call pre/post travel method recursively. 677 | 678 | All leaf elements will be filted and cached in self._all_visible_leaves. 679 | This is necessary because dom_location_dict use id(element) as keys, if 680 | call _root.iter('*') every time, the id(element) will not be a fixed value 681 | even for same element in XML. 682 | 683 | Returns: 684 | dom_location_dict, dict of 685 | {id(element): [depth, preorder-index, postorder-index]} 686 | """ 687 | dom_location_dict = collections.defaultdict(lambda: [None, None, None]) 688 | # Calculate the depth of all leaf nodes. 689 | for element in self._all_visible_leaves: 690 | ancestors = [node for node in element.iterancestors()] 691 | dom_location_dict[id(element)][DomLocationKey.DEPTH.value] = len( 692 | ancestors) 693 | 694 | # Calculate the pre/post index by calling pre/post iteration 695 | # recursively. 696 | self._preorder_index = 0 697 | self._pre_order_iterate(self._root, dom_location_dict) 698 | self._postorder_index = 0 699 | self._post_order_iterate(self._root, dom_location_dict) 700 | return dom_location_dict 701 | 702 | def _pre_order_iterate(self, element, dom_location_dict): 703 | """Preorder travel on hierarchy tree. 704 | 705 | Args: 706 | element: etree element which will be visited now. 707 | dom_location_dict: dict of 708 | {id(element): [depth, preorder-index, postorder-index]} 709 | """ 710 | if self._is_leaf(element): 711 | dom_location_dict[id(element)][DomLocationKey.PREORDER_INDEX 712 | .value] = self._preorder_index 713 | self._preorder_index += 1 714 | 715 | for child in element: 716 | if child.getparent() == element: 717 | self._pre_order_iterate(child, dom_location_dict) 718 | 719 | def _post_order_iterate(self, element, dom_location_dict): 720 | """Postorder travel on hierarchy tree. 721 | 722 | Args: 723 | element: etree element which will be visited now. 724 | dom_location_dict: dict of 725 | {id(element): [depth, preorder-index, postorder-index]} 726 | """ 727 | for child in element: 728 | if child.getparent() == element: 729 | self._post_order_iterate(child, dom_location_dict) 730 | 731 | if self._is_leaf(element): 732 | dom_location_dict[id(element)][DomLocationKey.POSTORDER_INDEX 733 | .value] = self._postorder_index 734 | self._postorder_index += 1 735 | 736 | def _is_leaf(self, element): 737 | """Whether an etree element is leaf in hierachy tree.""" 738 | 739 | return not element.findall('.//*') 740 | 741 | def _is_within_screen_bound(self, element): 742 | """Whether an etree element's bounding box is within screen boundary.""" 743 | bbox = _build_bounding_box(element.attrib.get('bounds')) 744 | in_x = (0 <= bbox.x1 <= self._screen_width) and (0 <= bbox.x2 <= 745 | self._screen_width) 746 | in_y = (0 <= bbox.y1 <= self._screen_height) and (0 <= bbox.y2 <= 747 | self._screen_height) 748 | x1_less_than_x2 = bbox.x1 < bbox.x2 749 | y1_less_than_y2 = bbox.y1 < bbox.y2 750 | return in_x and in_y and x1_less_than_x2 and y1_less_than_y2 751 | -------------------------------------------------------------------------------- /cognisim/device/device.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | 3 | 4 | class Device(ABC): 5 | def __init__(self, app_package): 6 | self.app_package = app_package 7 | 8 | @abstractmethod 9 | def start_device(self): 10 | ''' 11 | Function to start device 12 | ''' 13 | pass 14 | 15 | @abstractmethod 16 | def stop_device(self): 17 | ''' 18 | Function to stop device 19 | ''' 20 | pass 21 | 22 | @abstractmethod 23 | def get_state(self): 24 | pass 25 | 26 | @abstractmethod 27 | def tap(self, x, y): 28 | pass 29 | 30 | @abstractmethod 31 | def input(self, x, y, text): 32 | pass 33 | 34 | @abstractmethod 35 | def swipe(self, x, y, direction): 36 | pass 37 | -------------------------------------------------------------------------------- /cognisim/device/device_factory.py: -------------------------------------------------------------------------------- 1 | # device/device_factory.py 2 | # from .device import Device 3 | from cognisim.device.android.android_device import AndroidDevice 4 | from cognisim.device.ios.ios_device import IOSDevice 5 | from loguru import logger 6 | 7 | 8 | class DeviceFactory: 9 | @staticmethod 10 | def create_device( 11 | platform: str, 12 | app_url: str, 13 | state_representation='aria', 14 | download_directory='default', 15 | session_id=None, 16 | tracing=False, 17 | tracingconfig=None 18 | ): 19 | if platform == 'android': 20 | return AndroidDevice( 21 | app_package=app_url, 22 | download_directory=download_directory, 23 | session_id=session_id 24 | ) 25 | elif platform == 'ios': 26 | return IOSDevice(app_url) 27 | 28 | elif platform == 'web': 29 | logger.info("Creating web device") 30 | raise NotImplementedError("Web support is not yet implemented") 31 | else: 32 | raise ValueError( 33 | "Invalid type. Expected one of: 'android', 'web'.") 34 | -------------------------------------------------------------------------------- /cognisim/device/ios/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RevylAI/CogniSim/3d8902e011981b93cb0ebfafba1794eab93b053e/cognisim/device/ios/__init__.py -------------------------------------------------------------------------------- /cognisim/device/ios/ios_device.py: -------------------------------------------------------------------------------- 1 | import base64 2 | from datetime import datetime 3 | from appium.webdriver.common.appiumby import AppiumBy 4 | from cognisim.device.device import Device 5 | from appium.options.ios import XCUITestOptions 6 | from appium import webdriver 7 | from cognisim.device.ios.ios_view_hierarchy import UI 8 | from cognisim.device.ios.ios_view_hierarchy_maestro import get_formatted_hierarchy as get_formatted_hierarchy_maestro 9 | from loguru import logger 10 | import os 11 | import cv2 12 | import numpy as np 13 | import asyncio 14 | import json 15 | SCREEN_WITH = 430 16 | SCREEN_HEIGHT = 932 17 | 18 | SCREEN_CHANNEL = 4 19 | 20 | 21 | class IOSDevice(Device): 22 | def __init__(self, app_package=None, download_directory='default', session_id=None): 23 | super().__init__(app_package) 24 | self.download_directory = download_directory 25 | self.app_package = app_package 26 | self.session_id = session_id 27 | self.desired_caps = { 28 | 'deviceName': 'iPhone 14', 29 | 'automationName': 'XCUITest', 30 | 'autoGrantPermission': True, 31 | 'newCommandTimeout': 600, 32 | 'mjpegScreenshotUrl': 'http://localhost:4723/stream.mjpeg', 33 | 'platformVersion': '16.4', 34 | 'snapshotMaxDepth': 30, 35 | 'customSnapshotTimeout': 250, 36 | } 37 | 38 | self.options = XCUITestOptions().load_capabilities(self.desired_caps) 39 | self.use_maestro = True 40 | 41 | async def start_device(self): 42 | ''' 43 | Start the IOS device and connect to the appium server 44 | ''' 45 | try: 46 | self.driver = webdriver.Remote('http://localhost:4723', options=self.options) 47 | except BaseException: 48 | self.desired_caps.pop('mjpegScreenshotUrl') 49 | self.options = XCUITestOptions().load_capabilities(self.desired_caps) 50 | self.driver = webdriver.Remote('http://localhost:4723', options=self.options) 51 | 52 | self.driver.update_settings({'waitForIdleTimeout': 0, 'shouldWaitForQuiescence': False, 'maxTypingFrequency': 60}) 53 | 54 | async def mobile_get_source(self, format='json'): 55 | return self.driver.execute_script('mobile: source', {'format': format, 'excludedAttributes': 'visible'}) 56 | 57 | async def start_recording(self): 58 | ''' 59 | Start recording screen on the IOS device 60 | returns: None 61 | ''' 62 | try: 63 | self.driver.start_recording_screen() 64 | except Exception as e: 65 | logger.error(f"Failed to start screen recording. Error: {str(e)}") 66 | raise 67 | 68 | async def stop_recording(self, save_path=None): 69 | ''' 70 | Stops screen recording on the IOS device and saves the video 71 | Args: 72 | save_path (str, optional): Path to save the video file. If not provided, a default path will be used. 73 | 74 | Returns: 75 | str: Path to the saved video file 76 | 77 | ''' 78 | video_base64 = self.driver.stop_recording_screen() 79 | if save_path is None: 80 | timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") 81 | filename = f"screen_recording_{timestamp}.mp4" 82 | save_dir = os.path.join(os.getcwd(), "recordings") 83 | os.makedirs(save_dir, exist_ok=True) 84 | save_path = os.path.join(save_dir, filename) 85 | 86 | with open(save_path, "wb") as video_file: 87 | video_file.write(base64.b64decode(video_base64)) 88 | 89 | logger.info(f"Screen recording saved to: {save_path}") 90 | return save_path 91 | 92 | async def get_state(self, use_maestro=True): 93 | try: 94 | if use_maestro: 95 | encoded_ui, ui = await self.get_state_maestro() 96 | logger.info(f"Maestro hierarchy: {encoded_ui}") 97 | else: 98 | raw_appium_state = self.driver.page_source 99 | 100 | file_path = os.path.join(os.path.dirname(__file__), 'ios_view_hierarchy.xml') 101 | xml_file = open(file_path, 'w') 102 | xml_file.write(raw_appium_state) 103 | xml_file.close() 104 | 105 | ui = UI(file_path) 106 | self.ui = ui 107 | encoded_ui: str = ui.encoding() 108 | logger.info(f"Encoded UI: {encoded_ui}") 109 | # logger.info(f"Raw Appium State: {raw_appium_state}") 110 | except Exception as e: 111 | logger.info(f"Error getting page source: {e}") 112 | raw_appium_state = "" 113 | 114 | screenshot: bytes = self.driver.get_screenshot_as_png() 115 | return encoded_ui, screenshot, ui 116 | 117 | async def get_state_maestro(self): 118 | ''' 119 | Use Maestro to get the view hierarchy 120 | ''' 121 | try: 122 | # Run maestro hierarchy command and capture output 123 | process = await asyncio.create_subprocess_exec( 124 | 'maestro', 'hierarchy', 125 | stdout=asyncio.subprocess.PIPE, 126 | stderr=asyncio.subprocess.PIPE 127 | ) 128 | stdout, stderr = await process.communicate() 129 | 130 | if process.returncode != 0: 131 | logger.error(f"Error getting Maestro hierarchy: {stderr.decode()}") 132 | return None 133 | # Parse JSON output 134 | stdout = stdout.decode().strip() 135 | # Parse until first opening brace 136 | stdout = stdout[stdout.find('{'):] 137 | hierarchy = json.loads(stdout) 138 | # logger.info(f"Hierarchy length: {len(hierarchy)}") 139 | # Format hierarchy 140 | formatted_html, ui_objects = get_formatted_hierarchy_maestro(hierarchy) 141 | return formatted_html, ui_objects 142 | 143 | except Exception as e: 144 | logger.error(f"Error in get_state_maestro: {e}") 145 | return None 146 | 147 | def generate_set_of_mark(self, 148 | ui, 149 | image: bytes, 150 | position='top-left') -> bytes: 151 | ''' 152 | Code to generate a set of mark for a given image and UI state 153 | ui: UI object 154 | image: bytes of the image 155 | step_i: step number 156 | position: position of the annotation, defaults to 'top-lefts, can also be 'center 157 | ''' 158 | nparr = np.frombuffer(image, np.uint8) 159 | img = cv2.imdecode(nparr, cv2.IMREAD_COLOR) 160 | height, width, _ = img.shape 161 | k = 3000 162 | 163 | for element_id in ui.elements: 164 | bounds = [ 165 | ui.elements[element_id].bounding_box.x1, 166 | ui.elements[element_id].bounding_box.y1, 167 | ui.elements[element_id].bounding_box.x2, 168 | ui.elements[element_id].bounding_box.y2 169 | ] 170 | # Calculate the area of the bounding box 171 | area = (bounds[2] - bounds[0]) * (bounds[3] - bounds[1]) 172 | 173 | # Only label elements with area over k 174 | if area > k: 175 | # Draw a rectangle around the element 176 | cv2.rectangle( 177 | img, (int(bounds[0]), int(bounds[1])), 178 | (int(bounds[2]), int(bounds[3])), (0, 0, 255), 5) 179 | 180 | text = str(element_id) 181 | text_size = 2 # Fixed text size 182 | font = cv2.FONT_HERSHEY_SIMPLEX 183 | 184 | # Calculate the width and height of the text 185 | text_width, text_height = cv2.getTextSize(text, font, text_size, 2)[0] 186 | 187 | if position == 'top-left': 188 | text_x = int(bounds[0]) 189 | text_y = int(bounds[1]) + text_height 190 | else: 191 | text_x = (int(bounds[0]) + int(bounds[2])) // 2 - text_width // 2 192 | text_y = (int(bounds[1]) + int(bounds[3])) // 2 + text_height // 2 193 | 194 | # Draw a black rectangle behind the text 195 | cv2.rectangle(img, (text_x, text_y - text_height), 196 | (text_x + text_width, text_y), (0, 0, 0), thickness=cv2.FILLED) 197 | 198 | # Draw the text in white 199 | cv2.putText(img, text, (text_x, text_y), font, 200 | text_size, (255, 255, 255), 4) 201 | 202 | _, img_encoded = cv2.imencode('.png', img) 203 | img_bytes = img_encoded.tobytes() 204 | 205 | return img_bytes 206 | 207 | async def tap(self, x, y): 208 | self.driver.execute_script('mobile: tap', {'x': x, 'y': y}) 209 | 210 | async def input(self, x, y, text): 211 | self.driver.execute_script('mobile: tap', {'x': x, 'y': y}) 212 | self.driver.find_element(AppiumBy.IOS_PREDICATE, "type == 'XCUIElementTypeApplication'").send_keys(text) 213 | # self.driver.execute_script('mobile: type', {'text': text}) 214 | 215 | async def swipe(self, initial_x, initial_y, end_x, end_y, duration=1): 216 | """ 217 | Performs a swipe gesture on the iOS device 218 | 219 | Args: 220 | initial_x (int): Starting x coordinate of the swipe 221 | initial_y (int): Starting y coordinate of the swipe 222 | end_x (int): Ending x coordinate of the swipe 223 | end_y (int): Ending y coordinate of the swipe 224 | duration (int, optional): Duration of the swipe in seconds. Defaults to 1. 225 | """ 226 | self.driver.execute_script('mobile: dragFromToForDuration', {'fromX': initial_x, 'fromY': initial_y, 'toX': end_x, 'toY': end_y, 'duration': duration}) 227 | 228 | async def scroll(self, direction): 229 | direction_map = { 230 | 'up': 'UP', 231 | 'down': 'DOWN', 232 | 'left': 'LEFT', 233 | 'right': 'RIGHT' 234 | } 235 | await self.driver.execute_script('mobile: scroll', {'direction': direction_map[direction]}) 236 | 237 | async def get_screenshot(self) -> bytes: 238 | ''' 239 | Get Screenshot as bytes 240 | ''' 241 | screenshot: bytes = self.driver.get_screenshot_as_png() 242 | return screenshot 243 | 244 | async def navigate(self, package_name: str): 245 | self.driver.activate_app(package_name) 246 | 247 | async def capture_screenshot_with_bounding_box(self, bounds: dict, image_state: bytes = None) -> bytes: 248 | """ 249 | Capture a screenshot with a bounding box drawn around a specified element. 250 | 251 | Args: 252 | bounds (dict): A dictionary containing the bounding box coordinates. 253 | Expected keys are x1, y1, x2, y2, all of which are integers. 254 | image_state (bytes, optional): The current screenshot if available. 255 | 256 | Returns: 257 | bytes: The screenshot image with bounding box as bytes. 258 | """ 259 | logger.info("Creating tagged image") 260 | screenshot = image_state if image_state is not None else await self.device.screenshot() 261 | if screenshot is None: 262 | logger.info("Screenshot failed") 263 | return None 264 | 265 | # Convert the screenshot to a NumPy array 266 | image_np = np.frombuffer(screenshot, dtype=np.uint8) 267 | image = cv2.imdecode(image_np, cv2.IMREAD_COLOR) 268 | 269 | # Extract bounding box coordinates 270 | x1 = int(bounds[0]) 271 | y1 = int(bounds[1]) 272 | x2 = int(bounds[2]) 273 | y2 = int(bounds[3]) 274 | 275 | # Calculate width and height 276 | # width = x2 - x1 277 | # height = y2 - y1 278 | 279 | bright_color = (128, 0, 128) # Pink color 280 | # Draw the bounding box on the image 281 | cv2.rectangle(image, (x1, y1), (x2, y2), bright_color, 5) 282 | 283 | # Convert the image back to bytes 284 | _, encoded_image = cv2.imencode('.png', image) 285 | screenshot_with_bounding_box = encoded_image.tobytes() 286 | 287 | return screenshot_with_bounding_box 288 | 289 | async def stop_device(self): 290 | ''' 291 | Stops the device 292 | ''' 293 | pass 294 | 295 | 296 | if __name__ == "__main__": 297 | ui = UI(os.path.join(os.path.dirname(__file__), 'ios_view_hierarchy.xml')) 298 | encoded_ui = ui.encoding() 299 | 300 | logger.info(f"Encoded UI: {encoded_ui}") 301 | -------------------------------------------------------------------------------- /cognisim/device/ios/ios_view_hierarchy.py: -------------------------------------------------------------------------------- 1 | from lxml import etree 2 | from enum import Enum 3 | from distutils.util import strtobool 4 | import attr 5 | import numpy as np 6 | import re 7 | import json 8 | import collections 9 | from loguru import logger 10 | SCREEN_WIDTH = 430 11 | SCREEN_HEIGHT = 932 12 | 13 | SCREEN_CHANNEL = 4 14 | 15 | ADJACENT_BOUNDING_BOX_THRESHOLD = 3 16 | NORM_VERTICAL_NEIGHTBOR_MARGIN = 0.01 17 | NORM_HORIZONTAL_NEIGHTBOR_MARGIN = 0.01 18 | INPUT_ACTION_UPSAMPLE_RATIO = 1 19 | XML_SCREEN_WIDTH = 430 20 | XML_SCREEN_HEIGHT = 932 21 | CLASS_MAPPING = { 22 | "STATICTEXT": 'p', 23 | "BUTTON": 'button', 24 | "IMAGE": 'img', 25 | "SWITCH": 'input', 26 | "CELL": 'div', 27 | "TABLE": 'table', 28 | "NAVIGATIONBAR": 'nav', 29 | "APPLICATION": "div", 30 | "TEXTFIELD": "input", 31 | "SECURETEXTFIELD": "input", 32 | "DatePicker:": "input", 33 | "PICKER": "input", 34 | "PICKERWHEEL": "input", 35 | "PAGEINDICATOR": "div", 36 | "KEY": "button", 37 | "KEYBOARD": "div", 38 | "LINK": "a", 39 | "SEARCHFIELD:": "input", 40 | "TEXTVIEW": "textarea", 41 | "WEBVIEW": "iframe", 42 | "BUTTON": "button", 43 | "OTHER": "div" 44 | } 45 | 46 | 47 | class DomLocationKey(Enum): 48 | ''' 49 | Keys of dom location info 50 | ''' 51 | DEPTH = 0 52 | PREORDER_INDEX = 1 53 | POSTORDER_INDEX = 2 54 | 55 | 56 | class UIObjectType(Enum): 57 | """ 58 | Typoes of the different UI objects 59 | """ 60 | UNKNOWN = 0 61 | BUTTON = 1 62 | IMAGE = 2 63 | SWITCH = 3 64 | CELL = 4 65 | OTHER = 5 66 | TABLE = 6 67 | NAVIGATIONBAR = 7 68 | APPLICATION = 8 69 | WINDOW = 9 70 | STATICTEXT = 10 71 | SLIDER = 11 72 | TEXTFIELD = 12 73 | SECURETEXTFIELD = 13 74 | DATEPICKER = 14 75 | PICKER = 15 76 | PICKERWHEEL = 16 77 | PAGEINDICATOR = 17 78 | KEY = 18 79 | KEYBOARD = 19 80 | LINK = 20 81 | SEARCHFIELD = 21 82 | TEXTVIEW = 22 83 | WEBVIEW = 23 84 | 85 | 86 | class UIObjectGridLocation(Enum): 87 | ''' 88 | The on-screen grid location (3x3 grid) of an UI object 89 | ''' 90 | TOP_LEFT = 0 91 | TOP_CENTER = 1 92 | TOP_RIGHT = 2 93 | LEFT = 3 94 | CENTER = 4 95 | RIGHT = 5 96 | BOTTOM_LEFT = 6 97 | BOTTOM_CENTER = 7 98 | BOTTOM_RIGHT = 8 99 | 100 | 101 | @attr.s 102 | class BoundingBox(object): 103 | ''' 104 | The bounding box with horizontal/vertical coordinates of a ui object 105 | ''' 106 | x1 = attr.ib() 107 | y1 = attr.ib() 108 | x2 = attr.ib() 109 | y2 = attr.ib() 110 | 111 | 112 | @attr.s 113 | class UiObject(object): 114 | ''' 115 | Represents a UI object form the leaf node in the view hierarchy 116 | ''' 117 | # type 118 | obj_type = attr.ib() 119 | # name 120 | obj_name = attr.ib() 121 | 122 | word_sequence = attr.ib() 123 | # text 124 | text = attr.ib() 125 | # accessibility label 126 | accesible = attr.ib() 127 | 128 | # ios_Type 129 | ios_class = attr.ib() 130 | 131 | # name 132 | content_desc = attr.ib() 133 | # 134 | 135 | visible = attr.ib() 136 | enabled = attr.ib() 137 | 138 | bounding_box = attr.ib() 139 | 140 | grid_location = attr.ib() 141 | 142 | dom_location = attr.ib() 143 | 144 | pointer = attr.ib() 145 | 146 | neighbors = attr.ib() 147 | 148 | 149 | def _build_word_sequence(text, content_desc, resource_id): 150 | ''' 151 | Returns a sequence of word toekns based on certain attributes 152 | 153 | Args: 154 | text: the text attribute of an element 155 | content_desc: the content-desc attribute of an element 156 | resource_id: `resource_id` attribute of an element 157 | Priority of the attributes: text > content_desc > resource_id 158 | Returns: 159 | A sequence of word tokens 160 | ''' 161 | if text or content_desc: 162 | return re.findall(r"[\w']+|[.,!?;]", text if text else content_desc) 163 | else: 164 | name = resource_id.split('/')[-1] 165 | return filter(None, name.split('_')) 166 | 167 | 168 | def _build_object_type(ios_class: str): 169 | ''' 170 | Returns the object type based on `class` attribute 171 | 172 | Args: 173 | ios_class: the `class` attribute of an element 174 | Returns: 175 | The UIObjectType of the element 176 | 177 | ''' 178 | if ios_class.startswith("XCUIElementType"): 179 | widget_type = ios_class.split("XCUIElementType")[1] 180 | for obj_type in UIObjectType: 181 | if obj_type.name == widget_type.upper(): 182 | # logger.info(f"obj_type: {obj_type}") 183 | return obj_type 184 | return UIObjectType.BUTTON 185 | 186 | 187 | def _build_object_name(text, content_desc): 188 | ''' 189 | Returns the object name based on 'text' or 'context_desc' attribute 190 | Args: 191 | text: the `text` attribute of an element 192 | content_desc: the `content_desc` attribute of an element 193 | Returns: 194 | The object name 195 | ''' 196 | return text if text else content_desc 197 | 198 | 199 | def _build_bounding_box(bounds): 200 | ''' 201 | Returns the object bounding box based on `bounds` attribute 202 | 203 | Args: 204 | bounds the `b_ounds` attribute of an element 205 | 206 | Return: 207 | The BoundingBox Object 208 | ''' 209 | match = re.compile( 210 | r'\[\'(\d+)\', \'(\d+)\'\]\[\'(\d+)\', \'(\d+)\'\]').match(bounds) 211 | 212 | assert match 213 | x1, y1, x2, y2 = map(int, match.groups()) 214 | return BoundingBox(x1, y1, x2, y2) 215 | 216 | 217 | def _build_clickable(element, tree_child_as_clickable=True): 218 | '''' 219 | Returns whether the element is clickable based on certain attributes 220 | Args: 221 | element: The etree.element object 222 | tree_child_as_clickable: Whether to consider the tree child as clickable 223 | 224 | Returns: 225 | A boolean to indicate whether the element is clickable or one of its ancesors is 226 | basicallty given an element check if it is clickable or for the purposeo of this 227 | html representation 228 | ''' 229 | clickable = element.get('accessible') 230 | if clickable == 'false': 231 | for node in element.iterancestors(): 232 | if node.get('accessible') == 'true': 233 | clickable = True 234 | break 235 | if element.get('accessible') == 'true': 236 | clickable = 'true' 237 | if tree_child_as_clickable: 238 | p = element.getparent() 239 | while p is not None: 240 | if p.get('class') == 'android.widget.ListView': 241 | clickable = False 242 | break 243 | p = p.getparent() 244 | 245 | return strtobool(clickable) 246 | 247 | 248 | def _pixel_distance(a_x1, a_x2, b_x1, b_x2): 249 | ''' 250 | Calculates the pixel distance between bounding box a and b 251 | 252 | Args: 253 | a_x1: The x_1 coordinate of box a 254 | a_x2: The x_2 coordinate of box a 255 | b_x1: The x_1 coordinate of box b 256 | b_x2: The x_2 coordinate of box b 257 | 258 | Returns: 259 | The pixel distance between box a and b on the x acis. The distance 260 | on the y acis can be calculated in the same way. The distance can be 261 | positive number (b is right/bottom to a ) and negative 262 | (b is left/top to a) 263 | 264 | The _pixel_distance function calculates the pixel distance between two bounding box (a and b) along the x-axis 265 | 266 | Here's a breakdown: 267 | 268 | 1. If box b is close enough to box a on the right side. (distance is less than or equal to a threshold), it returns 1. 269 | 270 | 2. If box b is close enough to box a on the left side. (distance is less than or equal to a threshold), it returns -1. 271 | 272 | 3. If box a and box b overlap on the x-axis it returns - 273 | 274 | 4. If box b is to teh right of box a (b_x1 > a_x2), it reutrns the distance from the right side of box a to the left side of box b (b_x1 - a_x2) 275 | 276 | 5. If none of the above conditions are met, box b is to the left of box a, and it returns the distance from the right side of box b to the left side of box a (b_x2- a_x1) 277 | 278 | The function assumes that the x1 coordinate is the left side of a box and the x2 coordinate is teh right side. The returned distance can be positive (if b is to the right of a) 279 | Tldr: the fucntion runs the distance between two boundings boxes along the x-acis and if they close enoguht it returns 1 or -1 280 | ''' 281 | 282 | if b_x1 <= a_x2 and a_x2 - b_x1 <= ADJACENT_BOUNDING_BOX_THRESHOLD: 283 | return 1 284 | if a_x1 <= b_x2 and b_x2 - a_x1 <= ADJACENT_BOUNDING_BOX_THRESHOLD: 285 | return -1 286 | 287 | # overlap 288 | if (a_x1 <= b_x1 <= a_x2) or (a_x1 <= b_x2 <= a_x2) or (b_x1 <= a_x1 <= b_x2) or (b_x1 <= a_x2 <= b_x2): 289 | return 0 290 | elif b_x1 > a_x2: 291 | return b_x1 - a_x2 292 | else: 293 | return b_x2 - a_x1 294 | 295 | 296 | def _grid_coordinate(x, width): 297 | """Calculates the 3x3 grid coordinate on the x axis. 298 | 299 | The grid coordinate on the y axis is calculated in the same way. 300 | 301 | Args: 302 | x: The x coordinate: [0, width). 303 | width: The screen width. 304 | 305 | Returns: 306 | The grid coordinate: [0, 2]. 307 | Note that the screen is divided into 3x3 grid, so the grid coordinate 308 | uses the number from 0, 1, 2. 309 | """ 310 | logger.info(f"x: {x}, width: {width}") 311 | # assert 0 <= x <= width 312 | grid_x_0 = width / 3 313 | grid_x_1 = 2 * grid_x_0 314 | if 0 <= x < grid_x_0: 315 | grid_coordinate_x = 0 316 | elif grid_x_0 <= x < grid_x_1: 317 | grid_coordinate_x = 1 318 | else: 319 | grid_coordinate_x = 2 320 | return grid_coordinate_x 321 | 322 | 323 | def _grid_location(bbox, screen_width, screen_height): 324 | ''' 325 | Calculates teh grid number of the UI bounding box 326 | 327 | Args: 328 | bbox: The bounding box of the UI OBject 329 | screen_width: The width of the screen 330 | screen_height: The height of the screen 331 | 332 | Returns: 333 | The grid location number 334 | ''' 335 | bbox_center_x = (bbox.x1 + bbox.x2) / 2 336 | bbox_center_y = (bbox.y1 + bbox.y2) / 2 337 | bbox_grid_x = _grid_coordinate(bbox_center_x, screen_width) 338 | bbox_grid_y = _grid_coordinate(bbox_center_y, screen_height) 339 | return UIObjectGridLocation(bbox_grid_y * 3 + bbox_grid_x) 340 | 341 | 342 | def get_view_hiearchy_leaf_relation(objects, _screen_width, _screen_height): 343 | ''' 344 | Calculates teh adjacency relatio from list of view hierarchy leaf nodes 345 | Args: 346 | object: The list of view hierarchy leaf nodes 347 | _screen_width, _screen_width: Screen width and height 348 | 349 | Returns: 350 | An un-padded feature dictionary as follow: 351 | 'v_distance' 2d numpy array of ui object vertical adjancency relation 352 | 'h_distance' 2d numpy array of ui object horizontal adjacency relation 353 | 'dom_distance": 2d numpy array of ui object dom adjacency relation 354 | 355 | 356 | Adjacency matrix for vertical, horizontal, and dom relation 357 | 358 | ''' 359 | 360 | vh_node_num = len(objects) 361 | vertical_adjacency = np.zeros((vh_node_num, vh_node_num)) 362 | horizontal_adjacency = np.zeros((vh_node_num, vh_node_num)) 363 | 364 | for row in range(len(objects)): 365 | for column in range(len(objects)): 366 | if row == column: 367 | h_dist = v_dist = 0 368 | else: 369 | node1 = objects[row] 370 | node2 = objects[column] 371 | h_dist, v_dist = normalized_pixel_distance( 372 | node1, node2, _screen_width, _screen_height) 373 | 374 | vertical_adjacency[row][column] = v_dist 375 | horizontal_adjacency[row][column] = h_dist 376 | return { 377 | 'v_distance': vertical_adjacency, 378 | 'h_distance': horizontal_adjacency 379 | } 380 | 381 | 382 | def normalized_pixel_distance(node1, node2, _screen_width, _screen_height): 383 | ''' 384 | Caclulates teh normalized 385 | 386 | Args: 387 | node1, node2: Another object 388 | 389 | Reutrns: 390 | Normalized pixel distance on both horizontal and vertical direction 391 | ''' 392 | node1_x_1 = int(node1.get('x')) 393 | 394 | node1_x_2 = node1_x_1 + int(node1.get('width')) 395 | 396 | node1_y_1 = int(node1.get('y')) 397 | node1_y_2 = node1_y_1 + int(node1.get('height')) 398 | node2_x_1 = int(node2.get('x')) 399 | 400 | node2_x_2 = node2_x_1 + int(node2.get('width')) 401 | 402 | node2_y_1 = int(node2.get('y')) 403 | 404 | node2_y_2 = node2_y_1 + int(node2.get('height')) 405 | 406 | h_distance = _pixel_distance(node1_x_1, node1_x_2, node2_x_1, node2_x_2) 407 | 408 | v_distance = _pixel_distance(node1_y_1, node1_y_2, node2_y_1, node2_y_2) 409 | 410 | return float(h_distance) / _screen_width, float(v_distance) / _screen_height 411 | 412 | 413 | def _build_neighbors(node, view_hierarchy_leaf_nodes, 414 | _screen_width, _screen_height): 415 | ''' 416 | Builds the neighbors of a node based on the view hierarchy leaf nodes 417 | 418 | Args: 419 | node: The etree element object 420 | view_hierarchy_leaf_nodes: The list of view hierarchy leaf nodes 421 | _screen_width: The screen width 422 | _screen_height: The screen height 423 | 424 | Returns: 425 | A list of neighbors of the node 426 | ''' 427 | if view_hierarchy_leaf_nodes is None: 428 | return None 429 | 430 | vh_relation = get_view_hiearchy_leaf_relation( 431 | view_hierarchy_leaf_nodes, _screen_width, _screen_height) 432 | _neighbor = _get_single_direction_neighbors( 433 | view_hierarchy_leaf_nodes, 434 | vh_relation['v_distance'], 435 | vh_relation['h_distance'], 436 | ) 437 | for k, v in _neighbor.items(): 438 | _neighbor[k] = view_hierarchy_leaf_nodes[v].get('pointer') 439 | return _neighbor 440 | 441 | 442 | def _get_single_direction_neighbors(object_idx, ui_v_dist, ui_h_dist): 443 | ''' 444 | Gets four single direction neighbor for one target ui_object 445 | 446 | Args: 447 | object_idx: The index of the target ui_object 448 | ui_v_dist: The vertical adjacency matrix 449 | ui_h_dist: The horizontal adjacency matrix 450 | 451 | Returns: 452 | A dictionary of the four single direction neighbors 453 | 454 | ''' 455 | neighbor_dict = {} 456 | vertical_distance = ui_v_dist[object_idx] 457 | horizontal_distance = ui_h_dist[object_idx] 458 | bottom_neighbor = np.array([ 459 | idx for idx in range(len(vertical_distance)) if vertical_distance[idx] > 0 and 460 | abs(horizontal_distance[idx]) < NORM_HORIZONTAL_NEIGHTBOR_MARGIN 461 | ]) 462 | top_neighbor = np.array([ 463 | idx for idx in range(len(vertical_distance)) if vertical_distance[idx] < 0 and 464 | abs(horizontal_distance[idx]) < NORM_HORIZONTAL_NEIGHTBOR_MARGIN 465 | ]) 466 | right_neighbor = np.array([ 467 | idx for idx in range(len(horizontal_distance)) if horizontal_distance[idx] > 0 and 468 | abs(vertical_distance[idx]) < NORM_VERTICAL_NEIGHTBOR_MARGIN 469 | ]) 470 | left_neighbor = np.array([ 471 | idx for idx in range(len(horizontal_distance)) if horizontal_distance[idx] < 0 and 472 | abs(vertical_distance[idx]) < NORM_VERTICAL_NEIGHTBOR_MARGIN 473 | ]) 474 | 475 | if bottom_neighbor.size: 476 | neighbor_dict['top'] = bottom_neighbor[ 477 | np.argmin(vertical_distance[bottom_neighbor])] 478 | if top_neighbor.size: 479 | neighbor_dict['bottom'] = top_neighbor[np.argmax( 480 | vertical_distance[top_neighbor])] 481 | if right_neighbor.size: 482 | neighbor_dict['left'] = right_neighbor[np.argmax( 483 | horizontal_distance[right_neighbor])] 484 | if left_neighbor.size: 485 | neighbor_dict['right'] = left_neighbor[np.argmin( 486 | horizontal_distance[left_neighbor])] 487 | 488 | return neighbor_dict 489 | 490 | 491 | def _build_etree_from_json(root, json_dict): 492 | ''' 493 | Builds teh element tree from json_dict 494 | 495 | Args: 496 | root: The current etree root node 497 | json_dict: The current json_dict corresponding ot the etree root node 498 | 499 | 500 | ''' 501 | 502 | if root is None or json_dict is None: 503 | return 504 | x1, y1, x2, y2 = json_dict.get('bounds', [0, 0, 0, 0]) 505 | root.set('bounds', '[%d, %d, %d, %d]' % (x1, y1, x2, y2)) 506 | root.set('class', json_dict.get('class', '')) 507 | root.set('type', json_dict.get('type', '')) 508 | 509 | root.set('text', json_dict.get('text', '').replace('\x00', '')) 510 | 511 | root.set('resource-id', json_dict.get('resource-id', '')) 512 | 513 | root.set('content-desc', json_dict.get('content-desc', [None])) 514 | root.set('package', json_dict.get('package', '')) 515 | root.set('visible', str(json_dict.get('displayed', True))) 516 | root.set('enable', str(json_dict.get('enabled', False))) 517 | root.set('focusable', str(json_dict.get('focusable', False))) 518 | root.set('focused', str(json_dict.get('focused', False))) 519 | 520 | root.set('scrollable', 521 | str( 522 | json_dict.get('scrollable-horizontal', False) or 523 | json_dict.get('scrollable-vertical', False) 524 | )) 525 | root.set('clickable', str(json_dict.get('clickable', False))) 526 | root.set('long-clickable', str(json_dict.get('long-clickable', False))) 527 | 528 | root.set('selected', str(json_dict.get('selected', False))) 529 | 530 | root.set('pointer', json_dict.get('pointer', '')) 531 | 532 | if 'children' in json_dict: 533 | for child in json_dict['children']: 534 | child_element = etree.Element('node') 535 | root.append(child_element) 536 | _build_etree_from_json(child_element, child) 537 | 538 | 539 | class LeafNode(object): 540 | ''' 541 | Represent a leaf node in the view hierachy 542 | ''' 543 | 544 | def __init__( 545 | self, 546 | element, 547 | all_elements=None, 548 | dom_location=None, 549 | screen_width=SCREEN_WIDTH, 550 | screen_height=SCREEN_HEIGHT, 551 | ): 552 | ''' 553 | Constructor. 554 | 555 | Args: 556 | 557 | element: the etree.Element object 558 | all_element: All the etree.Element objects in the view hierarchy 559 | dom_location: [depth, preorder-index, postorder-index] of element 560 | screen_width: The width of the screen associated with the element 561 | screen_height: The height of the screen associated with the element 562 | ''' 563 | 564 | assert not len(element) 565 | self.element = element 566 | 567 | self._screen_width = screen_width 568 | 569 | self._screen_height = screen_height 570 | 571 | x_1 = str(max(0, int(element.get('x')))) 572 | y_1 = str(max(0, int(element.get('y')))) 573 | x_2 = str(int(x_1) + int(element.get('width'))) 574 | y_2 = str(int(y_1) + int(element.get('height'))) 575 | 576 | inits = str([x_1, y_1]) 577 | ends = str([x_2, y_2]) 578 | bounds = str(inits) + str(ends) 579 | 580 | bbox = _build_bounding_box(bounds) 581 | 582 | self.uiobject = UiObject( 583 | obj_type=_build_object_type(element.get('type')), 584 | content_desc=element.get('content-desc', default='').split('.')[-1] 585 | if '.' in element.get('name', default='') else element.get('name', default=''), 586 | obj_name=_build_object_name( 587 | text=element.get('name', default=''), 588 | content_desc=element.get('content-desc', default='') 589 | ), 590 | word_sequence=_build_word_sequence( 591 | text=element.get( 592 | 'text', default='' 593 | ), 594 | content_desc=element.get( 595 | 'content-desc', default='' 596 | ), 597 | resource_id=element.get('resource-id', default='') 598 | 599 | ), 600 | text=element.get('label', default=''), 601 | accesible=element.get('accessible', default='true'), 602 | 603 | ios_class=element.get('type', default=''), 604 | visible=strtobool(element.get('visible', default='true')), 605 | enabled=strtobool(element.get('enabled', default='true')), 606 | bounding_box=bbox, 607 | grid_location=_grid_location(bbox, self._screen_width, self._screen_height), 608 | dom_location=dom_location, 609 | pointer=element.get('pointer', default=''), 610 | neighbors=_build_neighbors(element, all_elements, self._screen_width, self._screen_height), 611 | 612 | ) 613 | 614 | def dom_distance(self, other_node): 615 | ''' 616 | Calculate the dom distance between two nodes 617 | Args: 618 | other_node: Another LeafNode 619 | Returns dom distance 620 | ''' 621 | intersection = [ 622 | node for node in self.element.iterancestors() 623 | if node in other_node.element.iterancestors() 624 | ] 625 | assert intersection 626 | ancestor_list = list(self.element.iterancestors()) 627 | 628 | other_ancestor_list = list(other_node.element.iterancestors()) 629 | 630 | return ancestor_list.index( 631 | intersection[0]) + other_ancestor_list.index(intersection[0]) + 1 632 | 633 | 634 | class ViewHierarchy(object): 635 | ''' 636 | Represents the view hierachy from XCUI Test 637 | ''' 638 | 639 | def __init__(self, screen_width=SCREEN_WIDTH, screen_height=SCREEN_HEIGHT): 640 | ''' 641 | Constructor 642 | 643 | Args: 644 | 645 | screen_width: The pixel width of the screen 646 | screen_height: The pixel height of the screen 647 | ''' 648 | 649 | self._root = None 650 | self._root_element = None 651 | 652 | self._all_visible_leaves = [] 653 | 654 | self._dom_location_dict = None 655 | self._preorder_index = 0 656 | self._postorder_index = 0 657 | 658 | self._screen_width = screen_width 659 | self._screen_height = screen_height 660 | 661 | def load_xml(self, xml_content): 662 | ''' 663 | Builds the etree from xml content 664 | Args: 665 | xml_content: The string containing xml content 666 | ''' 667 | self._root = etree.XML(xml_content) 668 | 669 | self._root_element = self._root[0] 670 | self._all_visible_leaves = self._get_visible_leaves() 671 | 672 | self._dom_location_dict = self._calculate_dom_location() 673 | 674 | def load_json(self, json_content): 675 | ''' 676 | Builds the etree from json content 677 | args: 678 | json_content: The string containing json content 679 | ''' 680 | json_dict = json.loads(json_content) 681 | if json_dict: 682 | raise ValueError('The json content is empty') 683 | 684 | self._root = etree.Element('hierarchy', rotation='0') 685 | self._root_element = etree.Element('node') 686 | self._root.append(self._root_element) 687 | _build_etree_from_json(self._root_element, json_dict['activity']['root']) 688 | 689 | self._all_visible_leaves = self._get_all_visible_leaves() 690 | 691 | self._dom_location_dict = self._calculate_dom_location_dict() 692 | 693 | def get_leaf_nodes(self): 694 | ''' 695 | Returns all the leaf nodes in the view hierarchy 696 | 697 | ''' 698 | return [ 699 | 700 | LeafNode( 701 | element, 702 | self._all_visible_leaves, 703 | self._dom_location_dict[id(element)], 704 | self._screen_width, 705 | self._screen_height 706 | ) 707 | for element in self._all_visible_leaves 708 | ] 709 | 710 | def get_ui_objects(self): 711 | ''' 712 | Returns a list of all UI objects represented by leaf nodes 713 | ''' 714 | return [ 715 | LeafNode(element, self._all_visible_leaves, self._dom_location_dict[id(element)], self._screen_width, self._screen_height).uiobject 716 | for element in self._all_visible_leaves 717 | ] 718 | 719 | def dedup(self, click_x_and_y): 720 | ''' 721 | Dedup UI objects with same text or content_desc 722 | Args 723 | click_x_and_y: The click x and y coordinates 724 | ''' 725 | click_x, click_y = click_x_and_y 726 | 727 | name_element_map = collections.defaultdict(list) 728 | 729 | for element in self._all_visible_leaves: 730 | name = _build_object_name( 731 | element.get('text'), 732 | element.get('content-desc') 733 | ) 734 | name_element_map[name].append(element) 735 | 736 | def delete_element(element): 737 | element.getparent().remove(element) 738 | 739 | for name, elements in name_element_map.items(): 740 | if not name: 741 | continue 742 | target_index = None 743 | for index, element in enumerate(elements): 744 | box = _build_bounding_box(element.get('bounds')) 745 | if (box.x1 <= click_x <= box.x2) and (box.y1 <= click_y <= box.y2): 746 | target_index = index 747 | break 748 | 749 | if target_index is None: 750 | for ele in elements[1:]: 751 | delete_element(ele) 752 | else: 753 | for ele in elements[:target_index] + elements[target_index + 1:]: 754 | delete_element(ele) 755 | 756 | print('Dedup %d elements' % (len(elements) - 1)) 757 | 758 | self._dom_location_dict = self._calculate_dom_location_dict() 759 | self._all_visible_leaves = self._get_visible_leaves() 760 | 761 | def _get_visible_leaves(self): 762 | ''' 763 | Gets all visible leaves from view hierarchy 764 | All_visible_leaves: The list of teh visible leaf elements 765 | ''' 766 | all_elements = [element for element in self._root.iter('*')] 767 | button_elements = [element for element in all_elements if element.get('type') == 'XCUIElementTypeButton'] 768 | 769 | for button in button_elements: 770 | self._make_button_a_leaf(button) 771 | 772 | all_visible_leaves = [ 773 | 774 | element for element in all_elements if self._is_leaf(element) and 775 | strtobool(element.get('visible', default='true')) and 776 | self._is_within_screen_bound(element) 777 | ] 778 | 779 | return all_visible_leaves 780 | 781 | def _make_button_a_leaf(self, element): 782 | ''' 783 | IF an element is a button remove its children 784 | ''' 785 | if element.get('type') == 'XCUIElementTypeButton': 786 | for child in element.findall('*'): 787 | element.remove(child) 788 | 789 | def _calculate_dom_location(self): 790 | ''' 791 | Calcualte [depth, preorder-index, postorder-index] for each element 792 | 793 | This method is not thread safe if multiple threads call this method of same ViewHierarchy object object 794 | 795 | Returns: 796 | dom_location_dict, dict of 797 | { 798 | id(element): [depth, preorder-index, postorder-index] 799 | } 800 | ''' 801 | dom_location_dict = collections.defaultdict(lambda: [None, None, None]) 802 | for element in self._all_visible_leaves: 803 | ancestors = [node for node in element.iterancestors()] 804 | dom_location_dict[id(element)][DomLocationKey.DEPTH.value] = len(ancestors) 805 | 806 | self._peorder_index = 0 807 | self._preorder_iterate(self._root, dom_location_dict) 808 | self._postorder_index = 0 809 | self._postorder_iterate(self._root, dom_location_dict) 810 | return dom_location_dict 811 | 812 | def _preorder_iterate(self, element, dom_location_dict): 813 | ''' 814 | Preorder traversal on the view hierarchy tree 815 | ARGS: 816 | element: The current etree element 817 | dom_location_dict: The dict of dom location info 818 | 819 | ''' 820 | if self._is_leaf(element): 821 | dom_location_dict[id(element)][DomLocationKey.PREORDER_INDEX.value] = self._preorder_index 822 | self._preorder_index += 1 823 | for child in element: 824 | if child.getparent() == element: 825 | self._preorder_iterate(child, dom_location_dict) 826 | 827 | def _postorder_iterate(self, element, dom_location_dict): 828 | ''' 829 | Postorder traversal on the view hierarchy tree 830 | Args: 831 | element: The current etree element 832 | dom_location_dict: The dict of dom location info 833 | ''' 834 | for child in element: 835 | if child.getparent() == element: 836 | self._postorder_iterate(child, dom_location_dict) 837 | if self._is_leaf(element): 838 | dom_location_dict[id(element)][DomLocationKey.POSTORDER_INDEX.value] = self._postorder_index 839 | self._postorder_index += 1 840 | 841 | def _is_leaf(self, element): 842 | return not element.findall('.//*') 843 | 844 | def _is_within_screen_bound(self, element): 845 | ''' 846 | Checks if the element is within the screen bound 847 | Args: 848 | element: The etree element object 849 | Returns: 850 | A boolean to indicate whether the element is within the screen bound 851 | ''' 852 | x_1 = str(max(0, int(element.get('x')))) 853 | 854 | y_1 = str(max(0, int(element.get('y')))) 855 | 856 | x_2 = str(int(x_1) + int(element.get('width'))) 857 | 858 | y_2 = str(int(y_1) + int(element.get('height'))) 859 | # logger.info(x_1) 860 | inits = str([x_1, y_1]) 861 | 862 | ends = str([x_2, y_2]) 863 | 864 | bbox = _build_bounding_box(inits + ends) 865 | 866 | in_x = (0 <= bbox.x1 <= self._screen_width) or (0 <= bbox.x2 <= self._screen_width) 867 | 868 | in_y = (0 <= bbox.y1 <= self._screen_height) or (0 <= bbox.y2 <= self._screen_height) 869 | 870 | x1_less_than_x2 = bbox.x1 < bbox.x2 871 | 872 | y1_less_than_y2 = bbox.y1 < bbox.y2 873 | 874 | return in_x and in_y and x1_less_than_x2 and y1_less_than_y2 875 | 876 | 877 | class UI: 878 | def __init__(self, xml_file): 879 | self.xml_file = xml_file 880 | self.elements = { 881 | } 882 | 883 | def sortchildrenby_viewhierarchy(self, view, attr="bounds"): 884 | if attr == "bounds": 885 | bounds = [ 886 | (ele.uiobject.bounding_box.x1, ele.uiobject.bounding_box.y1, ele.uiobject.bounding_box.x2, ele.uiobject.bounding_box.y2) 887 | for ele in view 888 | ] 889 | sorted_bounds_index = [ 890 | bounds.index(i) for i in sorted( 891 | bounds, key=lambda x: (x[1], x[0]) 892 | ) 893 | ] 894 | sort_children = [view[i] for i in sorted_bounds_index] 895 | view[:] = sort_children 896 | 897 | def encoding(self): 898 | ''' 899 | Encodes the UI into a string representation 900 | 901 | Returns: 902 | the string representation of the UI 903 | ''' 904 | with open(self.xml_file, 'r', encoding='utf-8') as f: 905 | xml_content = f.read().encode() 906 | 907 | vh = ViewHierarchy( 908 | screen_width=XML_SCREEN_WIDTH, 909 | screen_height=XML_SCREEN_HEIGHT 910 | ) 911 | vh.load_xml(xml_content) 912 | view_hierarchy_leaf_nodes = vh.get_leaf_nodes() 913 | # logger.info(view_hierarchy_leaf_nodes) 914 | self.sortchildrenby_viewhierarchy( 915 | view_hierarchy_leaf_nodes, 916 | attr="bounds") 917 | 918 | codes = '' 919 | for _id, ele in enumerate(view_hierarchy_leaf_nodes): 920 | obj_type_str = ele.uiobject.obj_type.name 921 | text = ele.uiobject.text 922 | text = text.replace('\n', ' ') 923 | 924 | resource_id = ele.uiobject.obj_name 925 | 926 | content_desc = ele.uiobject.content_desc 927 | # logger.info(resource_id) 928 | # ogger.info(content_desc) 929 | 930 | html_code = self.element_encoding( 931 | _id=_id, 932 | _obj_type=obj_type_str, 933 | _text=text, 934 | _content_desc=content_desc, 935 | _resource_id=resource_id 936 | ) 937 | 938 | codes += html_code if html_code else '' 939 | self.elements[_id] = ele.uiobject 940 | 941 | codes = "\n" + codes + "" 942 | 943 | return codes 944 | 945 | def action_encoding(self): 946 | ''' 947 | Get Heuristic of possible actions output 948 | {action_type: type, encoding} 949 | ''' 950 | pass 951 | 952 | def element_encoding(self, 953 | _id, 954 | _obj_type, 955 | _text, 956 | _content_desc, 957 | _resource_id): 958 | ''' 959 | Encodes the element into a string representation 960 | 961 | Args: 962 | _id: The id of the element 963 | _obj_type: The type of the element 964 | _text: The text of the element 965 | _content_desc: The content description of the element 966 | _resource_id: The resource id of the element 967 | 968 | Returns: 969 | The string representation of the element 970 | ''' 971 | _class = _resource_id.split('.')[-1] if '.' in _resource_id else _resource_id 972 | _text = _text.strip() 973 | # logger.info(_id) 974 | # logger.info(_obj_type) 975 | 976 | assert _obj_type in CLASS_MAPPING.keys() 977 | 978 | tag = CLASS_MAPPING[_obj_type] 979 | 980 | if _obj_type == 'None': 981 | tag = '' 982 | code = '' 983 | if _obj_type == "XCUIElementTypeSwitch": 984 | code = f'\n' 985 | code += f'\n' 986 | 987 | elif _obj_type == "XCUIElementTypeImage": 988 | if _class == "": 989 | code = f'\n' 990 | else: 991 | code = f'\n' 992 | else: 993 | _text = _content_desc if _text == "" else _text 994 | if _class == "": 995 | code = f'<{tag} id="{_id}">{_text}\n' 996 | else: 997 | code = f'<{tag} id="{_id}" class="{_class}">{_text}\n' 998 | return code 999 | -------------------------------------------------------------------------------- /cognisim/device/ios_device.py: -------------------------------------------------------------------------------- 1 | from mobileadapt.device.device import Device 2 | 3 | 4 | class IOSDevice(Device): 5 | def __init__(self, app_start_url=""): 6 | pass 7 | 8 | def get_state(self): 9 | # TODO: Implement get_state for iOS device 10 | pass 11 | 12 | def tap(self, x, y): 13 | # TODO: Implement tap for iOS device 14 | pass 15 | 16 | def input(self, x, y, text): 17 | # TODO: Implement input for iOS device 18 | pass 19 | 20 | def swipe(self, x, y, direction): 21 | # TODO: Implement swipe for iOS device 22 | pass 23 | -------------------------------------------------------------------------------- /cognisim/utils/constants.py: -------------------------------------------------------------------------------- 1 | # Android Emulator Config 2 | SCREEN_WIDTH = 1080 3 | SCREEN_HEIGHT = 1920 4 | SCREEN_CHANNEL = 4 5 | SCREEN_TOP_HEAD = 63 6 | SCREEN_BOTTOM_HEAD = 126 7 | # screen config 8 | ADJACENT_BOUNDING_BOX_THRESHOLD = 3 9 | NORM_VERTICAL_NEIGHBOR_MARGIN = 0.01 10 | NORM_HORIZONTAL_NEIGHBOR_MARGIN = 0.01 11 | INPUT_ACTION_UPSAMPLE_RATIO = 1 12 | # XML screen config 13 | XML_SCREEN_WIDTH = 1440 14 | XML_SCREEN_HEIGHT = 2960 15 | 16 | # Max number of reflections before going to next step 17 | 18 | MAX_REFLECTIONS = 5 19 | 20 | # PLAYWRIGHT TIMEOUTS 21 | BOUNDING_BOX_TIMEOUT = 3000 22 | -------------------------------------------------------------------------------- /cookbook/agentic_example.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import base64 3 | import io 4 | import json 5 | import os 6 | from datetime import datetime 7 | from typing import Any, Dict 8 | 9 | import openai 10 | from loguru import logger 11 | from openai import OpenAI 12 | from PIL import Image 13 | 14 | from cognisim import mobileadapt 15 | 16 | openai.api_key = "" 17 | 18 | 19 | def llm_call(html_state: str, image: bytes, nlp_task: str): 20 | client = OpenAI() 21 | 22 | function_call_instruction_guided_replay = { 23 | "name": "run_step", 24 | "description": "Based on the current step and the current state, return the next action to take", 25 | "parameters": { 26 | "type": "object", 27 | "properties": { 28 | "reasoning": { 29 | "type": "string", 30 | "description": "The reasoning for the action to be performed in the current step", 31 | }, 32 | "action_type": { 33 | "type": "string", 34 | "description": "The type of action to be performed", 35 | "enum": ["tap", "input", "swipe", "validate" "scroll"], 36 | }, 37 | "action_id": { 38 | "type": "integer", 39 | "description": "The id of the action to be performed in the current step based on the current state", 40 | }, 41 | "value": { 42 | "type": "string", 43 | "description": "The value to be inputted if action_type is input or the text to be validated if action_type is validate", 44 | }, 45 | "direction": { 46 | "type": "string", 47 | "description": "The direction to be swiped if action_type is swipe", 48 | "enum": ["up", "down", "left", "right"], 49 | }, 50 | }, 51 | "required": ["action_type", "action_id", "reasoning"], 52 | }, 53 | } 54 | 55 | response = client.chat.completions.create( 56 | model="gpt-4o-2024-08-06", 57 | messages=[ 58 | { 59 | "role": "system", 60 | "content": "You are an AI assistant that helps with mobile app testing.", 61 | }, 62 | { 63 | "role": "user", 64 | "content": [ 65 | { 66 | "type": "text", 67 | "text": f"Given the following task: {nlp_task}\n\nAnd the current state of the app:\n\nHTML: {html_state}", 68 | }, 69 | { 70 | "type": "image_url", 71 | "image_url": { 72 | "url": f"data:image/jpeg;base64,{base64.b64encode(image).decode('utf-8')}" 73 | }, 74 | }, 75 | ], 76 | }, 77 | ], 78 | functions=[function_call_instruction_guided_replay], 79 | function_call={"name": "run_step"}, 80 | ) 81 | 82 | return json.loads(response.choices[0].message.function_call.arguments) 83 | 84 | 85 | async def main(): 86 | 87 | android_device = mobileadapt(platform="android") 88 | # Start device 89 | await android_device.start_device() 90 | 91 | encoded_ui, screenshot, ui = await android_device.get_state() 92 | 93 | # Open the app (Flexify - https://f-droid.org/en/packages/com.presley.flexify/) 94 | await android_device.navigate("com.presley.flexify") 95 | 96 | # Press the button with the text 'Add a new task' 97 | 98 | encoded_ui, screenshot, ui = await android_device.get_state() 99 | 100 | # Create set of mark screenshot 101 | set_of_mark: bytes = android_device.generate_set_of_mark(ui, screenshot) 102 | 103 | action_grounded: Dict[str, Any] = llm_call( 104 | html_state=encoded_ui, 105 | image=set_of_mark, 106 | nlp_task="Press the buttom with the text 'Add a new task'", 107 | ) 108 | 109 | await android_device.perform_action(action_grounded) 110 | 111 | encoded_ui, screenshot, ui = await android_device.get_state() 112 | 113 | # save set of mark screens 114 | 115 | await android_device.stop_device() 116 | await android_device.start_device() 117 | 118 | 119 | if __name__ == "__main__": 120 | asyncio.run(main()) 121 | -------------------------------------------------------------------------------- /cookbook/examplescript2.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import io 3 | import os 4 | from datetime import datetime 5 | 6 | from PIL import Image 7 | 8 | from cognisim import mobileadapt 9 | 10 | 11 | async def save_screenshot(screenshot_data, filename): 12 | image = Image.open(io.BytesIO(screenshot_data)) 13 | image.save(filename) 14 | 15 | 16 | async def perform_actions(device): 17 | # Tap actions 18 | await device.tap(200, 300) 19 | print("Tapped at (200, 300)") 20 | await device.tap(100, 400) 21 | print("Tapped at (100, 400)") 22 | 23 | # Swipe actions 24 | await device.swipe("up") 25 | print("Swiped up") 26 | await device.swipe("down") 27 | print("Swiped down") 28 | await device.swipe("left") 29 | print("Swiped left") 30 | await device.swipe("right") 31 | print("Swiped right") 32 | 33 | # Input text 34 | await device.input(150, 500, "Hello, MobileAdapt!") 35 | print("Input text at (150, 500)") 36 | 37 | 38 | async def main(): 39 | android_device = mobileadapt(platform="android") 40 | await android_device.start_device() 41 | 42 | # Perform initial state capture 43 | encoded_ui, screenshot, ui = await android_device.get_state() 44 | timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") 45 | filename = os.path.join( 46 | os.path.dirname(__file__), f"screenshot_initial_{timestamp}.png" 47 | ) 48 | await save_screenshot(screenshot, filename) 49 | print(f"Initial screenshot saved as {filename}") 50 | print("Initial UI state:", encoded_ui) 51 | 52 | # Perform a series of actions and capture states 53 | for i in range(3): 54 | print(f"\nPerforming action set {i+1}") 55 | await perform_actions(android_device) 56 | 57 | encoded_ui, screenshot, ui = await android_device.get_state() 58 | timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") 59 | filename = os.path.join( 60 | os.path.dirname(__file__), f"screenshot_action{i+1}_{timestamp}.png" 61 | ) 62 | await save_screenshot(screenshot, filename) 63 | print(f"Screenshot after action set {i+1} saved as {filename}") 64 | print(f"UI state after action set {i+1}:", encoded_ui) 65 | 66 | # Additional complex interaction 67 | print("\nPerforming additional complex interaction") 68 | await android_device.tap(300, 300) 69 | await android_device.swipe("up") 70 | await android_device.input(200, 600, "Complex interaction") 71 | await android_device.swipe("left") 72 | await android_device.tap(150, 450) 73 | 74 | # Capture final state 75 | encoded_ui, screenshot, ui = await android_device.get_state() 76 | timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") 77 | filename = os.path.join( 78 | os.path.dirname(__file__), f"screenshot_final_{timestamp}.png" 79 | ) 80 | await save_screenshot(screenshot, filename) 81 | print(f"Final screenshot saved as {filename}") 82 | print("Final UI state:", encoded_ui) 83 | 84 | 85 | if __name__ == "__main__": 86 | asyncio.run(main()) 87 | -------------------------------------------------------------------------------- /cookbook/smoke_example_android.py.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import base64 3 | import io 4 | import os 5 | from datetime import datetime 6 | 7 | from loguru import logger 8 | from PIL import Image 9 | 10 | from cognisim import mobileadapt 11 | 12 | """ From the root directory use the following command to start the script: 13 | python example-scripts/examplescript.py 14 | """ 15 | 16 | 17 | async def save_screenshot(screenshot_data, filename): 18 | # Open the screenshot data as an image and save it 19 | image = Image.open(io.BytesIO(screenshot_data)) 20 | image.save(filename) 21 | 22 | 23 | async def main(): 24 | # Create an Android device instance 25 | android_device = mobileadapt(platform="android") 26 | 27 | # Initialize the device (starts the Appium session) 28 | await android_device.start_device() 29 | 30 | # Get the current state of the device 31 | encoded_ui, screenshot, ui = await android_device.get_state() 32 | logger.info(f"Current state: {encoded_ui}") 33 | 34 | # Save the first screenshot 35 | timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") 36 | # filename1 = os.path.join(os.path.dirname(__file__), f"screenshot_before_{timestamp}.png") 37 | # await save_screenshot(screenshot, filename1) 38 | # print(f"Screenshot saved as {filename1}") 39 | 40 | # Perform a tap action at coordinates (100, 100) 41 | await android_device.tap(100, 100) 42 | 43 | # Get the state again after the tap action 44 | new_encoded_ui, new_screenshot, new_ui = await android_device.get_state() 45 | print("New state after tap:", new_encoded_ui) 46 | 47 | # Save the second screenshot 48 | timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") 49 | filename2 = os.path.join( 50 | os.path.dirname(__file__), f"screenshot_after_{timestamp}.png" 51 | ) 52 | await save_screenshot(new_screenshot, filename2) 53 | print(f"Screenshot saved as {filename2}") 54 | 55 | 56 | if __name__ == "__main__": 57 | # Run the main function asynchronously 58 | asyncio.run(main()) 59 | -------------------------------------------------------------------------------- /cookbook/smoke_example_ios.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | import base64 4 | import io 5 | import os 6 | 7 | from datetime import datetime 8 | 9 | from PIL import Image 10 | from loguru import logger 11 | from cognisim import mobileadapt 12 | 13 | 14 | async def main(): 15 | 16 | ios_device = mobileadapt(platform="ios") 17 | 18 | await ios_device.start_device() 19 | 20 | 21 | encoded_ui, screenshot, ui = await ios_device.get_state() 22 | logger.info(f"Current state: {encoded_ui}") 23 | 24 | if __name__ == "__main__": 25 | asyncio.run(main()) 26 | -------------------------------------------------------------------------------- /deploy/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -o errexit 4 | 5 | # Regular Colors 6 | Green='\033[0;32m' 7 | Yellow='\033[0;33m' 8 | Red='\033[0;31m' 9 | NC='\033[0m' # No Color 10 | 11 | # Change to the script's directory 12 | cd "$(dirname "$0")" 13 | 14 | is_command_present() { 15 | type "$1" >/dev/null 2>&1 16 | } 17 | 18 | check_os() { 19 | if [[ "$OSTYPE" == "darwin"* ]]; then 20 | echo "macOS detected" 21 | package_manager="brew" 22 | elif [[ "$OSTYPE" == "linux-gnu"* ]]; then 23 | echo "Linux detected" 24 | if is_command_present apt-get; then 25 | package_manager="apt-get" 26 | elif is_command_present yum; then 27 | package_manager="yum" 28 | else 29 | echo "Unsupported package manager. Please install Python3, pip3, and npm manually." 30 | exit 1 31 | fi 32 | else 33 | echo "Unsupported OS" 34 | exit 1 35 | fi 36 | } 37 | 38 | request_sudo() { 39 | if [[ $EUID != 0 ]]; then 40 | sudo_cmd="sudo" 41 | echo "We need sudo access to complete the installation." 42 | sudo -v 43 | fi 44 | } 45 | 46 | install_dependencies() { 47 | echo "Installing dependencies..." 48 | if [[ $package_manager == "brew" ]]; then 49 | brew install python node 50 | elif [[ $package_manager == "apt-get" ]]; then 51 | $sudo_cmd $package_manager update 52 | $sudo_cmd $package_manager install -y python3 python3-pip nodejs npm 53 | else 54 | $sudo_cmd $package_manager install -y python3 python3-pip nodejs npm 55 | fi 56 | } 57 | 58 | install_python_dependencies() { 59 | echo "Setting up Python virtual environment..." 60 | python3 -m venv venv 61 | source venv/bin/activate 62 | 63 | echo "Upgrading pip..." 64 | python3 -m pip install --upgrade pip 65 | 66 | echo "Installing Python dependencies..." 67 | python3 -m pip install -r ../requirements.txt 68 | } 69 | 70 | install_appium() { 71 | echo "Installing Appium..." 72 | $sudo_cmd npm install -g appium 73 | } 74 | 75 | start_appium() { 76 | echo "Starting Appium server..." 77 | appium & 78 | APPIUM_PID=$! 79 | echo "Appium server started with PID: $APPIUM_PID" 80 | sleep 5 # Give Appium some time to start up 81 | } 82 | 83 | # Main script execution 84 | echo -e "${Green}Setting up the mobile adapter environment...${NC}" 85 | 86 | check_os 87 | request_sudo 88 | install_dependencies 89 | install_python_dependencies 90 | install_appium 91 | start_appium 92 | 93 | echo -e "${Green}Mobile adapter setup complete.${NC}" 94 | echo -e "${Yellow}Activating the virtual environment...${NC}" 95 | source "$(dirname "$0")/venv/bin/activate" 96 | echo -e "${Green}Virtual environment activated. You can now use mobileadapt.${NC}" 97 | echo -e "${Yellow}To deactivate the virtual environment when you're done, type 'deactivate'.${NC}" 98 | echo -e "${Yellow}To stop Appium server, run: kill $APPIUM_PID${NC}" 99 | 100 | # Keep the script running to maintain the Appium server 101 | wait $APPIUM_PID -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["poetry-core>=1.0.0"] 3 | build-backend = "poetry.core.masonry.api" 4 | 5 | [tool.poetry] 6 | name = "cognisim" 7 | version = "0.1.0" 8 | description = "A package for mobile app adaptation and testing" 9 | authors = ["Revyl AI "] 10 | license = "MIT" 11 | readme = "README.md" 12 | repository = "https://github.com/RevylAI/Mobileadapt" 13 | homepage = "https://mobileadapt.revyl.ai" 14 | packages = [{include = "cognisim"}] 15 | classifiers = [ 16 | "Development Status :: 3 - Alpha", 17 | "Intended Audience :: Developers", 18 | "License :: OSI Approved :: MIT License", 19 | "Operating System :: OS Independent", 20 | "Programming Language :: Python :: 3", 21 | "Programming Language :: Python :: 3.7", 22 | "Programming Language :: Python :: 3.8", 23 | "Programming Language :: Python :: 3.9", 24 | ] 25 | 26 | [tool.poetry.dependencies] 27 | python = "^3.12" 28 | appium-python-client = "*" 29 | loguru = "*" 30 | lxml = "*" 31 | numpy = "*" 32 | attrs = "*" 33 | str2bool = "^1.1" 34 | pillow = "^10.4.0" 35 | opencv-python = "^4.10.0.84" 36 | retrying = "^1.3.4" 37 | openai = "^1.43.0" 38 | setuptools = "^75.2.0" 39 | 40 | [tool.poetry.dev-dependencies] 41 | pytest = "^6.2" 42 | 43 | [tool.poetry.urls] 44 | "Bug Tracker" = "https://github.com/RevylAI/Mobileadapt/issues" 45 | [tool.poetry.group.dev.dependencies] 46 | black = "^24.8.0" 47 | isort = "^5.13.2" 48 | mypy = "^1.11.2" 49 | 50 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | appium-python-client # Core Appium client for Python, essential for mobile automation 2 | loguru # Advanced logging library for better debug information 3 | lxml # Efficient XML and HTML processing, used for parsing view hierarchies 4 | numpy # Numerical computing library, useful for data manipulation and analysis 5 | attrs # Reduces boilerplate for Python classes, used in defining UI objects -------------------------------------------------------------------------------- /scripts/format.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | cd "$(dirname "$0")" || exit 1 3 | cd .. 4 | 5 | 6 | printf "\nFormatting Python 🧹\n" 7 | poetry run black . 8 | 9 | printf "\nSorting imports 🧹\n" 10 | poetry run isort . 11 | 12 | -------------------------------------------------------------------------------- /scripts/setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -o errexit 4 | 5 | # Change to the script's directory 6 | cd "$(dirname "$0")" 7 | 8 | check_os() { 9 | if [[ "$OSTYPE" == "darwin"* ]]; then 10 | echo "macOS detected" 11 | package_manager="brew" 12 | elif [[ "$OSTYPE" == "linux-gnu"* ]]; then 13 | echo "Linux detected" 14 | if type apt-get >/dev/null 2>&1; then 15 | package_manager="apt-get" 16 | elif type yum >/dev/null 2>&1; then 17 | package_manager="yum" 18 | else 19 | echo "Unsupported package manager. Please install Python3, pip3, and npm manually." 20 | exit 1 21 | fi 22 | else 23 | echo "Unsupported OS" 24 | exit 1 25 | fi 26 | } 27 | 28 | request_sudo() { 29 | if [[ $EUID != 0 ]]; then 30 | sudo_cmd="sudo" 31 | echo "We need sudo access to complete the installation." 32 | sudo -v 33 | fi 34 | } 35 | 36 | install_dependencies() { 37 | echo "Installing dependencies..." 38 | if [[ $package_manager == "brew" ]]; then 39 | brew install node 40 | elif [[ $package_manager == "apt-get" ]]; then 41 | $sudo_cmd $package_manager update 42 | $sudo_cmd $package_manager install -y nodejs npm 43 | else 44 | $sudo_cmd $package_manager install -y nodejs npm 45 | fi 46 | } 47 | 48 | install_appium() { 49 | echo "Installing Appium..." 50 | $sudo_cmd npm install -g appium 51 | } 52 | 53 | cd .. 54 | poetry install -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages, setup 2 | 3 | with open("README.md", "r", encoding="utf-8") as fh: 4 | long_description = fh.read() 5 | 6 | setup( 7 | name="cognisim", 8 | version="0.1.0", 9 | author="Revyl AI", 10 | author_email="anam@revyl.ai", 11 | description="A package for cross platform LLM agentic testing", 12 | long_description=long_description, 13 | long_description_content_type="text/markdown", 14 | url="https://github.com/revyl-ai/mobileadapt", 15 | packages=find_packages(), 16 | classifiers=[ 17 | "Development Status :: 3 - Alpha", 18 | "Intended Audience :: Developers", 19 | "License :: OSI Approved :: MIT License", 20 | "Operating System :: OS Independent", 21 | "Programming Language :: Python :: 3", 22 | "Programming Language :: Python :: 3.7", 23 | "Programming Language :: Python :: 3.8", 24 | "Programming Language :: Python :: 3.9", 25 | ], 26 | python_requires=">=3.7", 27 | install_requires=[ 28 | "appium-python-client", 29 | "loguru", 30 | "lxml", 31 | "numpy", 32 | "attrs", 33 | ], 34 | ) 35 | --------------------------------------------------------------------------------