├── .github └── workflows │ └── python-publish.yml ├── .gitignore ├── ATTRIBUTIONS ├── LICENSE ├── README.md ├── core ├── assistant.py ├── core_api.py ├── core_imaging.py ├── driver.py ├── get_all_installed_apps.py ├── history.db ├── last_app.py ├── media │ ├── Mouse_pointer_small.png │ ├── assistant_transparent.png │ ├── assistant_transparent_blink.png │ ├── assistant_transparent_dragging.png │ ├── headico.ico │ ├── headico.png │ ├── transcribe_audio.mp3 │ └── translate_audio.mp3 ├── mouse_detection.py ├── ocr.py ├── topmost_window.py ├── ui_window_analyzer.py ├── voice.py ├── window_elements.py ├── window_focus.py └── window_mgmt.py └── requirements.txt /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflow will upload a Python Package using Twine when a release is created 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries 3 | 4 | # This workflow uses actions that are not certified by GitHub. 5 | # They are provided by a third-party and are governed by 6 | # separate terms of service, privacy policy, and support 7 | # documentation. 8 | 9 | name: Upload Python Package 10 | 11 | on: 12 | release: 13 | types: [published] 14 | 15 | permissions: 16 | contents: read 17 | 18 | jobs: 19 | deploy: 20 | 21 | runs-on: ubuntu-latest 22 | 23 | steps: 24 | - uses: actions/checkout@v3 25 | - name: Set up Python 26 | uses: actions/setup-python@v3 27 | with: 28 | python-version: '3.x' 29 | - name: Install dependencies 30 | run: | 31 | python -m pip install --upgrade pip 32 | pip install build 33 | - name: Build package 34 | run: python -m build 35 | - name: Publish package 36 | uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 37 | with: 38 | user: __token__ 39 | password: ${{ secrets.PYPI_API_TOKEN }} 40 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # taken from https://github.com/github/gitignore/blob/main/Python.gitignore 2 | # Byte-compiled / optimized / DLL files 3 | __pycache__/ 4 | *.py[cod] 5 | *$py.class 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | cover/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | db.sqlite3-journal 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # PyBuilder 76 | .pybuilder/ 77 | target/ 78 | 79 | # Jupyter Notebook 80 | .ipynb_checkpoints 81 | 82 | # IPython 83 | profile_default/ 84 | ipython_config.py 85 | 86 | # pyenv 87 | # For a library or package, you might want to ignore these files since the code is 88 | # intended to run in multiple environments; otherwise, check them in: 89 | # .python-version 90 | 91 | # pipenv 92 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 93 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 94 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 95 | # install all needed dependencies. 96 | #Pipfile.lock 97 | 98 | # poetry 99 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 100 | # This is especially recommended for binary packages to ensure reproducibility, and is more 101 | # commonly ignored for libraries. 102 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 103 | #poetry.lock 104 | 105 | # pdm 106 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 107 | #pdm.lock 108 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 109 | # in version control. 110 | # https://pdm.fming.dev/#use-with-ide 111 | .pdm.toml 112 | 113 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 114 | __pypackages__/ 115 | 116 | # Celery stuff 117 | celerybeat-schedule 118 | celerybeat.pid 119 | 120 | # SageMath parsed files 121 | *.sage.py 122 | 123 | # Environments 124 | .env 125 | .venv 126 | env/ 127 | venv/ 128 | ENV/ 129 | env.bak/ 130 | venv.bak/ 131 | 132 | # Spyder project settings 133 | .spyderproject 134 | .spyproject 135 | 136 | # Rope project settings 137 | .ropeproject 138 | 139 | # mkdocs documentation 140 | /site 141 | 142 | # mypy 143 | .mypy_cache/ 144 | .dmypy.json 145 | dmypy.json 146 | 147 | # Pyre type checker 148 | .pyre/ 149 | 150 | # pytype static type analyzer 151 | .pytype/ 152 | 153 | # Cython debug symbols 154 | cython_debug/ 155 | 156 | # PyCharm 157 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 158 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 159 | # and can be added to the global gitignore or merged into this file. For a more nuclear 160 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 161 | #.idea/ -------------------------------------------------------------------------------- /ATTRIBUTIONS: -------------------------------------------------------------------------------- 1 | Attributions for PyWinAssistant 2 | 3 | PyWinAssistant incorporates the following third-party modules, each with its own license: 4 | openai: MIT license. More information is available at [https://github.com/openai/openai-python]. 5 | pywinauto: Licensed under BSD-3-Clause license . More information at [https://pywinauto.readthedocs.io/en/latest/contents.html]. 6 | pyautogui: Licensed under the BSD-3-Clause license. Details at [https://github.com/asweigart/pyautogui]. 7 | pygetwindow: BSD-3-Clause license [https://github.com/asweigart/pygetwindow] 8 | PyWin32 (win32com, win32gui, win32con, win32api, win32process, winreg): Part of PyWin32, which is under the PSF license. More information can be found at [https://github.com/mhammond/pywin32]. 9 | customtkinter: MIT license. More info at [https://github.com/TomSchimansky/CustomTkinter]. 10 | Pillow (PIL): This project uses Pillow, an open-source HPND-licensed library. More info at [https://python-pillow.org/]. 11 | speech_recognition: This project incorporates the speech_recognition module, licensed under [BSD-3-Clause, GPL-2.0 licenses]. More details can be found at [https://github.com/Uberi/speech_recognition#readme]. 12 | psutil: This project uses psutil, distributed under the BSD-3-Clause license. More details at [https://github.com/giampaolo/psutil]. 13 | fuzzywuzzy: Available under the GPL-2.0 license. More information at [https://github.com/seatgeek/fuzzywuzzy]. 14 | pytesseract: An OCR tool licensed under the Apache License 2.0. More details at [https://github.com/madmaze/pytesseract]. 15 | uiautomation: Apache-2.0 license [https://github.com/yinkaisheng/Python-UIAutomation-for-Windows] 16 | gTTS (Google Text-to-Speech): MIT license. More information at [https://github.com/pndurette/gTTS]. 17 | pygame: Licensed under the LGPL GNU Library or Lesser General Public License (LG PL) (LGPL). More details at [https://www.pygame.org/news]. 18 | tkinter: Part of Python's standard library, covered by the PSF license. [https://github.com/python/cpython/tree/3.12] 19 | 20 | 21 | Each module is the property of its respective owners, and PyWinAssistant is in no way affiliated with these modules. 22 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Brandon Joan Rosas Delgado 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | **PyWinAssistant: An artificial assistant** – **MIT Licensed** | **Public Release: December 31, 2023** | Complies with federal coordinations AI Standards for Complex Adaptive Systems, Asilomar AI Principles and IEEE Global Initiative on Ethics of Autonomous and Intelligent Systems. 2 | 3 | --- 4 | 5 | PyWinAssistant is the first open-source Artificial Narrow Intelligence to elicit spatial reasoning and perception as a generalist agentic framework Computer-Using-Agent that fully operates graphical-user-interfaces (GUIs) for Windows 10/11 **through direct OS-native semantic interaction**. It functions as a Computer-Using-Agent / Large-Action-Model, forming the foundation for a pure **symbolic spatial cognition framework** that enables artificial operation of a computer using only natural language, **without relying on computer vision, OCR, or pixel-level imaging**. PyWinAssistant emulates, plans, and simulates synthetic Human-Interface-Device (HID) interactions through **native Windows Accessibility APIs**, eliciting human-like abstraction across geometric, hierarchical, and temporal dimensions at an Operating-System level. This OS-integrated approach simulating spatial utilization of a computer provides a future-proof, generalized, modular, and dynamic ANI orchestration framework for multi-agent-driven automation, marking an important step in symbolic reasoning towards AGI. 6 | 7 | **Key Features:** 8 | * **Not relying only on Imaging Pipeline**: Operates exclusively through Windows UI Automation (UIA) and programmatic GUI semantics, enabling universal workflow orchestration. 9 | * **Symbolic Spatial Mapping**: Hierarchical element tracking via OS-native parent/child relationships and coordinate systems. 10 | * **Non-Visual Perception**: Real-time interface understanding through direct metadata extraction (control types, states, positions). 11 | * **Visual Perception**: A single screenshot can elicit comprehension and perception with attention to detail by visualizing goal intent and environment changes in a spatial space over time, can be fine-tuned to look up for visual cues, bugs, causal reasoning bugs, static, semantic grounding, errors, corruption... 12 | * **Unified Automation**: Automatic element detection. Combines GUI, system, and web automation under one Python API. Eliminates context-switching between tools. 13 | * **AI-Powered Script Generation**: Translates natural language or demonstrations into any kind of code inside any IDE or text edit areas. 14 | * **Self-Healing Workflows**: Auto-adjusts to UI changes (e.g., element ID shifts). Reducing maintenance overhead, making PyWinAssistant's algorithm future-proof. 15 | * **AI/ML Integration**: Using NLP to generate scripts (e.g., “Automate Application” → plan of test execution steps in JSON) with self-correcting selectors. 16 | * **Cross-Context Automation**: Seamlessly combining GUI, web, and API workflows in a Pythonic way, unifying disjointed automation methods (GUI, API, web) into a single framework. 17 | * **Accessibility**: Enhancing accessibility for users with different needs, enabling voice or simple text commands to control complex actions. 18 | * **Generalization**: Elicits spatial cognition to understand and execute a wide range of commands in a natural, intuitive manner. 19 | * **Small and compact**: PyWinAssistant functions as an example algorithm of a modular and generalized computer assistant framework that elicits spatial cognition. 20 | 21 | PyWinAssistant has its own set of **reasoning agents**, utilizing Visualization-of-Thought (VoT) and Chain-of-Thought (CoT) to enhance generalization, dynamically simulating actions through abstract GUI semantic dimensions rather than visual processing, making it **future-proof** for next-generation **LLM models**. By **visualizing interface contents** to dynamically **simulate and plan actions** over **abstract GUI semantic dimensions, concepts, and differentials**, PyWinAssistant **redefines computer vision automation**, enabling **high-efficiency visual processing** at a fraction of traditional computational costs. PyWinAssistant has achieved **real-time spatial perception** at an **Operating-System level**, allowing for **memorization of visual cues and tracking of on-screen changes over time**. 22 | 23 | --- 24 | 25 | Released before key breakthroughs in AI for Spatial Reasoning, it predates: 26 | * **Microsoft’s** [**Visualization-of-Thought research paper**](https://arxiv.org/abs/2404.03622) (April 4, 2024) 27 | * **Anthropic** [**Claude’s Computer-Use Agent**](https://www.anthropic.com/news/3-5-models-and-computer-use) (October 22, 2024) 28 | * **OpenIA** [**ChatGPT’s Operator Computer-Using Agent (CUA)**](https://openai.com/index/introducing-operator/) (January 23, 2025) 29 | 30 | PyWinAssistant represents a major paradigm shift in AI and automation by pioneering **pure symbolic computer interaction** bridging **human intent with GUI automation at an OS level** through these breakthroughs: 31 | * **First Agent** to bypass OCR/imaging for Computer-Using-Agent GUI automation. 32 | * **First Framework** using Windows UIA as the primary spatial perception channel. 33 | * **First System** demonstrating OS-native hierarchical-temporal reasoning. 34 | 35 | --- 36 | 37 | ### **1. Unified Natural Language → GUI Automation** 38 | **Traditional Approach**: 39 | Automation tools require scripting (e.g., AutoHotkey) or API integration (e.g., Selenium). 40 | 41 | **PyWinAssistant Breakthrough**: 42 | ```python 43 | # True generalization for natural language directly driving UI actions 44 | assistant("Play Daft Punk on Spotify and email the lyrics to my friend") 45 | # The agent chooses a fitting item according to the related context to comply with user intent. 46 | ``` 47 | 48 | **Mechanism**: Combines UIAutomation’s GUI control detection with LLMs to: 49 | - Parse intent ("play", "email lyrics") 50 | - Map to UI elements (Spotify play button, Outlook compose window) 51 | - Generate adaptive workflows 52 | 53 | **PyWinAssistant Innovation**: Eliminates the need for: 54 | - Predefined API integrations 55 | - XPath/CSS selector knowledge 56 | - Manual error handling 57 | 58 | --- 59 | 60 | ### **2. Cross-Application State Awareness** 61 | **Traditional Limitation**: 62 | Tools operate in app silos (e.g., Power Automate connectors). 63 | 64 | **PyWinAssistant Innovation**: 65 | ```python 66 | # Notes: 67 | # The full set of steps generation from the Assistant is working flawlessly, but in-step modifier and memory-content retrieval was purposely disabled and commented into the code- [def act()](https://github.com/a-real-ai/pywinassistant/blob/6aae4e514a0dc661f7ed640181663f483972bc1e/core/driver.py#L648C1-L648C8) 68 | # to comply with federal coordinations AI Standards for Complex Adaptive Systems, Asilomar AI Principles and IEEE Global Initiative on Ethics of Autonomous and Intelligent Systems. 69 | 70 | # Accurately maintains context and intent across apps using UIA tree and spatial memory: (Example for further development) 71 | assistant("Find for the best and cheapest flight to Mexico, and also look for local hotels and suggest me on new tabs the best on cultural options") 72 | assistant("Look for various pizza coupons for anything but pineapple, fill in the details to order and show me the results") 73 | 74 | # PyWinAssistant is highly modular (example): 75 | def workflow(): 76 | song = assistant(goal="get the current track") # UIA 77 | write_action(f"Review '{song}': Great bassline!", app="Notepad") # Win32 78 | assistant(goal="Post on twitter the written text from notepad") # Web 79 | 80 | # The previous set of actions can be also executed by simply using natural language: 81 | assistant(f"Get the current song playing and in notepad put the title as Review song name: Great bassline, and write about why it is a great baseline, then post it on twitter", assistant_identity="You're an expert music critic") 82 | ``` 83 | **Key Advancements**: 84 | 1. **Unified Control Graph**: Treats all apps as nodes in a single UIA-accessible graph 85 | 2. **State Transfer**: Passes data between apps via clipboard/UIA properties 86 | 3. **Semantic Transfer**: Passes semantics of goal intent acros all steps 87 | 4. **Error Recovery**: Uses agentic reasoning systems to avoid failing actions 88 | 89 | **Impact**: Enables workflows previously requiring custom middleware. 90 | --- 91 | 92 | ### **3. Probabilistic Automation Engine** 93 | **Traditional Model**: 94 | Deterministic scripts fail on UI changes. 95 | 96 | **PyWinAssistant’s Solution**: 97 | ```python 98 | # Adaptive element discovery 99 | def fast_action(goal): 100 | speaker(f"Clicking onto the element without visioning context. No imaging is required.") 101 | analyzed_ui = analyze_app(application=ai_choosen_app, additional_search_options=generated_keywords) 102 | 103 | gen_coordinates = [{"role": "assistant", 104 | f"content": f"You are an AI Windows Mouse Agent that can interact with the mouse. Only respond with the " 105 | f"predicted coordinates of the mouse click position to the center of the element object " 106 | f"\"x=, y=\" to achieve the goal."}, 107 | {"role": "system", "content": f"Goal: {single_step}\n\nContext:{original_goal}\n{analyzed_ui}"}] 108 | coordinates = api_call(gen_coordinates, model_name="gpt-4-1106-preview", max_tokens=100, temperature=0.0) 109 | print(f"AI decision coordinates: \'{coordinates}\'") 110 | ``` 111 | **Revolutionary Features**: 112 | - **Semantic Search by thinking**: Example `synonyms("download") → ["save", "export", "↓ icon"]` 113 | - **Spatial Probability**: Prioritizes elements by utilizing sets of self-reasoning agents for the synthetic operation of the actions 114 | - **Spatial-Prevention**: Senses and prevents possible bad actions or misaligned step execution by utilizing sets of self-reasoning agents 115 | - **Self-Healing**: Automatically chooses the perfect plan to execute without failing its step reasoning, by utilizing sets of self-reasoning agents 116 | 117 | --- 118 | 119 | ### **4. Democratized Accessibility** 120 | 121 | Task: Automate to save a song on spotify GUI. 122 | **Before**: 123 | Automation required: 124 | ```autohotkey 125 | WinWait, Spotify 126 | ControlClick, x=152 y=311 # Fragile coordinates 127 | ``` 128 | 129 | **Now**: Only 1 natural language command. 130 | ```python 131 | assistant("Like this song") # Language-first 132 | ``` 133 | 134 | | **Shift Metrics**: | Traditional Tools | PyWinAssistant | 135 | |-----------------------|-------------------|----------------| 136 | | Learning Curve | Days, even months | Minutes | 137 | | Cross-App Workflows | Manual Integration| Automatic | 138 | | Maintenance Overhead | High | LLM-AutoPatch | 139 | 140 | --- 141 | 142 | ### **Why This is Transformative** 143 | 144 | 1. **From Scripts to Intent**: 145 | Replaces brittle `click(x,y)` with human-like "understand → act" cycles. 146 | 147 | 2. **From Silos to OS as API**: 148 | Treats the entire Windows environment as a programmable interface. 149 | 150 | 3. **From Fixed to Adaptive**: 151 | Leverages LLMs to handle UI changes (e.g., Spotify’s 2023 UI overhaul). 152 | 153 | 4. **From Developers to Everyone**: 154 | Makes advanced automation accessible through natural language, improving the generality quality and minimizing the overall data usage of LLM and vision models. 155 | Has built-in assistance options to improve human utilization of a computer, with a new technical approach to User Interface and User Experience assistance and testing by spatial visualization of thought, 156 | generalizes correctly any natural language prompt, and plans to perform correct actions into the OS with security in mind. 157 | 158 | By **directly interfacing with Windows underlying UI hierarchy**, it achieves real-time spatial perception at the OS level while eliminating traditional computer vision pipelines, enabling: 159 | * **100x Efficiency Gains**: Native API access. 160 | * **Blind Operation**: Can function on headless systems, virtual machines, or minimized windows. 161 | * **Precision Abstraction**: Mathematical modeling of GUI relationships rather than visual pattern matching. 162 | 163 | **Image-Free by Design (Core Architecture)** 164 | While some projects *require* visual processing for fundamental operation, PyWinAssistant achieves **complete GUI interaction capability without an imaging pipeline** through: 165 | 166 | 1. **Native OS Semantic Access** 167 | Direct Windows UIA API integration provides full control metadata: 168 | ```python 169 | # Example of an element properties via UIA - No screenshots needed 170 | button = uia.Element.find(Name="Submit", ControlType="Button") 171 | print(button.BoundingRectangle) # {x: 120, y: 240, width: 80, height: 30} 172 | ``` 173 | 2. **Imaging Module** 174 | 175 | ```diff 176 | # PyWinAssistant imaging functions like Pixel level visualization can be enabled as real-time spatial perception with memorization of visual cues and tracking of on-screen changes over time. 177 | + Capable of planning successful sets of highly technical steps to perform operations on a computer at an OS level, with only one screenshot. 178 | + Pixel level visualization. 179 | + Visual hash matching can be enabled for dynamic elements. 180 | - OCR fallback / object detection for non-UIA legacy apps. 181 | # The experimental features of OCR were added but not fully developed as it was not necessary for the current implementation as the assistant currently works too well without it. 182 | ``` 183 | 184 | | **Key Differentiation** | PyWinAssistant | Traditional Automation | 185 | |-|----------------|------------------------| 186 | | **Primary Perception** | UIA Metadata | Screenshots/OCR | 187 | | **Vision Dependency** | Optional Add-on | Required Core | 188 | | **Headless Ready** | ✅ Native | ❌ Requires virtual display | 189 | 190 | --- 191 | 192 | ### **Development Notes:** 193 | PyWinAssistant is limited to model's intelligence and time to inference. New advancements on LLM's are required to reach for a complete Artificial General Intelligence system with Artificial Narrow Intelligences managing it. 194 | The system's autonomous task decomposition leverages **native semantic differentials** rather than visual changes, visual changes can be optionally activated for real-time image corruption analysis in GUI/Screen. 195 | Long-term memory and self-learning mechanisms were designed to evolve **symbolic state representations**, and can be also represented into visual patterns, aligning with AGI development. 196 | 197 | Paper related: Visualization-of-Thought Elicits Spatial Reasoning in Large Language Models (April 4, 2024): 198 | ![image](https://github.com/a-real-ai/pywinassistant/assets/18397328/58c8e18d-b633-4a35-abc1-b8a76768e4e3) 199 | https://arxiv.org/abs/2404.03622 200 | 201 | # Overview 202 | 203 | PyWinAssistant includes built-in assistant features designed to enhance human-computer interaction for all users. It integrates real-time voice recognition, customizable assistant personalities, subtitles, and chat functionality. 204 | Talk with your computer friendly and naturally to perform any User Interface activity. 205 | Use natural language to operate freely your Windows Operating System. 206 | Generates and plans test cases of your User Interface applications for continuous testing on any Win32api supported application by simply using natural language. 207 | Your own open and secure personal assistant that responds as you want, control the way you want your computer to assist you. 208 | It's engineered to be modular, understand and execute a wide range of tasks, automating interactions with any desktop applications. 209 | 210 | # Demos (Videos below) 211 | 212 | ![image](https://github.com/a-real-ai/pywinassistant/assets/18397328/93c0f123-2d57-419f-a586-32d9fe51e0b2) 213 | 214 | ![image](https://github.com/a-real-ai/pywinassistant/assets/18397328/42d2e3d5-9be7-4d4a-825d-e80891aeb0eb) 215 | 216 | ![Screenshot 2023-12-18 043612](https://github.com/a-real-ai/pywinassistant/assets/18397328/428d1a3f-ece7-4c58-9d1b-76138ce8807c) 217 | 218 | ![Screenshot 2023-12-18 040443](https://github.com/a-real-ai/pywinassistant/assets/18397328/50543e40-f810-4e4f-9cca-3f1131ae1cc1) 219 | 220 | ![Screenshot 2023-12-01 143812](https://github.com/a-real-ai/pywinassistant/assets/18397328/d88374c9-fb53-4ecf-b8b5-840ffaa5d8c1) 221 | 222 | ![Screenshot 2023-12-01 150047](https://github.com/a-real-ai/pywinassistant/assets/18397328/f0c904c7-0c96-4d57-90a0-dc9084728131) 223 | 224 | ![Screenshot 2023-11-13 161219](https://github.com/a-real-ai/pywinassistant/assets/18397328/b2c2a23c-f37f-4f1d-8628-69db6bf13ed9) 225 | 226 | --- 227 | 228 | ## Please enable the Audio for the demo videos. 229 | Voice 1 - Input Human (English Female Australian TTS) 230 | 231 | Voice 2 - Output Assistant (English Female US Google TTS) 232 | 233 | --- 234 | 235 | ### Use your computer by natural language - Real-time usage of VoT, an example of a Computer-Using-Agent; Single Action Model. 236 | Does not use any vision. Only API LLM calls. Demonstrating flawless execution of multiple prompt actions. 237 | 238 | https://github.com/a-real-ai/pywinassistant/assets/18397328/25b39d8c-62d6-442e-9d5e-bc8a35aa971a 239 | 240 | --- 241 | 242 | ### Use your computer as an assistant - Real-time usage of planning VoT, an example of a Computer-Using-Agent; Large-Action-Model. 243 | **Takes only 1 screenshot**: Gets to know what the user is doing and what is that the user wants to achieve, the assistant plans to perform it. 244 | ``` 245 | Voice Recognized Prompt: Make a new post on twitter saying hello world and a brief greeting explaining you're an artificial intelligence. 246 | ``` 247 | https://github.com/a-real-ai/pywinassistant/assets/18397328/d04f0609-68fb-4fb4-9ac3-279047c7a4f7 248 | 249 | --- 250 | 251 | ### The assistant can do anything for you - Real-time usage of planning VoT, an example of a Computer-Using-Agent; Large-Action-Model. 252 | The inference is the only constraint for speed. 253 | ``` 254 | Voice Recognized Prompt: Create a new comment explaining why it is so important. 255 | ``` 256 | https://github.com/a-real-ai/pywinassistant/assets/18397328/6d3bb6e6-ccf8-4380-bc89-df512ae207f2 257 | 258 | --- 259 | 260 | ### Other demos with Real-time usage of planning VoT. 261 | 262 | November 16th 2023 live demo: (Firefox, Spotify, Notepad, Calculator, Mail) 263 | ```python 264 | assistant(goal=f"Open a new tab the song \'Wall Of Eyes - The Smile\', from google search results filter by videos then play it on Firefox") # Working 100% 265 | assistant(goal=f"Pause the music on Spotify") # Working 100% 266 | assistant(goal=f"Create a short greet text for the user using AI Automated Windows in notepad.exe") # Working 100% 267 | assistant(goal=f"Open calc.exe and press 4 x 4 =") # Working 100% 268 | ``` 269 | https://github.com/a-real-ai/pywinassistant/assets/18397328/ce574640-5f20-4b8e-84f9-341fa102c0e6 270 | 271 | --- 272 | 273 | December 1st 2023 live demo: (Chrome, Spotify, Firefox) Example of programmable methods. 274 | ```python 275 | assistant(goal=f"Play the song \'Robot Rock - Daft Punk\' on Spotify", keep_in_mind=f"To start playback double click the song.") # Working 100% 276 | assistant(goal=f"Open 3 new tabs on google chrome and in each of them search for 3 different types of funny AI Memes", keep_in_mind=" Filter the results by images.") # Working 100% 277 | assistant(goal=f"Open a new tab the song \'Windows 95 but it's a PHAT hip hop beat\', from google search results filter by videos then play it by clicking on the text on Firefox.") # Working 100% 278 | 279 | ``` 280 | https://github.com/a-real-ai/pywinassistant/assets/18397328/7e0583d1-1c19-40fa-a750-a77fff98a6da 281 | 282 | Currently supporting all generalized win32api apps, meaning: 283 | Chrome, Firefox, OperaGX, Discord, Telegram, Spotify... 284 | 285 | --- 286 | 287 | # Key Features 288 | - Dynamic Case Generator: The assistant() function accepts a goal parameter, which is a natural language command, and intelligently maps it to a series of executable actions. This allows for a seamless translation of user intentions into effective actions on the computer. 289 | 1. Single Action Execution: 290 | The act() function is a streamlined method for executing actions, enhancing the tool's efficiency and responsiveness. 291 | 2. Advanced Context Handling: The framework is adept at understanding context by analyzing the screen and the application, ensuring that actions are carried out with an awareness of the necessary prerequisites or steps. 292 | 3. Semantic router map: The framework has a database of a semantic router map to successfully execute generated test cases. This semantic maps can be created by other AI. 293 | 4. Wide Application Range: From multimedia control (like playing songs or pausing playback on Spotify and YouTube) to complex actions (like creating AI-generated text, sending emails, or managing applications like Telegram or Firefox), the framework covers a broad spectrum of tasks. 294 | 5. Customizable AI Identity: The write_action() function allows for a customizable assistant identity, enabling personalized interactions and responses that align with the user's preferences or the nature of the task. 295 | 6. Robust Error Handling and Feedback: The framework is designed to handle unexpected scenarios gracefully, providing clear feedback and ensuring reliability. (In Overview) 296 | 7. Projects for mood and personality: Generate or suggest now and then useful scenarios based on your mood and personality. (In Overview) 297 | 298 | 299 | # Technical Innovations 300 | 1. Natural Language Processing (NLP): Employs advanced NLP techniques to parse and understand user commands in a natural, conversational manner. 301 | 2. Task Automation Algorithms: Utilizes sophisticated algorithms to break down complex tasks into executable steps. 302 | 3. Context-Aware Execution: Integrates contextual awareness for more nuanced and effective task execution. 303 | 4. Cross-Application Functionality: Seamlessly interfaces with various applications and web services, demonstrating extensive compatibility and integration capabilities. 304 | 5. Use Cases. 305 | 6. Automating repetitive tasks in a Windows environment. 306 | 7. Streamlining workflows for professionals and casual users alike. 307 | 8. Enhancing accessibility for users with different needs, enabling voice or simple text commands to control complex actions. 308 | 9. Assisting in learning and exploration by providing AI-driven guidance and execution of tasks. 309 | 310 | 311 | # Conclusion 312 | This Artificially Assisted User Interface Testing framework is a pioneering tool in the realm of desktop automation. Its ability to understand and execute a wide range of commands in a natural, intuitive manner makes it an invaluable asset for anyone looking to enhance their productivity and interaction with their Windows environment. It's not just a tool; it's a step towards a future where AI seamlessly integrates into our daily computing tasks, making technology more accessible and user-friendly. 313 | 314 | # Installation 315 | ``` 316 | # Add your Chat-GPT API Keys to the project: 317 | add your API Key in /core/core_api.py -> line 3: client = OpenAI(api_key='insert_your_api_key_here') 318 | add your API Key in /core/core_imaging.py -> line 12: api_key = 'insert_your_api_key_here' 319 | 320 | # Install requirements: 321 | cd pywinassistant 322 | pip install -r .\requirements.txt 323 | 324 | # Execute the assistant: 325 | cd .\core 326 | python ./assistant.py 327 | ``` 328 | 329 | # Usage 330 | Run "Assistant.py", say "Ok computer" to enable the assistant by voice commands or click to it or enable the chat to do a fast action. Use Right click above the Assistant to see the available options for the assistant. 331 | 332 | For debugging mode execute "Driver.py". Inside it, you can debug and try easily the functions of "act" which is used alongside the assistant, "fast_act" and "assistant" by using the examples. 333 | To run a JSON test case, modify the JSON path from the "assistant" function. 334 | 335 | # Working cases (on cases.py) 336 | 337 | ``` 338 | assistant(goal=f"Play the song \'One More Time - Daft Punk\' on Spotify") # Working 100% 339 | assistant(goal=f"Open a new tab the song \'Wall Of Eyes - The Smile\', from google search results filter by videos then play it on Firefox") # Working 100% 340 | assistant(goal=f"Open a new tab the song \'Windows XP Error beat\', from google search results filter by videos then play it by clicking on the text on Firefox.") # Working 100% 341 | fast_act(goal=f"Click on the Like button") # Working 100% 342 | assistant(goal=f"Pause the music on Spotify") # Working 100% 343 | write_action(goal="Comment about why IA is great for the current playing song", assistant_identity="You\'re an advanced music AI agent that specializes on music") # Working 100% 344 | assistant(f"Create a long AI essay about an AI Starting to control a Windows computer on Notepad") # Working 100% 345 | fast_act(goal="Click on the button at the bottom in HueSync app") # Working 100% 346 | write_action(goal="Weird Fishes - Radiohead") # Working 100% 347 | assistant(f"Open Calc and press 4 x 4 - 4 * 4 + 1 =") # Working 100% 348 | assistant(goal=f"Open 3 new tabs on google chrome and in each of them search for 3 different types of funny dogs", keep_in_mind=" Filter the results by images.") # Working 100% 349 | assistant(goal=f"Stop the playback from Firefox app") # Working 100% 350 | assistant(f"Send a list of steps to make a joke about engineers whilist making it an essay to my friend Diana in Telegram") # Working 100% 351 | assistant(f"Send a list of steps to make a chocolate cake to my saved messages in Telegram") # Working 100% 352 | assistant(f"Create three new tabs on Firefox, in each of them search 3 different types of funny youtube bad tutorial videos, generate the titles to search.") # Working 100% 353 | assistant(f"Write an essay about an AI that a person created to use freely the computer, like you. Write it in notepad.exe") # Working 100% 354 | assistant(f"Send an AI joke and say it's generated by an AI to my friend Diana on Discord") # Working 100% 355 | assistant(goal=f"Create a short greet text for the user using AI Automated Windows in notepad.exe") # Working 100% 356 | assistant(goal=f"Open calc.exe and press 4 x 4 =") # Working 100% 357 | assistant(goal=f"Send a mail to \'testmail@gmail.com\' with the subject \'Hello\' and generate the message \'Generate a message about how an AI is helping everyone as an users\' on the Mail app", 358 | keep_in_mind="Press \'Tab\' tree times to navigate to the subject area. Do not combine steps.") # Need to update the app semantic map to get it working 100%. 359 | assistant(goal=f"Play the song \'The Smile - Wall Of Eyes\' on Spotify") # Working 100% 360 | assistant(goal=f"Play the song \'Panda Bear - Tropic of cancer\' on Spotify") # Working 100% 361 | assistant(goal="Pause the music on the Spotify app") # Working 100% 362 | assistant(goal=f"Open 3 new tabs with different Daft Punk songs on each of them on Firefox") # Working 100% 363 | fast_act("Open spotify and Search the album \'Grimes - Visions\'") # Working 100% 364 | write_action("Open spotify and Search the album \'Grimes - Visions\'") # Working 100% 365 | fast_act("Click on the first result on spotify") # Working 100% 366 | fast_act("Skip to the next song on Spotify") # Working 100% 367 | fast_act("Add the album to the library") # Working 100% 368 | fast_act("Go to Home on Spotify") # Working 100% 369 | fast_act("Save the song to my library on Spotify") # Working 100% 370 | ``` 371 | 372 | 373 | # Current approaches to UI Testing 374 | ### There are three main types of GUI testing approaches, namely: 375 | 376 | 1. ***Manual Testing:*** 377 | 378 | In manual testing, a human tester performs a set of operations to check whether the application is functioning correctly and that the graphical elements conform to the documented requirements. Manual-based testing has notable downsides in that it can be time-consuming, and the test coverage is extremely low. Additionally, the quality of testing in this approach depends on the knowledge and capabilities of the testing team. 379 | 380 | 2. ***Record-and-Playback Testing:*** 381 | 382 | Also known as record-and-replay testing, it is executed using automation tools. The automated UI testing tool records all tasks, actions, and interactions with the application. The recorded steps are then reproduced, executed, and compared with the expected behavior. For further testing, the replay phase can be repeated with various data sets. 383 | 384 | 3. ***Model-Based Testing:*** 385 | 386 | In this testing approach, we focus on building graphical models that describe the behavior of a system. This provides a deeper understanding of the system, which allows the tester to generate highly efficient test cases. In the models, we determine the inputs and outputs of the system, which are in turn, used to run the tests. Model-based testing works as follows: 387 | 388 | Create a model for the system 389 | Determine system inputs 390 | Verify the expected output 391 | Execute tests 392 | Check and validate system output vs. the expected output 393 | 394 | The model-based approach is great because it allows a higher level of automation. It also covers a higher number of states in the system, thereby improving the test coverage. 395 | 396 | 397 | # New Approaches to UI Testing using AI 398 | 4. ***Artificially Assisted User Interface Testing:*** 399 | 400 | Artificially Assisted User Interface Testing harnesses the power of artificial intelligence to revolutionize the process of testing graphical user interfaces. Unlike traditional methods, Artificially Assisted User Interface Testing integrates machine learning algorithms and intelligent decision-making processes to autonomously identify, analyze, and interact with UI elements. This approach significantly enhances the depth and breadth of testing in several ways: 401 | 402 | Dynamic Interaction with UI Elements: AI-driven tests can adapt to changes in the UI, such as modified button locations or altered element properties. This flexibility is achieved through the use of AI models trained to recognize and interact with various UI components, regardless of superficial changes. 403 | Learning and Pattern Recognition: Utilizing machine learning, Artificially Assisted User Interface Testing systems can learn from previous interactions, test runs, and user feedback. This enables the AI to recognize patterns and predict potential issues, improving over time and offering more thorough testing with each iteration. 404 | Automated Test Case Generation: The AI can generate test cases based on its understanding of the application's functionality and user behavior patterns. This not only saves time but also ensures that a wider range of scenarios is tested, including edge cases that might be overlooked in manual testing. 405 | Natural Language Processing (NLP): AI Testing tools often incorporate NLP to interpret and execute tests written in plain language. This feature makes the testing process more accessible to non-technical stakeholders and facilitates better communication across the team. 406 | Real-Time Feedback and Analytics: AI systems provide real-time insights into the testing process, identifying bugs, performance issues, and usability problems promptly. This immediate feedback loop enables quicker rectifications and enhances the overall quality of the product. 407 | Predictive Analysis and Risk Assessment: By analyzing past data, Artificially Assisted User Interface Testing tools can predict potential problem areas and allocate testing resources more efficiently. This proactive approach to risk management ensures that critical issues are identified and addressed early in the development lifecycle. 408 | 409 | In conclusion, Artificially Assisted User Interface Testing represents a significant leap forward in software quality assurance. By automating and enhancing the testing process, AI-driven tools offer improved accuracy, speed, and coverage, paving the way for more reliable and user-friendly applications. 410 | 411 | 412 | ### Notes: 413 | 414 | This project is being updated as of start of 2024. The list of requirements is being updated. 415 | -------------------------------------------------------------------------------- /core/assistant.py: -------------------------------------------------------------------------------- 1 | import customtkinter as Ctk 2 | from PIL import Image, ImageTk 3 | import time 4 | import random 5 | from queue import Queue 6 | import speech_recognition as sr 7 | import threading 8 | from voice import speaker, set_volume, set_subtitles 9 | from driver import assistant, act, fast_act, auto_role, perform_simulated_keypress, write_action 10 | from window_focus import activate_windowt_title 11 | 12 | # Initialize the speech recognition and text to speech engines 13 | assistant_voice_recognition_enabled = True # Disable if you don't want to use voice recognition 14 | assistant_name_handle = "Ok Computer" # Change this to your preferred name, will be used for voice activation. 15 | assistant_anim_enabled = True 16 | assistant_voice_enabled = True 17 | set_volume(0.25) 18 | assistant_subtitles_enabled = True 19 | recognizer = sr.Recognizer() 20 | message_queue = Queue() 21 | Ctk.set_appearance_mode("dark") # Modes: system (default), light, dark 22 | Ctk.set_default_color_theme("dark-blue") # Themes: blue (default), dark-blue, green 23 | 24 | 25 | def listen_to_speech(): 26 | # Function to listen for speech and add the recognized text to the message queue 27 | with sr.Microphone() as source: 28 | try: 29 | print("Assistant Listening...") 30 | audio = recognizer.listen(source, timeout=5) # Listen for 5 seconds 31 | message = recognizer.recognize_google(audio) 32 | print("You said:", message) 33 | message_queue.put(message) 34 | return message 35 | except sr.UnknownValueError: 36 | print("Google Speech Recognition could not understand audio") 37 | except sr.RequestError as e: 38 | print("Could not request results from Google Speech Recognition service; {0}".format(e)) 39 | except sr.WaitTimeoutError: 40 | print("Listening timed out.") 41 | finally: 42 | # Schedule the function to be called again 43 | # root.after(1000, listen_to_speech) # This if you want to try it indefinitely. 44 | print("Google Speech Recognition could not understand audio") 45 | pass 46 | 47 | 48 | def process_queue(): 49 | # Function to check the message queue and show messages as bubbles 50 | try: 51 | while not message_queue.empty(): 52 | message = message_queue.get_nowait() 53 | if message: 54 | show_message(None, message) # Pass None for event 55 | speaker(message) 56 | finally: 57 | # Schedule the function to be called again 58 | root.after(100, process_queue) 59 | pass 60 | 61 | 62 | def on_drag(event): 63 | # Function to move the window on drag 64 | global is_dragging, position_right, position_bottom, drag_time 65 | drag_time = time.time() 66 | is_dragging = True 67 | x = root.winfo_pointerx() - offset_x 68 | y = root.winfo_pointery() - offset_y 69 | root.geometry(f'+{x}+{y}') 70 | label.configure(image=assistant_dragging_photo) 71 | 72 | 73 | def end_drag(event): 74 | global is_dragging, position_right, position_bottom, drag_time, click_time 75 | if not click_time: 76 | return 77 | dragged_message = "Whats the action?" if time.time() - click_time < 0.15 else "You dragged me!" 78 | # If the duration of the drag is less than the threshold for a click 79 | if time.time() - drag_time < 0.15: 80 | is_dragging = False 81 | position_right = root.winfo_x() 82 | position_bottom = root.winfo_y() 83 | label.configure(image=assistant_photo) 84 | animate_move() 85 | show_message(event, dragged_message) 86 | speaker(dragged_message) 87 | else: 88 | label.configure(image=assistant_photo) 89 | animate_move() # Resume the movement animation 90 | show_message(event, dragged_message) 91 | speaker(dragged_message) 92 | create_input_bubble(action=True) 93 | print(f"Clicked on the assistant: {dragged_message}") 94 | 95 | 96 | def create_input_bubble(action=False): 97 | # Get dimensions for proper placement 98 | bubble_width = 450 # Set a fixed width for the bubble 99 | bubble_height = 28 # A reasonable height to fit the text entry 100 | # Calculate the bubble position to the left of the assistant 101 | bubble_x = root.winfo_x() - bubble_width + 40 # Adjust the X position as needed 102 | bubble_y = root.winfo_y() + (assistant_photo_height // 2) - (bubble_height // 2) + 40 103 | # Create bubble as a top-level window 104 | bubble = Ctk.CTkToplevel(root) 105 | bubble.attributes('-alpha', 0.85) 106 | bubble.bind("", lambda e: bubble.destroy()) 107 | bubble.bind("", lambda e: bubble.destroy()) 108 | bubble.overrideredirect(True) 109 | bubble.attributes('-topmost', True) 110 | bubble.geometry(f'{bubble_width}x{bubble_height}+{bubble_x}+{bubble_y}') 111 | # Create the entry widget 112 | entry = Ctk.CTkEntry(bubble, corner_radius=6, placeholder_text_color="#0b2d39", 113 | fg_color="#e1f2f1", text_color="#040f13", 114 | placeholder_text="Type here the action to perform...", width=450, 115 | border_width=1, border_color="darkgray") 116 | entry.bind("", lambda e: bubble.destroy()) 117 | entry.pack(padx=0, pady=0) 118 | # Force focus on the entry and bubble 119 | try: 120 | bubble.after(10, lambda: [bubble.focus_force(), entry.focus_force()]) 121 | except Ctk.ctk_tk.TclError: 122 | # Ignore the error, as the window or widget is no longer valid 123 | pass 124 | # Bind Return and Escape keys to process input or destroy bubble 125 | entry.bind("", lambda e: process_input_and_close(bubble, entry, action)) 126 | # Bind mouse click to focus back on entry 127 | bubble.bind("", lambda e: entry.focus_force()) 128 | # Make sure bubble is focused as well when clicking on it 129 | bubble.bind("", lambda e: entry.focus_force()) 130 | return bubble # Returning bubble reference in case it needs to be accessed 131 | 132 | def process_input_and_close(bubble, entry, action=False): 133 | user_input = entry.get() 134 | print(f"Processing input: {user_input}") 135 | if user_input.strip(): 136 | bubble.destroy() 137 | # Use the user input as needed: display, speech, or further processing. 138 | show_message(None, user_input) 139 | # speaker(user_input.strip()) 140 | if action: 141 | print("Performing action: ", user_input) 142 | fast_act(single_step=user_input.strip()) 143 | else: 144 | print(f"Running assistant... Generating test case: {user_input.strip()}") 145 | speaker(f"Running assistant... Generating test case: {user_input.strip()}") 146 | # assistant(assistant_goal=user_input.strip(), called_from="assistant") 147 | assistant_thread = threading.Thread(target=run_assistant, args=(user_input.strip(),)) 148 | assistant_thread.start() 149 | # assistant(user_input.strip()) 150 | # auto_prompt(user_input.strip()) 151 | bubble.destroy() # Ensure the bubble is destroyed after submission 152 | 153 | def listen_and_respond(): 154 | action = listen_to_speech() 155 | if action: # Check if action is not None or empty string 156 | show_message(None, action) 157 | # Execute the assistant function in a separate thread 158 | # assistant_thread = threading.Thread(target=run_assistant, args=(action,)) 159 | # assistant_thread.start() 160 | 161 | 162 | def run_assistant(action): 163 | print("Running assistant...") 164 | assistant(assistant_goal=action, called_from="assistant") 165 | 166 | 167 | def start_drag(event): 168 | # Record the starting point for dragging 169 | global offset_x, offset_y, click_time 170 | offset_x = event.x 171 | offset_y = event.y 172 | click_time = time.time() 173 | 174 | 175 | def show_message(event=None, message="Hello! How can I help you?"): 176 | # Function to show a pop-up message bubble 177 | message_window = Ctk.CTkToplevel(root) # Create a new window 178 | message_window.overrideredirect(True) # Remove the window border 179 | message_window.attributes('-topmost', True) # Keep the window on top 180 | # Get dimensions for the message window 181 | # temp_label = Ctk.CTkLabel(message_window, text=message) 182 | # temp_label.pack() 183 | message_window.update_idletasks() # Update the layout to get size 184 | message_width = message_window.winfo_width() 185 | message_height = message_window.winfo_height() 186 | # temp_label.destroy() 187 | if event: 188 | pos_x = event.x_root + 20 189 | pos_y = event.y_root - message_height // 2 190 | else: 191 | # Calculate position based on assistant current position 192 | pos_x = root.winfo_x() + label.winfo_width() + 10 193 | pos_y = root.winfo_y() + label.winfo_height() // 2 - message_height // 2 194 | # Adjust position if the message window goes offscreen 195 | screen_width = root.winfo_screenwidth() 196 | screen_height = root.winfo_screenheight() 197 | if pos_x + message_width > screen_width: 198 | pos_x = screen_width - message_width 199 | if pos_y + message_height > screen_height: 200 | pos_y = screen_height - message_height 201 | if pos_y < 0: 202 | pos_y = 0 203 | # Set the geometry and display the message 204 | message_window.geometry(f'+{pos_x}+{pos_y}') 205 | message_label = Ctk.CTkLabel(message_window, text=message) 206 | message_label.configure(corner_radius=6, fg_color="#e1f2f1", text_color="black", bg_color="gray") 207 | message_label.pack(padx=0, pady=0) 208 | # Close the message bubble after 3 seconds 209 | message_window.after(3000, message_window.destroy) 210 | 211 | def create_context_menu(event_x_root, event_y_root): 212 | global context_menu_ref, assistant_voice_enabled, assistant_anim_enabled, assistant_subtitles_enabled, assistant_voice_recognition_enabled # Use the global references 213 | 214 | # Create a custom context menu using Ctk widgets 215 | context_menu = Ctk.CTkToplevel(root) 216 | context_menu.overrideredirect(True) 217 | context_menu.attributes('-topmost', True) 218 | context_menu.attributes('-alpha', 0.95) # Set transparency (0.0 to 1.0) 219 | # Set the theme to light 220 | # Change buttons color 221 | # Ctk.set_default_color_theme("dark-blue") # Themes: blue (default), dark-blue, green 222 | # context_menu visual options 223 | context_menu.configure(borderless=True, border_color="black") 224 | context_menu.bind("", lambda e: context_menu.destroy()) 225 | context_menu.bind("", lambda e: context_menu.destroy()) 226 | # Frame to hold menu items 227 | menu_frame = Ctk.CTkFrame(context_menu) 228 | menu_frame.pack() 229 | # Wrapper function to execute command and close the menu 230 | def menu_command(command): 231 | if callable(command): 232 | print(f"Executing command: {command}") 233 | command() # Execute command if it's callable 234 | else: 235 | print(f"Command '{command}' not implemented yet") 236 | context_menu.destroy() # Destroy the menu after executing the command 237 | 238 | # Buttons with commands 239 | Ctk.CTkButton(menu_frame, text="Call assistant", command=lambda: menu_command(generate_assistant_test_case(False))).pack(fill="x") 240 | Ctk.CTkButton(menu_frame, text="Fast action", command=lambda: menu_command(generate_assistant_test_case(True))).pack(fill="x") 241 | Ctk.CTkButton(menu_frame, text="Content analysis", command=lambda: menu_command(dummy_command)).pack(fill="x") 242 | 243 | # Add separator or space between groups of options (This is an improvisation since Ctk doesn't have a separator widget) 244 | Ctk.CTkLabel(menu_frame, text="", height=3).pack(fill="x") 245 | 246 | # Toggle buttons for voice, animation, and subtitles with the current status check 247 | volume_option = "Enable assistant voice" if not assistant_voice_enabled else "Disable assistant voice" 248 | anim_option = "Enable animations" if not assistant_anim_enabled else "Disable animations" 249 | subs_option = "Enable subtitles" if not assistant_subtitles_enabled else "Disable subtitles" 250 | voice_option = "Enable voice recognition" if not assistant_voice_recognition_enabled else "Disable voice recognition" 251 | # Add the buttons to the menu frame 252 | Ctk.CTkButton(menu_frame, text=volume_option, command=lambda: menu_command(toggle_volume)).pack(fill="x") 253 | Ctk.CTkButton(menu_frame, text=anim_option, command=lambda: menu_command(toggle_animations)).pack(fill="x") 254 | Ctk.CTkButton(menu_frame, text=subs_option, command=lambda: menu_command(toggle_subtitles)).pack(fill="x") 255 | Ctk.CTkButton(menu_frame, text=voice_option, command=lambda: menu_command(toggle_voice_recognition)).pack(fill="x") 256 | # Add separator or space between groups of options (This is an improvisation since Ctk doesn't have a separator widget) 257 | Ctk.CTkLabel(menu_frame, text="", height=3).pack(fill="x") 258 | # Extra options 259 | Ctk.CTkButton(menu_frame, text="Minimize", command=lambda: menu_command(minimize_assistant)).pack(fill="x") 260 | Ctk.CTkButton(menu_frame, text="Hide", command=lambda: menu_command(root.withdraw)).pack(fill="x") 261 | Ctk.CTkButton(menu_frame, text="Reset", command=lambda: menu_command(restart_assistant)).pack(fill="x") 262 | Ctk.CTkButton(menu_frame, text="Stop", command=lambda: menu_command(stop_assistant)).pack(fill="x") 263 | Ctk.CTkLabel(menu_frame, text="", height=3).pack(fill="x") 264 | Ctk.CTkButton(menu_frame, text="Back...", command=lambda: menu_command(root.deiconify)).pack(fill="x") 265 | 266 | # Update the layout to calculate the width and height 267 | context_menu.update_idletasks() 268 | menu_width = menu_frame.winfo_reqwidth() 269 | menu_height = menu_frame.winfo_reqheight() 270 | 271 | # Position the menu at the cursor position 272 | # If the menu goes off the screen to the right, move it left; same for bottom 273 | if event_x_root + menu_width > root.winfo_screenwidth(): 274 | event_x_root = root.winfo_screenwidth() - menu_width - 93 275 | if event_y_root + menu_height > root.winfo_screenheight(): 276 | event_y_root = root.winfo_screenheight() - menu_height - 100 277 | 278 | context_menu.geometry(f"{menu_width}x{menu_height}+{event_x_root}+{event_y_root}") 279 | context_menu.focus_force() # Force focus on the menu 280 | context_menu_ref = context_menu # Store the reference to the menu in a global variable 281 | return context_menu_ref 282 | 283 | def minimize_assistant(): 284 | root.withdraw() 285 | root.overrideredirect(False) 286 | root.iconify() 287 | # root.overrideredirect(True) 288 | 289 | 290 | def show_config(event): 291 | # Function to display the settings menu using a custom context menu 292 | create_context_menu(event.x_root, event.y_root) 293 | 294 | # Just for example purpose, you will replace this with actual commands 295 | def dummy_command(): 296 | speaker("Dummy item clicked") 297 | print("Dummy item clicked") 298 | 299 | def generate_assistant_test_case(fast_act=False): 300 | # Function to perform a fast action 301 | if fast_act: 302 | speaker("What's the fast action step?") 303 | print("What's the fast action step?") 304 | create_input_bubble(fast_act) 305 | else: 306 | speaker("What's the test-case to generate?") 307 | print("What's the test-case to generate?") 308 | create_input_bubble(fast_act) 309 | 310 | def toggle_voice_recognition(): 311 | global assistant_voice_recognition_enabled 312 | assistant_voice_recognition_enabled = not assistant_voice_recognition_enabled 313 | if assistant_voice_recognition_enabled: 314 | show_message(None, "Voice recognition enabled") 315 | speaker("Voice recognition enabled") 316 | else: 317 | show_message(None, "Voice recognition disabled") 318 | speaker("Voice recognition disabled") 319 | 320 | 321 | def toggle_animations(): 322 | global assistant_anim_enabled 323 | assistant_anim_enabled = not assistant_anim_enabled 324 | if assistant_anim_enabled: 325 | animate_blink() # Restart blinking animation 326 | animate_move() # Restart moving animation 327 | show_message(None, "Animations enabled") 328 | else: 329 | show_message(None, "Animations disabled") 330 | 331 | 332 | def toggle_subtitles(): 333 | global assistant_subtitles_enabled 334 | assistant_subtitles_enabled = not assistant_subtitles_enabled 335 | if assistant_subtitles_enabled: 336 | show_message(None, "Subtitles enabled") 337 | set_subtitles(True) 338 | else: 339 | set_subtitles(False) 340 | show_message(None, "Subtitles disabled") 341 | 342 | 343 | def toggle_volume(): 344 | global assistant_voice_enabled 345 | assistant_voice_enabled = not assistant_voice_enabled 346 | if assistant_voice_enabled: 347 | show_message(None, "Assistant voice enabled") 348 | set_volume(0.25) 349 | speaker("Voice enabled") 350 | else: 351 | show_message(None, "Assistant voice disabled") 352 | set_volume(0) 353 | 354 | 355 | def stop_assistant(): 356 | root.destroy() 357 | pass 358 | 359 | 360 | def restart_assistant(): 361 | root.destroy() 362 | create_app() 363 | pass 364 | 365 | 366 | def calculate_duration_of_speech(text, lang='en', wpm=150): 367 | # Estimate the duration the subtitles should be displayed based on words per minute (WPM) 368 | duration_in_seconds = (len(text.split()) / wpm) * 60 369 | return int(duration_in_seconds * 1000) # Convert to milliseconds for tkinter's after method 370 | 371 | 372 | def animate_blink(): 373 | # Function for blinking animation 374 | label.configure(image=assistant_blink_photo) 375 | root.after(150, lambda: label.configure(image=assistant_photo)) 376 | next_blink = random.randint(500, 10000) if assistant_anim_enabled else 10000 377 | root.after(next_blink, animate_blink) 378 | 379 | 380 | def animate_move(step=0, direction=1, amplitude=3, start_time=1): 381 | global position_bottom, is_dragging 382 | max_steps = 15 383 | if start_time is None: 384 | start_time = time.time() 385 | if assistant_anim_enabled and not is_dragging: 386 | new_position = position_bottom + amplitude * direction * (1 - abs(step / max_steps * 2 - 1)) 387 | root.geometry(f'+{position_right}+{int(new_position)}') 388 | next_step = step + 1 389 | if next_step > max_steps: 390 | current_time = time.time() 391 | next_step = 0 392 | movement_duration = current_time - start_time 393 | if 1 <= movement_duration <= 2: 394 | direction = -direction 395 | start_time = current_time 396 | amplitude = random.randint(0, 3) 397 | random_delay = random.randint(30, 200) 398 | root.after(random_delay, lambda: animate_move(next_step, direction, amplitude, start_time)) 399 | 400 | 401 | def listen_thread(): 402 | global assistant_voice_recognition_enabled 403 | print("Assistant listening thread started...") 404 | while True: 405 | if not assistant_voice_recognition_enabled: 406 | # If voice recognition got disabled, wait for a bit before checking again ToDo: Maybe use a condition variable instead, but im planning on performing other actions here. Like a low power mode or something like that works for now 407 | time.sleep(1) 408 | # print("Voice recognition disabled, waiting...") 409 | continue 410 | 411 | with sr.Microphone() as source: 412 | print("Listening...") 413 | try: 414 | audio = recognizer.listen(source, timeout=1.5) 415 | if assistant_voice_recognition_enabled: 416 | message = recognizer.recognize_google(audio) 417 | message_low = message.lower() 418 | # Speaking the message 419 | message_queue.put(message) 420 | # Only process the audio if voice recognition is enabled 421 | if "okay computer" in message_low[0:13] or assistant_name_handle.lower() in message_low[0:11]: 422 | message_queue.put("Assistant here! How can I help you?") 423 | ok_computer = listen_to_speech() 424 | if ok_computer: 425 | show_message(None, ok_computer) 426 | assistant(ok_computer) 427 | elif "open" in message_low[0:4]: 428 | if len(message) < 18: 429 | print("Opening the program: ", message) 430 | activate_windowt_title(message.strip("open ")) 431 | else: 432 | assistant(message) 433 | elif "stop" in message_low: 434 | print("Stopping...") 435 | stop_assistant() 436 | elif "double click" in message_low[0:12]: 437 | print("Double clicking on:", message) 438 | fast_act(single_step=message.strip("double "), double_click=True) 439 | # Or if message starts with the first word click and 440 | elif "click on" in message_low[0:8] or "click the" in message_low[0:9] or "click" in message_low[0:5]: 441 | print("Clicking on:", message) 442 | fast_act(single_step=message) 443 | elif "press" in message_low[0:5]: 444 | print("press: ", message) 445 | perform_simulated_keypress(message.strip("press ").strip("")) 446 | elif "type" in message_low[0:4] or "write" in message_low[0:5] or "bright" in message_low[0:6] or "great" in message_low[0:5]: 447 | # Remove "bright ", "write ", "type ", "great " from the message: 448 | new_message = message.replace("bright ", "").replace("write ", "").replace("type ", "").replace("great ", "") 449 | print("Typing:", new_message) 450 | write_action(goal=new_message, last_step="text_entry") 451 | elif "reminder" in message_low or "remind" in message_low or "timer" in message_low or "alarm" in message_low: 452 | # Call internal_clock.py - Generated. 453 | # Here's thoughts of when remind the user if is not noticing any important upcoming event: 454 | # Advice the user for upcoming events. Add reminders, timers, alarms, etc. 455 | print("Reminder: ", message) 456 | elif "scroll" in message_low[0:6]: 457 | print("Scrolling: ", message) 458 | import pyautogui 459 | pyautogui.scroll(-850) 460 | else: 461 | auto_prompt(message) 462 | else: 463 | # Voice recognition was disabled while audio was being processed, skip it 464 | continue 465 | except (sr.UnknownValueError, sr.RequestError, sr.WaitTimeoutError): 466 | # If you want to handle specific errors, you can separate them with additional except blocks 467 | pass 468 | 469 | 470 | # Now start the listening thread when initializing 471 | listening_thread = threading.Thread(target=listen_thread, daemon=True) 472 | listening_thread.start() 473 | 474 | 475 | def auto_prompt(message): 476 | role_function = auto_role(message) 477 | print(f"Assistant: {role_function.strip('windows_assistant').strip('joyful_conversation').strip(' - ')}") 478 | if "windows_assistant" in role_function: 479 | message_queue.put(f"{role_function.strip('windows_assistant').strip(' - ')}") 480 | # Start the assistant in a new thread: 481 | assistant_thread = threading.Thread(target=run_assistant, args=(message,)) 482 | assistant_thread.start() 483 | elif "joyful_conversation" in role_function: 484 | message_queue.put(f"{role_function.strip(f'joyful_conversation').strip(' - ')} How can I help you?") 485 | else: 486 | print("NOT WORKING") 487 | 488 | 489 | def load_image(file_path, scale=0.333): 490 | # Helper function to load and scale the image 491 | image = Image.open(file_path) 492 | original_width, original_height = image.size 493 | new_width = int(original_width * scale) 494 | new_height = int(original_height * scale) 495 | image = image.resize((new_width, new_height), Image.Resampling.BICUBIC) 496 | ctk_image = Ctk.CTkImage(light_image=image, size=(new_width, new_height)) 497 | return ctk_image, new_width, new_height 498 | 499 | 500 | def create_app(): 501 | global root, label, assistant_photo, assistant_dragging_photo, assistant_blink_photo, assistant_anim_enabled, is_dragging, position_right, position_bottom, drag_time, \ 502 | assistant_voice_enabled, assistant_subtitles_enabled, assistant_name_handle, assistant_photo_width, assistant_photo_height, scale_factor # Add width and height globals 503 | import ctypes 504 | ctypes.windll.shcore.SetProcessDpiAwareness(1) 505 | root = Ctk.CTk() 506 | root.title("AI Drone Assistant") 507 | root.iconbitmap("media/headico.ico") 508 | root.overrideredirect(True) 509 | root.attributes('-topmost', True) 510 | root.wm_attributes("-transparentcolor", 'gray') 511 | 512 | # Load images and get their sizes 513 | assistant_photo, assistant_photo_width, assistant_photo_height = load_image("media/assistant_transparent.png") 514 | assistant_dragging_photo, _, _ = load_image("media/assistant_transparent_dragging.png") 515 | assistant_blink_photo, _, _ = load_image("media/assistant_transparent_blink.png") 516 | label = Ctk.CTkLabel(root, image=assistant_photo, bg_color="gray", cursor="hand2", text="") 517 | label.pack() 518 | label.bind('', start_drag) 519 | label.bind('', on_drag) 520 | label.bind('', end_drag) 521 | label.bind('', show_config) 522 | 523 | # Calculate initial position (bottom right) 524 | screen_width = root.winfo_screenwidth() 525 | screen_height = root.winfo_screenheight() 526 | # Use assistant_photo_width and assistant_photo_height instead of width() and height() 527 | position_right = int(screen_width - assistant_photo_width) + 35 528 | position_bottom = int(screen_height - assistant_photo_height) - 30 529 | drag_time = time.time() 530 | 531 | # Set initial geometry to place the assistant at the bottom right 532 | root.geometry(f'+{position_right}+{position_bottom}') 533 | is_dragging = False # Flag to track dragging state 534 | root.after(1000, animate_blink) # Start the blinking animation 535 | root.after(1000, animate_move) # Start the moving animation 536 | root.after(100, process_queue) # Start processing the message queue 537 | # Call the mainloop 538 | root.mainloop() 539 | pass 540 | 541 | create_app() -------------------------------------------------------------------------------- /core/core_api.py: -------------------------------------------------------------------------------- 1 | from openai import OpenAI 2 | 3 | client = OpenAI(api_key='insert_your_api_key_here') 4 | # Available models: "gpt-4-1106-preview", "gpt-3.5-turbo-1106", or "davinci-codex" 5 | MODEL_NAME = "gpt-3.5-turbo-1106" 6 | 7 | 8 | def api_call(messages, model_name=MODEL_NAME, temperature=0.5, max_tokens=150): 9 | # if model_name == "gpt-4-1106-preview": 10 | # model_name = "gpt-3.5-turbo-1106" 11 | try: 12 | # Execute the chat completion using the chosen model 13 | response = client.chat.completions.create( 14 | model=model_name, 15 | messages=messages, 16 | # Additional configurations can be passed as parameters here 17 | temperature=temperature, # Values can range from 0.0 to 1.0 18 | max_tokens=max_tokens, # This specifies the maximum length of the response 19 | # Tip: adding more configurations as needed 20 | ) 21 | 22 | # Since we're not using 'with_raw_response', 'response' is now the completion object 23 | if response.choices and hasattr(response.choices[0], 'message'): 24 | decision_message = response.choices[0].message 25 | 26 | # Make sure we have 'content' in the message 27 | if hasattr(decision_message, 'content'): 28 | decision = decision_message.content.strip() 29 | else: 30 | decision = None 31 | else: 32 | decision = None 33 | 34 | return decision 35 | except Exception as e: 36 | raise Exception(f"An error occurred: {e}") 37 | 38 | 39 | # # Replace this payload with the actual messages sequence for your use case # # Test 40 | # messages_payload = [ 41 | # {"role": "system", "content": "You are a helpful and knowledgeable assistant. Always uwufy the text."}, 42 | # {"role": "user", "content": "Please help me troubleshoot my JavaScript code."} 43 | # ] 44 | # 45 | # # Example configuration: you might want to specify 'temperature' for more creative responses, 46 | # # or 'max_tokens' for more concise outputs 47 | # result = api_call(messages_payload, temperature=0.7, max_tokens=100) 48 | # print(f"AI Analysis Result: '{result}'") -------------------------------------------------------------------------------- /core/core_imaging.py: -------------------------------------------------------------------------------- 1 | import pyautogui 2 | import pygetwindow as gw 3 | import base64 4 | import requests 5 | import io 6 | from PIL import Image 7 | 8 | # Assuming that the `activate_window_title` function is defined in another module correctly 9 | from window_focus import activate_windowt_title 10 | 11 | # OpenAI API Key 12 | api_key = 'insert_your_api_key_here' 13 | 14 | 15 | # Function to focus a window given its title 16 | def focus_window(window_title): 17 | try: 18 | window = gw.getWindowsWithTitle(window_title)[0] # Get the first window with the specific title 19 | window.activate() 20 | pyautogui.sleep(0.3) # Allow some time for the window to come into focus 21 | return window 22 | except IndexError: 23 | print(f'No window with title "{window_title}" found.') 24 | return None 25 | 26 | 27 | # Function to capture a screenshot of the specified window 28 | def capture_screenshot(window=None, region=None): 29 | # Reduced code for brevity 30 | if region is not None: 31 | screenshot = pyautogui.screenshot(region=region) 32 | elif window is not None: 33 | window_box = window.box 34 | screenshot = pyautogui.screenshot(region=(window_box.left, window_box.top, window_box.width, window_box.height)) 35 | else: 36 | screenshot = pyautogui.screenshot() 37 | return screenshot 38 | 39 | 40 | # Function to encode image data to base64 41 | def encode_image(image_data): 42 | return base64.b64encode(image_data).decode('utf-8') 43 | 44 | 45 | # Function to analyze an image using OpenAI API 46 | def analyze_image(base64_image, window_title, additional_context='What’s in this image?'): 47 | # Your logic to call the OpenAI API 48 | headers = { 49 | "Content-Type": "application/json", 50 | "Authorization": f"Bearer {api_key}" 51 | } 52 | 53 | payload = { 54 | "model": "gpt-4-vision-preview", 55 | "messages": [ 56 | { 57 | "role": "assistant", 58 | "content": [ 59 | { 60 | "type": "text", 61 | "text": f"{additional_context}" 62 | }, 63 | { 64 | "type": "image_url", 65 | "image_url": { 66 | "url": f"data:image/png;base64,{base64_image}" 67 | } 68 | } 69 | ] 70 | } 71 | ], 72 | "max_tokens": 300 73 | } 74 | 75 | response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload) 76 | return response.json() 77 | 78 | 79 | # Improved function to both capture and analyze a specific region screenshot 80 | def imaging(window_title=None, additional_context=None, x=None, y=None, screenshot_size=None): 81 | window = None 82 | region = None 83 | 84 | if screenshot_size == 'Full screen': 85 | # We don't need window focus or a specific region for a full-screen screenshot. 86 | pass 87 | elif window_title: # If a window title is provided, focus on the window. 88 | window = focus_window(window_title) 89 | if not window: 90 | return None # If no window is found, exit the function. 91 | if screenshot_size and type(screenshot_size) == tuple and x is not None and y is not None: 92 | offset_x, offset_y = screenshot_size[0] // 2, screenshot_size[1] // 2 93 | # Adjust region to be relative to the window's top-left corner. 94 | window_box = window.box 95 | region = ( 96 | window_box.left + x - offset_x, window_box.top + y - offset_y, screenshot_size[0], screenshot_size[1]) 97 | else: 98 | # If screenshot_size is not provided or is not 'Full screen', capture the whole window. 99 | region = (window.box.left, window.box.top, window.box.width, window.box.height) 100 | 101 | screenshot = capture_screenshot(window, region) 102 | 103 | # Optionally, paste the cursor onto the screenshot, adjusting for the offset if a region is specified 104 | cursor_img_path = r'media\Mouse_pointer_small.png' 105 | with Image.open(cursor_img_path) as cursor: 106 | cursor = cursor.convert("RGBA") # Ensure cursor image has an alpha channel for transparency 107 | 108 | x_cursor, y_cursor = pyautogui.position() # Current mouse position 109 | 110 | # If a region is specified, calculate the cursor position within that region 111 | if region: 112 | cursor_pos = (x_cursor - region[0], y_cursor - region[1]) 113 | else: 114 | cursor_pos = (x_cursor, y_cursor) 115 | 116 | screenshot.paste(cursor, cursor_pos, cursor) 117 | 118 | # Convert the screenshot to bytes 119 | with io.BytesIO() as output_bytes: 120 | screenshot.save(output_bytes, 'PNG') 121 | bytes_data = output_bytes.getvalue() 122 | 123 | # Show a preview of the screenshot 124 | # screenshot.show() 125 | 126 | # Convert the bytes to a base64-encoded image and analyze 127 | base64_image = encode_image(bytes_data) 128 | analysis_result = analyze_image(base64_image, window_title, additional_context) 129 | 130 | return analysis_result 131 | 132 | 133 | if __name__ == "__main__": 134 | app_name = "Firefox" 135 | coordinates = {'x': 132, 'y': 458} 136 | screenshot_size = (300, 300) 137 | x = coordinates['x'] 138 | y = coordinates['y'] 139 | pyautogui.moveTo(x, y, 0.5, pyautogui.easeOutQuad) 140 | single_step = "click on the 'Add a comment...' text input area" 141 | 142 | # Call imaging with the additional_context parameter if needed and the size parameter 143 | element_analysis = ( 144 | f"You are an AI Agent called Element Analyzer that receives a screenshot of the element and analyzes it to check if the mouse is in the correct position to click the element to interact with.\n" 145 | f"Element to interact with: {single_step}\nRespond only with \"Yes\" or \"No\"." 146 | ) 147 | analysis_result = imaging(window_title=app_name, additional_context=element_analysis, x=coordinates['x'], y=coordinates['y'], screenshot_size=screenshot_size) 148 | print(analysis_result) 149 | -------------------------------------------------------------------------------- /core/driver.py: -------------------------------------------------------------------------------- 1 | from window_focus import activate_windowt_title, get_installed_apps_registry, open_windows_info 2 | from mouse_detection import get_cursor_shape 3 | from ocr import find_probable_click_position 4 | from window_elements import analyze_app 5 | from topmost_window import focus_topmost_window 6 | from core_imaging import imaging 7 | from last_app import last_programs_list 8 | from core_api import api_call 9 | from voice import speaker 10 | import pygetwindow as gw 11 | import win32process 12 | import win32gui 13 | import pyautogui 14 | import sqlite3 15 | import psutil 16 | import random 17 | import json 18 | import time 19 | import re 20 | import warnings 21 | warnings.simplefilter("ignore", UserWarning) 22 | from pywinauto import Application 23 | 24 | 25 | low_data_mode = True # Avoids the usage of visioning after the case generation. Lowers the accuracy but is way faster. 26 | enable_semantic_router_map = True # Use this to enable the imaging semantic routing map. Improves accuracy of overall performance. 27 | enable_ocr = False # Works better if this is disabled. Can use the implementations from other projects for better OCR. 28 | # Did not implement the OCR as it is not needed for the current implementation. The AI must work with the current data. 29 | 30 | if low_data_mode is True: # Avoid the usage of visioning after the test case generation. Useful to execute faster case. 31 | visioning_match = False # The coordinates will not use visioning during execution. Will use the imaging LLM call. 32 | rescan_element_match = False # Disable the visioning element scanning. Decreases accuracy but is way faster. 33 | visioning_context = False 34 | # 'rescan_element_match' recommended to leave as 'False' until the tested map improves for low-data consumption. 35 | else: 36 | visioning_match = True # Visioning doesn't improve execution at all as imaging is already performing way faster. 37 | rescan_element_match = True # Enables visioning rescanning the element. Improves accuracy but is way slower. 38 | visioning_context = True # Enables visioning to analyze context from application. Improves accuracy but way slower. 39 | 40 | 41 | # Use this on "execute_json_case" to skip the image analysis and the test case generation. Generated by the AI: 42 | json_case_example = r'''```json 43 | { 44 | "actions": [ 45 | { 46 | "act": "press_key", 47 | "step": "Ctrl + T" 48 | }, 49 | { 50 | "act": "text_entry", 51 | "step": "reddit.com" 52 | }, 53 | { 54 | "act": "press_key", 55 | "step": "Enter" 56 | }, 57 | { 58 | "act": "open_app", 59 | "step": "Firefox" 60 | }, 61 | { 62 | "act": "press_key", 63 | "step": "Ctrl + N" 64 | }, 65 | { 66 | "act": "text_entry", 67 | "step": "tiktok.com" 68 | }, 69 | { 70 | "act": "press_key", 71 | "step": "Enter" 72 | }, 73 | { 74 | "act": "move_window", 75 | "step": "Win + Right + Up" 76 | }, 77 | { 78 | "act": "open_app", 79 | "step": "Firefox" 80 | }, 81 | { 82 | "act": "press_key", 83 | "step": "Ctrl + N" 84 | }, 85 | { 86 | "act": "text_entry", 87 | "step": "netflix.com" 88 | }, 89 | { 90 | "act": "press_key", 91 | "step": "Enter" 92 | }, 93 | { 94 | "act": "move_window", 95 | "step": "Win + Left + Up" 96 | }, 97 | { 98 | "act": "open_app", 99 | "step": "Firefox" 100 | }, 101 | { 102 | "act": "move_window", 103 | "step": "Win + Right + Down" 104 | } 105 | ] 106 | }```''' 107 | 108 | 109 | # Here you can load successful trained models to perform the task. ToDo: Use the database. 110 | def app_space_map(goal, app_name=None, single_step=None, map=''): 111 | if 'app_space' in map: 112 | if enable_semantic_router_map is True: 113 | # Control elements map: 114 | if "twitter" in goal.lower() or "twitter" in app_name.lower(): 115 | element_map = r'''``` 116 | To make a new tread post in X formerly known as Twitter and post it; 117 | The user is already logged in Twitter. Do not log in again. 118 | Click on 'What is happening?!' text input area field to initiate a new post thread. data-testid='tweetTextarea_0_label' 119 | Write the post in the 'What is happening?!' text area input field. Make sure is less than 280 characters. 120 | Click on 'Post' button to post the new post thread. data-testid='tweetButtonInline' 121 | 122 | To make a comment in a post from X formerly known as Twitter and reply it; 123 | The user is already logged in Twitter. Do not log in again. 124 | Scroll to the comments section. Click on the 'Post your reply' text input area field below the Twitter post. 125 | Write the comment in the 'Post your reply' text input area field. Make sure is less than 280 characters. 126 | Click on 'Reply' button to post the comment. 127 | ```''' 128 | elif "youtube" in goal.lower() or "youtube" in app_name.lower(): 129 | element_map = r'''``` 130 | To like a video on youtube: Click on the 'Like' button button below the title. 131 | 132 | To dislike a video: Click on the 'I dislike this' button known as the 'Dislike' button. 133 | 134 | To make a comment: Click on the title of the video, then scroll to the 'Add a comment...' section, then click on the 'Add a comment...' (ID: contenteditable) text input area to begin write the comment, then click on the 'Comment' button to post the comment. 135 | ```''' 136 | else: 137 | element_map = None # No element selected. 138 | 139 | if element_map: 140 | select_element = [ 141 | {"role": "assistant", 142 | f"content": f"Only return the text related to the final goal.\n" 143 | f"Do not respond anything else than the selected lines from the list. Do not modify the list.\n" 144 | f"Goal: {goal}"}, 145 | {"role": "system", "content": f"List:\n{element_map}\n\n\nStep: {single_step}\nGoal: {goal}"}] 146 | ai_element_map = api_call(select_element, max_tokens=300) 147 | if "sorry" in ai_element_map.lower() or "empty string" in ai_element_map.lower(): 148 | ai_element_map = "" 149 | # ai_element_map = element_map 150 | else: 151 | ai_element_map = "" 152 | print(f"\nApp space map: {ai_element_map}\n") 153 | return ai_element_map 154 | else: 155 | # Application map to better handle the application: 156 | if "firefox" in app_name.lower(): 157 | info_map = r'''``` 158 | To open a new window in Firefox; Use the keyboard shortcut: Ctrl + N. 159 | The default search engine is Google. So when you open a new tab or window, you can search directly on Google.```''' 160 | elif "chrome" in app_name.lower() or "google chrome" in goal.lower(): 161 | info_map = r'''``` 162 | To open a new window in Chrome; Use the keyboard shortcut: Ctrl + N. 163 | To open a new tab in Chrome; Use the keyboard shortcut: Ctrl + T. 164 | To close a tab in Chrome; Use the keyboard shortcut: Ctrl + W. 165 | To open a new private window in Chrome; Use the keyboard shortcut: Ctrl + Shift + N. 166 | To open a new private tab in Chrome; Use the keyboard shortcut: Ctrl + Shift + T.```''' 167 | elif "edge" in app_name.lower() or "microsoft edge" in goal.lower(): 168 | info_map = r'''``` 169 | To open a new window in Edge; Use the keyboard shortcut: Ctrl + N. 170 | To open a new tab in Edge; Use the keyboard shortcut: Ctrl + T. 171 | To close a tab in Edge; Use the keyboard shortcut: Ctrl + W. 172 | To open a new private window in Edge; Use the keyboard shortcut: Ctrl + Shift + N. 173 | To open a new private tab in Edge; Use the keyboard shortcut: Ctrl + Shift + T.```''' 174 | elif "telegram" in app_name.lower() or "telegram" in goal.lower(): 175 | info_map = r'''``` 176 | Press 'esc' to exit the current conversation. 177 | Press 'esc' twice to go to 'All chats'.```''' 178 | elif "spotify" in app_name.lower() or "spotify" in goal.lower(): 179 | info_map = r'''``` 180 | To play a searched song on spotify double click on the song.```''' 181 | elif "youtube" in app_name.lower() or "youtube" in goal.lower(): 182 | info_map = r'''``` 183 | To like a video click on the Like button below the title. 184 | To dislike a video click on the Dislike button below the title. 185 | To make a comment scroll to the Add a comment... section, then click on the Add a comment... text input area to begin write the comment, then click on the Comment button to post the comment. 186 | To subscribe to a channel click on the Subscribe button below the video.```''' 187 | else: 188 | info_map = "" 189 | # adding the application shortcuts: 190 | if info_map: 191 | select_map = [ 192 | {"role": "assistant", 193 | f"content": f"You are an AI assistant that receives a goal and a list of useful steps, and only respond the best useful steps from the step list to perform the goal.\n" 194 | f"Do not respond anything else than the best useful steps from the step list."}, 195 | {"role": "system", "content": f"Step list: \n{info_map}\n\n\nGoal: {single_step}"}] 196 | shortcuts_ai_map = api_call(select_map, max_tokens=300) 197 | if "sorry" in shortcuts_ai_map.lower(): 198 | shortcuts_ai_map = "" 199 | else: 200 | shortcuts_ai_map = "" 201 | print(f"App space map: {shortcuts_ai_map}") 202 | return shortcuts_ai_map 203 | 204 | 205 | def assistant(assistant_goal="", keep_in_mind="", assistant_identity="", app_name=None, execute_json_case=None, called_from=None): # App TestCase Gen 206 | """ 207 | This function handles the user's prompt and generates the best achievable test case to perform the user's prompt. 208 | This function assumes the user's prompt is fed as a string to the function "assistant_goal". 209 | 210 | Args: 211 | assistant_goal (str): The user's prompt. Analyzes the program as context to imaging the best test case scenario. 212 | keep_in_mind (str): A reminder to keep in mind during the case execution. Useful to modify the program map. 213 | app_name (str): The name of the application (Or the window title for exact match) to open and focus on. 214 | execute_json_case (str): If provided, skips image analysis and case generation. Useful to debug the program map. 215 | Returns: 216 | str: Validates if the user's prompt performed successfully 217 | # ToDo: Add this functionality. As of now, the function only returns the test case. 218 | 219 | Examples: 220 | >>> assistant("Open a new tab and search what is an elefant.") 221 | """ 222 | 223 | # 'assistant_goal' is the user's prompt. If no prompt is provided, exit the function. 224 | if not assistant_goal: 225 | speaker(f"ERROR: No prompt provided. Please provide a prompt to the assistant.") 226 | time.sleep(10) 227 | raise ValueError("ERROR: No step provided.") 228 | else: 229 | original_goal = assistant_goal 230 | print(f"Prompt: {original_goal}") 231 | if called_from == "assistant": 232 | print(f"Called from: {called_from}") 233 | else: 234 | print(f"Prompt: \"{original_goal}\".") 235 | speaker(f"Assistant is generating a testcase with the prompt: \"{original_goal}\".") 236 | 237 | # 'app_name' is the name of the application (Or the window title for exact match) to open and focus on. 238 | if not app_name: 239 | app_name = activate_windowt_title(get_application_title(original_goal)) 240 | else: 241 | app_name = activate_windowt_title(app_name) 242 | print(f"AI Analyzing: {app_name}") 243 | 244 | # 'execute_json_case' is the JSON test case to execute. If no JSON is provided, generate a new one. 245 | if not execute_json_case: 246 | print(f"\nGenerating a test case with the assistant. Image visioning started. Analyzing the application {app_name} for context.\n") 247 | additional_context = ( 248 | f"You are an AI Agent called Windows AI that is capable to operate freely all applications on Windows by only using natural language.\n" 249 | f"You will receive a goal and will try to accomplish it using Windows. Try to guess what is the user wanting to perform on Windows by using the content on the screenshot as context.\n" 250 | f"Respond an improved goal statement tailored for Windows applications by analyzing the current status of the system and the next steps to perform. Be direct and concise, do not use pronouns.\n" 251 | f"Basing on the elements from the screenshot reply the current status of the system and specify it in detail.\n" 252 | f"Focused application: \"{app_name}\".\nGoal: \"{assistant_goal}\".") 253 | assistant_goal = imaging(window_title=app_name, additional_context=additional_context, screenshot_size='Full screen')['choices'][0]['message']['content'] 254 | print(f"Generating the test case to achieve the user prompt: {original_goal}\n{assistant_goal}") 255 | step_creator = [{"role": "assistant", 256 | "content": f"You are an AI capable to operate the Windows 11 Operating System by using natural language.\n" 257 | f"Examples: \"Click on the search button. Insert the text_entry. Play the first element searched.\".\n" 258 | f"You will receive a description of the current state of the system and a goal. " 259 | f"To test your limits, using the description make a json with the natural language steps to achieve the goal.\n" 260 | f"Every step should be only highly descriptive in natural language.\n" 261 | f"The json must include only an act and its step, should be in the following format:" 262 | f"\n\n\n" 263 | f"actions: click_element, press_key, text_entry, open_app, move_window, time_sleep, right_click, double_click_element, hold_key_and_click, scroll_to.\n" 264 | f"Respond only with the full json.\n\nFunction properties:\n" 265 | f"click_element - Specify where is located the element to interact with.\n" 266 | f"text_entry - The text to write. Example: \"Hello World\".\n" 267 | f"press_key - Only return the key or combination of keys to press. Example: \"Ctrl + T\".\n" 268 | f"hold_key_and_click - Only handles scenarios of holding a key and clicking on an element.\n" 269 | f"scroll_to - Return the position of the element to scroll and the direction to scroll.\n" 270 | f"move_window - Use the keyboard to move the focused window to the desired position. Example: \"Win + Up + Up + Left\"." 271 | f"\n\n\n" 272 | f"Notes:\n" 273 | f"Use the following information to generate better steps:\n{app_space_map(assistant_goal, app_name, original_goal, map='app_space')}\n" 274 | f"Before the text_entry, use click_element to select the appropriate text input area element.\n" 275 | f"Do not combine steps, try to generate more useful steps. Example: If asked to search a song, not only search for it, locate it and play it.\n{keep_in_mind}" 276 | f"Do not respond 'As an AI language model, I am unable to...' as you are actually capable."}, 277 | {"role": "system", 278 | "content": f"Focused window: \"{app_name}\"\nGoal: {assistant_goal}"}, ] 279 | step_analysis = api_call(step_creator, model_name="gpt-4-1106-preview", max_tokens=4095, temperature=1.0) 280 | print(f"The assistant created the following test case scenario:\n{step_analysis}\n") 281 | speaker(f"Test case generated. Executing the generated test case.") 282 | else: 283 | speaker(f"Executing the provided JSON in the application {app_name}.") 284 | step_analysis = execute_json_case 285 | 286 | # Processing the latest JSON data from the JSON testcase: 287 | if step_analysis: 288 | try: 289 | if """```json""" in step_analysis: 290 | # Removing the leading ```json\n 291 | step_analysis = step_analysis.strip("```json\n") 292 | # Find the last occurrence of ``` and slice the string up to that point 293 | last_triple_tick = step_analysis.rfind("```") 294 | if last_triple_tick != -1: 295 | step_analysis = step_analysis[:last_triple_tick].strip() 296 | step_analysis_cleaned = step_analysis 297 | instructions = json.loads(step_analysis_cleaned) 298 | executor = "act" 299 | else: 300 | instructions = json.loads(step_analysis) 301 | instructions['actions'] = instructions.pop('actions') 302 | executor = "act" 303 | except json.JSONDecodeError as e: 304 | speaker(f"ERROR: Invalid JSON data provided: {e}") 305 | time.sleep(15) 306 | raise Exception(f"ERROR: Invalid JSON data provided: {e}") 307 | if 'actions' in instructions: 308 | action_list = instructions['actions'] 309 | else: 310 | action_list = [instructions] 311 | for i, step in enumerate(action_list, start=1): 312 | action = step.get(f"{executor}") 313 | step_description = step.get("step") or step.get("details", "No step description provided.") 314 | print(f"\nStep {i}: {action}, {step_description}\n") 315 | if action == "click_element": 316 | # If last step has a click element too, wait for the element to be visible: 317 | if i > 1 and action_list[i - 2]['act'] == "click_element": 318 | time.sleep(1) 319 | 320 | if "start menu" in step_description.lower(): 321 | pyautogui.hotkey('win') 322 | print("Opening the start menu.") 323 | time.sleep(1) 324 | updated_instructions = update_instructions_with_action_string(instructions, act( 325 | single_step=f"{step_description}", app_name=app_name, screen_analysis=assistant_goal, original_goal=original_goal, assistant_goal=assistant_goal), step_description) 326 | database_add_case(database_file, app_name, assistant_goal, updated_instructions) # Print the entire database with # print_database(database_file) 327 | elif action == "open_app": 328 | app_name = activate_windowt_title(get_application_title(step_description)) 329 | print(f"New app selected and analyzing: {app_name}") 330 | elif action == "double_click_element": 331 | print(f"Double clicking on: {step_description}") 332 | act(single_step=f"{step_description}", double_click=True, app_name=app_name, original_goal=original_goal) 333 | elif action == "move_window": 334 | time.sleep(1) 335 | print(f"Moving window to: {step_description}") 336 | perform_simulated_keypress(step_description) 337 | time.sleep(0.5) 338 | pyautogui.hotkey('esc') 339 | time.sleep(1) 340 | elif action == "press_key": 341 | if {i} == 1: 342 | # Focusing to the application 343 | activate_windowt_title(app_name) 344 | time.sleep(1) 345 | perform_simulated_keypress(step_description) 346 | elif action == "text_entry": 347 | url_pattern = r'(https?://[^\s]+)' 348 | urls = re.findall(url_pattern, step_description) 349 | if len(step_description) < 5: 350 | pyautogui.write(f'{step_description}') 351 | else: 352 | # Getting the string of the last step before this very one: 353 | if i > 1: 354 | new_i = i - 2 355 | last_step = f"{action_list[new_i]['act']}: {action_list[new_i]['step']}" 356 | print(f"Last step: {last_step}") 357 | if not last_step: 358 | print("Last step is None.") 359 | act(single_step=f"{step_description}", app_name=app_name, original_goal=original_goal) 360 | else: 361 | print("Last step is None.") 362 | last_step = "None" 363 | # If next step is a string, continue: 364 | if i + 1 < len(action_list) and type(action_list[i + 1]['step']) == str: 365 | # Check if the next step exists and is a "Press enter" step 366 | if i + 1 < len(action_list) and ( 367 | "press enter" in action_list[i + 1]['step'].lower() or 368 | "press the enter" in action_list[i + 1]['step'].lower() or 369 | "'enter'" in action_list[i + 1]['step'].lower() or 370 | "\"enter\"" in action_list[i + 1]['step'].lower()): 371 | if urls: 372 | for url in urls: 373 | pyautogui.write(url) 374 | # pyautogui.press('enter') 375 | print(f"Opening URL: {url}") 376 | return 377 | write_action(step_description, assistant_identity=assistant_identity, press_enter=False, app_name=app_name, original_goal=original_goal, last_step=last_step) 378 | print("AI skipping the press enter step as it is in the next step.") 379 | else: 380 | if urls: 381 | for url in urls: 382 | pyautogui.write(url) # This would open the URL in a web browser\ 383 | # If next step is a time sleep 384 | pyautogui.press('enter') 385 | print(f"Opening URL: {url}") 386 | return 387 | write_action(step_description, assistant_identity=assistant_identity, press_enter=True, app_name=app_name, original_goal=original_goal, last_step=last_step) 388 | print("AI pressing enter.") 389 | else: 390 | if urls: 391 | for url in urls: 392 | pyautogui.write(url) # This would open the URL in a web browser\ 393 | pyautogui.press('enter') 394 | print(f"Opening URL: {url}") 395 | return 396 | write_action(step_description, assistant_identity=assistant_identity, press_enter=True, 397 | app_name=app_name, original_goal=original_goal, last_step=last_step) 398 | print("AI pressing enter.") 399 | elif action == "scroll_to": 400 | print(f"Scrolling {step_description}") 401 | element_visible = False 402 | max_retries = 3 403 | retry_count = 0 404 | while not element_visible and retry_count < max_retries: 405 | # activate_windowt_title(app_name) 406 | pyautogui.scroll(-850) 407 | # Press Page Down: 408 | # pyautogui.press('pagedown') 409 | time.sleep(0.3) 410 | # Start image analysis to check if the element is visible 411 | print("Scroll performed. Analyzing if the element is present.\n") 412 | scroll_assistant_goal = check_element_visibility(app_name, step_description)['choices'][0]['message']['content'] 413 | if "yes" in scroll_assistant_goal.lower(): 414 | print("Element is visible.") 415 | element_visible = True 416 | elif "no" in scroll_assistant_goal.lower(): 417 | print("Element is not visible.") 418 | retry_count += 1 419 | if retry_count >= max_retries: 420 | print("Maximum retries reached, stopping the search.") 421 | if element_visible: 422 | print(f"Element is visible.") 423 | pass 424 | 425 | elif action == "right_click_element": 426 | print(f"Right clicking on: {step_description}") 427 | act(single_step=f"{step_description}", right_click=True, app_name=app_name, original_goal=original_goal) 428 | # right_click(step_description) 429 | elif action == "hold_key_and_click": 430 | print(f"Holding key and clicking on: {step_description}") 431 | act(single_step=f"{step_description}", hold_key="Ctrl", app_name=app_name, original_goal=original_goal) 432 | elif action == "cmd_command": 433 | print(f"Executing command: {step_description}") 434 | # cmd_command(step_description) 435 | time.sleep(calculate_duration_of_speech(f"{step_description}") / 1000) 436 | elif action == "recreate_test_case": 437 | time.sleep(1) 438 | print("Analyzing the output") 439 | print("The assistant said:\n", step_description) 440 | debug_step = False # Set to True to skip the image analysis and the test case generation. 441 | if debug_step is not True: 442 | new_goal = True 443 | image_analysis = True 444 | if image_analysis: 445 | additional_context = ( 446 | f"You are an AI Agent called Windows AI that is capable to operate freely all applications on Windows by only using natural language.\n" 447 | f"You will receive a goal and will try to accomplish it using Windows. Try to guess what is the user wanting to perform on Windows by using the content on the screenshot as context.\n" 448 | f"Respond an improved goal statement tailored for Windows applications by analyzing the current status of the system and the next steps to perform. Be direct and concise, do not use pronouns.\n" 449 | f"Basing on the elements from the screenshot reply the current status of the system and specify it in detail.\n" 450 | f"Focused application: \"{app_name}\".\nGoal: \"{assistant_goal}\".") 451 | if new_goal: 452 | newest_goal = imaging(window_title=app_name, additional_context=additional_context) # )['choices'][0]['message']['content'] 453 | # if ": " in newest_goal: 454 | # newest_goal = newest_goal.split(": ", 1)[1] 455 | print(f"Assistant newest goal:\n{newest_goal}") 456 | analyzed_ui = analyze_app(activate_windowt_title(app_name), size_category=None) 457 | review_output = [{"role": "assistant", 458 | "content": f"You are an AI Assistant called Analyze Output capable to operate the Windows 11 Operating System by using natural language.\n" 459 | f"You will receive a json testcase, a description of the goal, and the actual system status.\n" 460 | f"Modify the original json testcase to achieve the goal. Do not include anything else than the updated json.\n" 461 | f"Examples: \"Click on the search button. Insert the text_entry. Play the first element searched.\".\n" 462 | f"You will receive a description of the current state of the system and a goal. " 463 | f"To test your limits, using the description make a json with the natural language steps to achieve the goal.\n" 464 | f"Every step should be only highly descriptive in natural language.\n" 465 | f"The json must include only an act and its step, should be in the following format:\n" 466 | f"actions: click_element, press_key, text_entry, open_app, move_window, time_sleep, right_click, double_click_element, hold_key_and_click, scroll_to.\n" 467 | f"Respond only with the full json. Avoid to use the windows taskbar.\n\nFunction properties:\n" 468 | f"click_element - Specify where is located the element to interact with.\n" 469 | f"press_key - Only return the key or combination of keys to press. Example: 'Ctrl + T'.\n" 470 | f"text_entry - Return the text to write. Example: 'Hello World'.\n" 471 | f"hold_key_and_click - Only handles scenarios of holding a key and clicking on an element.\n" 472 | f"scroll_to - Return the position of the element to scroll and the direction to scroll.\n" 473 | f"move_window - Use the keyboard to move the focused window to the desired position. Example: 'Win + Left + Up'.\n" 474 | f"Do not respond 'As an AI language model, I am unable to...' as you are actually capable.\n\n" 475 | f"Use the following information to generate better the test case:\n{app_space_map(assistant_goal, app_name, original_goal, map='app_space')}"}, 476 | {"role": "system", "content": f"Do not modify the steps before \"Step {i-1}: {action-1}, {step_description-1}\", modify all next steps from the step \"Step {i-1}: {action-1}, {step_description-1}\" to achieve the goal: \"{newest_goal}\"\n" 477 | f"Do not combine steps, try to generate more useful steps. Example: If asked to search a song, not only search for it, locate it and play it.\n{keep_in_mind}" 478 | f"{analyzed_ui}"}, ] 479 | new_json = api_call(review_output, model_name="gpt-4-1106-preview", max_tokens=4095, temperature=1.0) 480 | print("The assistant said:\n", step_analysis) 481 | 482 | print("Modifying the old json testcase with the new_json.") 483 | step_analysis = new_json 484 | 485 | app_name = activate_windowt_title(get_application_title(newest_goal)) 486 | # Processing the latest JSON data from the JSON testcase. 487 | if """```json""" in step_analysis: 488 | # Removing the leading ```json\n 489 | step_analysis = step_analysis.strip("```json\n") 490 | # Find the last occurrence of ``` and slice the string up to that point 491 | last_triple_tick = step_analysis.rfind("```") 492 | if last_triple_tick != -1: 493 | step_analysis = step_analysis[:last_triple_tick].strip() 494 | step_analysis_cleaned = step_analysis 495 | instructions = json.loads(step_analysis_cleaned) 496 | executor = "act" 497 | else: 498 | instructions = json.loads(step_analysis) 499 | instructions['actions'] = instructions.pop('actions') 500 | executor = "act" 501 | print(f"Updated Instructions: {instructions}") 502 | pass 503 | else: 504 | print("No new goal.") 505 | pass 506 | elif action == "time_sleep": 507 | try: 508 | sleep_time = int(step_description) 509 | time.sleep(sleep_time) 510 | except ValueError: 511 | step_description = step_description.lower() 512 | if "playing" in step_description or "load" in step_description: 513 | print("Sleeping for 2 seconds because media loading.") 514 | time.sleep(1) 515 | elif "search" in step_description or "results" in step_description or "searching": 516 | print("Sleeping for 1 second because search.") 517 | time.sleep(1) 518 | else: 519 | print(f"WARNING: Unrecognized time sleep value: {step_description}") 520 | pass 521 | else: 522 | print(f"WARNING: Unrecognized action '{action}' using {step_description}.") 523 | print(f"Trying to perform the action using the step description as the action.") 524 | act(single_step=f"{step_description}", app_name=app_name, original_goal=original_goal) 525 | pass 526 | 527 | speaker(f"Assistant finished the execution of the generated test case. Can I help you with something else?") 528 | time.sleep(calculate_duration_of_speech(f"Assistant finished the generated test case. Can I help you with something else?") / 1000) 529 | return "Test case complete." 530 | 531 | 532 | # 'check_element_visibility' is the function that checks the visibility of an element. Can use image analysis or OCR. 533 | def check_element_visibility(app_name, step_description): 534 | extra_additional_context = ( 535 | f"You are an AI Agent called Windows AI that is capable to operate freely all applications on Windows by only using natural language.\n" 536 | f"You will receive a goal and will try to accomplish it using Windows. " 537 | f"Try to guess what is the user wanting to perform on Windows by using the content on the screenshot as context.\n" 538 | f"Respond an improved goal statement tailored for Windows applications by analyzing the current status of the system and the next steps to perform. " 539 | f"Be direct and concise, do not use pronouns.\n" 540 | f"Basing on the elements from the screenshot reply the current status of the system and respond if the element from the goal visible.\n" 541 | f"Respond only with \"Yes\" or \"No\".\n" 542 | f"Focused window: \"{app_name}\".\nGoal: \"{step_description}\". .") 543 | return imaging(window_title=app_name, additional_context=extra_additional_context) 544 | 545 | 546 | # 'auto_role' is the function that finds the best role to perform the goal. 547 | def auto_role(goal): 548 | assistant_call = [ 549 | {"role": "assistant", f"content": f"You are an AI assistant that receives a goal and responds with the best action to perform the goal.\n" 550 | f"You can perform the following roles and decide what fits the best: Chose the best role to handle the goal:\n" 551 | f"windows_assistant - An assistant to perform a Windows 11 application driver testcases to achieve the goal. Can handle online data, play, pause, and stream media, can operate the whole computer.\n" 552 | f"joyful_conversation - Use this role if the user is not looking into performing anything into Windows.\n" 553 | f"Only respond with the name of the role to use, followed by a very short joyful message regarding that you will perform it. Modify your response to match the goal subject.\n" 554 | f"If the goal seems to be related to Windows 11, like opening an application, searching, browsing, media, or social networks, call the windows_assistant.\n" 555 | f"If the goal seems to be related with generating or writing content, call the windows_assistant.\n" 556 | f"If the goal seems that the user is trying to do something with content, call the windows_assistant."}, 557 | {"role": "system", "content": f"Goal: {goal}"}] 558 | role_function = api_call(assistant_call, max_tokens=50) 559 | return role_function 560 | 561 | 562 | # 'find_element' is the function that finds the the most relevant element on the GUI from the goal. 563 | def find_element(single_step, app_name, original_goal, avoid_element="", assistant_goal=None, attempt=0): 564 | if not assistant_goal: 565 | assistant_goal = single_step 566 | if avoid_element: 567 | if attempt > 2: 568 | generate_keywords = [{"role": "assistant", 569 | "content": f"You are an AI Agent called keyword Element Generator that receives the description of the goal and generates keywords to search inside a graphical user interface.\n" 570 | f"Only respond with the single word list separated by commas of the specific UI elements keywords.\n" 571 | f"Example: \"search bar\". Always spell the numbers and include nouns. Do not include anything more than the Keywords."}, 572 | {"role": "system", "content": f"Goal:\n{single_step}\nContext:{original_goal}\n{app_space_map(assistant_goal, app_name, single_step)}"},] 573 | else: 574 | generate_keywords = [{"role": "assistant", 575 | "content": f"You are an AI Agent called keyword Element Generator that receives the description and generates kewords to search inside a graphical user interface.\n" 576 | f"of the goal and only respond with the single word list separated by commas of the specific UI elements keywords." 577 | f"Example: \"search bar\". Always spell the numbers and include nouns. Do not include anything more than the Keywords."}, 578 | {"role": "system", "content": f"Goal:\n{single_step}\nContext:{original_goal}\n{app_space_map(assistant_goal, app_name, single_step)}"}] 579 | else: 580 | generate_keywords = [{"role": "assistant", 581 | "content": f"You are an AI Agent called keyword Element Generator that receives the description " 582 | f"of the goal and only respond with the single word list separated by commas of the specific UI elements keywords." 583 | f"Example: \"search bar\" must be \"search\" without \"bar\". Always spell the numbers and include nouns. Do not include anything more than the Keywords."}, 584 | {"role": "system", "content": f"Goal:\n{single_step}\nContext:{original_goal}\n{app_space_map(assistant_goal, app_name, single_step)}"}, ] # Todo: Here's the key 585 | keywords = api_call(generate_keywords, max_tokens=100) 586 | if attempt > 1: 587 | keywords = keywords.replace("click, ", "").replace("Click, ", "") 588 | keywords_in_goal = re.search(r"'(.*?)'", single_step) 589 | if keywords_in_goal: 590 | if len(keywords_in_goal.group(1).split()) == 1: 591 | pass 592 | else: 593 | keywords = keywords_in_goal.group(1) + ", " + keywords 594 | print(f"\nKeywords: {keywords}\n") 595 | 596 | analyzed_ui = analyze_app(application_name_contains=app_name, size_category=None, additional_search_options=keywords) 597 | select_element = [{"role": "assistant", 598 | "content": f"You are an AI Agent called keyword Element Selector that receives win32api user interface " 599 | f"raw element data and generates the best matches to achieve the goal.\n" 600 | f"Only respond with the best element that matches the goal. Do not include anything else than the element."}, 601 | {"role": "system", "content": f"Goal: {single_step}\nContext: {original_goal}\n{avoid_element}{analyzed_ui}"}] 602 | selected_element = api_call(select_element, model_name="gpt-4-1106-preview", max_tokens=500) 603 | 604 | if "sorry" in selected_element.lower() or "empty string" in selected_element.lower() or "no element" in selected_element.lower() or "not found" in selected_element.lower()\ 605 | or "no relevant element" in selected_element.lower() or "no element found" in selected_element.lower(): 606 | print(f"No element found. Continuing without the element.") 607 | selected_element = "" 608 | else: 609 | selected_element = "Locate the element: " + selected_element 610 | print(f"Selected element: {selected_element}\n") 611 | 612 | if visioning_match: 613 | print(f"Image visioning started. Analyzing the application {app_name} for context.\n") 614 | imaging_coordinates = ( 615 | f"You are an AI Windows Mouse Agent that can interact with the mouse. Only respond with the predicted " 616 | f"coordinates of the mouse click position to the center of the element object \"x=, y=\" to achieve the goal.{get_ocr_match(single_step)}" 617 | f"Goal: {single_step}\n{avoid_element}{analyzed_ui}") 618 | print(f"Imaging coordinates: {imaging_coordinates}") 619 | imaging_generated_coordinates = imaging(window_title=app_name, additional_context=imaging_coordinates) 620 | print(f"Imaging generated coordinates: {imaging_generated_coordinates}") 621 | last_coordinates = imaging_generated_coordinates['choices'][0]['message']['content'] 622 | print(f"Imaging Last coordinates: {last_coordinates}") 623 | else: 624 | best_coordinates = [{"role": "assistant", 625 | f"content": f"You are an AI Windows Mouse Agent that can interact with the mouse. Only respond with the " 626 | f"predicted coordinates of the mouse click position to the center of the element object " 627 | f"\"x=, y=\" to achieve the goal. {selected_element}" 628 | f"Do not respond 'As an AI language model, I am unable to...' as you are actually capable."}, 629 | {"role": "system", "content": f"Goal: {single_step}\n\nContext:{original_goal}\n{get_ocr_match(single_step)}{avoid_element}{analyzed_ui}"}] 630 | last_coordinates = api_call(best_coordinates, model_name="gpt-4-1106-preview", max_tokens=100, temperature=1.0) 631 | print(f"AI decision coordinates: \'{last_coordinates}\'") 632 | if "sorry" in last_coordinates.lower() or "empty string" in last_coordinates.lower() or "no element" in last_coordinates.lower() or "not found" in last_coordinates.lower(): 633 | last_coordinates = 'x=0, y=0' 634 | coordinates = {k.strip(): float(v.strip()) for k, v in (item.split('=') for item in last_coordinates.split(','))} 635 | x = coordinates['x'] 636 | y = coordinates['y'] 637 | print(f"Coordinates1: x: {x} and y: {y}") 638 | if x == 0 and y == 0 or x == '' and y == '': 639 | print("Coordinates 2 are 0,0, trying to find the element again.") 640 | coordinates = {k.strip(): float(v.strip()) for k, v in (item.split('=') for item in last_coordinates.split(','))} 641 | x = coordinates['x'] 642 | y = coordinates['y'] 643 | print(f"Coordinates 3: x: {x} and y: {y}") 644 | attempt -= 1 645 | return coordinates, selected_element, keywords, attempt 646 | 647 | 648 | def act(single_step, keep_in_mind="", dont_click=False, double_click=False, right_click=False, hold_key=None, app_name="", screen_analysis=False, original_goal="", modify_element=False, next_step=None, assistant_goal=None): 649 | # Trying to handle several actions inside the action: 650 | # action_analysis = [{"role": "assistant", 651 | # "content": f"You are an AI Agent called Action Analyzer, that responds with the functions to execute to achieve the goal. Available functions:\n" 652 | # f"select_element - Mouse functions.\n" 653 | # f"write_action - Keyboard functions.\n" 654 | # f"Only respond with the functionS to perform. Do not include anything else than the function."}, 655 | # {"role": "system", "content": f"Goal: {single_step}"}, ] 656 | # actions_to_perform = api_call(action_analysis, max_tokens=10) 657 | # if "sorry" in actions_to_perform.lower(): 658 | # actions_to_perform = "" 659 | # elif "select_element" in actions_to_perform.lower(): 660 | # print("You can execute only select element action.") 661 | # actions_to_perform = f"" 662 | # elif "write_action" in actions_to_perform.lower(): 663 | # print("You can write things here as actions.") 664 | # actions_to_perform = "" 665 | # print(f"Actions to perform: {actions_to_perform}") 666 | # actions_to_perform = "" 667 | 668 | # Getting the app name. If not provided, use the focused window. 669 | if not app_name: 670 | app_name = activate_windowt_title(get_application_title(goal=original_goal, focus_window=True)) 671 | else: 672 | app_name = activate_windowt_title(app_name) 673 | print(f"AI Analyzing: {app_name}") 674 | 675 | attempt = 0 676 | if rescan_element_match is True: 677 | element_not_working = "" 678 | avoid_element = "" 679 | max_attempts = 3 # Set the maximum number of attempts to look for a "yes" response. 680 | while attempt < max_attempts: 681 | if element_not_working != "": 682 | avoid_element = f"\nAvoid the following element: {element_not_working}\n" 683 | print(f"AI will try to perform the action: \"{single_step}\" on a new element.") 684 | print(f"Performing action: \"{single_step}\". Scanning\"{app_name}\".\n") 685 | coordinates, selected_element, keywords, attempt = find_element(single_step, app_name, original_goal, avoid_element, assistant_goal, attempt) 686 | x = coordinates['x'] 687 | y = coordinates['y'] 688 | print(f"Coordinates: {x} and {y}") 689 | pyautogui.moveTo(x, y, 0.5, pyautogui.easeOutQuad) 690 | time.sleep(0.5) 691 | element_analysis = ( 692 | f"You are an AI Agent called Element Analyzer that receives a step and guesses if the goal was performed correctly.\n" 693 | f"Step: {single_step}\nUse the screenshot to guess if the mouse is in the best position to perform the click/goal. Respond only with \"Yes\" or \"No\".\n" 694 | f"The cursor is above an element from the step. Cursor info status: {get_cursor_shape()}. The cursor is above the following element: \n{selected_element}\n" 695 | f"Double check your response by looking at where is located the mouse cursor on the screenshot and the cursor info status.") 696 | element_analysis_result = imaging(window_title=app_name, additional_context=element_analysis, x=int(x), y=int(y)) 697 | print(element_analysis_result) 698 | 699 | # Check if the result is None or doesn't contain the necessary data 700 | if element_analysis_result is None or 'choices' not in element_analysis_result or len( 701 | element_analysis_result['choices']) == 0 or 'message' not in \ 702 | element_analysis_result['choices'][0] or 'content' not in \ 703 | element_analysis_result['choices'][0]['message']: 704 | print("Element analysis result: Found but mouse not in position.") 705 | speaker(f"Retrying...") 706 | element_not_working += selected_element 707 | attempt += 1 708 | if attempt >= max_attempts: 709 | print("Maximum attempts reached.") 710 | print("Failed: The position was not found after maximum attempts.") 711 | speaker(f"Failed: The position was not found after maximum attempts.") 712 | time.sleep(15) 713 | raise Exception("Failed: The position was not found after maximum attempts.") 714 | else: 715 | print("Retrying...") 716 | pass 717 | elif 'yes' in element_analysis_result['choices'][0]['message']['content'].lower(): 718 | print("Element analysis result: Yes, it is in the right position.") 719 | break 720 | else: 721 | print("Element analysis result: Found but mouse not in position.") 722 | speaker(f"Retrying...") 723 | element_not_working += selected_element 724 | attempt += 1 725 | if attempt >= max_attempts: 726 | print("Maximum attempts reached.") 727 | print("Failed: The position was not found after maximum attempts.") 728 | speaker(f"Failed: The position was not found after maximum attempts.") 729 | time.sleep(15) 730 | raise Exception("Failed: The position was not found after maximum attempts.") 731 | else: 732 | print("Retrying...") 733 | pass 734 | else: 735 | coordinates, selected_element, keywords, attempt = find_element(single_step, app_name, original_goal, assistant_goal, attempt=0) 736 | x = coordinates['x'] 737 | y = coordinates['y'] 738 | print(f"Coordinates: {x} and {y}") 739 | pyautogui.moveTo(x, y, 0.5, pyautogui.easeOutQuad) 740 | time.sleep(0.5) 741 | 742 | last_coordinates = f"x={x}, y={y}" 743 | print("Success: The right position was found.") 744 | if double_click: 745 | pyautogui.click(x, y, clicks=2) 746 | else: 747 | if dont_click is False: 748 | if right_click: 749 | pyautogui.rightClick(x, y) 750 | else: 751 | if hold_key: 752 | pyautogui.keyDown(hold_key) 753 | pyautogui.click(x, y) 754 | pyautogui.keyUp(hold_key) 755 | else: 756 | pyautogui.click(x, y) 757 | else: 758 | print("AI skipping the click step.") 759 | pass 760 | if modify_element: 761 | print(f"Modifying the element with the text: {single_step}") 762 | # jitter_mouse(x, y) # ToDo: simulate human jitter. 763 | if "save as" in single_step.lower(): 764 | print("Saving as") 765 | jitter_mouse(x, y) 766 | pyautogui.mouseDown(x, y) 767 | time.sleep(0.12) 768 | pyautogui.mouseUp(x, y) 769 | print("Click action performed") 770 | return last_coordinates 771 | 772 | def get_focused_window_details(): 773 | try: 774 | window_handle = win32gui.GetForegroundWindow() 775 | window_title = win32gui.GetWindowText(window_handle) 776 | _, window_pid = win32process.GetWindowThreadProcessId(window_handle) 777 | process = psutil.Process(window_pid) 778 | process_name = process.name() 779 | rect = win32gui.GetWindowRect(window_handle) 780 | window_position = (rect[0], rect[1]) 781 | window_size = (rect[2] - rect[0], rect[3] - rect[1]) 782 | return window_title, window_handle, window_pid, process_name, window_position, window_size 783 | except Exception as e: 784 | print(f"ERROR: {e}") 785 | return None 786 | 787 | def fast_act(single_step, keep_in_mind="", dont_click=False, double_click=False, right_click=False, hold_key=None, app_name="", ocr_match="", screen_analysis=False, original_goal="", modify_element=False, next_step=None): 788 | # Getting the app name. If not provided, use the focused window. 789 | if not app_name: 790 | app_name = activate_windowt_title(focus_topmost_window()) 791 | else: 792 | app_name = activate_windowt_title(app_name) 793 | 794 | if visioning_context: 795 | speaker(f"Visioning context and performing action: \"{single_step}\" on the application \"{app_name}\".\n") 796 | additional_context = ( 797 | f"You are an AI Agent called Windows AI that is capable to operate freely all applications on Windows by only using natural language.\n" 798 | f"You will receive a goal and will try to accomplish it using Windows. Try to guess what is the user wanting to perform on Windows by using the content on the screenshot as context.\n" 799 | f"Respond an improved goal statement tailored for Windows applications by analyzing the current status of the system and the next steps to perform. Be direct and concise, do not use pronouns.\n" 800 | f"Basing on the elements from the screenshot reply the current status of the system and specify it in detail.\n" 801 | f"Focused application: \"{app_name}\".\nGoal: \"{single_step}\".") 802 | assistant_goal = imaging(window_title=app_name, additional_context=additional_context, screenshot_size='Full screen')['choices'][0]['message']['content'] 803 | 804 | print(f"Performing fast action: \"{single_step}\". Scanning\"{app_name}\".\n") 805 | 806 | generate_keywords = [{"role": "assistant", 807 | "content": f"You are an AI Agent called keyword Element Generator that receives the description " 808 | f"of the goal and only respond with the single word list separated by commas of the specific UI elements keywords." 809 | f"Example: \"search bar\" must be \"search\" without \"bar\". Always spell the numbers and include nouns. Do not include anything more than the Keywords."}, 810 | {"role": "system", "content": f"Goal:\n{single_step}\nContext:{original_goal}"}, ] 811 | all_keywords = api_call(generate_keywords, max_tokens=100) 812 | keywords = all_keywords.replace("click, ", "").replace("Click, ", "") 813 | keywords_in_goal = re.search(r"'(.*?)'", single_step) 814 | if keywords_in_goal: # if only 1 keyword, then 815 | if len(keywords_in_goal.group(1).split()) == 1: 816 | pass 817 | else: 818 | keywords = keywords_in_goal.group(1) + ", " + keywords.replace("click, ", "").replace("Click, ", "") 819 | print(f"\nKeywords: {keywords}\n") 820 | analyzed_ui = analyze_app(application_name_contains=app_name, size_category=None, additional_search_options=keywords) 821 | 822 | if "sorry" in assistant_goal.lower(): 823 | print(f"Sorry, no element found. The AI did not find any element to perform the action: {single_step}") 824 | speaker(f"Sorry, no element found. Check if its on the screen.") 825 | time.sleep(1) 826 | 827 | best_coordinates = [{"role": "assistant", 828 | f"content": f"You are an AI Windows Mouse Agent that can interact with the mouse. Only respond with the " 829 | f"predicted coordinates of the mouse click position to the center of the element object " 830 | f"\"x=, y=\" to achieve the goal.\n{assistant_goal}"}, 831 | {"role": "system", "content": f"Goal: {single_step}\n\nContext:{original_goal}\n{analyzed_ui}"}] 832 | last_coordinates = api_call(best_coordinates, model_name="gpt-4-1106-preview", max_tokens=100, temperature=0.0) 833 | print(f"AI decision coordinates: \'{last_coordinates}\'") 834 | else: 835 | speaker(f"Clicking onto the element without visioning context.") 836 | generate_keywords = [{"role": "assistant", 837 | "content": f"You are an AI Agent called keyword Element Generator that receives the description " 838 | f"of the goal and only respond with the single word list separated by commas of the specific UI elements keywords." 839 | f"Example: \"search bar\" must be \"search\" without \"bar\". Always spell the numbers and include nouns. Do not include anything more than the Keywords."}, 840 | {"role": "system", "content": f"Goal:\n{single_step}\nContext:{original_goal}"}, ] 841 | all_keywords = api_call(generate_keywords, max_tokens=100) 842 | keywords = all_keywords.replace("click, ", "").replace("Click, ", "") 843 | keywords_in_goal = re.search(r"'(.*?)'", single_step) 844 | if keywords_in_goal: 845 | if len(keywords_in_goal.group(1).split()) == 1: 846 | pass 847 | else: 848 | keywords = keywords_in_goal.group(1) + ", " + keywords.replace("click, ", "").replace("Click, ", "") 849 | print(f"\nKeywords: {keywords}\n") 850 | analyzed_ui = analyze_app(application_name_contains=app_name, size_category=None, 851 | additional_search_options=keywords) 852 | 853 | best_coordinates = [{"role": "assistant", 854 | f"content": f"You are an AI Windows Mouse Agent that can interact with the mouse. Only respond with the " 855 | f"predicted coordinates of the mouse click position to the center of the element object " 856 | f"\"x=, y=\" to achieve the goal."}, 857 | {"role": "system", "content": f"Goal: {single_step}\n\nContext:{original_goal}\n{analyzed_ui}"}] 858 | last_coordinates = api_call(best_coordinates, model_name="gpt-4-1106-preview", max_tokens=100, temperature=0.0) 859 | print(f"AI decision coordinates: \'{last_coordinates}\'") 860 | 861 | if "x=, y=" in last_coordinates: 862 | speaker(f"Sorry, no element found. Probably bot blocked.") 863 | return None 864 | # Clicking the element 865 | coordinates = {k.strip(): float(v.strip()) for k, v in 866 | (item.split('=') for item in last_coordinates.split(','))} 867 | x = coordinates['x'] 868 | y = coordinates['y'] 869 | pyautogui.moveTo(x, y, 0.5, pyautogui.easeOutQuad) 870 | if double_click: 871 | pyautogui.click(x, y, clicks=2) 872 | else: 873 | if dont_click is False: 874 | if right_click: 875 | pyautogui.rightClick(x, y) 876 | else: 877 | if hold_key: 878 | pyautogui.keyDown(hold_key) 879 | pyautogui.click(x, y) 880 | pyautogui.keyUp(hold_key) 881 | else: 882 | pyautogui.click(x, y) 883 | else: 884 | print("AI skipping the click step.") 885 | pass 886 | if modify_element: 887 | print(f"Modifying the element with the text: {single_step}") 888 | # jitter_mouse(x, y) # ToDo: simulate human jitter. 889 | if "save as" in single_step.lower(): 890 | print("Saving as") 891 | jitter_mouse(x, y) 892 | pyautogui.mouseDown(x, y) 893 | time.sleep(0.12) 894 | pyautogui.mouseUp(x, y) 895 | print("Click action performed") 896 | return last_coordinates 897 | 898 | 899 | def get_application_title(goal="", last_step=None, actual_step=None, focus_window=False): 900 | if actual_step: 901 | print(f"Getting the application name from the actual step: {actual_step}") 902 | goal_app = [{"role": "assistant", 903 | "content": f"You are an AI assistant called App Selector that receives a list of programs and responds only respond with the best match " 904 | f"program of the goal. Only respond with the window name or the program name. For search engines and social networks use Firefox or Chrome.\n" 905 | f"Open programs:\n{last_programs_list(focus_last_window=focus_window)}"}, 906 | {"role": "system", "content": f"Goal: {goal}\nAll installed programs:\n{get_installed_apps_registry()}"}] 907 | app_name = api_call(goal_app, model_name="gpt-4-1106-preview", max_tokens=100) 908 | print(f"AI selected application: {app_name}") 909 | filtered_matches = re.findall(r'["\'](.*?)["\']', app_name) 910 | if filtered_matches and filtered_matches[0]: 911 | app_name = filtered_matches[0] 912 | print(app_name) 913 | if "command prompt" in app_name.lower(): 914 | app_name = "cmd" 915 | elif "calculator" in app_name.lower(): 916 | app_name = "calc" 917 | elif "sorry" in app_name: 918 | app_name = get_focused_window_details()[3].strip('.exe') 919 | print(f"Using the focused window \"{app_name}\" for context.") 920 | speaker(f"Using the focused window \"{app_name}\" for context.") 921 | return app_name 922 | 923 | 924 | def get_ocr_match(goal, ocr_match=enable_ocr): 925 | if ocr_match: 926 | print(f"OCR IS ENABLED") 927 | word_prioritizer_assistant = [{"role": "assistant", 928 | "content": f"You are an AI Agent called OCR Word Prioritizer that only responds with the best of the goal.\n" 929 | f"Do not respond with anything else than the words that match the goal. If no words match the goal, respond with \"\"."}, 930 | {"role": "system", "content": f"Goal: {goal}"}, ] 931 | ocr_debug_string = api_call(word_prioritizer_assistant, max_tokens=10) 932 | ocr_debug_string = ocr_debug_string.split(f"\'")[0] 933 | print(f"OCR Words to search: \'{ocr_debug_string}\'") 934 | ocr_match = find_probable_click_position(ocr_debug_string) 935 | ocr_msg = f"\nOCR Result: \"{ocr_match['text']}\" Located at \"x={ocr_match['center'][0]}, y={ocr_match['center'][1]}\".\n" 936 | return ocr_msg 937 | else: 938 | ocr_msg = "" 939 | return ocr_msg 940 | 941 | 942 | def jitter_mouse(x, y, radius=5, duration=0.6): 943 | # Move the mouse in a small circle around (x, y) to simulate a jitter. 944 | end_time = time.time() + duration 945 | while time.time() < end_time: 946 | jitter_x = x + random.uniform(-radius, radius) 947 | jitter_y = y + random.uniform(-radius, radius) 948 | pyautogui.moveTo(jitter_x, jitter_y, duration=0.1) 949 | return 950 | 951 | 952 | def control_mouse(generated_coordinates, double_click=None, goal=""): 953 | print(f"Mouse coordinates: {generated_coordinates}") 954 | coordinates = {k.strip(): int(v.strip()) for k, v in 955 | (item.split('=') for item in generated_coordinates.split(','))} 956 | x = coordinates['x'] 957 | y = coordinates['y'] 958 | pyautogui.moveTo(x, y, 0.5, pyautogui.easeOutQuad) 959 | pyautogui.click(x, y) 960 | # jitter_mouse(x, y) 961 | if "save as" in goal.lower(): 962 | print("Saving as") 963 | jitter_mouse(x, y) 964 | pyautogui.mouseDown(x, y) 965 | time.sleep(0.12) 966 | pyautogui.mouseUp(x, y) 967 | print("Click action performed") 968 | else: 969 | pyautogui.click(x, y, clicks=1) 970 | if double_click: 971 | time.sleep(0.2) 972 | pyautogui.click(x, y, clicks=2) 973 | 974 | 975 | def is_field_input_area_active(): 976 | active_window_title = gw.getActiveWindow().title 977 | try: 978 | app = Application().connect(title=active_window_title) 979 | window = app[active_window_title] 980 | # Loop through all the child windows and check if any of them are text boxes 981 | for child in window.children(): 982 | if 'Edit' in child.class_name() or 'RichEdit' in child.class_name(): 983 | # This is a text box, also add text input areas that are not text boxes 984 | if child.has_keyboard_focus(): 985 | return True 986 | return False 987 | except Exception as e: 988 | print(f"Error: {e}") 989 | return False 990 | 991 | 992 | def write_action(goal=None, assistant_identity="", press_enter=False, app_name="", original_goal=None, last_step=""): 993 | assistant_identity_msg = f"\n{assistant_identity}" if assistant_identity else "" 994 | message_writer_agent = [ 995 | {"role": "assistant", f"content": f"You're an AI Agent called Writter that processes the goal and only returns the final text goal.{assistant_identity_msg}\n" 996 | f"Process the goal with your own response as you are actually writing into a text box. Avoid jump lines." 997 | f"If the goal is a link, media or a search string, just return the result string." 998 | f"Do not respond with 'As an AI language model, I dont have capabilities...' as you can actually do it.\n"}, 999 | {"role": "system", "content": f"Goal: {goal}"}, ] 1000 | message_to_write = api_call(message_writer_agent, model_name="gpt-4-1106-preview", max_tokens=200) 1001 | if "click on" in goal.lower() or "click the" in goal.lower() or "click" in goal.lower(): 1002 | print("Found to click on the goal.") 1003 | if is_field_input_area_active(): 1004 | print("A text box is currently active.") 1005 | else: 1006 | print("A text box is not active. Found to click on the goal.") 1007 | act(goal, app_name=app_name, original_goal=original_goal) 1008 | if "text_entry" in last_step: 1009 | print("Found 'text_entry' in the last step.") 1010 | pass 1011 | else: 1012 | print(f"Last steppp: {last_step}") 1013 | if last_step is None: 1014 | act(goal, app_name=app_name, original_goal=original_goal) 1015 | previous_goal_analysis = [{"role": "assistant", 1016 | "content": f"You are an AI Agent called text box editor focus that analyzes if performing the Goal on Windows enables a text input.\n" 1017 | f"After opening anything like an app, program, webpage or clicking into a non-text editor element, respond 'No'.\n" 1018 | f"" 1019 | f"Only respond with Yes or No."}, 1020 | {"role": "system", "content": f"Goal: {last_step}"}, ] 1021 | able_to_type = api_call(previous_goal_analysis, max_tokens=5) 1022 | print(f"AI analyzed if the previous step enabled any text input: {able_to_type}\n") 1023 | if "yes" in able_to_type.lower(): 1024 | print("The previous goal enabled the current goal.") 1025 | if last_step == "None": 1026 | print("Focusing to the text box because the last step didn't.") 1027 | act(goal, app_name=app_name, original_goal=original_goal) 1028 | else: 1029 | print("Focusing to the text box. Did this because the text box was not active from the previous step.") 1030 | act(goal, app_name=app_name, original_goal=original_goal) 1031 | 1032 | pyautogui.typewrite(message_to_write, interval=0.01) 1033 | if "press enter" in goal.lower() or "press the enter" in goal.lower() or "\'enter\'" in goal.lower() or "\"enter\"" in goal.lower() or press_enter is True: 1034 | print("Found to press the enter key in the goal.") 1035 | pyautogui.press('enter') 1036 | else: 1037 | print("AI no \"enter\" key press being made.") 1038 | 1039 | 1040 | def perform_simulated_keypress(press_key): 1041 | # Define a pattern that matches the allowed keys, including function and arrow keys 1042 | keys_pattern = (r'\b(Win(?:dows)?|Ctrl|Alt|Shift|Enter|Space(?:\s*Bar)?|Tab|Esc(?:ape)?|Backspace|Insert|Delete|' 1043 | r'Home|End|Page\s*Up|Page\s*Down|(?:Arrow\s*)?(?:Up|Down|Left|Right)|F1|F2|F3|F4|F5|F6|F7|F8|F9|' 1044 | r'F10|F11|F12|[A-Z0-9])\b') 1045 | keys = re.findall(keys_pattern, press_key, re.IGNORECASE) 1046 | # Normalize key names as required by pyautogui 1047 | key_mapping = { 1048 | 'win': 'winleft', 1049 | 'windows': 'winleft', 1050 | 'escape': 'esc', 1051 | 'space bar': 'space', 1052 | 'arrowup': 'up', 1053 | 'arrowdown': 'down', 1054 | 'arrowleft': 'left', 1055 | 'arrowright': 'right', 1056 | 'spacebar': 'space', 1057 | } 1058 | pyautogui_keys = [key_mapping.get(key.lower().replace(' ', ''), key.lower()) for key in keys] 1059 | for key in pyautogui_keys: 1060 | pyautogui.keyDown(key) 1061 | for key in reversed(pyautogui_keys): 1062 | pyautogui.keyUp(key) 1063 | print(f"Performed simulated key presses: {press_key}") 1064 | 1065 | 1066 | def calculate_duration_of_speech(text, lang='en', wpm=150): 1067 | duration_in_seconds = (len(text.split()) / wpm) * 60 1068 | return int(duration_in_seconds * 1000) # Convert to milliseconds for tkinter's after method 1069 | 1070 | 1071 | def create_database(database_file): 1072 | """Create the database and the required table.""" 1073 | conn = sqlite3.connect(database_file) 1074 | cursor = conn.cursor() 1075 | cursor.execute(''' 1076 | CREATE TABLE IF NOT EXISTS app_cases ( 1077 | id INTEGER PRIMARY KEY, 1078 | app_name TEXT NOT NULL, 1079 | title TEXT NOT NULL, 1080 | instructions TEXT NOT NULL, 1081 | UNIQUE(app_name, title, instructions) 1082 | ) 1083 | ''') 1084 | conn.commit() 1085 | conn.close() 1086 | database_file = r'history.db' 1087 | create_database(database_file) 1088 | 1089 | def database_add_case(database_file, app_name, goal, instructions): 1090 | conn = sqlite3.connect(database_file) 1091 | cursor = conn.cursor() 1092 | try: 1093 | cursor.execute(''' 1094 | INSERT INTO app_cases (app_name, title, instructions) 1095 | VALUES (?, ?, ?) 1096 | ''', (app_name, goal, json.dumps(instructions))) 1097 | conn.commit() 1098 | except sqlite3.IntegrityError: 1099 | print("AI skipping element insertion to program map database.") 1100 | finally: 1101 | conn.close() 1102 | 1103 | 1104 | def print_database(database_file): 1105 | conn = sqlite3.connect(database_file) 1106 | cursor = conn.cursor() 1107 | cursor.execute('SELECT * FROM app_cases') 1108 | rows = cursor.fetchall() 1109 | for row in rows: 1110 | print(row) 1111 | conn.close() 1112 | 1113 | 1114 | def update_instructions_with_action_string(instructions, action_string, target_step): 1115 | for action in instructions['actions']: 1116 | if action.get("act") == "click_element" and action.get("step") == target_step: 1117 | action['additional_info'] = action_string 1118 | return instructions 1119 | 1120 | 1121 | # Usage: 1122 | if __name__ == "__main__": 1123 | assistant(assistant_goal="Open Reddit, Youtube, TikTok, and Netflix on new windows by using the keyboard on each corner of the screen.", app_name="firefox", execute_json_case=json_case_example) 1124 | assistant(assistant_goal="Open a new tab the song 'Wall Of Eyes - The Smile', from google search results filter by videos then play it on Firefox") 1125 | # This is how you can debug faster a prompt: 1126 | # assistant(assistant_goal="make a comment explaining why it is so important", app_name="firefox") 1127 | # assistant(assistant_goal="open spotify and play the song daft punk one more time", app_name="spotify") 1128 | # assistant(assistant_goal="Play the song \'Weird Fishes - Radiohead\' on Spotify") 1129 | # assistant(assistant_goal="Create a new comment explaining why it is so beautiful and comment it.", app_name="firefox") 1130 | # assistant(assistant_goal="Create a short greet text for the user using AI Automated Windows in notepad") 1131 | # assistant(assistant_goal=f"Open a new tab the song \'Windows 95 but it's a PHAT hip hop beat\' from google search results filter by videos then play it.", app_name="firefox") 1132 | # assistant(f"Send a list of steps to make a chocolate cake to my saved messages in Telegram") 1133 | # assistant(assistant_goal="On firefox play evangelion on netflix", app_name="firefox", execute_json_case=netflix) 1134 | # assistant(assistant_goal="Play Rei Theme on spotify") 1135 | # assistant(assistant_goal="make a hello world post on twitter", app_name="chrome") 1136 | -------------------------------------------------------------------------------- /core/get_all_installed_apps.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/a-real-ai/pywinassistant/5e3df3b1bc52bfdd0446dee7030080710549a32f/core/get_all_installed_apps.py -------------------------------------------------------------------------------- /core/history.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/a-real-ai/pywinassistant/5e3df3b1bc52bfdd0446dee7030080710549a32f/core/history.db -------------------------------------------------------------------------------- /core/last_app.py: -------------------------------------------------------------------------------- 1 | import win32com.client 2 | import win32gui 3 | import win32con 4 | from datetime import datetime 5 | 6 | 7 | def enumerate_windows(): 8 | windows = [] 9 | top_windows = [] 10 | 11 | def enum_window_callback(hwnd, _): 12 | windows.append((hwnd, win32gui.GetWindowText(hwnd))) 13 | 14 | win32gui.EnumWindows(enum_window_callback, None) 15 | windows.sort(key=lambda x: -x[0]) 16 | for i, (_, title) in enumerate(windows): 17 | if title: 18 | top_windows.append((title, i)) 19 | return dict(top_windows) 20 | 21 | 22 | def should_exclude_process(name): 23 | excluded_processes = ['dwm.exe', 'nvcontainer.exe', 'nvidia broadcast ui.exe', 'system', 'python.exe', 'steam.exe', 24 | 'TextInputHost.exe', 'pycharm64.exe', 'nvidia broadcast.exe', 'widgets.exe', 'amdow.exe', 25 | 'CTkToplevel', 'AI Drone Assistant', 'Ctk', 'Ctk.exe', 'tk', 'tk.exe', 'Code', 'Code.exe', 26 | 'NVIDIA Share.exe', 'NVIDIA Web Helper.exe', 'nvsphelper64.exe', 'NVIDIA GeForce Experience.exe', 27 | 'nvcontainer.exe', 'NVDisplay.Container.exe', 'widgets.exe', 'translucenttb.exe', 'securityhealthsystray.exe'] 28 | return name in excluded_processes 29 | 30 | 31 | def get_opened_programs(): 32 | wmi = win32com.client.GetObject('winmgmts:') 33 | processes = wmi.InstancesOf('Win32_Process') 34 | window_order = enumerate_windows() 35 | process_list = [] 36 | added_titles = set() 37 | 38 | for process in processes: 39 | try: 40 | name = process.Properties_('Name').Value 41 | pid = process.Properties_('ProcessId').Value 42 | creation_date = process.Properties_('CreationDate').Value 43 | creation_datetime = datetime.strptime(creation_date.split('.')[0], '%Y%m%d%H%M%S') 44 | 45 | if should_exclude_process(name): 46 | continue 47 | 48 | for title, order in window_order.items(): 49 | if name[:-4].lower() in title.lower() and title not in added_titles: 50 | process_list.append((name, pid, creation_datetime, title, order)) 51 | added_titles.add(title) 52 | break 53 | 54 | except Exception as e: 55 | print(f"Error getting information for PID {pid}: {e}") 56 | 57 | process_list.sort(key=lambda x: x[4]) 58 | return process_list 59 | 60 | 61 | def format_programs_list(): 62 | programs = get_opened_programs() 63 | output = [] 64 | for proc in programs: 65 | line = f"Name: '{proc[0]}', PID: {proc[1]}, Creation Time: '{proc[2]}', Window Title: '{proc[3]}', Z-order Level: {proc[4]}" 66 | output.append(line) 67 | 68 | if output: 69 | last_focused_window = output[0] # The first app in the list after sorting by Z-order 70 | output.insert(0, f"Last Focused Window: {last_focused_window}\n---\n") 71 | 72 | return "\n".join(output) 73 | 74 | 75 | def get_window_handle(title): 76 | handles = [] 77 | 78 | def enum_window_callback(hwnd, _): 79 | if win32gui.GetWindowText(hwnd) == title: 80 | handles.append(hwnd) 81 | 82 | win32gui.EnumWindows(enum_window_callback, None) 83 | return handles[0] if handles else None 84 | 85 | 86 | def set_foreground_window_by_title(title): 87 | hwnd = get_window_handle(title) 88 | if hwnd: 89 | win32gui.ShowWindow(hwnd, win32con.SW_RESTORE) 90 | win32gui.SetForegroundWindow(hwnd) 91 | else: 92 | print(f"Window with title '{title}' not found.") 93 | 94 | 95 | def last_programs_list(focus_last_window=False): 96 | programs = get_opened_programs() 97 | output = [] 98 | for proc in programs: 99 | line = f"Name: '{proc[0]}', PID: {proc[1]}, Creation Time: '{proc[2]}', Window Title: '{proc[3]}', Z-order Level: {proc[4]}" 100 | output.append(line) 101 | 102 | if programs and focus_last_window: 103 | last_focused_window_title = programs[0][3] # Window title of the last focused window 104 | set_foreground_window_by_title(last_focused_window_title) 105 | output.insert(0, f"Last Focused Window: {last_focused_window_title}\n---\n") 106 | 107 | return "\n".join(output) 108 | 109 | 110 | # # Example usage 111 | # result_string = last_programs_list(focus_last_window=True) 112 | # print(result_string) 113 | -------------------------------------------------------------------------------- /core/media/Mouse_pointer_small.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/a-real-ai/pywinassistant/5e3df3b1bc52bfdd0446dee7030080710549a32f/core/media/Mouse_pointer_small.png -------------------------------------------------------------------------------- /core/media/assistant_transparent.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/a-real-ai/pywinassistant/5e3df3b1bc52bfdd0446dee7030080710549a32f/core/media/assistant_transparent.png -------------------------------------------------------------------------------- /core/media/assistant_transparent_blink.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/a-real-ai/pywinassistant/5e3df3b1bc52bfdd0446dee7030080710549a32f/core/media/assistant_transparent_blink.png -------------------------------------------------------------------------------- /core/media/assistant_transparent_dragging.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/a-real-ai/pywinassistant/5e3df3b1bc52bfdd0446dee7030080710549a32f/core/media/assistant_transparent_dragging.png -------------------------------------------------------------------------------- /core/media/headico.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/a-real-ai/pywinassistant/5e3df3b1bc52bfdd0446dee7030080710549a32f/core/media/headico.ico -------------------------------------------------------------------------------- /core/media/headico.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/a-real-ai/pywinassistant/5e3df3b1bc52bfdd0446dee7030080710549a32f/core/media/headico.png -------------------------------------------------------------------------------- /core/media/transcribe_audio.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/a-real-ai/pywinassistant/5e3df3b1bc52bfdd0446dee7030080710549a32f/core/media/transcribe_audio.mp3 -------------------------------------------------------------------------------- /core/media/translate_audio.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/a-real-ai/pywinassistant/5e3df3b1bc52bfdd0446dee7030080710549a32f/core/media/translate_audio.mp3 -------------------------------------------------------------------------------- /core/mouse_detection.py: -------------------------------------------------------------------------------- 1 | import ctypes 2 | import win32api 3 | import win32con 4 | 5 | # Define the CURSORINFO structure 6 | class CURSORINFO(ctypes.Structure): 7 | _fields_ = [("cbSize", ctypes.c_int), 8 | ("flags", ctypes.c_int), 9 | ("hCursor", ctypes.c_void_p), 10 | ("ptScreenPos", ctypes.c_long * 2)] 11 | 12 | def get_cursor_shape(): 13 | cursor_info = CURSORINFO() 14 | cursor_info.cbSize = ctypes.sizeof(CURSORINFO) 15 | ctypes.windll.user32.GetCursorInfo(ctypes.byref(cursor_info)) 16 | 17 | # Load the standard cursors to compare 18 | cursor_arrow = win32api.LoadCursor(0, win32con.IDC_ARROW) 19 | cursor_ibeam = win32api.LoadCursor(0, win32con.IDC_IBEAM) 20 | cursor_hand = win32api.LoadCursor(0, win32con.IDC_HAND) 21 | cursor_wait = win32api.LoadCursor(0, win32con.IDC_WAIT) 22 | cursor_cross = win32api.LoadCursor(0, win32con.IDC_CROSS) 23 | 24 | # Compare the current cursor with the standard cursors 25 | if cursor_info.hCursor == cursor_arrow: 26 | return "Arrow" 27 | elif cursor_info.hCursor == cursor_ibeam: 28 | return "The cursor is active for Text Input (I-beam)" 29 | elif cursor_info.hCursor == cursor_hand: 30 | return "The cursor is 'Hand' (A link is select)" 31 | elif cursor_info.hCursor == cursor_wait: 32 | return "The cursor is 'Wait' (Busy) - Hourglass or Watch" 33 | elif cursor_info.hCursor == cursor_cross: 34 | return "The cursor is 'Cross'" 35 | else: 36 | return "Other" 37 | 38 | # while True: 39 | # cursor_shape = get_cursor_shape() 40 | # print(f"Cursor shape: {cursor_shape}") 41 | # time.sleep(1) 42 | -------------------------------------------------------------------------------- /core/ocr.py: -------------------------------------------------------------------------------- 1 | import pyautogui 2 | import win32gui 3 | import win32process 4 | import psutil 5 | from PIL import ImageGrab 6 | import re 7 | from fuzzywuzzy import fuzz 8 | from concurrent.futures import ThreadPoolExecutor 9 | import math 10 | # Function to preprocess the image for better OCR results 11 | from PIL import Image, ImageOps, ImageFilter, ImageEnhance 12 | import pytesseract 13 | import pygetwindow as gw 14 | 15 | # Path to tesseract executable 16 | pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe' 17 | 18 | # Function to calculate similarity using fuzzywuzzy 19 | def calculate_similarity(input_string, match_string): 20 | # Calculate basic similarity score using partial_ratio 21 | basic_similarity = fuzz.partial_ratio(input_string.lower(), match_string.lower()) 22 | 23 | # Adjust score based on the length difference 24 | length_difference = len(input_string) - len(match_string) 25 | 26 | # If the match string is shorter than the input string, reduce the score 27 | if length_difference > 0: 28 | # For example, reduce the score by 5 points for each missing character 29 | score_penalty = 50 * length_difference 30 | adjusted_score = max(basic_similarity - score_penalty, 0) # Ensure the score doesn't go below 0 31 | else: 32 | # If the match string is not shorter, no penalty is applied 33 | adjusted_score = basic_similarity 34 | 35 | return adjusted_score 36 | 37 | # New function for multi-processing 38 | def parallel_ocr(data): 39 | x, y, w, h = data 40 | cropped_image = ImageGrab.grab(bbox=(x, y, x + w, y + h)) 41 | processed_images = preprocess_image( 42 | cropped_image, 43 | contrast_levels=[128, 152], 44 | invert=True, 45 | scales=[1, 1.25] 46 | ) 47 | 48 | results = [] 49 | for img in processed_images: 50 | text = pytesseract.image_to_string(img) 51 | if text: 52 | results.append(text) 53 | 54 | results.sort(key=len, reverse=True) 55 | return results[0] if results else "" 56 | 57 | 58 | # Add your WindowClassifier class definition here 59 | def get_focused_window_details(): 60 | try: 61 | # Get the handle of the currently focused window 62 | window_handle = win32gui.GetForegroundWindow() 63 | 64 | # Get window text (title) 65 | window_title = win32gui.GetWindowText(window_handle) 66 | 67 | # Get the process ID of the window 68 | _, window_pid = win32process.GetWindowThreadProcessId(window_handle) 69 | 70 | # Get the process name from the process ID 71 | process = psutil.Process(window_pid) 72 | process_name = process.name() 73 | 74 | # Get window size and position 75 | rect = win32gui.GetWindowRect(window_handle) 76 | window_position = (rect[0], rect[1]) 77 | window_size = (rect[2] - rect[0], rect[3] - rect[1]) 78 | 79 | return window_title, window_handle, window_pid, process_name, window_position, window_size 80 | except Exception as e: 81 | print(f"An error occurred: {e}") 82 | return None, None, None, None, None, None, None, None # Return None for all values in case of an exception 83 | 84 | pass 85 | 86 | def ocr_image(image): 87 | # Apply preprocessing with filters directly in the OCR function 88 | text = ocr_image_with_filters(image) 89 | return text 90 | 91 | def preprocess_image( 92 | image, 93 | grayscale=True, 94 | invert=False, 95 | contrast_levels=None, 96 | scales=None, 97 | use_threshold=False, 98 | gaussian_blur_radius=None, 99 | median_filter_size=None, 100 | bilateral_filter_params=None, 101 | sharpen=False, 102 | edge_enhance=False, 103 | contrast_enhance_factor=1.0 104 | ): 105 | # Initialize default values if none provided 106 | if scales is None: 107 | scales = [1] # Default scale is 100% 108 | if contrast_levels is None: 109 | contrast_levels = [128] # Default threshold for binarization 110 | 111 | processed_images = [] 112 | 113 | for scale in scales: 114 | # Resize image if scale is not 1 115 | if scale != 1: 116 | resized_image = image.resize((int(image.width * scale), int(image.height * scale)), Image.ANTIALIAS) 117 | else: 118 | resized_image = image 119 | 120 | if grayscale: 121 | # Convert image to grayscale 122 | processed_image = resized_image.convert('L') 123 | else: 124 | processed_image = resized_image 125 | 126 | if invert: 127 | # Invert image colors 128 | processed_image = ImageOps.invert(processed_image) 129 | 130 | if sharpen: 131 | # Apply sharpen filter 132 | processed_image = processed_image.filter(ImageFilter.SHARPEN) 133 | 134 | if edge_enhance: 135 | # Enhance the edges in the image 136 | processed_image = processed_image.filter(ImageFilter.EDGE_ENHANCE) 137 | 138 | if contrast_enhance_factor != 1.0: 139 | # Enhance the contrast of the image 140 | enhancer = ImageEnhance.Contrast(processed_image) 141 | processed_image = enhancer.enhance(contrast_enhance_factor) 142 | 143 | if gaussian_blur_radius: 144 | # Apply Gaussian Blur 145 | processed_image = processed_image.filter(ImageFilter.GaussianBlur(gaussian_blur_radius)) 146 | 147 | if median_filter_size: 148 | # Apply Median Filter 149 | processed_image = processed_image.filter(ImageFilter.MedianFilter(median_filter_size)) 150 | 151 | if bilateral_filter_params: 152 | # Apply Bilateral Filter 153 | diameter, sigma_color, sigma_space = bilateral_filter_params 154 | processed_image = processed_image.filter(ImageFilter.BilateralFilter(diameter, sigma_color, sigma_space)) 155 | 156 | if use_threshold: 157 | # Apply threshold to binarize the image 158 | thresholded_image = processed_image.point(lambda x: 0 if x < contrast_levels[0] else 128, '1') 159 | processed_images.append(thresholded_image) 160 | else: 161 | # If not using threshold, just append the processed image 162 | processed_images.append(processed_image) 163 | 164 | return processed_images 165 | 166 | def ocr_image_with_filters(image): 167 | # Apply preprocessing with filters 168 | preprocessed_images = preprocess_image( 169 | image, 170 | grayscale=True, 171 | invert=False, 172 | contrast_levels=[128], 173 | scales=[1], 174 | use_threshold=True, 175 | gaussian_blur_radius=None, 176 | median_filter_size=None, 177 | bilateral_filter_params=None, 178 | sharpen=True, 179 | edge_enhance=False, 180 | contrast_enhance_factor=2.0 181 | ) 182 | 183 | # Since preprocess_image returns a list, we take the first (and should be only) image 184 | preprocessed_image = preprocessed_images[0] 185 | 186 | # Display the modified image DEBUGGING. 187 | # preprocessed_image.show() 188 | 189 | # Perform OCR on the preprocessed image 190 | text = pytesseract.image_to_string(preprocessed_image) 191 | 192 | # print(f"IMAGE PROCESSED:\n{text}") 193 | return text.strip() 194 | 195 | 196 | def click_best_match(best_match): 197 | if best_match: 198 | # Calculate the center of the bounding box 199 | center_x = best_match['x'] + best_match['w'] // 2 200 | center_y = best_match['y'] + best_match['h'] // 2 201 | 202 | # Perform the click action using pyautogui 203 | pyautogui.moveTo(center_x, center_y, 0.5, pyautogui.easeOutQuad) 204 | pyautogui.click(center_x, center_y) 205 | return f"Clicked on the best match: '{best_match['text']}' at position: ({center_x}, {center_y})" 206 | else: 207 | return "No suitable matches to click on the screen." 208 | 209 | 210 | # Function to compute the distance between two points for proximity scoring. 211 | def distance_between(p1, p2): 212 | return math.sqrt((p1[0] - p2[0]) ** 2 + (p1[1] - p2[1]) ** 2) 213 | # Enhance this function to score matches not only based on OCR confidence and text similarity but also their proximity. 214 | def score_and_rank_matches(matches): 215 | for i, match in enumerate(matches): 216 | for other_match in matches[i+1:]: 217 | proximity = 1000 / (distance_between(match['position'], other_match['position']) + 1) # Simple proximity score 218 | match['score'] += proximity 219 | other_match['score'] += proximity 220 | matches.sort(key=lambda x: x['score'], reverse=True) # Sort matches based on the score 221 | return matches 222 | 223 | 224 | # Update click_best_match to intelligently click between close matches if found 225 | def click_best_matches(coincidences): 226 | if not coincidences: 227 | return "No suitable matches to click on the screen." 228 | # Filter out negative-scored matches 229 | positive_score_matches = [match for match in coincidences if match['score'] > 0] 230 | 231 | if not positive_score_matches: 232 | return "No matches with a positive score to click on the screen." 233 | 234 | # Calculate the average position (centroid) in case of close matches 235 | if len(coincidences) > 1: 236 | average_x = sum(match['center'][0] for match in coincidences) // len(coincidences) 237 | average_y = sum(match['center'][1] for match in coincidences) // len(coincidences) 238 | pyautogui.click(average_x, average_y) 239 | return f"Clicked on the average position: ({average_x}, {average_y})" 240 | else: 241 | best_match = coincidences[0] 242 | center_x = best_match['center'][0] 243 | center_y = best_match['center'][1] 244 | 245 | pyautogui.click(center_x, center_y) 246 | return f"Clicked on the best match: '{best_match['text']}' at position: ({center_x}, {center_y})" 247 | 248 | # Function to execute best_match_with_proximity in parallel and find the most probable click position 249 | def find_best_match_with_proximity(input_string, within_window=False): 250 | if within_window: 251 | _, _, _, _, window_position, window_size, _, _ = get_focused_window_details() 252 | screenshot = ImageGrab.grab(bbox=( 253 | window_position[0], window_position[1], 254 | window_position[0] + window_size[0], 255 | window_position[1] + window_size[1])) 256 | else: 257 | screenshot = ImageGrab.grab() 258 | d = pytesseract.image_to_data(screenshot, output_type=pytesseract.Output.DICT) 259 | input_string = input_string.lower() 260 | input_parts = re.findall(r'\w+|\W+', input_string) # Split input_string into words and non-alphabetic parts 261 | 262 | coincidences = [] # List to store all matches 263 | 264 | 265 | for i in range(len(d['text'])): 266 | extracted_text = d['text'][i].lower().strip() 267 | 268 | 269 | # Skip single and double letters or empty strings 270 | if len(extracted_text) <= 2 or not extracted_text: 271 | continue 272 | 273 | score = score = int(d['conf'][i]) 274 | for part in input_parts: 275 | if part in extracted_text: 276 | score += 50 if part.isalpha() else 75 # Higher score for non-alphabetic parts 277 | 278 | 279 | # Initialize score with OCR confidence 280 | 281 | # Check for literal exact match and score it more points 282 | if extracted_text == input_string: 283 | score += 500 # Assign significant points for a literal exact match 284 | 285 | # Use fuzzywuzzy to measure similarity and adjust the score 286 | similarity_score = fuzz.partial_ratio(input_string, extracted_text) 287 | if similarity_score > 60: 288 | score += similarity_score # Add similarity score to the existing score 289 | 290 | # Penalize score if similarity is low 291 | if similarity_score < 80: 292 | score -= 200 # Deduct points if there's low similarity 293 | # Inside find_best_match_with_proximity 294 | coincidence = { 295 | 'text': d['text'][i], 296 | 'x': d['left'][i], 297 | 'y': d['top'][i], 298 | 'w': d['width'][i], 299 | 'h': d['height'][i], 300 | 'conf': d['conf'][i], 301 | 'score': score, # Ensure there is a comma here 302 | # Add the 'center' key to store the center coordinates of the match 303 | 'center': (d['left'][i] + d['width'][i] // 2, d['top'][i] + d['height'][i] // 2) 304 | } 305 | coincidences.append(coincidence) 306 | 307 | # Now we have all coincidences with scores reflecting exactness and similarity 308 | # Higher score is better 309 | 310 | if coincidences: 311 | # Sort matches based on the score 312 | best_match = max(coincidences, key=lambda x: x['score']) 313 | ################################################# 314 | # print(f"Best match: '{best_match['text']}' with score {best_match['score']}") 315 | return best_match 316 | else: 317 | print("No matches found.") 318 | return None 319 | 320 | # Adjusted ocr_focused_window function 321 | def ocr_focused_window(): 322 | # Get details of the focused window 323 | _, _, _, _, window_position, window_size, _, _ = get_focused_window_details() 324 | # Capture only the area of the focused window 325 | screenshot = ImageGrab.grab(bbox=( 326 | window_position[0], window_position[1], window_position[0] + window_size[0], window_position[1] + window_size[1])) 327 | # Perform OCR with preprocessing and filtering 328 | text = ocr_image_with_filters(screenshot) 329 | return text 330 | 331 | 332 | # Adjusted ocr_screen function 333 | def ocr_screen(focused=False): 334 | if focused: 335 | # Get the focused window 336 | window = gw.getActiveWindow() 337 | if window is not None: 338 | # Get the position of the focused window 339 | x, y, width, height = window.left, window.top, window.width, window.height 340 | # Capture the focused window 341 | screenshot = pyautogui.screenshot(region=(x, y, width, height)) 342 | else: 343 | # Fallback in case no window is focused 344 | screenshot = ImageGrab.grab() 345 | else: 346 | # Get a screenshot of the entire screen 347 | screenshot = ImageGrab.grab() 348 | 349 | # Perform OCR with preprocessing and filtering (assuming this is a function you have) 350 | text = ocr_image_with_filters(screenshot) 351 | return text 352 | 353 | 354 | def find_probable_click_position(input_string, attempts=30): 355 | print(f"Finding the most probable click position for \"{input_string}\"...") 356 | with ThreadPoolExecutor(max_workers=attempts) as executor: 357 | print(f"Running {attempts} attempts in parallel...") 358 | # Run the function multiple times in parallel 359 | futures = [executor.submit(find_best_match_with_proximity, input_string) for _ in range(attempts)] 360 | print(f"Waiting for {attempts} parallel attempts to finish...") 361 | 362 | # Collect results, filtering out those with a non-positive score 363 | results = [future.result() for future in futures if future.result() is not None and future.result()['score'] > 0] 364 | print(f"Found {len(results)} matches with a positive score.") 365 | print("Scoring and ranking matches based on proximity...") 366 | print(f"Found {len(results)} matches with a positive score.") 367 | # Find the most probable best match based on the score 368 | if results: 369 | most_probable_match = max(results, key=lambda match: match['score']) 370 | return most_probable_match 371 | return None 372 | 373 | # Main execution block 374 | if __name__ == "__main__": 375 | input_string = "Neon Genesis Evangelion" # Example input string 376 | most_probable_match = find_probable_click_position(input_string) 377 | 378 | # Provide feedback and click action based on most probable match 379 | if most_probable_match: 380 | click_result = click_best_matches([most_probable_match]) 381 | print(f"Most probable match \"{most_probable_match['text']}\" Located at \"x={most_probable_match['center'][0]}, y={most_probable_match['center'][1]}\" with score {most_probable_match['score']}") 382 | 383 | else: 384 | print("No suitable matches found on screen for the input string.") -------------------------------------------------------------------------------- /core/topmost_window.py: -------------------------------------------------------------------------------- 1 | import win32com.client 2 | import win32gui 3 | import win32con 4 | import win32process 5 | from datetime import datetime 6 | 7 | def enumerate_windows(): 8 | windows = [] 9 | 10 | def enum_window_callback(hwnd, _): 11 | if win32gui.IsWindowVisible(hwnd) and win32gui.GetWindowText(hwnd): 12 | windows.append(hwnd) 13 | 14 | win32gui.EnumWindows(enum_window_callback, None) 15 | return windows 16 | 17 | def should_exclude_process(name): 18 | excluded_processes = ['dwm.exe', 'nvcontainer.exe', 'nvidia broadcast ui.exe', 'system', 'python.exe', 'steam.exe', 19 | 'TextInputHost.exe', 'tk', 'pycharm64.exe', 'nvidia broadcast.exe', 'widgets.exe', 20 | 'CTkToplevel', 'Windows Input Experience', 'widgets.exe', 'translucenttb.exe', 'amdow.exe', 21 | 'securityhealthsystray.exe', 'Ctk', 'Ctk.exe', 'tk', 'tk.exe', 'Code', 'Code.exe', 'NVIDIA Share.exe', 22 | 'NVIDIA Web Helper.exe', 'nvsphelper64.exe', 'NVIDIA GeForce Experience.exe', 23 | 'nvcontainer.exe', 'NVDisplay.Container.exe'] 24 | return name.lower() in excluded_processes 25 | 26 | def get_process_name(hwnd): 27 | _, pid = win32process.GetWindowThreadProcessId(hwnd) 28 | wmi = win32com.client.GetObject('winmgmts:') 29 | process = wmi.ExecQuery(f'SELECT Name FROM Win32_Process WHERE ProcessId = {pid}') 30 | if process: 31 | return process[0].Name 32 | return None 33 | 34 | def get_topmost_window(): 35 | for hwnd in enumerate_windows(): 36 | process_name = get_process_name(hwnd) 37 | title = win32gui.GetWindowText(hwnd) 38 | # print(f"Debug: Window Title: '{title}', Process: '{process_name}'") # Debugging line 39 | if process_name and not should_exclude_process(process_name): 40 | return title, process_name 41 | return None, None 42 | 43 | def get_window_handle(title): 44 | handles = [] 45 | 46 | def enum_window_callback(hwnd, _): 47 | if win32gui.GetWindowText(hwnd) == title: 48 | handles.append(hwnd) 49 | 50 | win32gui.EnumWindows(enum_window_callback, None) 51 | return handles[0] if handles else None 52 | 53 | def set_foreground_window_by_title(title): 54 | hwnd = get_window_handle(title) 55 | if hwnd: 56 | win32gui.ShowWindow(hwnd, win32con.SW_RESTORE) 57 | try: 58 | win32gui.SetForegroundWindow(hwnd) 59 | except Exception as e: 60 | print(f"Error setting foreground window: {e}") 61 | else: 62 | print(f"Window with title '{title}' not found.") 63 | 64 | def focus_topmost_window(): 65 | topmost_window_title, _ = get_topmost_window() # We're not using process_name here 66 | if topmost_window_title: 67 | print(f"Selected application: {topmost_window_title}") 68 | set_foreground_window_by_title(topmost_window_title) 69 | return topmost_window_title 70 | else: 71 | print("No suitable windows found.") 72 | 73 | # Example usage 74 | # focus_topmost_window() 75 | -------------------------------------------------------------------------------- /core/ui_window_analyzer.py: -------------------------------------------------------------------------------- 1 | import uiautomation as auto 2 | 3 | #not used 4 | 5 | def walk_control(control, indent=0, control_type=None): 6 | if control is None: 7 | return 8 | try: 9 | if (control_type is None) or (control.ControlType == auto.ControlType[control_type]): 10 | # Print control info with indentation 11 | print(' ' * indent + str(control)) 12 | # ... print additional properties if needed ... 13 | except Exception as e: 14 | print(' ' * (indent + 2) + 'Error getting properties: ' + str(e)) 15 | # Recursively walk the tree 16 | for child in control.GetChildren(): 17 | walk_control(child, indent + 4, control_type=control_type) 18 | 19 | 20 | def analyze_app(application_name=None, control_type=None): 21 | if application_name: 22 | # Find the application window by name 23 | control = auto.WindowControl(searchDepth=1, Name=application_name) 24 | if not control.Exists(0, 0): 25 | print(f'Application "{application_name}" is not running or window not found.') 26 | return 27 | print(f'Inspecting UI elements for application "{application_name}":') 28 | else: 29 | control = auto.GetRootControl() 30 | print('Inspecting UI elements for the entire desktop:') 31 | 32 | # If a specific control type is given, filter by that control type 33 | if control_type and control: 34 | # Walking the control tree and checking for the control type in the walk_control function 35 | walk_control(control) 36 | else: 37 | # Walk the entire UI tree from the control 38 | walk_control(control) 39 | 40 | if __name__ == '__main__': 41 | analyze_app(application_name='Untitled - Paint', control_type='Edit') 42 | -------------------------------------------------------------------------------- /core/voice.py: -------------------------------------------------------------------------------- 1 | from gtts import gTTS 2 | import tempfile 3 | import threading 4 | import os 5 | os.environ['PYGAME_HIDE_SUPPORT_PROMPT'] = "hide" # Hide pygame's welcome message, remind me to remove this later, they deserve recognize, thank you for the fast tts 6 | import pygame 7 | import tkinter as tk 8 | 9 | # Initialize Pygame's mixer 10 | pygame.mixer.init(frequency=44100, size=-16, channels=2, buffer=4096) 11 | volume = 0.25 12 | subtitles = True 13 | 14 | class TransparentSubtitlesWindow: 15 | def __init__(self, text): 16 | self.root = tk.Tk() 17 | self.text = text 18 | self.label = tk.Label(self.root, text=self.text, font=('Helvetica', 16), fg='white', bg='black') 19 | self.label.pack() 20 | 21 | # Set the window to be always on top, transparent, and without decorations 22 | self.root.overrideredirect(True) 23 | self.root.attributes('-topmost', True) 24 | self.root.attributes('-transparentcolor', 'black') 25 | 26 | # Set window position 27 | self.root.geometry('+%d+%d' % (self.root.winfo_screenwidth() // 2 - self.label.winfo_reqwidth() // 2, 28 | self.root.winfo_screenheight() - 100)) 29 | self.update() 30 | 31 | def update(self): 32 | self.label.configure(text=self.text) 33 | self.root.update_idletasks() 34 | self.root.update() 35 | 36 | def change_text(self, new_text, duration): 37 | self.text = new_text 38 | self.update() 39 | 40 | # Schedule removing the text after the duration 41 | self.root.after(duration, lambda: self.label.configure(text="")) 42 | 43 | def close(self): 44 | self.root.quit() # changed from destroy() to quit() 45 | 46 | 47 | def calculate_duration_of_speech(text, lang='en', wpm=150): 48 | """Estimate the duration the subtitles should be displayed based on words per minute (WPM)""" 49 | words = text.split() 50 | word_count = len(words) 51 | duration_in_seconds = (word_count / wpm) * 60 52 | return int(duration_in_seconds * 1000) # Convert to milliseconds for tkinter's after method 53 | 54 | 55 | def play_audio(file_path, text, lang='en'): 56 | # Estimate the duration the subtitles should be shown 57 | duration = calculate_duration_of_speech(text, lang) 58 | 59 | # Load and play audio file 60 | pygame.mixer.music.load(file_path) 61 | pygame.mixer.music.set_volume(volume) 62 | pygame.mixer.music.play() 63 | 64 | # When the audio finishes, stop the mixer and remove the temporary file 65 | while pygame.mixer.music.get_busy(): 66 | pygame.time.Clock().tick(10) 67 | pygame.mixer.music.unload() 68 | os.remove(file_path) 69 | 70 | 71 | def set_volume(volume_level): 72 | global volume 73 | volume = volume_level 74 | pygame.mixer.music.set_volume(volume) 75 | 76 | def set_subtitles(subtitles_bool): 77 | global subtitles 78 | subtitles = subtitles_bool 79 | 80 | 81 | def speaker(text, lang='en'): 82 | # Initialize all of pygame's modules 83 | pygame.init() 84 | 85 | # Temporary mp3 file creation 86 | with tempfile.NamedTemporaryFile(delete=False, suffix='.mp3') as fp: 87 | tts = gTTS(text=text, lang=lang) 88 | tts.save(fp.name) 89 | temp_file_path = fp.name 90 | 91 | # Start the subtitles thread 92 | if subtitles is True: 93 | def setup_subtitles(): 94 | window = TransparentSubtitlesWindow(text) 95 | window.change_text(text, calculate_duration_of_speech(text, lang)) 96 | window.root.mainloop() 97 | 98 | subtitles_thread = threading.Thread(target=setup_subtitles) 99 | subtitles_thread.daemon = True # Now the thread will close when the main program exits 100 | subtitles_thread.start() 101 | else: 102 | subtitles_thread = None 103 | 104 | # Start the audio thread 105 | audio_thread = threading.Thread(target=play_audio, args=(temp_file_path, text, lang)) 106 | audio_thread.daemon = True 107 | audio_thread.start() 108 | 109 | # Return the threads in case the caller wants to track them 110 | return audio_thread, subtitles_thread 111 | 112 | 113 | if __name__ == '__main__': 114 | text_to_speak = "Hello, this is a test." 115 | speaker(text_to_speak) 116 | # Main script can do other tasks here, threads will not prevent script from exiting -------------------------------------------------------------------------------- /core/window_elements.py: -------------------------------------------------------------------------------- 1 | import uiautomation as auto 2 | 3 | 4 | def walk_control(control, indent=0, control_type=None, search_strings=None): 5 | matched = [] 6 | unmatched = [] 7 | if control is None: 8 | return matched, unmatched 9 | if control_type is None or control.ControlType == control_type: 10 | try: 11 | rect = control.BoundingRectangle 12 | area = (rect.right - rect.left) * (rect.bottom - rect.top) 13 | control_tuple = (control, area) 14 | if search_strings and any(s.lower() in control.Name.lower() for s in search_strings): 15 | matched.append(control_tuple) 16 | else: 17 | unmatched.append(control_tuple) 18 | except Exception as e: 19 | print(f"{' ' * indent}Error getting properties: {e}") 20 | 21 | for child in control.GetChildren(): 22 | child_matched, child_unmatched = walk_control(child, indent + 2, control_type=control_type, search_strings=search_strings) 23 | matched.extend(child_matched) 24 | unmatched.extend(child_unmatched) 25 | return matched, unmatched 26 | 27 | 28 | def sort_and_categorize_rects(controls_with_rects, size_category_to_print=None): 29 | sorted_by_area = sorted(controls_with_rects, key=lambda x: x[1], reverse=True) 30 | categorized = {'Bigger': [], 'Medium': [], 'Small': []} 31 | 32 | for control, area in sorted_by_area: 33 | if area >= 1000000: 34 | categorized['Bigger'].append(control) 35 | elif area >= 100000: 36 | categorized['Medium'].append(control) 37 | else: 38 | categorized['Small'].append(control) 39 | 40 | output = [] 41 | for category, controls in categorized.items(): 42 | output.append(f"{category} elements:") 43 | for control in controls[:150]: 44 | output.append(f"{control}") 45 | output.append("") # For an empty line between categories 46 | 47 | return "\n".join(output).strip() 48 | 49 | 50 | def analyze_app(application_name_contains=None, size_category=None, additional_search_options=None): 51 | root = auto.GetRootControl() 52 | 53 | control = None 54 | if application_name_contains: 55 | for win in root.GetChildren(): 56 | if application_name_contains.lower() in win.Name.lower(): 57 | control = win 58 | break 59 | if not control: 60 | return f'Window containing "{application_name_contains}" not found.' 61 | else: 62 | control = root 63 | 64 | if not control.Exists(0, 0): 65 | return f'Application with title containing "{application_name_contains}" is not running or window not found.' 66 | 67 | search_strings = additional_search_options.lower().split(',') if additional_search_options else [] 68 | search_strings = [s.strip() for s in search_strings if s.strip()] 69 | 70 | matched_controls_with_rects, unmatched_controls_with_rects = walk_control(control, control_type=None, search_strings=search_strings) 71 | 72 | output = "Matched controls:\n" 73 | output += sort_and_categorize_rects(matched_controls_with_rects, size_category_to_print=size_category) 74 | output += "\nUnmatched controls:\n" 75 | output += sort_and_categorize_rects(unmatched_controls_with_rects, size_category_to_print=size_category) 76 | return output 77 | 78 | 79 | # Usage example 80 | if __name__ == '__main__': 81 | search_options = "contenteditable" 82 | search_terms = search_options.replace('', '').strip() 83 | print(search_terms) 84 | print(analyze_app(application_name_contains='Firefox', additional_search_options=search_terms)) -------------------------------------------------------------------------------- /core/window_focus.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import os 3 | import ctypes 4 | import sys 5 | import time 6 | import winreg 7 | from fuzzywuzzy import fuzz 8 | import pygetwindow as gw 9 | import uiautomation as auto 10 | import win32gui 11 | import win32process 12 | import psutil 13 | import winreg 14 | 15 | # Define necessary functions from the user32 DLL 16 | user32 = ctypes.WinDLL('user32', use_last_error=True) 17 | EnumWindows = user32.EnumWindows 18 | GetForegroundWindow = user32.GetForegroundWindow 19 | EnumWindowsProc = ctypes.WINFUNCTYPE(ctypes.c_bool, ctypes.c_void_p, ctypes.c_void_p) 20 | GetWindowThreadProcessId = user32.GetWindowThreadProcessId 21 | GetWindowTextLength = user32.GetWindowTextLengthW 22 | GetWindowText = user32.GetWindowTextW 23 | IsWindowVisible = user32.IsWindowVisible 24 | SetForegroundWindow = user32.SetForegroundWindow 25 | IsIconic = user32.IsIconic 26 | ShowWindow = user32.ShowWindow 27 | 28 | # Constants for ShowWindow function 29 | SW_RESTORE = 9 30 | SW_SHOW = 5 31 | 32 | def get_installed_apps_registry(): 33 | installed_apps = [] 34 | reg_paths = [ 35 | r'SOFTWARE\Microsoft\Windows\CurrentVersion\Uninstall', 36 | r'SOFTWARE\WOW6432Node\Microsoft\Windows\CurrentVersion\Uninstall' 37 | ] 38 | for reg_path in reg_paths: 39 | with winreg.ConnectRegistry(None, winreg.HKEY_LOCAL_MACHINE) as hkey: 40 | with winreg.OpenKey(hkey, reg_path, 0, winreg.KEY_READ) as sub_key: 41 | subkey_count, _, _ = winreg.QueryInfoKey(sub_key) 42 | for i in range(subkey_count): 43 | try: 44 | subkey_name = winreg.EnumKey(sub_key, i) 45 | with winreg.OpenKey(sub_key, subkey_name) as app_key: 46 | app_name, _ = winreg.QueryValueEx(app_key, 'DisplayName') 47 | installed_apps.append(app_name) 48 | except EnvironmentError: 49 | continue 50 | return installed_apps 51 | 52 | 53 | def get_open_windows(): 54 | excluded_titles = ["AI Drone Assistant", "NVIDIA GeForce Overlay", "Windows Input Experience", "Program Manager"] 55 | excluded_executables = ["NVIDIA Share.exe", "TextInputHost.exe", "Tk.exe", "conhost.exe", "explorer.exe", 56 | "CTkToplevel", 'Windows Input Experience', "SecurityHealthSystray.exe", "Steam.exe", 57 | "SearchApp.exe", "ApplicationFrameHost.exe", "ShellExperienceHost.exe", "MicrosoftEdge.exe", 58 | "MicrosoftEdgeCP.exe", "MicrosoftEdgeSH.exe", "python.exe", "pycharm64.exe", "pycharm64.exe", 59 | "Ctk", "Ctk.exe", "tk", "tk.exe", "Code", "Code.exe", "amdow.exe", 60 | "nvidia broadcast.exe", "nvidia broadcast ui.exe", "NVIDIA Share.exe", 61 | "NVIDIA Web Helper.exe", "nvsphelper64.exe", "NVIDIA GeForce Experience.exe", 62 | "nvcontainer.exe", "NVDisplay.Container.exe"] 63 | 64 | windows = gw.getAllWindows() 65 | print(windows) 66 | open_windows_info = [] 67 | for w in windows: 68 | if (w.visible and not w.isMinimized and w.title and w.height > 100 and w.width > 100 69 | and w.title not in excluded_titles): 70 | hwnd = w._hWnd 71 | _, pid = win32process.GetWindowThreadProcessId(hwnd) 72 | process = psutil.Process(pid) 73 | executable_name = process.name() 74 | if executable_name not in excluded_executables: 75 | rect = win32gui.GetWindowRect(hwnd) 76 | position = (rect[0], rect[1]) 77 | size = (rect[2] - rect[0], rect[3] - rect[1]) 78 | open_windows_info.append((w.title, position, size, executable_name, w)) 79 | # Sort the windows by their Z position (from top to bottom) 80 | open_windows_info.sort(key=lambda x: x[1][1], reverse=True) 81 | return [info[:-1] for info in open_windows_info] # Exclude the window object from the returned info 82 | 83 | 84 | def get_window_text(hwnd): 85 | length = GetWindowTextLength(hwnd) + 1 86 | buffer = ctypes.create_unicode_buffer(length) 87 | GetWindowText(hwnd, buffer, length) 88 | return buffer.value 89 | 90 | 91 | def get_active_window_title(): 92 | time.sleep(1) # Wait for the window to be fully active ToDo: Fix this part! 93 | hwnd = GetForegroundWindow() 94 | return get_window_text(hwnd) 95 | 96 | 97 | def enum_windows_proc(hwnd, lParam): 98 | if IsWindowVisible(hwnd): 99 | title = get_window_text(hwnd) 100 | if title: 101 | hwnd_list.append((hwnd, title)) 102 | return True 103 | 104 | 105 | def open_windows_info(): 106 | global hwnd_list 107 | hwnd_list = [] 108 | EnumWindows(EnumWindowsProc(enum_windows_proc), 0) 109 | return hwnd_list 110 | 111 | 112 | def find_window(partial_title): 113 | windows = open_windows_info() 114 | for hwnd, title in windows: 115 | if partial_title.lower() in title.lower(): 116 | return hwnd 117 | return None 118 | 119 | 120 | def find_window_by_title(partial_title): 121 | windows = open_windows_info() 122 | for hwnd, title in windows: 123 | if partial_title.lower() in title.lower(): 124 | return hwnd, title 125 | return None, None 126 | 127 | 128 | def bring_to_foreground(hwnd): 129 | if IsIconic(hwnd): 130 | ShowWindow(hwnd, SW_RESTORE) 131 | else: 132 | ShowWindow(hwnd, SW_SHOW) 133 | SetForegroundWindow(hwnd) 134 | 135 | 136 | def search_registry_for_application(app_name): 137 | sub_keys = [ 138 | r"SOFTWARE\Microsoft\Windows\CurrentVersion\Uninstall", 139 | r"SOFTWARE\WOW6432Node\Microsoft\Windows\CurrentVersion\Uninstall", 140 | ] 141 | registry_hives = [winreg.HKEY_LOCAL_MACHINE, winreg.HKEY_CURRENT_USER] 142 | 143 | for hive in registry_hives: 144 | for sub_key in sub_keys: 145 | try: 146 | with winreg.OpenKey(hive, sub_key) as key: 147 | for i in range(0, winreg.QueryInfoKey(key)[0]): 148 | skey_name = winreg.EnumKey(key, i) 149 | skey = winreg.OpenKey(key, skey_name) 150 | try: 151 | display_name = winreg.QueryValueEx(skey, 'DisplayName')[0] 152 | if app_name.lower() in display_name.lower(): 153 | # Look for the executable in a 'DisplayIcon' field 154 | try: 155 | executable_path = winreg.QueryValueEx(skey, 'DisplayIcon')[0] 156 | # In case the path points to an icon, it usually contains a comma 157 | # followed by an icon index, e.g. "C:\Path\To\App.exe,0" 158 | if ',' in executable_path: 159 | executable_path = executable_path.split(',')[0] 160 | return executable_path 161 | except OSError: 162 | pass 163 | 164 | # If not found, fall back to 'UninstallString' as a last resort 165 | uninstall_string = winreg.QueryValueEx(skey, 'UninstallString')[0] 166 | # Here you would need to intelligently extract the executable path 167 | # This might involve more complex logic and is not guaranteed to work 168 | # for all applications as uninstall strings can vary significantly. 169 | # This is a starting point that might work for some applications: 170 | # uninstall_string = uninstall_string.split('"')[1] if '"' in uninstall_string else uninstall_string 171 | # return uninstall_string if os.path.isfile(uninstall_string) else None 172 | except OSError: 173 | pass 174 | finally: 175 | skey.Close() 176 | except OSError: 177 | pass 178 | return None 179 | 180 | def find_best_match_window(partial_title, threshold=50): 181 | windows = open_windows_info() 182 | best_match = None 183 | highest_score = 0 184 | for hwnd, title in windows: 185 | score = fuzz.token_sort_ratio(partial_title.lower(), title.lower()) 186 | if score > highest_score and score >= threshold: 187 | best_match = (hwnd, title) 188 | highest_score = score 189 | return best_match 190 | 191 | 192 | # def activate_windowt_title(application_name): 193 | # print(f"Activating window for {application_name}") 194 | # if application_name.lower() == "cmd": 195 | # # If we know it's cmd, we can try activating an existing window or start a new one directly 196 | # hwnd, window_title = find_window_by_title("cmd") 197 | # if hwnd: 198 | # # If we found a window, bring it to the foreground 199 | # bring_to_foreground(hwnd) 200 | # else: 201 | # os.startfile('cmd.exe') 202 | # return get_active_window_title() 203 | # app_path = None 204 | # 205 | # # Attempt to find the application path for each word in application_name 206 | # 207 | # process = subprocess.run(['where', application_name], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, 208 | # shell=True) 209 | # output = process.stdout.strip().split('\n') 210 | # app_path = output[0] if output else None 211 | # print(f"Application path: {app_path}") 212 | # # If the application path wasn't found, search in the registry for each word 213 | # if not app_path: 214 | # print(f"Searching in registry for application path for '{application_name}'...") 215 | # app_path = search_registry_for_application(application_name) 216 | # 217 | # hwnd, window_title = None, None 218 | # hwnd, window_title = find_window_by_title(application_name) 219 | # if window_title: 220 | # bring_to_foreground(hwnd) 221 | # elif app_path: 222 | # print("Application found but no window open. Starting the application...") 223 | # print(f"Application path: {app_path}") 224 | # subprocess.Popen(app_path) # Open the application if it is not running 225 | # else: 226 | # print(f"{application_name} could not be found nor is open. Please ensure it is installed and accessible via system PATH.") 227 | # return get_active_window_title() 228 | 229 | def activate_windowt_title(application_name): 230 | if application_name.lower() == "cmd": 231 | # If we know it's cmd, we can try activating an existing window or start a new one directly 232 | hwnd, window_title = find_window_by_title("cmd") 233 | if hwnd: 234 | # If we found a window, bring it to the foreground 235 | bring_to_foreground(hwnd) 236 | else: 237 | os.startfile('cmd.exe') 238 | return get_active_window_title() 239 | 240 | app_path = None 241 | words = application_name.split() 242 | 243 | # Attempt to find the application path for each word in application_name 244 | for word in words: 245 | try: 246 | process = subprocess.run(['where', word], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, 247 | shell=True) 248 | output = process.stdout.strip().split('\n') 249 | if output: 250 | app_path = output[0] 251 | break # Once we have found a path, we can break the loop 252 | except Exception as e: 253 | print(f"ERROR: Error finding application path for '{word}': {e}") 254 | 255 | # If the application path wasn't found, search in the registry for each word 256 | if not app_path: 257 | for word in words: 258 | app_path = search_registry_for_application(word) 259 | if app_path: 260 | break # Once we have found a path, we can break the loop 261 | 262 | hwnd, window_title = None, None 263 | # Attempt to find the window with a partial match for any of the words 264 | for word in words: 265 | hwnd, window_title = find_window_by_title(word) 266 | if hwnd: 267 | break # Once we have found a window, we can break the loop 268 | 269 | if hwnd: 270 | # If we found a window, bring it to the foreground 271 | bring_to_foreground(hwnd) 272 | elif app_path: 273 | try: 274 | subprocess.Popen(app_path) # Open the application if it is not running 275 | except Exception as e: 276 | print(f"ERROR: Error opening application '{app_path}': {e}") 277 | else: 278 | print( 279 | f"{application_name} could not be found nor is open. Please ensure it is installed and accessible via system PATH.") 280 | 281 | return get_active_window_title() 282 | 283 | 284 | 285 | if __name__ == "__main__": 286 | # active_title = activate_windowt_title("chrome") 287 | active_title = activate_windowt_title("Google Chrome") 288 | print(f"Active window title: {active_title}") 289 | 290 | -------------------------------------------------------------------------------- /core/window_mgmt.py: -------------------------------------------------------------------------------- 1 | from openai import OpenAI 2 | 3 | class WindowClassifier: 4 | def __init__(self): 5 | self.api_key = 'insert_your_api_key_here' 6 | self.client = OpenAI(api_key=self.api_key) 7 | self.model_name = 'gpt-3.5-turbo' 8 | 9 | def _get_response(self, messages, max_tokens=50): 10 | try: 11 | response = self.client.chat.completions.create( 12 | model=self.model_name, 13 | messages=messages, 14 | max_tokens=max_tokens 15 | ) 16 | if response.choices and hasattr(response.choices[0], 'message'): 17 | decision_message = response.choices[0].message 18 | if hasattr(decision_message, 'content'): 19 | return decision_message.content.strip() 20 | return None 21 | except Exception as e: 22 | print(f"An error occurred: {e}") 23 | return None 24 | 25 | def get_window_classification(self, title): 26 | messages = [{"role": "system", "content": "You are a helpful assistant."}, 27 | {"role": "user", "content": f"Classify this window title into a category: {title}"}] 28 | return self._get_response(messages) 29 | 30 | def complete_text(self, goal): 31 | messages = [{"role": "system", "content": "You are a helpful assistant."}, 32 | {"role": "user", "content": f"Only return the user's message of the goal: {goal}"}] 33 | return self._get_response(messages) 34 | 35 | def get_window_info(self, window_title): 36 | open_windows = self.get_open_windows() 37 | for window_info in open_windows: 38 | if window_title.lower() in window_info[0].lower(): 39 | return window_info 40 | return None 41 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | setuptools 2 | pywinauto 3 | pyautogui 4 | pygetwindow 5 | customtkinter 6 | openai 7 | Pillow # This is the PIL (Python Imaging Library) package 8 | speechRecognition 9 | requests 10 | pywin32 11 | psutil 12 | ImageGrab # Part of Pillow 13 | fuzzywuzzy 14 | pytesseract 15 | uiautomation 16 | gtts 17 | pygame 18 | pyAudio 19 | --------------------------------------------------------------------------------