├── .github
    └── workflows
    │   └── python-publish.yml
├── .gitignore
├── ATTRIBUTIONS
├── LICENSE
├── README.md
├── core
    ├── assistant.py
    ├── core_api.py
    ├── core_imaging.py
    ├── driver.py
    ├── get_all_installed_apps.py
    ├── history.db
    ├── last_app.py
    ├── media
    │   ├── Mouse_pointer_small.png
    │   ├── assistant_transparent.png
    │   ├── assistant_transparent_blink.png
    │   ├── assistant_transparent_dragging.png
    │   ├── headico.ico
    │   ├── headico.png
    │   ├── transcribe_audio.mp3
    │   └── translate_audio.mp3
    ├── mouse_detection.py
    ├── ocr.py
    ├── topmost_window.py
    ├── ui_window_analyzer.py
    ├── voice.py
    ├── window_elements.py
    ├── window_focus.py
    └── window_mgmt.py
└── requirements.txt


/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will upload a Python Package using Twine when a release is created
 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries
 3 | 
 4 | # This workflow uses actions that are not certified by GitHub.
 5 | # They are provided by a third-party and are governed by
 6 | # separate terms of service, privacy policy, and support
 7 | # documentation.
 8 | 
 9 | name: Upload Python Package
10 | 
11 | on:
12 |   release:
13 |     types: [published]
14 | 
15 | permissions:
16 |   contents: read
17 | 
18 | jobs:
19 |   deploy:
20 | 
21 |     runs-on: ubuntu-latest
22 | 
23 |     steps:
24 |     - uses: actions/checkout@v3
25 |     - name: Set up Python
26 |       uses: actions/setup-python@v3
27 |       with:
28 |         python-version: '3.x'
29 |     - name: Install dependencies
30 |       run: |
31 |         python -m pip install --upgrade pip
32 |         pip install build
33 |     - name: Build package
34 |       run: python -m build
35 |     - name: Publish package
36 |       uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
37 |       with:
38 |         user: __token__
39 |         password: ${{ secrets.PYPI_API_TOKEN }}
40 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # taken from https://github.com/github/gitignore/blob/main/Python.gitignore
  2 | # Byte-compiled / optimized / DLL files
  3 | __pycache__/
  4 | *.py[cod]
  5 | *$py.class
  6 | 
  7 | # C extensions
  8 | *.so
  9 | 
 10 | # Distribution / packaging
 11 | .Python
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | cover/
 54 | 
 55 | # Translations
 56 | *.mo
 57 | *.pot
 58 | 
 59 | # Django stuff:
 60 | *.log
 61 | local_settings.py
 62 | db.sqlite3
 63 | db.sqlite3-journal
 64 | 
 65 | # Flask stuff:
 66 | instance/
 67 | .webassets-cache
 68 | 
 69 | # Scrapy stuff:
 70 | .scrapy
 71 | 
 72 | # Sphinx documentation
 73 | docs/_build/
 74 | 
 75 | # PyBuilder
 76 | .pybuilder/
 77 | target/
 78 | 
 79 | # Jupyter Notebook
 80 | .ipynb_checkpoints
 81 | 
 82 | # IPython
 83 | profile_default/
 84 | ipython_config.py
 85 | 
 86 | # pyenv
 87 | #   For a library or package, you might want to ignore these files since the code is
 88 | #   intended to run in multiple environments; otherwise, check them in:
 89 | # .python-version
 90 | 
 91 | # pipenv
 92 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 93 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 94 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 95 | #   install all needed dependencies.
 96 | #Pipfile.lock
 97 | 
 98 | # poetry
 99 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
100 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
101 | #   commonly ignored for libraries.
102 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
103 | #poetry.lock
104 | 
105 | # pdm
106 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
107 | #pdm.lock
108 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
109 | #   in version control.
110 | #   https://pdm.fming.dev/#use-with-ide
111 | .pdm.toml
112 | 
113 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
114 | __pypackages__/
115 | 
116 | # Celery stuff
117 | celerybeat-schedule
118 | celerybeat.pid
119 | 
120 | # SageMath parsed files
121 | *.sage.py
122 | 
123 | # Environments
124 | .env
125 | .venv
126 | env/
127 | venv/
128 | ENV/
129 | env.bak/
130 | venv.bak/
131 | 
132 | # Spyder project settings
133 | .spyderproject
134 | .spyproject
135 | 
136 | # Rope project settings
137 | .ropeproject
138 | 
139 | # mkdocs documentation
140 | /site
141 | 
142 | # mypy
143 | .mypy_cache/
144 | .dmypy.json
145 | dmypy.json
146 | 
147 | # Pyre type checker
148 | .pyre/
149 | 
150 | # pytype static type analyzer
151 | .pytype/
152 | 
153 | # Cython debug symbols
154 | cython_debug/
155 | 
156 | # PyCharm
157 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
158 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
159 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
160 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
161 | #.idea/


--------------------------------------------------------------------------------
/ATTRIBUTIONS:
--------------------------------------------------------------------------------
 1 | Attributions for PyWinAssistant
 2 | 
 3 | PyWinAssistant incorporates the following third-party modules, each with its own license:
 4 |     openai: MIT license. More information is available at [https://github.com/openai/openai-python].
 5 |     pywinauto: Licensed under BSD-3-Clause license . More information at [https://pywinauto.readthedocs.io/en/latest/contents.html].
 6 |     pyautogui: Licensed under the BSD-3-Clause license. Details at [https://github.com/asweigart/pyautogui].
 7 |     pygetwindow: BSD-3-Clause license [https://github.com/asweigart/pygetwindow]
 8 |     PyWin32 (win32com, win32gui, win32con, win32api, win32process, winreg): Part of PyWin32, which is under the PSF license. More information can be found at [https://github.com/mhammond/pywin32].
 9 |     customtkinter: MIT license. More info at [https://github.com/TomSchimansky/CustomTkinter].
10 |     Pillow (PIL): This project uses Pillow, an open-source HPND-licensed library. More info at [https://python-pillow.org/].
11 |     speech_recognition: This project incorporates the speech_recognition module, licensed under [BSD-3-Clause, GPL-2.0 licenses]. More details can be found at [https://github.com/Uberi/speech_recognition#readme].
12 |     psutil: This project uses psutil, distributed under the BSD-3-Clause license. More details at [https://github.com/giampaolo/psutil].
13 |     fuzzywuzzy: Available under the GPL-2.0 license. More information at [https://github.com/seatgeek/fuzzywuzzy].
14 |     pytesseract: An OCR tool licensed under the Apache License 2.0. More details at [https://github.com/madmaze/pytesseract].
15 |     uiautomation: Apache-2.0 license [https://github.com/yinkaisheng/Python-UIAutomation-for-Windows]
16 |     gTTS (Google Text-to-Speech): MIT license. More information at [https://github.com/pndurette/gTTS].
17 |     pygame: Licensed under the LGPL GNU Library or Lesser General Public License (LG PL) (LGPL). More details at [https://www.pygame.org/news].
18 |     tkinter: Part of Python's standard library, covered by the PSF license. [https://github.com/python/cpython/tree/3.12]
19 | 
20 | 
21 | Each module is the property of its respective owners, and PyWinAssistant is in no way affiliated with these modules.
22 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Brandon Joan Rosas Delgado
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | **PyWinAssistant: An artificial assistant** – **MIT Licensed** | **Public Release: December 31, 2023** |  Complies with federal coordinations AI Standards for Complex Adaptive Systems, Asilomar AI Principles and IEEE Global Initiative on Ethics of Autonomous and Intelligent Systems.
  2 | 
  3 | ---
  4 | 
  5 | PyWinAssistant is the first open-source Artificial Narrow Intelligence to elicit spatial reasoning and perception as a generalist agentic framework Computer-Using-Agent that fully operates graphical-user-interfaces (GUIs) for Windows 10/11 **through direct OS-native semantic interaction**. It functions as a Computer-Using-Agent / Large-Action-Model, forming the foundation for a pure **symbolic spatial cognition framework** that enables artificial operation of a computer using only natural language, **without relying on computer vision, OCR, or pixel-level imaging**. PyWinAssistant emulates, plans, and simulates synthetic Human-Interface-Device (HID) interactions through **native Windows Accessibility APIs**, eliciting human-like abstraction across geometric, hierarchical, and temporal dimensions at an Operating-System level. This OS-integrated approach simulating spatial utilization of a computer provides a future-proof, generalized, modular, and dynamic ANI orchestration framework for multi-agent-driven automation, marking an important step in symbolic reasoning towards AGI.
  6 | 
  7 | **Key Features:**
  8 | *   **Not relying only on Imaging Pipeline**: Operates exclusively through Windows UI Automation (UIA) and programmatic GUI semantics, enabling universal workflow orchestration.
  9 | *   **Symbolic Spatial Mapping**: Hierarchical element tracking via OS-native parent/child relationships and coordinate systems.
 10 | *   **Non-Visual Perception**: Real-time interface understanding through direct metadata extraction (control types, states, positions).
 11 | *   **Visual Perception**: A single screenshot can elicit comprehension and perception with attention to detail by visualizing goal intent and environment changes in a spatial space over time, can be fine-tuned to look up for visual cues, bugs, causal reasoning bugs, static, semantic grounding, errors, corruption...
 12 | *   **Unified Automation**: Automatic element detection. Combines GUI, system, and web automation under one Python API. Eliminates context-switching between tools.
 13 | *   **AI-Powered Script Generation**: Translates natural language or demonstrations into any kind of code inside any IDE or text edit areas.
 14 | *   **Self-Healing Workflows**: Auto-adjusts to UI changes (e.g., element ID shifts). Reducing maintenance overhead, making PyWinAssistant's algorithm future-proof.
 15 | *   **AI/ML Integration**: Using NLP to generate scripts (e.g., “Automate Application” → plan of test execution steps in JSON) with self-correcting selectors.
 16 | *   **Cross-Context Automation**: Seamlessly combining GUI, web, and API workflows in a Pythonic way, unifying disjointed automation methods (GUI, API, web) into a single framework.
 17 | *   **Accessibility**: Enhancing accessibility for users with different needs, enabling voice or simple text commands to control complex actions. 
 18 | *   **Generalization**: Elicits spatial cognition to understand and execute a wide range of commands in a natural, intuitive manner.
 19 | *    **Small and compact**: PyWinAssistant functions as an example algorithm of a modular and generalized computer assistant framework that elicits spatial cognition.
 20 | 
 21 | PyWinAssistant has its own set of **reasoning agents**, utilizing Visualization-of-Thought (VoT) and Chain-of-Thought (CoT) to enhance generalization, dynamically simulating actions through abstract GUI semantic dimensions rather than visual processing, making it **future-proof** for next-generation **LLM models**. By **visualizing interface contents** to dynamically **simulate and plan actions** over **abstract GUI semantic dimensions, concepts, and differentials**, PyWinAssistant **redefines computer vision automation**, enabling **high-efficiency visual processing** at a fraction of traditional computational costs. PyWinAssistant has achieved **real-time spatial perception** at an **Operating-System level**, allowing for **memorization of visual cues and tracking of on-screen changes over time**.
 22 | 
 23 | ---
 24 | 
 25 | Released before key breakthroughs in AI for Spatial Reasoning, it predates:
 26 | *   **Microsoft’s** [**Visualization-of-Thought research paper**](https://arxiv.org/abs/2404.03622) (April 4, 2024)
 27 | *   **Anthropic** [**Claude’s Computer-Use Agent**](https://www.anthropic.com/news/3-5-models-and-computer-use) (October 22, 2024)
 28 | *   **OpenIA** [**ChatGPT’s Operator Computer-Using Agent (CUA)**](https://openai.com/index/introducing-operator/) (January 23, 2025)
 29 | 
 30 | PyWinAssistant represents a major paradigm shift in AI and automation by pioneering **pure symbolic computer interaction** bridging **human intent with GUI automation at an OS level** through these breakthroughs:
 31 | *   **First Agent** to bypass OCR/imaging for Computer-Using-Agent GUI automation.
 32 | *   **First Framework** using Windows UIA as the primary spatial perception channel.
 33 | *   **First System** demonstrating OS-native hierarchical-temporal reasoning.
 34 | 
 35 | ---
 36 | 
 37 | ### **1. Unified Natural Language → GUI Automation**
 38 | **Traditional Approach**:  
 39 | Automation tools require scripting (e.g., AutoHotkey) or API integration (e.g., Selenium).  
 40 | 
 41 | **PyWinAssistant Breakthrough**:  
 42 | ```python
 43 | # True generalization for natural language directly driving UI actions
 44 | assistant("Play Daft Punk on Spotify and email the lyrics to my friend")
 45 | # The agent chooses a fitting item according to the related context to comply with user intent.
 46 | ```
 47 | 
 48 | **Mechanism**: Combines UIAutomation’s GUI control detection with LLMs to:
 49 |   - Parse intent ("play", "email lyrics")
 50 |   - Map to UI elements (Spotify play button, Outlook compose window)
 51 |   - Generate adaptive workflows  
 52 | 
 53 | **PyWinAssistant Innovation**: Eliminates the need for:
 54 | - Predefined API integrations
 55 | - XPath/CSS selector knowledge
 56 | - Manual error handling
 57 | 
 58 | ---
 59 | 
 60 | ### **2. Cross-Application State Awareness**
 61 | **Traditional Limitation**:  
 62 | Tools operate in app silos (e.g., Power Automate connectors).  
 63 | 
 64 | **PyWinAssistant Innovation**:  
 65 | ```python
 66 | # Notes:
 67 | # The full set of steps generation from the Assistant is working flawlessly, but in-step modifier and memory-content retrieval was purposely disabled and commented into the code- [def act()](https://github.com/a-real-ai/pywinassistant/blob/6aae4e514a0dc661f7ed640181663f483972bc1e/core/driver.py#L648C1-L648C8)
 68 | # to comply with federal coordinations AI Standards for Complex Adaptive Systems, Asilomar AI Principles and IEEE Global Initiative on Ethics of Autonomous and Intelligent Systems.
 69 | 
 70 | # Accurately maintains context and intent across apps using UIA tree and spatial memory: (Example for further development)
 71 | assistant("Find for the best and cheapest flight to Mexico, and also look for local hotels and suggest me on new tabs the best on cultural options")
 72 | assistant("Look for various pizza coupons for anything but pineapple, fill in the details to order and show me the results")
 73 | 
 74 | # PyWinAssistant is highly modular (example):
 75 | def workflow():
 76 |     song = assistant(goal="get the current track")  # UIA
 77 |     write_action(f"Review '{song}': Great bassline!", app="Notepad")  # Win32
 78 |     assistant(goal="Post on twitter the written text from notepad")  # Web
 79 | 
 80 | # The previous set of actions can be also executed by simply using natural language:
 81 | assistant(f"Get the current song playing and in notepad put the title as Review song name: Great bassline, and write about why it is a great baseline, then post it on twitter", assistant_identity="You're an expert music critic")
 82 | ```
 83 | **Key Advancements**:
 84 | 1. **Unified Control Graph**: Treats all apps as nodes in a single UIA-accessible graph
 85 | 2. **State Transfer**: Passes data between apps via clipboard/UIA properties
 86 | 3. **Semantic Transfer**: Passes semantics of goal intent acros all steps
 87 | 4. **Error Recovery**: Uses agentic reasoning systems to avoid failing actions
 88 | 
 89 | **Impact**: Enables workflows previously requiring custom middleware.
 90 | ---
 91 | 
 92 | ### **3. Probabilistic Automation Engine**
 93 | **Traditional Model**:  
 94 | Deterministic scripts fail on UI changes.  
 95 | 
 96 | **PyWinAssistant’s Solution**:  
 97 | ```python
 98 | # Adaptive element discovery
 99 | def fast_action(goal):
100 |     speaker(f"Clicking onto the element without visioning context. No imaging is required.")
101 |     analyzed_ui = analyze_app(application=ai_choosen_app, additional_search_options=generated_keywords)
102 |     
103 |     gen_coordinates = [{"role": "assistant",
104 |         f"content": f"You are an AI Windows Mouse Agent that can interact with the mouse. Only respond with the "
105 |               f"predicted coordinates of the mouse click position to the center of the element object "
106 |               f"\"x=, y=\" to achieve the goal."},
107 |         {"role": "system", "content": f"Goal: {single_step}\n\nContext:{original_goal}\n{analyzed_ui}"}]
108 |     coordinates = api_call(gen_coordinates, model_name="gpt-4-1106-preview", max_tokens=100, temperature=0.0)
109 |     print(f"AI decision coordinates: \'{coordinates}\'")
110 | ```
111 | **Revolutionary Features**:
112 | - **Semantic Search by thinking**: Example `synonyms("download") → ["save", "export", "↓ icon"]`
113 | - **Spatial Probability**: Prioritizes elements by utilizing sets of self-reasoning agents for the synthetic operation of the actions
114 | - **Spatial-Prevention**: Senses and prevents possible bad actions or misaligned step execution by utilizing sets of self-reasoning agents
115 | - **Self-Healing**: Automatically chooses the perfect plan to execute without failing its step reasoning, by utilizing sets of self-reasoning agents
116 | 
117 | ---
118 | 
119 | ### **4. Democratized Accessibility**
120 | 
121 | Task: Automate to save a song on spotify GUI.
122 | **Before**:  
123 | Automation required:
124 | ```autohotkey
125 | WinWait, Spotify
126 | ControlClick, x=152 y=311  # Fragile coordinates
127 | ```
128 | 
129 | **Now**:  Only 1 natural language command.
130 | ```python
131 | assistant("Like this song")  # Language-first
132 | ```
133 | 
134 | | **Shift Metrics**:    | Traditional Tools | PyWinAssistant |
135 | |-----------------------|-------------------|----------------|
136 | | Learning Curve        | Days, even months | Minutes        |
137 | | Cross-App Workflows   | Manual Integration| Automatic      |
138 | | Maintenance Overhead  | High              | LLM-AutoPatch  |
139 | 
140 | ---
141 | 
142 | ### **Why This is Transformative**
143 | 
144 | 1. **From Scripts to Intent**:  
145 |    Replaces brittle `click(x,y)` with human-like "understand → act" cycles.
146 | 
147 | 2. **From Silos to OS as API**:  
148 |    Treats the entire Windows environment as a programmable interface.
149 | 
150 | 3. **From Fixed to Adaptive**:  
151 |    Leverages LLMs to handle UI changes (e.g., Spotify’s 2023 UI overhaul).
152 | 
153 | 4. **From Developers to Everyone**:  
154 |    Makes advanced automation accessible through natural language, improving the generality quality and minimizing the overall data usage of LLM and vision models.
155 |    Has built-in assistance options to improve human utilization of a computer, with a new technical approach to User Interface and User Experience assistance and testing by spatial visualization of thought,
156 |    generalizes correctly any natural language prompt, and plans to perform correct actions into the OS with security in mind.
157 |    
158 | By **directly interfacing with Windows underlying UI hierarchy**, it achieves real-time spatial perception at the OS level while eliminating traditional computer vision pipelines, enabling:
159 | *   **100x Efficiency Gains**: Native API access.
160 | *   **Blind Operation**: Can function on headless systems, virtual machines, or minimized windows.
161 | *   **Precision Abstraction**: Mathematical modeling of GUI relationships rather than visual pattern matching.
162 | 
163 |   **Image-Free by Design (Core Architecture)**  
164 | While some projects *require* visual processing for fundamental operation, PyWinAssistant achieves **complete GUI interaction capability without an imaging pipeline** through:  
165 | 
166 | 1. **Native OS Semantic Access**  
167 |    Direct Windows UIA API integration provides full control metadata:  
168 |    ```python
169 |    # Example of an element properties via UIA - No screenshots needed
170 |    button = uia.Element.find(Name="Submit", ControlType="Button")
171 |    print(button.BoundingRectangle)  # {x: 120, y: 240, width: 80, height: 30}
172 |    ```
173 | 2. **Imaging Module**  
174 | 
175 |    ```diff
176 |    # PyWinAssistant imaging functions like Pixel level visualization can be enabled as real-time spatial perception with memorization of visual cues and tracking of on-screen changes over time.
177 |    + Capable of planning successful sets of highly technical steps to perform operations on a computer at an OS level, with only one screenshot.
178 |    + Pixel level visualization.
179 |    + Visual hash matching can be enabled for dynamic elements. 
180 |    - OCR fallback / object detection for non-UIA legacy apps.
181 |    # The experimental features of OCR were added but not fully developed as it was not necessary for the current implementation as the assistant currently works too well without it.
182 |    ```
183 | 
184 | | **Key Differentiation** | PyWinAssistant | Traditional Automation |  
185 | |-|----------------|------------------------|  
186 | | **Primary Perception** | UIA Metadata | Screenshots/OCR |  
187 | | **Vision Dependency** | Optional Add-on | Required Core |  
188 | | **Headless Ready** | ✅ Native | ❌ Requires virtual display |  
189 | 
190 | ---
191 | 
192 | ### **Development Notes:**
193 | PyWinAssistant is limited to model's intelligence and time to inference. New advancements on LLM's are required to reach for a complete Artificial General Intelligence system with Artificial Narrow Intelligences managing it.
194 | The system's autonomous task decomposition leverages **native semantic differentials** rather than visual changes, visual changes can be optionally activated for real-time image corruption analysis in GUI/Screen.
195 | Long-term memory and self-learning mechanisms were designed to evolve **symbolic state representations**, and can be also represented into visual patterns, aligning with AGI development.
196 | 
197 | Paper related: Visualization-of-Thought Elicits Spatial Reasoning in Large Language Models (April 4, 2024):
198 | ![image](https://github.com/a-real-ai/pywinassistant/assets/18397328/58c8e18d-b633-4a35-abc1-b8a76768e4e3)
199 | https://arxiv.org/abs/2404.03622
200 | 
201 | # Overview
202 | 
203 | PyWinAssistant includes built-in assistant features designed to enhance human-computer interaction for all users. It integrates real-time voice recognition, customizable assistant personalities, subtitles, and chat functionality.
204 | Talk with your computer friendly and naturally to perform any User Interface activity.
205 | Use natural language to operate freely your Windows Operating System.
206 | Generates and plans test cases of your User Interface applications for continuous testing on any Win32api supported application by simply using natural language.
207 | Your own open and secure personal assistant that responds as you want, control the way you want your computer to assist you.
208 | It's engineered to be modular, understand and execute a wide range of tasks, automating interactions with any desktop applications.
209 | 
210 | # Demos (Videos below)
211 | 
212 | ![image](https://github.com/a-real-ai/pywinassistant/assets/18397328/93c0f123-2d57-419f-a586-32d9fe51e0b2)
213 | 
214 | ![image](https://github.com/a-real-ai/pywinassistant/assets/18397328/42d2e3d5-9be7-4d4a-825d-e80891aeb0eb)
215 | 
216 | ![Screenshot 2023-12-18 043612](https://github.com/a-real-ai/pywinassistant/assets/18397328/428d1a3f-ece7-4c58-9d1b-76138ce8807c)
217 | 
218 | ![Screenshot 2023-12-18 040443](https://github.com/a-real-ai/pywinassistant/assets/18397328/50543e40-f810-4e4f-9cca-3f1131ae1cc1)
219 | 
220 | ![Screenshot 2023-12-01 143812](https://github.com/a-real-ai/pywinassistant/assets/18397328/d88374c9-fb53-4ecf-b8b5-840ffaa5d8c1)
221 | 
222 | ![Screenshot 2023-12-01 150047](https://github.com/a-real-ai/pywinassistant/assets/18397328/f0c904c7-0c96-4d57-90a0-dc9084728131)
223 | 
224 | ![Screenshot 2023-11-13 161219](https://github.com/a-real-ai/pywinassistant/assets/18397328/b2c2a23c-f37f-4f1d-8628-69db6bf13ed9)
225 | 
226 | ---
227 | 
228 | ## Please enable the Audio for the demo videos.
229 | Voice 1 - Input Human (English Female Australian TTS)
230 | 
231 | Voice 2 - Output Assistant (English Female US Google TTS)
232 | 
233 | ---
234 | 
235 | ### Use your computer by natural language - Real-time usage of VoT, an example of a Computer-Using-Agent; Single Action Model.
236 | Does not use any vision. Only API LLM calls. Demonstrating flawless execution of multiple prompt actions.
237 | 
238 | https://github.com/a-real-ai/pywinassistant/assets/18397328/25b39d8c-62d6-442e-9d5e-bc8a35aa971a
239 | 
240 | ---
241 | 
242 | ### Use your computer as an assistant - Real-time usage of planning VoT, an example of a Computer-Using-Agent; Large-Action-Model.
243 | **Takes only 1 screenshot**: Gets to know what the user is doing and what is that the user wants to achieve, the assistant plans to perform it.
244 | ```
245 | Voice Recognized Prompt: Make a new post on twitter saying hello world and a brief greeting explaining you're an artificial intelligence.
246 | ```
247 | https://github.com/a-real-ai/pywinassistant/assets/18397328/d04f0609-68fb-4fb4-9ac3-279047c7a4f7
248 | 
249 | ---
250 | 
251 | ### The assistant can do anything for you - Real-time usage of planning VoT, an example of a Computer-Using-Agent; Large-Action-Model.
252 | The inference is the only constraint for speed.
253 | ```
254 | Voice Recognized Prompt: Create a new comment explaining why it is so important.
255 | ```
256 | https://github.com/a-real-ai/pywinassistant/assets/18397328/6d3bb6e6-ccf8-4380-bc89-df512ae207f2
257 | 
258 | ---
259 | 
260 | ### Other demos with Real-time usage of planning VoT.
261 | 
262 | November 16th 2023 live demo: (Firefox, Spotify, Notepad, Calculator, Mail)
263 | ```python
264 | assistant(goal=f"Open a new tab the song \'Wall Of Eyes - The Smile\', from google search results filter by videos then play it on Firefox")  # Working 100%
265 | assistant(goal=f"Pause the music on Spotify")  # Working 100%
266 | assistant(goal=f"Create a short greet text for the user using AI Automated Windows in notepad.exe")  # Working 100%
267 | assistant(goal=f"Open calc.exe and press 4 x 4 =")  # Working 100%
268 | ```
269 | https://github.com/a-real-ai/pywinassistant/assets/18397328/ce574640-5f20-4b8e-84f9-341fa102c0e6
270 | 
271 | ---
272 | 
273 | December 1st 2023 live demo: (Chrome, Spotify, Firefox) Example of programmable methods.
274 | ```python
275 | assistant(goal=f"Play the song \'Robot Rock - Daft Punk\' on Spotify", keep_in_mind=f"To start playback double click the song.")  # Working 100%
276 | assistant(goal=f"Open 3 new tabs on google chrome and in each of them search for 3 different types of funny AI Memes", keep_in_mind=" Filter the results by images.")  # Working 100%
277 | assistant(goal=f"Open a new tab the song \'Windows 95 but it's a PHAT hip hop beat\', from google search results filter by videos then play it by clicking on the text on Firefox.")  # Working 100%
278 | 
279 | ```
280 | https://github.com/a-real-ai/pywinassistant/assets/18397328/7e0583d1-1c19-40fa-a750-a77fff98a6da
281 | 
282 | Currently supporting all generalized win32api apps, meaning:
283 | Chrome, Firefox, OperaGX, Discord, Telegram, Spotify...
284 | 
285 | ---
286 | 
287 | # Key Features
288 | - Dynamic Case Generator: The assistant() function accepts a goal parameter, which is a natural language command, and intelligently maps it to a series of executable actions. This allows for a seamless translation of user intentions into effective actions on the computer.
289 | 1. Single Action Execution:
290 | The act() function is a streamlined method for executing actions, enhancing the tool's efficiency and responsiveness.
291 | 2. Advanced Context Handling: The framework is adept at understanding context by analyzing the screen and the application, ensuring that actions are carried out with an awareness of the necessary prerequisites or steps.
292 | 3. Semantic router map: The framework has a database of a semantic router map to successfully execute generated test cases. This semantic maps can be created by other AI.
293 | 4. Wide Application Range: From multimedia control (like playing songs or pausing playback on Spotify and YouTube) to complex actions (like creating AI-generated text, sending emails, or managing applications like Telegram or Firefox), the framework covers a broad spectrum of tasks.
294 | 5. Customizable AI Identity: The write_action() function allows for a customizable assistant identity, enabling personalized interactions and responses that align with the user's preferences or the nature of the task.
295 | 6. Robust Error Handling and Feedback: The framework is designed to handle unexpected scenarios gracefully, providing clear feedback and ensuring reliability. (In Overview)
296 | 7. Projects for mood and personality: Generate or suggest now and then useful scenarios based on your mood and personality. (In Overview)
297 | 
298 | 
299 | # Technical Innovations
300 | 1. Natural Language Processing (NLP): Employs advanced NLP techniques to parse and understand user commands in a natural, conversational manner.
301 | 2. Task Automation Algorithms: Utilizes sophisticated algorithms to break down complex tasks into executable steps.
302 | 3. Context-Aware Execution: Integrates contextual awareness for more nuanced and effective task execution.
303 | 4. Cross-Application Functionality: Seamlessly interfaces with various applications and web services, demonstrating extensive compatibility and integration capabilities.
304 | 5. Use Cases.
305 | 6. Automating repetitive tasks in a Windows environment.
306 | 7. Streamlining workflows for professionals and casual users alike.
307 | 8. Enhancing accessibility for users with different needs, enabling voice or simple text commands to control complex actions.
308 | 9. Assisting in learning and exploration by providing AI-driven guidance and execution of tasks.
309 | 
310 | 
311 | # Conclusion
312 | This Artificially Assisted User Interface Testing framework is a pioneering tool in the realm of desktop automation. Its ability to understand and execute a wide range of commands in a natural, intuitive manner makes it an invaluable asset for anyone looking to enhance their productivity and interaction with their Windows environment. It's not just a tool; it's a step towards a future where AI seamlessly integrates into our daily computing tasks, making technology more accessible and user-friendly.
313 | 
314 | # Installation
315 | ```
316 | # Add your Chat-GPT API Keys to the project:
317 | add your API Key in /core/core_api.py  ->  line 3: client = OpenAI(api_key='insert_your_api_key_here')
318 | add your API Key in /core/core_imaging.py  ->  line 12: api_key = 'insert_your_api_key_here'
319 | 
320 | # Install requirements:
321 | cd pywinassistant
322 | pip install -r .\requirements.txt
323 | 
324 | # Execute the assistant:
325 | cd .\core
326 | python ./assistant.py
327 | ```
328 | 
329 | # Usage
330 | Run "Assistant.py", say "Ok computer" to enable the assistant by voice commands or click to it or enable the chat to do a fast action. Use Right click above the Assistant to see the available options for the assistant.
331 | 
332 | For debugging mode execute "Driver.py". Inside it, you can debug and try easily the functions of "act" which is used alongside the assistant, "fast_act" and "assistant" by using the examples.
333 | To run a JSON test case, modify the JSON path from the "assistant" function.
334 | 
335 | # Working cases (on cases.py)
336 | 
337 | ```
338 | assistant(goal=f"Play the song \'One More Time - Daft Punk\' on Spotify")  # Working 100%
339 | assistant(goal=f"Open a new tab the song \'Wall Of Eyes - The Smile\', from google search results filter by videos then play it on Firefox")  # Working 100%
340 | assistant(goal=f"Open a new tab the song \'Windows XP Error beat\', from google search results filter by videos then play it by clicking on the text on Firefox.")  # Working 100%
341 | fast_act(goal=f"Click on the Like button") # Working 100%
342 | assistant(goal=f"Pause the music on Spotify")  # Working 100%
343 | write_action(goal="Comment about why IA is great for the current playing song", assistant_identity="You\'re an advanced music AI agent that specializes on music") # Working 100%
344 | assistant(f"Create a long AI essay about an AI Starting to control a Windows computer on Notepad")  # Working 100%
345 | fast_act(goal="Click on the button at the bottom in HueSync app")  # Working 100%
346 | write_action(goal="Weird Fishes - Radiohead")  # Working 100%
347 | assistant(f"Open Calc and press 4 x 4 - 4 * 4 + 1 =")  # Working 100%
348 | assistant(goal=f"Open 3 new tabs on google chrome and in each of them search for 3 different types of funny dogs", keep_in_mind=" Filter the results by images.")  # Working 100%
349 | assistant(goal=f"Stop the playback from Firefox app")  # Working 100%
350 | assistant(f"Send a list of steps to make a joke about engineers whilist making it an essay to my friend Diana in Telegram")  # Working 100%
351 | assistant(f"Send a list of steps to make a chocolate cake to my saved messages in Telegram")  # Working 100%
352 | assistant(f"Create three new tabs on Firefox, in each of them search 3 different types of funny youtube bad tutorial videos, generate the titles to search.")  # Working 100%
353 | assistant(f"Write an essay about an AI that a person created to use freely the computer, like you. Write it in notepad.exe") # Working 100%
354 | assistant(f"Send an AI joke and say it's generated by an AI to my friend Diana on Discord")  # Working 100%
355 | assistant(goal=f"Create a short greet text for the user using AI Automated Windows in notepad.exe") # Working 100%
356 | assistant(goal=f"Open calc.exe and press 4 x 4 =")  # Working 100%
357 | assistant(goal=f"Send a mail to \'testmail@gmail.com\' with the subject \'Hello\' and generate the message \'Generate a message about how an AI is helping everyone as an users\' on the Mail app",
358 |           keep_in_mind="Press \'Tab\' tree times to navigate to the subject area. Do not combine steps.")  # Need to update the app semantic map to get it working 100%.
359 | assistant(goal=f"Play the song \'The Smile - Wall Of Eyes\' on Spotify")  # Working 100%
360 | assistant(goal=f"Play the song \'Panda Bear - Tropic of cancer\' on Spotify")  # Working 100%
361 | assistant(goal="Pause the music on the Spotify app")  # Working 100%
362 | assistant(goal=f"Open 3 new tabs with different Daft Punk songs on each of them on Firefox")  # Working 100%
363 | fast_act("Open spotify and Search the album \'Grimes - Visions\'")  # Working 100%
364 | write_action("Open spotify and Search the album \'Grimes - Visions\'")  # Working 100%
365 | fast_act("Click on the first result on spotify")  # Working 100%
366 | fast_act("Skip to the next song on Spotify")  # Working 100%
367 | fast_act("Add the album to the library")  # Working 100%
368 | fast_act("Go to Home on Spotify")  # Working 100%
369 | fast_act("Save the song to my library on Spotify")  # Working 100%
370 | ```
371 | 
372 | 
373 | # Current approaches to UI Testing
374 | ### There are three main types of GUI testing approaches, namely:
375 | 
376 | 1. ***Manual Testing:***
377 | 
378 | In manual testing, a human tester performs a set of operations to check whether the application is functioning correctly and that the graphical elements conform to the documented requirements. Manual-based testing has notable downsides in that it can be time-consuming, and the test coverage is extremely low. Additionally, the quality of testing in this approach depends on the knowledge and capabilities of the testing team.
379 | 
380 | 2. ***Record-and-Playback Testing:***
381 | 
382 | Also known as record-and-replay testing, it is executed using automation tools. The automated UI testing tool records all tasks, actions, and interactions with the application. The recorded steps are then reproduced, executed, and compared with the expected behavior. For further testing, the replay phase can be repeated with various data sets.
383 | 
384 | 3. ***Model-Based Testing:***
385 | 
386 | In this testing approach, we focus on building graphical models that describe the behavior of a system. This provides a deeper understanding of the system, which allows the tester to generate highly efficient test cases. In the models, we determine the inputs and outputs of the system, which are in turn, used to run the tests. Model-based testing works as follows:
387 | 
388 |     Create a model for the system
389 |     Determine system inputs
390 |     Verify the expected output
391 |     Execute tests
392 |     Check and validate system output vs. the expected output
393 | 
394 | The model-based approach is great because it allows a higher level of automation. It also covers a higher number of states in the system, thereby improving the test coverage.
395 | 
396 | 
397 | # New Approaches to UI Testing using AI
398 | 4. ***Artificially Assisted User Interface Testing:***
399 | 
400 | Artificially Assisted User Interface Testing harnesses the power of artificial intelligence to revolutionize the process of testing graphical user interfaces. Unlike traditional methods, Artificially Assisted User Interface Testing integrates machine learning algorithms and intelligent decision-making processes to autonomously identify, analyze, and interact with UI elements. This approach significantly enhances the depth and breadth of testing in several ways:
401 | 
402 |     Dynamic Interaction with UI Elements: AI-driven tests can adapt to changes in the UI, such as modified button locations or altered element properties. This flexibility is achieved through the use of AI models trained to recognize and interact with various UI components, regardless of superficial changes.
403 |     Learning and Pattern Recognition: Utilizing machine learning, Artificially Assisted User Interface Testing systems can learn from previous interactions, test runs, and user feedback. This enables the AI to recognize patterns and predict potential issues, improving over time and offering more thorough testing with each iteration.
404 |     Automated Test Case Generation: The AI can generate test cases based on its understanding of the application's functionality and user behavior patterns. This not only saves time but also ensures that a wider range of scenarios is tested, including edge cases that might be overlooked in manual testing.
405 |     Natural Language Processing (NLP): AI Testing tools often incorporate NLP to interpret and execute tests written in plain language. This feature makes the testing process more accessible to non-technical stakeholders and facilitates better communication across the team.
406 |     Real-Time Feedback and Analytics: AI systems provide real-time insights into the testing process, identifying bugs, performance issues, and usability problems promptly. This immediate feedback loop enables quicker rectifications and enhances the overall quality of the product.
407 |     Predictive Analysis and Risk Assessment: By analyzing past data, Artificially Assisted User Interface Testing tools can predict potential problem areas and allocate testing resources more efficiently. This proactive approach to risk management ensures that critical issues are identified and addressed early in the development lifecycle.
408 | 
409 | In conclusion, Artificially Assisted User Interface Testing represents a significant leap forward in software quality assurance. By automating and enhancing the testing process, AI-driven tools offer improved accuracy, speed, and coverage, paving the way for more reliable and user-friendly applications.
410 | 
411 | 
412 | ### Notes:
413 | 
414 | This project is being updated as of start of 2024. The list of requirements is being updated.
415 | 


--------------------------------------------------------------------------------
/core/assistant.py:
--------------------------------------------------------------------------------
  1 | import customtkinter as Ctk
  2 | from PIL import Image, ImageTk
  3 | import time
  4 | import random
  5 | from queue import Queue
  6 | import speech_recognition as sr
  7 | import threading
  8 | from voice import speaker, set_volume, set_subtitles
  9 | from driver import assistant, act, fast_act, auto_role, perform_simulated_keypress, write_action
 10 | from window_focus import activate_windowt_title
 11 | 
 12 | # Initialize the speech recognition and text to speech engines
 13 | assistant_voice_recognition_enabled = True  # Disable if you don't want to use voice recognition
 14 | assistant_name_handle = "Ok Computer"  # Change this to your preferred name, will be used for voice activation.
 15 | assistant_anim_enabled = True
 16 | assistant_voice_enabled = True
 17 | set_volume(0.25)
 18 | assistant_subtitles_enabled = True
 19 | recognizer = sr.Recognizer()
 20 | message_queue = Queue()
 21 | Ctk.set_appearance_mode("dark")  # Modes: system (default), light, dark
 22 | Ctk.set_default_color_theme("dark-blue")  # Themes: blue (default), dark-blue, green
 23 | 
 24 | 
 25 | def listen_to_speech():
 26 |     # Function to listen for speech and add the recognized text to the message queue
 27 |     with sr.Microphone() as source:
 28 |         try:
 29 |             print("Assistant Listening...")
 30 |             audio = recognizer.listen(source, timeout=5)  # Listen for 5 seconds
 31 |             message = recognizer.recognize_google(audio)
 32 |             print("You said:", message)
 33 |             message_queue.put(message)
 34 |             return message
 35 |         except sr.UnknownValueError:
 36 |             print("Google Speech Recognition could not understand audio")
 37 |         except sr.RequestError as e:
 38 |             print("Could not request results from Google Speech Recognition service; {0}".format(e))
 39 |         except sr.WaitTimeoutError:
 40 |             print("Listening timed out.")
 41 |         finally:
 42 |             # Schedule the function to be called again
 43 |             # root.after(1000, listen_to_speech) # This if you want to try it indefinitely.
 44 |             print("Google Speech Recognition could not understand audio")
 45 |             pass
 46 | 
 47 | 
 48 | def process_queue():
 49 |     # Function to check the message queue and show messages as bubbles
 50 |     try:
 51 |         while not message_queue.empty():
 52 |             message = message_queue.get_nowait()
 53 |             if message:
 54 |                 show_message(None, message)  # Pass None for event
 55 |                 speaker(message)
 56 |     finally:
 57 |         # Schedule the function to be called again
 58 |         root.after(100, process_queue)
 59 |         pass
 60 | 
 61 | 
 62 | def on_drag(event):
 63 |     # Function to move the window on drag
 64 |     global is_dragging, position_right, position_bottom, drag_time
 65 |     drag_time = time.time()
 66 |     is_dragging = True
 67 |     x = root.winfo_pointerx() - offset_x
 68 |     y = root.winfo_pointery() - offset_y
 69 |     root.geometry(f'+{x}+{y}')
 70 |     label.configure(image=assistant_dragging_photo)
 71 | 
 72 | 
 73 | def end_drag(event):
 74 |     global is_dragging, position_right, position_bottom, drag_time, click_time
 75 |     if not click_time:
 76 |         return
 77 |     dragged_message = "Whats the action?" if time.time() - click_time < 0.15 else "You dragged me!"
 78 |     # If the duration of the drag is less than the threshold for a click
 79 |     if time.time() - drag_time < 0.15:
 80 |         is_dragging = False
 81 |         position_right = root.winfo_x()
 82 |         position_bottom = root.winfo_y()
 83 |         label.configure(image=assistant_photo)
 84 |         animate_move()
 85 |         show_message(event, dragged_message)
 86 |         speaker(dragged_message)
 87 |     else:
 88 |         label.configure(image=assistant_photo)
 89 |         animate_move()  # Resume the movement animation
 90 |         show_message(event, dragged_message)
 91 |         speaker(dragged_message)
 92 |         create_input_bubble(action=True)
 93 |     print(f"Clicked on the assistant: {dragged_message}")
 94 | 
 95 | 
 96 | def create_input_bubble(action=False):
 97 |     # Get dimensions for proper placement
 98 |     bubble_width = 450  # Set a fixed width for the bubble
 99 |     bubble_height = 28  # A reasonable height to fit the text entry
100 |     # Calculate the bubble position to the left of the assistant
101 |     bubble_x = root.winfo_x() - bubble_width + 40  # Adjust the X position as needed
102 |     bubble_y = root.winfo_y() + (assistant_photo_height // 2) - (bubble_height // 2) + 40
103 |     # Create bubble as a top-level window
104 |     bubble = Ctk.CTkToplevel(root)
105 |     bubble.attributes('-alpha', 0.85)
106 |     bubble.bind("<Escape>", lambda e: bubble.destroy())
107 |     bubble.bind("<FocusOut>", lambda e: bubble.destroy())
108 |     bubble.overrideredirect(True)
109 |     bubble.attributes('-topmost', True)
110 |     bubble.geometry(f'{bubble_width}x{bubble_height}+{bubble_x}+{bubble_y}')
111 |     # Create the entry widget
112 |     entry = Ctk.CTkEntry(bubble, corner_radius=6, placeholder_text_color="#0b2d39",
113 |                          fg_color="#e1f2f1", text_color="#040f13",
114 |                          placeholder_text="Type here the action to perform...", width=450,
115 |                          border_width=1, border_color="darkgray")
116 |     entry.bind("<Escape>", lambda e: bubble.destroy())
117 |     entry.pack(padx=0, pady=0)
118 |     # Force focus on the entry and bubble
119 |     try:
120 |         bubble.after(10, lambda: [bubble.focus_force(), entry.focus_force()])
121 |     except Ctk.ctk_tk.TclError:
122 |         # Ignore the error, as the window or widget is no longer valid
123 |         pass
124 |     # Bind Return and Escape keys to process input or destroy bubble
125 |     entry.bind("<Return>", lambda e: process_input_and_close(bubble, entry, action))
126 |     # Bind mouse click to focus back on entry
127 |     bubble.bind("<Button-1>", lambda e: entry.focus_force())
128 |     # Make sure bubble is focused as well when clicking on it
129 |     bubble.bind("<FocusIn>", lambda e: entry.focus_force())
130 |     return bubble  # Returning bubble reference in case it needs to be accessed
131 | 
132 | def process_input_and_close(bubble, entry, action=False):
133 |     user_input = entry.get()
134 |     print(f"Processing input: {user_input}")
135 |     if user_input.strip():
136 |         bubble.destroy()
137 |         # Use the user input as needed: display, speech, or further processing.
138 |         show_message(None, user_input)
139 |         # speaker(user_input.strip())
140 |         if action:
141 |             print("Performing action: ", user_input)
142 |             fast_act(single_step=user_input.strip())
143 |         else:
144 |             print(f"Running assistant... Generating test case: {user_input.strip()}")
145 |             speaker(f"Running assistant... Generating test case: {user_input.strip()}")
146 |             # assistant(assistant_goal=user_input.strip(), called_from="assistant")
147 |             assistant_thread = threading.Thread(target=run_assistant, args=(user_input.strip(),))
148 |             assistant_thread.start()
149 |             # assistant(user_input.strip())
150 |         # auto_prompt(user_input.strip())
151 |     bubble.destroy()  # Ensure the bubble is destroyed after submission
152 | 
153 | def listen_and_respond():
154 |     action = listen_to_speech()
155 |     if action:  # Check if action is not None or empty string
156 |         show_message(None, action)
157 |         # Execute the assistant function in a separate thread
158 |         # assistant_thread = threading.Thread(target=run_assistant, args=(action,))
159 |         # assistant_thread.start()
160 | 
161 | 
162 | def run_assistant(action):
163 |     print("Running assistant...")
164 |     assistant(assistant_goal=action, called_from="assistant")
165 | 
166 | 
167 | def start_drag(event):
168 |     # Record the starting point for dragging
169 |     global offset_x, offset_y, click_time
170 |     offset_x = event.x
171 |     offset_y = event.y
172 |     click_time = time.time()
173 | 
174 | 
175 | def show_message(event=None, message="Hello! How can I help you?"):
176 |     # Function to show a pop-up message bubble
177 |     message_window = Ctk.CTkToplevel(root)  # Create a new window
178 |     message_window.overrideredirect(True)  # Remove the window border
179 |     message_window.attributes('-topmost', True)  # Keep the window on top
180 |     # Get dimensions for the message window
181 |     # temp_label = Ctk.CTkLabel(message_window, text=message)
182 |     # temp_label.pack()
183 |     message_window.update_idletasks()  # Update the layout to get size
184 |     message_width = message_window.winfo_width()
185 |     message_height = message_window.winfo_height()
186 |     # temp_label.destroy()
187 |     if event:
188 |         pos_x = event.x_root + 20
189 |         pos_y = event.y_root - message_height // 2
190 |     else:
191 |         # Calculate position based on assistant current position
192 |         pos_x = root.winfo_x() + label.winfo_width() + 10
193 |         pos_y = root.winfo_y() + label.winfo_height() // 2 - message_height // 2
194 |     # Adjust position if the message window goes offscreen
195 |     screen_width = root.winfo_screenwidth()
196 |     screen_height = root.winfo_screenheight()
197 |     if pos_x + message_width > screen_width:
198 |         pos_x = screen_width - message_width
199 |     if pos_y + message_height > screen_height:
200 |         pos_y = screen_height - message_height
201 |     if pos_y < 0:
202 |         pos_y = 0
203 |     # Set the geometry and display the message
204 |     message_window.geometry(f'+{pos_x}+{pos_y}')
205 |     message_label = Ctk.CTkLabel(message_window, text=message)
206 |     message_label.configure(corner_radius=6, fg_color="#e1f2f1", text_color="black", bg_color="gray")
207 |     message_label.pack(padx=0, pady=0)
208 |     # Close the message bubble after 3 seconds
209 |     message_window.after(3000, message_window.destroy)
210 | 
211 | def create_context_menu(event_x_root, event_y_root):
212 |     global context_menu_ref, assistant_voice_enabled, assistant_anim_enabled, assistant_subtitles_enabled, assistant_voice_recognition_enabled  # Use the global references
213 | 
214 |     # Create a custom context menu using Ctk widgets
215 |     context_menu = Ctk.CTkToplevel(root)
216 |     context_menu.overrideredirect(True)
217 |     context_menu.attributes('-topmost', True)
218 |     context_menu.attributes('-alpha', 0.95)  # Set transparency (0.0 to 1.0)
219 |     # Set the theme to light
220 |     # Change buttons color
221 |     # Ctk.set_default_color_theme("dark-blue")  # Themes: blue (default), dark-blue, green
222 |     # context_menu visual options
223 |     context_menu.configure(borderless=True, border_color="black")
224 |     context_menu.bind("<Escape>", lambda e: context_menu.destroy())
225 |     context_menu.bind("<FocusOut>", lambda e: context_menu.destroy())
226 |     # Frame to hold menu items
227 |     menu_frame = Ctk.CTkFrame(context_menu)
228 |     menu_frame.pack()
229 |     # Wrapper function to execute command and close the menu
230 |     def menu_command(command):
231 |         if callable(command):
232 |             print(f"Executing command: {command}")
233 |             command()  # Execute command if it's callable
234 |         else:
235 |             print(f"Command '{command}' not implemented yet")
236 |         context_menu.destroy()  # Destroy the menu after executing the command
237 | 
238 |     # Buttons with commands
239 |     Ctk.CTkButton(menu_frame, text="Call assistant", command=lambda: menu_command(generate_assistant_test_case(False))).pack(fill="x")
240 |     Ctk.CTkButton(menu_frame, text="Fast action", command=lambda: menu_command(generate_assistant_test_case(True))).pack(fill="x")
241 |     Ctk.CTkButton(menu_frame, text="Content analysis", command=lambda: menu_command(dummy_command)).pack(fill="x")
242 | 
243 |     # Add separator or space between groups of options (This is an improvisation since Ctk doesn't have a separator widget)
244 |     Ctk.CTkLabel(menu_frame, text="", height=3).pack(fill="x")
245 | 
246 |     # Toggle buttons for voice, animation, and subtitles with the current status check
247 |     volume_option = "Enable assistant voice" if not assistant_voice_enabled else "Disable assistant voice"
248 |     anim_option = "Enable animations" if not assistant_anim_enabled else "Disable animations"
249 |     subs_option = "Enable subtitles" if not assistant_subtitles_enabled else "Disable subtitles"
250 |     voice_option = "Enable voice recognition" if not assistant_voice_recognition_enabled else "Disable voice recognition"
251 |     # Add the buttons to the menu frame
252 |     Ctk.CTkButton(menu_frame, text=volume_option, command=lambda: menu_command(toggle_volume)).pack(fill="x")
253 |     Ctk.CTkButton(menu_frame, text=anim_option, command=lambda: menu_command(toggle_animations)).pack(fill="x")
254 |     Ctk.CTkButton(menu_frame, text=subs_option, command=lambda: menu_command(toggle_subtitles)).pack(fill="x")
255 |     Ctk.CTkButton(menu_frame, text=voice_option, command=lambda: menu_command(toggle_voice_recognition)).pack(fill="x")
256 |     # Add separator or space between groups of options (This is an improvisation since Ctk doesn't have a separator widget)
257 |     Ctk.CTkLabel(menu_frame, text="", height=3).pack(fill="x")
258 |     # Extra options
259 |     Ctk.CTkButton(menu_frame, text="Minimize", command=lambda: menu_command(minimize_assistant)).pack(fill="x")
260 |     Ctk.CTkButton(menu_frame, text="Hide", command=lambda: menu_command(root.withdraw)).pack(fill="x")
261 |     Ctk.CTkButton(menu_frame, text="Reset", command=lambda: menu_command(restart_assistant)).pack(fill="x")
262 |     Ctk.CTkButton(menu_frame, text="Stop", command=lambda: menu_command(stop_assistant)).pack(fill="x")
263 |     Ctk.CTkLabel(menu_frame, text="", height=3).pack(fill="x")
264 |     Ctk.CTkButton(menu_frame, text="Back...", command=lambda: menu_command(root.deiconify)).pack(fill="x")
265 | 
266 |     # Update the layout to calculate the width and height
267 |     context_menu.update_idletasks()
268 |     menu_width = menu_frame.winfo_reqwidth()
269 |     menu_height = menu_frame.winfo_reqheight()
270 | 
271 |     # Position the menu at the cursor position
272 |     # If the menu goes off the screen to the right, move it left; same for bottom
273 |     if event_x_root + menu_width > root.winfo_screenwidth():
274 |         event_x_root = root.winfo_screenwidth() - menu_width - 93
275 |     if event_y_root + menu_height > root.winfo_screenheight():
276 |         event_y_root = root.winfo_screenheight() - menu_height - 100
277 | 
278 |     context_menu.geometry(f"{menu_width}x{menu_height}+{event_x_root}+{event_y_root}")
279 |     context_menu.focus_force()  # Force focus on the menu
280 |     context_menu_ref = context_menu  # Store the reference to the menu in a global variable
281 |     return context_menu_ref
282 | 
283 | def minimize_assistant():
284 |     root.withdraw()
285 |     root.overrideredirect(False)
286 |     root.iconify()
287 |     # root.overrideredirect(True)
288 | 
289 | 
290 | def show_config(event):
291 |     # Function to display the settings menu using a custom context menu
292 |     create_context_menu(event.x_root, event.y_root)
293 | 
294 | # Just for example purpose, you will replace this with actual commands
295 | def dummy_command():
296 |     speaker("Dummy item clicked")
297 |     print("Dummy item clicked")
298 | 
299 | def generate_assistant_test_case(fast_act=False):
300 |     # Function to perform a fast action
301 |     if fast_act:
302 |         speaker("What's the fast action step?")
303 |         print("What's the fast action step?")
304 |         create_input_bubble(fast_act)
305 |     else:
306 |         speaker("What's the test-case to generate?")
307 |         print("What's the test-case to generate?")
308 |         create_input_bubble(fast_act)
309 | 
310 | def toggle_voice_recognition():
311 |     global assistant_voice_recognition_enabled
312 |     assistant_voice_recognition_enabled = not assistant_voice_recognition_enabled
313 |     if assistant_voice_recognition_enabled:
314 |         show_message(None, "Voice recognition enabled")
315 |         speaker("Voice recognition enabled")
316 |     else:
317 |         show_message(None, "Voice recognition disabled")
318 |         speaker("Voice recognition disabled")
319 | 
320 | 
321 | def toggle_animations():
322 |     global assistant_anim_enabled
323 |     assistant_anim_enabled = not assistant_anim_enabled
324 |     if assistant_anim_enabled:
325 |         animate_blink()  # Restart blinking animation
326 |         animate_move()   # Restart moving animation
327 |         show_message(None, "Animations enabled")
328 |     else:
329 |         show_message(None, "Animations disabled")
330 | 
331 | 
332 | def toggle_subtitles():
333 |     global assistant_subtitles_enabled
334 |     assistant_subtitles_enabled = not assistant_subtitles_enabled
335 |     if assistant_subtitles_enabled:
336 |         show_message(None, "Subtitles enabled")
337 |         set_subtitles(True)
338 |     else:
339 |         set_subtitles(False)
340 |         show_message(None, "Subtitles disabled")
341 | 
342 | 
343 | def toggle_volume():
344 |     global assistant_voice_enabled
345 |     assistant_voice_enabled = not assistant_voice_enabled
346 |     if assistant_voice_enabled:
347 |         show_message(None, "Assistant voice enabled")
348 |         set_volume(0.25)
349 |         speaker("Voice enabled")
350 |     else:
351 |         show_message(None, "Assistant voice disabled")
352 |         set_volume(0)
353 | 
354 | 
355 | def stop_assistant():
356 |     root.destroy()
357 |     pass
358 | 
359 | 
360 | def restart_assistant():
361 |     root.destroy()
362 |     create_app()
363 |     pass
364 | 
365 | 
366 | def calculate_duration_of_speech(text, lang='en', wpm=150):
367 |     # Estimate the duration the subtitles should be displayed based on words per minute (WPM)
368 |     duration_in_seconds = (len(text.split()) / wpm) * 60
369 |     return int(duration_in_seconds * 1000)  # Convert to milliseconds for tkinter's after method
370 | 
371 | 
372 | def animate_blink():
373 |     # Function for blinking animation
374 |     label.configure(image=assistant_blink_photo)
375 |     root.after(150, lambda: label.configure(image=assistant_photo))
376 |     next_blink = random.randint(500, 10000) if assistant_anim_enabled else 10000
377 |     root.after(next_blink, animate_blink)
378 | 
379 | 
380 | def animate_move(step=0, direction=1, amplitude=3, start_time=1):
381 |     global position_bottom, is_dragging
382 |     max_steps = 15
383 |     if start_time is None:
384 |         start_time = time.time()
385 |     if assistant_anim_enabled and not is_dragging:
386 |         new_position = position_bottom + amplitude * direction * (1 - abs(step / max_steps * 2 - 1))
387 |         root.geometry(f'+{position_right}+{int(new_position)}')
388 |         next_step = step + 1
389 |         if next_step > max_steps:
390 |             current_time = time.time()
391 |             next_step = 0
392 |             movement_duration = current_time - start_time
393 |             if 1 <= movement_duration <= 2:
394 |                 direction = -direction
395 |                 start_time = current_time
396 |             amplitude = random.randint(0, 3)
397 |         random_delay = random.randint(30, 200)
398 |         root.after(random_delay, lambda: animate_move(next_step, direction, amplitude, start_time))
399 | 
400 | 
401 | def listen_thread():
402 |     global assistant_voice_recognition_enabled
403 |     print("Assistant listening thread started...")
404 |     while True:
405 |         if not assistant_voice_recognition_enabled:
406 |             # If voice recognition got disabled, wait for a bit before checking again ToDo: Maybe use a condition variable instead, but im planning on performing other actions here. Like a low power mode or something like that works for now
407 |             time.sleep(1)
408 |             # print("Voice recognition disabled, waiting...")
409 |             continue
410 | 
411 |         with sr.Microphone() as source:
412 |             print("Listening...")
413 |             try:
414 |                 audio = recognizer.listen(source, timeout=1.5)
415 |                 if assistant_voice_recognition_enabled:
416 |                     message = recognizer.recognize_google(audio)
417 |                     message_low = message.lower()
418 |                     # Speaking the message
419 |                     message_queue.put(message)
420 |                     # Only process the audio if voice recognition is enabled
421 |                     if "okay computer" in message_low[0:13] or assistant_name_handle.lower() in message_low[0:11]:
422 |                         message_queue.put("Assistant here! How can I help you?")
423 |                         ok_computer = listen_to_speech()
424 |                         if ok_computer:
425 |                             show_message(None, ok_computer)
426 |                             assistant(ok_computer)
427 |                     elif "open" in message_low[0:4]:
428 |                         if len(message) < 18:
429 |                             print("Opening the program: ", message)
430 |                             activate_windowt_title(message.strip("open "))
431 |                         else:
432 |                             assistant(message)
433 |                     elif "stop" in message_low:
434 |                         print("Stopping...")
435 |                         stop_assistant()
436 |                     elif "double click" in message_low[0:12]:
437 |                         print("Double clicking on:", message)
438 |                         fast_act(single_step=message.strip("double "), double_click=True)
439 |                     # Or if message starts with the first word click and
440 |                     elif "click on" in message_low[0:8] or "click the" in message_low[0:9] or "click" in message_low[0:5]:
441 |                         print("Clicking on:", message)
442 |                         fast_act(single_step=message)
443 |                     elif "press" in message_low[0:5]:
444 |                         print("press: ", message)
445 |                         perform_simulated_keypress(message.strip("press ").strip(""))
446 |                     elif "type" in message_low[0:4] or "write" in message_low[0:5] or "bright" in message_low[0:6] or "great" in message_low[0:5]:
447 |                         # Remove "bright ", "write ", "type ", "great " from the message:
448 |                         new_message = message.replace("bright ", "").replace("write ", "").replace("type ", "").replace("great ", "")
449 |                         print("Typing:", new_message)
450 |                         write_action(goal=new_message, last_step="text_entry")
451 |                     elif "reminder" in message_low or "remind" in message_low or "timer" in message_low or "alarm" in message_low:
452 |                         # Call internal_clock.py - Generated.
453 |                         # Here's thoughts of when remind the user if is not noticing any important upcoming event:
454 |                         # Advice the user for upcoming events. Add reminders, timers, alarms, etc.
455 |                         print("Reminder: ", message)
456 |                     elif "scroll" in message_low[0:6]:
457 |                         print("Scrolling: ", message)
458 |                         import pyautogui
459 |                         pyautogui.scroll(-850)
460 |                     else:
461 |                         auto_prompt(message)
462 |                 else:
463 |                     # Voice recognition was disabled while audio was being processed, skip it
464 |                     continue
465 |             except (sr.UnknownValueError, sr.RequestError, sr.WaitTimeoutError):
466 |                 # If you want to handle specific errors, you can separate them with additional except blocks
467 |                 pass
468 | 
469 | 
470 | # Now start the listening thread when initializing
471 | listening_thread = threading.Thread(target=listen_thread, daemon=True)
472 | listening_thread.start()
473 | 
474 | 
475 | def auto_prompt(message):
476 |     role_function = auto_role(message)
477 |     print(f"Assistant: {role_function.strip('windows_assistant').strip('joyful_conversation').strip(' - ')}")
478 |     if "windows_assistant" in role_function:
479 |         message_queue.put(f"{role_function.strip('windows_assistant').strip(' - ')}")
480 |         # Start the assistant in a new thread:
481 |         assistant_thread = threading.Thread(target=run_assistant, args=(message,))
482 |         assistant_thread.start()
483 |     elif "joyful_conversation" in role_function:
484 |         message_queue.put(f"{role_function.strip(f'joyful_conversation').strip(' - ')} How can I help you?")
485 |     else:
486 |         print("NOT WORKING")
487 | 
488 | 
489 | def load_image(file_path, scale=0.333):
490 |     # Helper function to load and scale the image
491 |     image = Image.open(file_path)
492 |     original_width, original_height = image.size
493 |     new_width = int(original_width * scale)
494 |     new_height = int(original_height * scale)
495 |     image = image.resize((new_width, new_height), Image.Resampling.BICUBIC)
496 |     ctk_image = Ctk.CTkImage(light_image=image, size=(new_width, new_height))
497 |     return ctk_image, new_width, new_height
498 | 
499 | 
500 | def create_app():
501 |     global root, label, assistant_photo, assistant_dragging_photo, assistant_blink_photo, assistant_anim_enabled, is_dragging, position_right, position_bottom, drag_time, \
502 |         assistant_voice_enabled, assistant_subtitles_enabled, assistant_name_handle, assistant_photo_width, assistant_photo_height, scale_factor # Add width and height globals
503 |     import ctypes
504 |     ctypes.windll.shcore.SetProcessDpiAwareness(1)
505 |     root = Ctk.CTk()
506 |     root.title("AI Drone Assistant")
507 |     root.iconbitmap("media/headico.ico")
508 |     root.overrideredirect(True)
509 |     root.attributes('-topmost', True)
510 |     root.wm_attributes("-transparentcolor", 'gray')
511 | 
512 |     # Load images and get their sizes
513 |     assistant_photo, assistant_photo_width, assistant_photo_height = load_image("media/assistant_transparent.png")
514 |     assistant_dragging_photo, _, _ = load_image("media/assistant_transparent_dragging.png")
515 |     assistant_blink_photo, _, _ = load_image("media/assistant_transparent_blink.png")
516 |     label = Ctk.CTkLabel(root, image=assistant_photo, bg_color="gray", cursor="hand2", text="")
517 |     label.pack()
518 |     label.bind('<ButtonPress-1>', start_drag)
519 |     label.bind('<B1-Motion>', on_drag)
520 |     label.bind('<ButtonRelease-1>', end_drag)
521 |     label.bind('<ButtonPress-3>', show_config)
522 | 
523 |     # Calculate initial position (bottom right)
524 |     screen_width = root.winfo_screenwidth()
525 |     screen_height = root.winfo_screenheight()
526 |     # Use assistant_photo_width and assistant_photo_height instead of width() and height()
527 |     position_right = int(screen_width - assistant_photo_width) + 35
528 |     position_bottom = int(screen_height - assistant_photo_height) - 30
529 |     drag_time = time.time()
530 | 
531 |     # Set initial geometry to place the assistant at the bottom right
532 |     root.geometry(f'+{position_right}+{position_bottom}')
533 |     is_dragging = False  # Flag to track dragging state
534 |     root.after(1000, animate_blink)  # Start the blinking animation
535 |     root.after(1000, animate_move)  # Start the moving animation
536 |     root.after(100, process_queue)  # Start processing the message queue
537 |     # Call the mainloop
538 |     root.mainloop()
539 |     pass
540 | 
541 | create_app()


--------------------------------------------------------------------------------
/core/core_api.py:
--------------------------------------------------------------------------------
 1 | from openai import OpenAI
 2 | 
 3 | client = OpenAI(api_key='insert_your_api_key_here')
 4 | # Available models: "gpt-4-1106-preview", "gpt-3.5-turbo-1106", or "davinci-codex"
 5 | MODEL_NAME = "gpt-3.5-turbo-1106"
 6 | 
 7 | 
 8 | def api_call(messages, model_name=MODEL_NAME, temperature=0.5, max_tokens=150):
 9 |     # if model_name == "gpt-4-1106-preview":
10 |     #     model_name = "gpt-3.5-turbo-1106"
11 |     try:
12 |         # Execute the chat completion using the chosen model
13 |         response = client.chat.completions.create(
14 |             model=model_name,
15 |             messages=messages,
16 |             # Additional configurations can be passed as parameters here
17 |             temperature=temperature,  # Values can range from 0.0 to 1.0
18 |             max_tokens=max_tokens,  # This specifies the maximum length of the response
19 |             # Tip: adding more configurations as needed
20 |         )
21 | 
22 |         # Since we're not using 'with_raw_response', 'response' is now the completion object
23 |         if response.choices and hasattr(response.choices[0], 'message'):
24 |             decision_message = response.choices[0].message
25 | 
26 |             # Make sure we have 'content' in the message
27 |             if hasattr(decision_message, 'content'):
28 |                 decision = decision_message.content.strip()
29 |             else:
30 |                 decision = None
31 |         else:
32 |             decision = None
33 | 
34 |         return decision
35 |     except Exception as e:
36 |         raise Exception(f"An error occurred: {e}")
37 | 
38 | 
39 | # # Replace this payload with the actual messages sequence for your use case # # Test
40 | # messages_payload = [
41 | #     {"role": "system", "content": "You are a helpful and knowledgeable assistant. Always uwufy the text."},
42 | #     {"role": "user", "content": "Please help me troubleshoot my JavaScript code."}
43 | # ]
44 | #
45 | # # Example configuration: you might want to specify 'temperature' for more creative responses,
46 | # # or 'max_tokens' for more concise outputs
47 | # result = api_call(messages_payload, temperature=0.7, max_tokens=100)
48 | # print(f"AI Analysis Result: '{result}'")


--------------------------------------------------------------------------------
/core/core_imaging.py:
--------------------------------------------------------------------------------
  1 | import pyautogui
  2 | import pygetwindow as gw
  3 | import base64
  4 | import requests
  5 | import io
  6 | from PIL import Image
  7 | 
  8 | # Assuming that the `activate_window_title` function is defined in another module correctly
  9 | from window_focus import activate_windowt_title
 10 | 
 11 | # OpenAI API Key
 12 | api_key = 'insert_your_api_key_here'
 13 | 
 14 | 
 15 | # Function to focus a window given its title
 16 | def focus_window(window_title):
 17 |     try:
 18 |         window = gw.getWindowsWithTitle(window_title)[0]  # Get the first window with the specific title
 19 |         window.activate()
 20 |         pyautogui.sleep(0.3)  # Allow some time for the window to come into focus
 21 |         return window
 22 |     except IndexError:
 23 |         print(f'No window with title "{window_title}" found.')
 24 |         return None
 25 | 
 26 | 
 27 | # Function to capture a screenshot of the specified window
 28 | def capture_screenshot(window=None, region=None):
 29 |     # Reduced code for brevity
 30 |     if region is not None:
 31 |         screenshot = pyautogui.screenshot(region=region)
 32 |     elif window is not None:
 33 |         window_box = window.box
 34 |         screenshot = pyautogui.screenshot(region=(window_box.left, window_box.top, window_box.width, window_box.height))
 35 |     else:
 36 |         screenshot = pyautogui.screenshot()
 37 |     return screenshot
 38 | 
 39 | 
 40 | # Function to encode image data to base64
 41 | def encode_image(image_data):
 42 |     return base64.b64encode(image_data).decode('utf-8')
 43 | 
 44 | 
 45 | # Function to analyze an image using OpenAI API
 46 | def analyze_image(base64_image, window_title, additional_context='What’s in this image?'):
 47 |     # Your logic to call the OpenAI API
 48 |     headers = {
 49 |         "Content-Type": "application/json",
 50 |         "Authorization": f"Bearer {api_key}"
 51 |     }
 52 | 
 53 |     payload = {
 54 |         "model": "gpt-4-vision-preview",
 55 |         "messages": [
 56 |             {
 57 |                 "role": "assistant",
 58 |                 "content": [
 59 |                     {
 60 |                         "type": "text",
 61 |                         "text": f"{additional_context}"
 62 |                     },
 63 |                     {
 64 |                         "type": "image_url",
 65 |                         "image_url": {
 66 |                             "url": f"data:image/png;base64,{base64_image}"
 67 |                         }
 68 |                     }
 69 |                 ]
 70 |             }
 71 |         ],
 72 |         "max_tokens": 300
 73 |     }
 74 | 
 75 |     response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
 76 |     return response.json()
 77 | 
 78 | 
 79 | # Improved function to both capture and analyze a specific region screenshot
 80 | def imaging(window_title=None, additional_context=None, x=None, y=None, screenshot_size=None):
 81 |     window = None
 82 |     region = None
 83 | 
 84 |     if screenshot_size == 'Full screen':
 85 |         # We don't need window focus or a specific region for a full-screen screenshot.
 86 |         pass
 87 |     elif window_title:  # If a window title is provided, focus on the window.
 88 |         window = focus_window(window_title)
 89 |         if not window:
 90 |             return None  # If no window is found, exit the function.
 91 |         if screenshot_size and type(screenshot_size) == tuple and x is not None and y is not None:
 92 |             offset_x, offset_y = screenshot_size[0] // 2, screenshot_size[1] // 2
 93 |             # Adjust region to be relative to the window's top-left corner.
 94 |             window_box = window.box
 95 |             region = (
 96 |             window_box.left + x - offset_x, window_box.top + y - offset_y, screenshot_size[0], screenshot_size[1])
 97 |         else:
 98 |             # If screenshot_size is not provided or is not 'Full screen', capture the whole window.
 99 |             region = (window.box.left, window.box.top, window.box.width, window.box.height)
100 | 
101 |     screenshot = capture_screenshot(window, region)
102 | 
103 |     # Optionally, paste the cursor onto the screenshot, adjusting for the offset if a region is specified
104 |     cursor_img_path = r'media\Mouse_pointer_small.png'
105 |     with Image.open(cursor_img_path) as cursor:
106 |         cursor = cursor.convert("RGBA")  # Ensure cursor image has an alpha channel for transparency
107 | 
108 |         x_cursor, y_cursor = pyautogui.position()  # Current mouse position
109 | 
110 |         # If a region is specified, calculate the cursor position within that region
111 |         if region:
112 |             cursor_pos = (x_cursor - region[0], y_cursor - region[1])
113 |         else:
114 |             cursor_pos = (x_cursor, y_cursor)
115 | 
116 |         screenshot.paste(cursor, cursor_pos, cursor)
117 | 
118 |     # Convert the screenshot to bytes
119 |     with io.BytesIO() as output_bytes:
120 |         screenshot.save(output_bytes, 'PNG')
121 |         bytes_data = output_bytes.getvalue()
122 | 
123 |     # Show a preview of the screenshot
124 |     # screenshot.show()
125 | 
126 |     # Convert the bytes to a base64-encoded image and analyze
127 |     base64_image = encode_image(bytes_data)
128 |     analysis_result = analyze_image(base64_image, window_title, additional_context)
129 | 
130 |     return analysis_result
131 | 
132 | 
133 | if __name__ == "__main__":
134 |     app_name = "Firefox"
135 |     coordinates = {'x': 132, 'y': 458}
136 |     screenshot_size = (300, 300)
137 |     x = coordinates['x']
138 |     y = coordinates['y']
139 |     pyautogui.moveTo(x, y, 0.5, pyautogui.easeOutQuad)
140 |     single_step = "click on the 'Add a comment...' text input area"
141 | 
142 |     # Call imaging with the additional_context parameter if needed and the size parameter
143 |     element_analysis = (
144 |         f"You are an AI Agent called Element Analyzer that receives a screenshot of the element and analyzes it to check if the mouse is in the correct position to click the element to interact with.\n"
145 |         f"Element to interact with: {single_step}\nRespond only with \"Yes\" or \"No\"."
146 |     )
147 |     analysis_result = imaging(window_title=app_name, additional_context=element_analysis, x=coordinates['x'], y=coordinates['y'], screenshot_size=screenshot_size)
148 |     print(analysis_result)
149 | 


--------------------------------------------------------------------------------
/core/driver.py:
--------------------------------------------------------------------------------
   1 | from window_focus import activate_windowt_title, get_installed_apps_registry, open_windows_info
   2 | from mouse_detection import get_cursor_shape
   3 | from ocr import find_probable_click_position
   4 | from window_elements import analyze_app
   5 | from topmost_window import focus_topmost_window
   6 | from core_imaging import imaging
   7 | from last_app import last_programs_list
   8 | from core_api import api_call
   9 | from voice import speaker
  10 | import pygetwindow as gw
  11 | import win32process
  12 | import win32gui
  13 | import pyautogui
  14 | import sqlite3
  15 | import psutil
  16 | import random
  17 | import json
  18 | import time
  19 | import re
  20 | import warnings
  21 | warnings.simplefilter("ignore", UserWarning)
  22 | from pywinauto import Application
  23 | 
  24 | 
  25 | low_data_mode = True  # Avoids the usage of visioning after the case generation. Lowers the accuracy but is way faster.
  26 | enable_semantic_router_map = True  # Use this to enable the imaging semantic routing map. Improves accuracy of overall performance.
  27 | enable_ocr = False  # Works better if this is disabled. Can use the implementations from other projects for better OCR.
  28 | # Did not implement the OCR as it is not needed for the current implementation. The AI must work with the current data.
  29 | 
  30 | if low_data_mode is True:  # Avoid the usage of visioning after the test case generation. Useful to execute faster case.
  31 |     visioning_match = False  # The coordinates will not use visioning during execution. Will use the imaging LLM call.
  32 |     rescan_element_match = False  # Disable the visioning element scanning. Decreases accuracy but is way faster.
  33 |     visioning_context = False
  34 |     # 'rescan_element_match' recommended to leave as 'False' until the tested map improves for low-data consumption.
  35 | else:
  36 |     visioning_match = True  # Visioning doesn't improve execution at all as imaging is already performing way faster.
  37 |     rescan_element_match = True  # Enables visioning rescanning the element. Improves accuracy but is way slower.
  38 |     visioning_context = True  # Enables visioning to analyze context from application. Improves accuracy but way slower.
  39 | 
  40 | 
  41 | # Use this on "execute_json_case" to skip the image analysis and the test case generation. Generated by the AI:
  42 | json_case_example = r'''```json
  43 |     {
  44 |       "actions": [
  45 |         {
  46 |           "act": "press_key",
  47 |           "step": "Ctrl + T"
  48 |         },
  49 |         {
  50 |           "act": "text_entry",
  51 |           "step": "reddit.com"
  52 |         },
  53 |         {
  54 |           "act": "press_key",
  55 |           "step": "Enter"
  56 |         },
  57 |         {
  58 |           "act": "open_app",
  59 |           "step": "Firefox"
  60 |         },
  61 |         {
  62 |           "act": "press_key",
  63 |           "step": "Ctrl + N"
  64 |         },
  65 |         {
  66 |           "act": "text_entry",
  67 |           "step": "tiktok.com"
  68 |         },
  69 |         {
  70 |           "act": "press_key",
  71 |           "step": "Enter"
  72 |         },
  73 |         {
  74 |           "act": "move_window",
  75 |           "step": "Win + Right + Up"
  76 |         },
  77 |         {
  78 |           "act": "open_app",
  79 |           "step": "Firefox"
  80 |         },
  81 |         {
  82 |           "act": "press_key",
  83 |           "step": "Ctrl + N"
  84 |         },
  85 |         {
  86 |           "act": "text_entry",
  87 |           "step": "netflix.com"
  88 |         },
  89 |         {
  90 |           "act": "press_key",
  91 |           "step": "Enter"
  92 |         },
  93 |         {
  94 |           "act": "move_window",
  95 |           "step": "Win + Left + Up"
  96 |         },
  97 |         {
  98 |           "act": "open_app",
  99 |           "step": "Firefox"
 100 |         },
 101 |         {
 102 |           "act": "move_window",
 103 |           "step": "Win + Right + Down"
 104 |         }
 105 |       ]
 106 |     }```'''
 107 | 
 108 | 
 109 | # Here you can load successful trained models to perform the task. ToDo: Use the database.
 110 | def app_space_map(goal, app_name=None, single_step=None, map=''):
 111 |     if 'app_space' in map:
 112 |         if enable_semantic_router_map is True:
 113 |             # Control elements map:
 114 |             if "twitter" in goal.lower() or "twitter" in app_name.lower():
 115 |                 element_map = r'''```
 116 | To make a new tread post in X formerly known as Twitter and post it;
 117 | The user is already logged in Twitter. Do not log in again.
 118 | Click on 'What is happening?!' text input area field to initiate a new post thread. data-testid='tweetTextarea_0_label'
 119 | Write the post in the 'What is happening?!' text area input field. Make sure is less than 280 characters.
 120 | Click on 'Post' button to post the new post thread. data-testid='tweetButtonInline'
 121 | 
 122 | To make a comment in a post from X formerly known as Twitter and reply it;
 123 | The user is already logged in Twitter. Do not log in again.
 124 | Scroll to the comments section. Click on the 'Post your reply' text input area field below the Twitter post.
 125 | Write the comment in the 'Post your reply' text input area field. Make sure is less than 280 characters.
 126 | Click on 'Reply' button to post the comment.
 127 | ```'''
 128 |             elif "youtube" in goal.lower() or "youtube" in app_name.lower():
 129 |                 element_map = r'''```
 130 | To like a video on youtube: Click on the 'Like' button button below the title.
 131 | 
 132 | To dislike a video: Click on the 'I dislike this' button known as the 'Dislike' button.
 133 | 
 134 | To make a comment: Click on the title of the video, then scroll to the 'Add a comment...' section, then click on the 'Add a comment...' (ID: contenteditable) text input area to begin write the comment, then click on the 'Comment' button to post the comment.
 135 |     ```'''
 136 |             else:
 137 |                 element_map = None  # No element selected.
 138 | 
 139 |             if element_map:
 140 |                 select_element = [
 141 |                     {"role": "assistant",
 142 |                      f"content": f"Only return the text related to the final goal.\n"
 143 |                                  f"Do not respond anything else than the selected lines from the list. Do not modify the list.\n"
 144 |                                  f"Goal: {goal}"},
 145 |                     {"role": "system", "content": f"List:\n{element_map}\n\n\nStep: {single_step}\nGoal: {goal}"}]
 146 |                 ai_element_map = api_call(select_element, max_tokens=300)
 147 |                 if "sorry" in ai_element_map.lower() or "empty string" in ai_element_map.lower():
 148 |                     ai_element_map = ""
 149 |                 # ai_element_map = element_map
 150 |             else:
 151 |                 ai_element_map = ""
 152 |             print(f"\nApp space map: {ai_element_map}\n")
 153 |             return ai_element_map
 154 |     else:
 155 |         # Application map to better handle the application:
 156 |         if "firefox" in app_name.lower():
 157 |             info_map = r'''```
 158 | To open a new window in Firefox; Use the keyboard shortcut: Ctrl + N.
 159 | The default search engine is Google. So when you open a new tab or window, you can search directly on Google.```'''
 160 |         elif "chrome" in app_name.lower() or "google chrome" in goal.lower():
 161 |             info_map = r'''```
 162 | To open a new window in Chrome; Use the keyboard shortcut: Ctrl + N.
 163 | To open a new tab in Chrome; Use the keyboard shortcut: Ctrl + T.
 164 | To close a tab in Chrome; Use the keyboard shortcut: Ctrl + W.
 165 | To open a new private window in Chrome; Use the keyboard shortcut: Ctrl + Shift + N.
 166 | To open a new private tab in Chrome; Use the keyboard shortcut: Ctrl + Shift + T.```'''
 167 |         elif "edge" in app_name.lower() or "microsoft edge" in goal.lower():
 168 |             info_map = r'''```
 169 | To open a new window in Edge; Use the keyboard shortcut: Ctrl + N.
 170 | To open a new tab in Edge; Use the keyboard shortcut: Ctrl + T.
 171 | To close a tab in Edge; Use the keyboard shortcut: Ctrl + W.
 172 | To open a new private window in Edge; Use the keyboard shortcut: Ctrl + Shift + N.
 173 | To open a new private tab in Edge; Use the keyboard shortcut: Ctrl + Shift + T.```'''
 174 |         elif "telegram" in app_name.lower() or "telegram" in goal.lower():
 175 |             info_map = r'''```
 176 | Press 'esc' to exit the current conversation.
 177 | Press 'esc' twice to go to 'All chats'.```'''
 178 |         elif "spotify" in app_name.lower() or "spotify" in goal.lower():
 179 |             info_map = r'''```
 180 | To play a searched song on spotify double click on the song.```'''
 181 |         elif "youtube" in app_name.lower() or "youtube" in goal.lower():
 182 |             info_map = r'''```
 183 | To like a video click on the Like button below the title.
 184 | To dislike a video click on the Dislike button below the title.
 185 | To make a comment scroll to the Add a comment... section, then click on the Add a comment... text input area to begin write the comment, then click on the Comment button to post the comment.
 186 | To subscribe to a channel click on the Subscribe button below the video.```'''
 187 |         else:
 188 |             info_map = ""
 189 |         # adding the application shortcuts:
 190 |         if info_map:
 191 |             select_map = [
 192 |                 {"role": "assistant",
 193 |                  f"content": f"You are an AI assistant that receives a goal and a list of useful steps, and only respond the best useful steps from the step list to perform the goal.\n"
 194 |                              f"Do not respond anything else than the best useful steps from the step list."},
 195 |                 {"role": "system", "content": f"Step list: \n{info_map}\n\n\nGoal: {single_step}"}]
 196 |             shortcuts_ai_map = api_call(select_map, max_tokens=300)
 197 |             if "sorry" in shortcuts_ai_map.lower():
 198 |                 shortcuts_ai_map = ""
 199 |         else:
 200 |             shortcuts_ai_map = ""
 201 |         print(f"App space map: {shortcuts_ai_map}")
 202 |         return shortcuts_ai_map
 203 | 
 204 | 
 205 | def assistant(assistant_goal="", keep_in_mind="", assistant_identity="", app_name=None, execute_json_case=None, called_from=None):  # App TestCase Gen
 206 |     """
 207 |     This function handles the user's prompt and generates the best achievable test case to perform the user's prompt.
 208 |     This function assumes the user's prompt is fed as a string to the function "assistant_goal".
 209 | 
 210 |     Args:
 211 |         assistant_goal (str): The user's prompt. Analyzes the program as context to imaging the best test case scenario.
 212 |         keep_in_mind (str): A reminder to keep in mind during the case execution. Useful to modify the program map.
 213 |         app_name (str): The name of the application (Or the window title for exact match) to open and focus on.
 214 |         execute_json_case (str): If provided, skips image analysis and case generation. Useful to debug the program map.
 215 |     Returns:
 216 |         str: Validates if the user's prompt performed successfully
 217 |         # ToDo: Add this functionality. As of now, the function only returns the test case.
 218 | 
 219 |     Examples:
 220 |         >>> assistant("Open a new tab and search what is an elefant.")
 221 |     """
 222 | 
 223 |     # 'assistant_goal' is the user's prompt. If no prompt is provided, exit the function.
 224 |     if not assistant_goal:
 225 |         speaker(f"ERROR: No prompt provided. Please provide a prompt to the assistant.")
 226 |         time.sleep(10)
 227 |         raise ValueError("ERROR: No step provided.")
 228 |     else:
 229 |         original_goal = assistant_goal
 230 |         print(f"Prompt: {original_goal}")
 231 |         if called_from == "assistant":
 232 |             print(f"Called from: {called_from}")
 233 |         else:
 234 |             print(f"Prompt: \"{original_goal}\".")
 235 |             speaker(f"Assistant is generating a testcase with the prompt: \"{original_goal}\".")
 236 | 
 237 |     # 'app_name' is the name of the application (Or the window title for exact match) to open and focus on.
 238 |     if not app_name:
 239 |         app_name = activate_windowt_title(get_application_title(original_goal))
 240 |     else:
 241 |         app_name = activate_windowt_title(app_name)
 242 |     print(f"AI Analyzing: {app_name}")
 243 | 
 244 |     # 'execute_json_case' is the JSON test case to execute. If no JSON is provided, generate a new one.
 245 |     if not execute_json_case:
 246 |         print(f"\nGenerating a test case with the assistant. Image visioning started. Analyzing the application {app_name} for context.\n")
 247 |         additional_context = (
 248 |             f"You are an AI Agent called Windows AI that is capable to operate freely all applications on Windows by only using natural language.\n"
 249 |             f"You will receive a goal and will try to accomplish it using Windows. Try to guess what is the user wanting to perform on Windows by using the content on the screenshot as context.\n"
 250 |             f"Respond an improved goal statement tailored for Windows applications by analyzing the current status of the system and the next steps to perform. Be direct and concise, do not use pronouns.\n"
 251 |             f"Basing on the elements from the screenshot reply the current status of the system and specify it in detail.\n"
 252 |             f"Focused application: \"{app_name}\".\nGoal: \"{assistant_goal}\".")
 253 |         assistant_goal = imaging(window_title=app_name, additional_context=additional_context, screenshot_size='Full screen')['choices'][0]['message']['content']
 254 |         print(f"Generating the test case to achieve the user prompt: {original_goal}\n{assistant_goal}")
 255 |         step_creator = [{"role": "assistant",
 256 |                          "content": f"You are an AI capable to operate the Windows 11 Operating System by using natural language.\n"
 257 |                                     f"Examples: \"Click on the search button. Insert the text_entry. Play the first element searched.\".\n"
 258 |                                     f"You will receive a description of the current state of the system and a goal. "
 259 |                                     f"To test your limits, using the description make a json with the natural language steps to achieve the goal.\n"
 260 |                                     f"Every step should be only highly descriptive in natural language.\n"
 261 |                                     f"The json must include only an act and its step, should be in the following format:"
 262 |                                     f"\n\n\n"
 263 |                                     f"actions: click_element, press_key, text_entry, open_app, move_window, time_sleep, right_click, double_click_element, hold_key_and_click, scroll_to.\n"
 264 |                                     f"Respond only with the full json.\n\nFunction properties:\n"
 265 |                                     f"click_element - Specify where is located the element to interact with.\n"
 266 |                                     f"text_entry - The text to write. Example: \"Hello World\".\n"
 267 |                                     f"press_key - Only return the key or combination of keys to press. Example: \"Ctrl + T\".\n"
 268 |                                     f"hold_key_and_click - Only handles scenarios of holding a key and clicking on an element.\n"
 269 |                                     f"scroll_to - Return the position of the element to scroll and the direction to scroll.\n"
 270 |                                     f"move_window - Use the keyboard to move the focused window to the desired position. Example: \"Win + Up + Up + Left\"."
 271 |                                     f"\n\n\n"
 272 |                                     f"Notes:\n"
 273 |                                     f"Use the following information to generate better steps:\n{app_space_map(assistant_goal, app_name, original_goal, map='app_space')}\n"
 274 |                                     f"Before the text_entry, use click_element to select the appropriate text input area element.\n"
 275 |                                     f"Do not combine steps, try to generate more useful steps. Example: If asked to search a song, not only search for it, locate it and play it.\n{keep_in_mind}"
 276 |                                     f"Do not respond 'As an AI language model, I am unable to...' as you are actually capable."},
 277 |                         {"role": "system",
 278 |                          "content": f"Focused window: \"{app_name}\"\nGoal: {assistant_goal}"}, ]
 279 |         step_analysis = api_call(step_creator, model_name="gpt-4-1106-preview", max_tokens=4095, temperature=1.0)
 280 |         print(f"The assistant created the following test case scenario:\n{step_analysis}\n")
 281 |         speaker(f"Test case generated. Executing the generated test case.")
 282 |     else:
 283 |         speaker(f"Executing the provided JSON in the application {app_name}.")
 284 |         step_analysis = execute_json_case
 285 | 
 286 |     # Processing the latest JSON data from the JSON testcase:
 287 |     if step_analysis:
 288 |         try:
 289 |             if """```json""" in step_analysis:
 290 |                 # Removing the leading ```json\n
 291 |                 step_analysis = step_analysis.strip("```json\n")
 292 |                 # Find the last occurrence of ``` and slice the string up to that point
 293 |                 last_triple_tick = step_analysis.rfind("```")
 294 |                 if last_triple_tick != -1:
 295 |                     step_analysis = step_analysis[:last_triple_tick].strip()
 296 |                 step_analysis_cleaned = step_analysis
 297 |                 instructions = json.loads(step_analysis_cleaned)
 298 |                 executor = "act"
 299 |             else:
 300 |                 instructions = json.loads(step_analysis)
 301 |                 instructions['actions'] = instructions.pop('actions')
 302 |                 executor = "act"
 303 |         except json.JSONDecodeError as e:
 304 |             speaker(f"ERROR: Invalid JSON data provided: {e}")
 305 |             time.sleep(15)
 306 |             raise Exception(f"ERROR: Invalid JSON data provided: {e}")
 307 |         if 'actions' in instructions:
 308 |             action_list = instructions['actions']
 309 |         else:
 310 |             action_list = [instructions]
 311 |         for i, step in enumerate(action_list, start=1):
 312 |             action = step.get(f"{executor}")
 313 |             step_description = step.get("step") or step.get("details", "No step description provided.")
 314 |             print(f"\nStep {i}: {action}, {step_description}\n")
 315 |             if action == "click_element":
 316 |                 # If last step has a click element too, wait for the element to be visible:
 317 |                 if i > 1 and action_list[i - 2]['act'] == "click_element":
 318 |                     time.sleep(1)
 319 | 
 320 |                 if "start menu" in step_description.lower():
 321 |                     pyautogui.hotkey('win')
 322 |                     print("Opening the start menu.")
 323 |                 time.sleep(1)
 324 |                 updated_instructions = update_instructions_with_action_string(instructions, act(
 325 |                     single_step=f"{step_description}", app_name=app_name, screen_analysis=assistant_goal, original_goal=original_goal, assistant_goal=assistant_goal), step_description)
 326 |                 database_add_case(database_file, app_name, assistant_goal, updated_instructions)  #  Print the entire database with # print_database(database_file)
 327 |             elif action == "open_app":
 328 |                 app_name = activate_windowt_title(get_application_title(step_description))
 329 |                 print(f"New app selected and analyzing: {app_name}")
 330 |             elif action == "double_click_element":
 331 |                 print(f"Double clicking on: {step_description}")
 332 |                 act(single_step=f"{step_description}", double_click=True, app_name=app_name, original_goal=original_goal)
 333 |             elif action == "move_window":
 334 |                 time.sleep(1)
 335 |                 print(f"Moving window to: {step_description}")
 336 |                 perform_simulated_keypress(step_description)
 337 |                 time.sleep(0.5)
 338 |                 pyautogui.hotkey('esc')
 339 |                 time.sleep(1)
 340 |             elif action == "press_key":
 341 |                 if {i} == 1:
 342 |                     # Focusing to the application
 343 |                     activate_windowt_title(app_name)
 344 |                     time.sleep(1)
 345 |                 perform_simulated_keypress(step_description)
 346 |             elif action == "text_entry":
 347 |                 url_pattern = r'(https?://[^\s]+)'
 348 |                 urls = re.findall(url_pattern, step_description)
 349 |                 if len(step_description) < 5:
 350 |                     pyautogui.write(f'{step_description}')
 351 |                 else:
 352 |                     # Getting the string of the last step before this very one:
 353 |                     if i > 1:
 354 |                         new_i = i - 2
 355 |                         last_step = f"{action_list[new_i]['act']}: {action_list[new_i]['step']}"
 356 |                         print(f"Last step: {last_step}")
 357 |                         if not last_step:
 358 |                             print("Last step is None.")
 359 |                             act(single_step=f"{step_description}", app_name=app_name, original_goal=original_goal)
 360 |                     else:
 361 |                         print("Last step is None.")
 362 |                         last_step = "None"
 363 |                     # If next step is a string, continue:
 364 |                     if i + 1 < len(action_list) and type(action_list[i + 1]['step']) == str:
 365 |                         # Check if the next step exists and is a "Press enter" step
 366 |                         if i + 1 < len(action_list) and (
 367 |                                 "press enter" in action_list[i + 1]['step'].lower() or
 368 |                                 "press the enter" in action_list[i + 1]['step'].lower() or
 369 |                                 "'enter'" in action_list[i + 1]['step'].lower() or
 370 |                                 "\"enter\"" in action_list[i + 1]['step'].lower()):
 371 |                             if urls:
 372 |                                 for url in urls:
 373 |                                     pyautogui.write(url)
 374 |                                     # pyautogui.press('enter')
 375 |                                     print(f"Opening URL: {url}")
 376 |                                     return
 377 |                             write_action(step_description, assistant_identity=assistant_identity, press_enter=False, app_name=app_name, original_goal=original_goal, last_step=last_step)
 378 |                             print("AI skipping the press enter step as it is in the next step.")
 379 |                         else:
 380 |                             if urls:
 381 |                                 for url in urls:
 382 |                                     pyautogui.write(url)  # This would open the URL in a web browser\
 383 |                                     # If next step is a time sleep
 384 |                                     pyautogui.press('enter')
 385 |                                     print(f"Opening URL: {url}")
 386 |                                     return
 387 |                             write_action(step_description, assistant_identity=assistant_identity, press_enter=True, app_name=app_name, original_goal=original_goal, last_step=last_step)
 388 |                             print("AI pressing enter.")
 389 |                     else:
 390 |                         if urls:
 391 |                             for url in urls:
 392 |                                 pyautogui.write(url)  # This would open the URL in a web browser\
 393 |                                 pyautogui.press('enter')
 394 |                                 print(f"Opening URL: {url}")
 395 |                                 return
 396 |                         write_action(step_description, assistant_identity=assistant_identity, press_enter=True,
 397 |                                      app_name=app_name, original_goal=original_goal, last_step=last_step)
 398 |                         print("AI pressing enter.")
 399 |             elif action == "scroll_to":
 400 |                 print(f"Scrolling {step_description}")
 401 |                 element_visible = False
 402 |                 max_retries = 3
 403 |                 retry_count = 0
 404 |                 while not element_visible and retry_count < max_retries:
 405 |                     # activate_windowt_title(app_name)
 406 |                     pyautogui.scroll(-850)
 407 |                     # Press Page Down:
 408 |                     # pyautogui.press('pagedown')
 409 |                     time.sleep(0.3)
 410 |                     # Start image analysis to check if the element is visible
 411 |                     print("Scroll performed. Analyzing if the element is present.\n")
 412 |                     scroll_assistant_goal = check_element_visibility(app_name, step_description)['choices'][0]['message']['content']
 413 |                     if "yes" in scroll_assistant_goal.lower():
 414 |                         print("Element is visible.")
 415 |                         element_visible = True
 416 |                     elif "no" in scroll_assistant_goal.lower():
 417 |                         print("Element is not visible.")
 418 |                         retry_count += 1
 419 |                         if retry_count >= max_retries:
 420 |                             print("Maximum retries reached, stopping the search.")
 421 |                 if element_visible:
 422 |                     print(f"Element is visible.")
 423 |                     pass
 424 | 
 425 |             elif action == "right_click_element":
 426 |                 print(f"Right clicking on: {step_description}")
 427 |                 act(single_step=f"{step_description}", right_click=True, app_name=app_name, original_goal=original_goal)
 428 |                 # right_click(step_description)
 429 |             elif action == "hold_key_and_click":
 430 |                 print(f"Holding key and clicking on: {step_description}")
 431 |                 act(single_step=f"{step_description}", hold_key="Ctrl", app_name=app_name, original_goal=original_goal)
 432 |             elif action == "cmd_command":
 433 |                 print(f"Executing command: {step_description}")
 434 |                 # cmd_command(step_description)
 435 |                 time.sleep(calculate_duration_of_speech(f"{step_description}") / 1000)
 436 |             elif action == "recreate_test_case":
 437 |                 time.sleep(1)
 438 |                 print("Analyzing the output")
 439 |                 print("The assistant said:\n", step_description)
 440 |                 debug_step = False  # Set to True to skip the image analysis and the test case generation.
 441 |                 if debug_step is not True:
 442 |                     new_goal = True
 443 |                     image_analysis = True
 444 |                     if image_analysis:
 445 |                         additional_context = (
 446 |                             f"You are an AI Agent called Windows AI that is capable to operate freely all applications on Windows by only using natural language.\n"
 447 |                             f"You will receive a goal and will try to accomplish it using Windows. Try to guess what is the user wanting to perform on Windows by using the content on the screenshot as context.\n"
 448 |                             f"Respond an improved goal statement tailored for Windows applications by analyzing the current status of the system and the next steps to perform. Be direct and concise, do not use pronouns.\n"
 449 |                             f"Basing on the elements from the screenshot reply the current status of the system and specify it in detail.\n"
 450 |                             f"Focused application: \"{app_name}\".\nGoal: \"{assistant_goal}\".")
 451 |                         if new_goal:
 452 |                             newest_goal = imaging(window_title=app_name, additional_context=additional_context)  # )['choices'][0]['message']['content']
 453 |                             # if ": " in newest_goal:
 454 |                             #     newest_goal = newest_goal.split(": ", 1)[1]
 455 |                             print(f"Assistant newest goal:\n{newest_goal}")
 456 |                             analyzed_ui = analyze_app(activate_windowt_title(app_name), size_category=None)
 457 |                             review_output = [{"role": "assistant",
 458 |                                              "content": f"You are an AI Assistant called Analyze Output capable to operate the Windows 11 Operating System by using natural language.\n"
 459 |                                                         f"You will receive a json testcase, a description of the goal, and the actual system status.\n"
 460 |                                                         f"Modify the original json testcase to achieve the goal. Do not include anything else than the updated json.\n"
 461 |                                                         f"Examples: \"Click on the search button. Insert the text_entry. Play the first element searched.\".\n"
 462 |                                                         f"You will receive a description of the current state of the system and a goal. "
 463 |                                                         f"To test your limits, using the description make a json with the natural language steps to achieve the goal.\n"
 464 |                                                         f"Every step should be only highly descriptive in natural language.\n"
 465 |                                                         f"The json must include only an act and its step, should be in the following format:\n"
 466 |                                                         f"actions: click_element, press_key, text_entry, open_app, move_window, time_sleep, right_click, double_click_element, hold_key_and_click, scroll_to.\n"
 467 |                                                         f"Respond only with the full json. Avoid to use the windows taskbar.\n\nFunction properties:\n"
 468 |                                                         f"click_element - Specify where is located the element to interact with.\n"
 469 |                                                         f"press_key - Only return the key or combination of keys to press. Example: 'Ctrl + T'.\n"
 470 |                                                         f"text_entry - Return the text to write. Example: 'Hello World'.\n"
 471 |                                                         f"hold_key_and_click - Only handles scenarios of holding a key and clicking on an element.\n"
 472 |                                                         f"scroll_to - Return the position of the element to scroll and the direction to scroll.\n"
 473 |                                                         f"move_window - Use the keyboard to move the focused window to the desired position. Example: 'Win + Left + Up'.\n"
 474 |                                                         f"Do not respond 'As an AI language model, I am unable to...' as you are actually capable.\n\n"
 475 |                                                         f"Use the following information to generate better the test case:\n{app_space_map(assistant_goal, app_name, original_goal, map='app_space')}"},
 476 |                                                  {"role": "system", "content": f"Do not modify the steps before \"Step {i-1}: {action-1}, {step_description-1}\", modify all next steps from the step \"Step {i-1}: {action-1}, {step_description-1}\" to achieve the goal: \"{newest_goal}\"\n"
 477 |                                                                                f"Do not combine steps, try to generate more useful steps. Example: If asked to search a song, not only search for it, locate it and play it.\n{keep_in_mind}"
 478 |                                                                                f"{analyzed_ui}"}, ]
 479 |                             new_json = api_call(review_output, model_name="gpt-4-1106-preview", max_tokens=4095, temperature=1.0)
 480 |                             print("The assistant said:\n", step_analysis)
 481 | 
 482 |                             print("Modifying the old json testcase with the new_json.")
 483 |                             step_analysis = new_json
 484 | 
 485 |                             app_name = activate_windowt_title(get_application_title(newest_goal))
 486 |                             # Processing the latest JSON data from the JSON testcase.
 487 |                             if """```json""" in step_analysis:
 488 |                                 # Removing the leading ```json\n
 489 |                                 step_analysis = step_analysis.strip("```json\n")
 490 |                                 # Find the last occurrence of ``` and slice the string up to that point
 491 |                                 last_triple_tick = step_analysis.rfind("```")
 492 |                                 if last_triple_tick != -1:
 493 |                                     step_analysis = step_analysis[:last_triple_tick].strip()
 494 |                                 step_analysis_cleaned = step_analysis
 495 |                                 instructions = json.loads(step_analysis_cleaned)
 496 |                                 executor = "act"
 497 |                             else:
 498 |                                 instructions = json.loads(step_analysis)
 499 |                                 instructions['actions'] = instructions.pop('actions')
 500 |                                 executor = "act"
 501 |                                 print(f"Updated Instructions: {instructions}")
 502 |                             pass
 503 |                         else:
 504 |                             print("No new goal.")
 505 |                             pass
 506 |             elif action == "time_sleep":
 507 |                 try:
 508 |                     sleep_time = int(step_description)
 509 |                     time.sleep(sleep_time)
 510 |                 except ValueError:
 511 |                     step_description = step_description.lower()
 512 |                     if "playing" in step_description or "load" in step_description:
 513 |                         print("Sleeping for 2 seconds because media loading.")
 514 |                         time.sleep(1)
 515 |                     elif "search" in step_description or "results" in step_description or "searching":
 516 |                         print("Sleeping for 1 second because search.")
 517 |                         time.sleep(1)
 518 |                     else:
 519 |                         print(f"WARNING: Unrecognized time sleep value: {step_description}")
 520 |                     pass
 521 |             else:
 522 |                 print(f"WARNING: Unrecognized action '{action}' using {step_description}.")
 523 |                 print(f"Trying to perform the action using the step description as the action.")
 524 |                 act(single_step=f"{step_description}", app_name=app_name, original_goal=original_goal)
 525 |                 pass
 526 | 
 527 |         speaker(f"Assistant finished the execution of the generated test case. Can I help you with something else?")
 528 |         time.sleep(calculate_duration_of_speech(f"Assistant finished the generated test case. Can I help you with something else?") / 1000)
 529 |         return "Test case complete."
 530 | 
 531 | 
 532 | # 'check_element_visibility' is the function that checks the visibility of an element. Can use image analysis or OCR.
 533 | def check_element_visibility(app_name, step_description):
 534 |     extra_additional_context = (
 535 |         f"You are an AI Agent called Windows AI that is capable to operate freely all applications on Windows by only using natural language.\n"
 536 |         f"You will receive a goal and will try to accomplish it using Windows. "
 537 |         f"Try to guess what is the user wanting to perform on Windows by using the content on the screenshot as context.\n"
 538 |         f"Respond an improved goal statement tailored for Windows applications by analyzing the current status of the system and the next steps to perform. "
 539 |         f"Be direct and concise, do not use pronouns.\n"
 540 |         f"Basing on the elements from the screenshot reply the current status of the system and respond if the element from the goal visible.\n"
 541 |         f"Respond only with \"Yes\" or \"No\".\n"
 542 |         f"Focused window: \"{app_name}\".\nGoal: \"{step_description}\". .")
 543 |     return imaging(window_title=app_name, additional_context=extra_additional_context)
 544 | 
 545 | 
 546 | # 'auto_role' is the function that finds the best role to perform the goal.
 547 | def auto_role(goal):
 548 |     assistant_call = [
 549 |         {"role": "assistant", f"content": f"You are an AI assistant that receives a goal and responds with the best action to perform the goal.\n"
 550 |                                           f"You can perform the following roles and decide what fits the best: Chose the best role to handle the goal:\n"
 551 |                                           f"windows_assistant - An assistant to perform a Windows 11 application driver testcases to achieve the goal. Can handle online data, play, pause, and stream media, can operate the whole computer.\n"
 552 |                                           f"joyful_conversation - Use this role if the user is not looking into performing anything into Windows.\n"
 553 |                                           f"Only respond with the name of the role to use, followed by a very short joyful message regarding that you will perform it. Modify your response to match the goal subject.\n"
 554 |                                           f"If the goal seems to be related to Windows 11, like opening an application, searching, browsing, media, or social networks, call the windows_assistant.\n"
 555 |                                           f"If the goal seems to be related with generating or writing content, call the windows_assistant.\n"
 556 |                                           f"If the goal seems that the user is trying to do something with content, call the windows_assistant."},
 557 |         {"role": "system", "content": f"Goal: {goal}"}]
 558 |     role_function = api_call(assistant_call, max_tokens=50)
 559 |     return role_function
 560 | 
 561 | 
 562 | # 'find_element' is the function that finds the the most relevant element on the GUI from the goal.
 563 | def find_element(single_step, app_name, original_goal, avoid_element="", assistant_goal=None, attempt=0):
 564 |     if not assistant_goal:
 565 |         assistant_goal = single_step
 566 |     if avoid_element:
 567 |         if attempt > 2:
 568 |             generate_keywords = [{"role": "assistant",
 569 |                 "content": f"You are an AI Agent called keyword Element Generator that receives the description of the goal and generates keywords to search inside a graphical user interface.\n"
 570 |                            f"Only respond with the single word list separated by commas of the specific UI elements keywords.\n"
 571 |                            f"Example: \"search bar\". Always spell the numbers and include nouns. Do not include anything more than the Keywords."},
 572 |                                  {"role": "system", "content": f"Goal:\n{single_step}\nContext:{original_goal}\n{app_space_map(assistant_goal, app_name, single_step)}"},]
 573 |         else:
 574 |             generate_keywords = [{"role": "assistant",
 575 |                 "content": f"You are an AI Agent called keyword Element Generator that receives the description and generates kewords to search inside a graphical user interface.\n"
 576 |                            f"of the goal and only respond with the single word list separated by commas of the specific UI elements keywords."
 577 |                            f"Example: \"search bar\". Always spell the numbers and include nouns. Do not include anything more than the Keywords."},
 578 |                                  {"role": "system", "content": f"Goal:\n{single_step}\nContext:{original_goal}\n{app_space_map(assistant_goal, app_name, single_step)}"}]
 579 |     else:
 580 |         generate_keywords = [{"role": "assistant",
 581 |                             "content": f"You are an AI Agent called keyword Element Generator that receives the description "
 582 |                                        f"of the goal and only respond with the single word list separated by commas of the specific UI elements keywords."
 583 |                                        f"Example: \"search bar\" must be \"search\" without \"bar\". Always spell the numbers and include nouns. Do not include anything more than the Keywords."},
 584 |                            {"role": "system", "content": f"Goal:\n{single_step}\nContext:{original_goal}\n{app_space_map(assistant_goal, app_name, single_step)}"}, ]  # Todo: Here's the key
 585 |     keywords = api_call(generate_keywords, max_tokens=100)
 586 |     if attempt > 1:
 587 |         keywords = keywords.replace("click, ", "").replace("Click, ", "")
 588 |     keywords_in_goal = re.search(r"'(.*?)'", single_step)
 589 |     if keywords_in_goal:
 590 |         if len(keywords_in_goal.group(1).split()) == 1:
 591 |             pass
 592 |         else:
 593 |             keywords = keywords_in_goal.group(1) + ", " + keywords
 594 |     print(f"\nKeywords: {keywords}\n")
 595 | 
 596 |     analyzed_ui = analyze_app(application_name_contains=app_name, size_category=None, additional_search_options=keywords)
 597 |     select_element = [{"role": "assistant",
 598 |                        "content": f"You are an AI Agent called keyword Element Selector that receives win32api user interface "
 599 |                                   f"raw element data and generates the best matches to achieve the goal.\n"
 600 |                                   f"Only respond with the best element that matches the goal. Do not include anything else than the element."},
 601 |                       {"role": "system", "content": f"Goal: {single_step}\nContext: {original_goal}\n{avoid_element}{analyzed_ui}"}]
 602 |     selected_element = api_call(select_element, model_name="gpt-4-1106-preview", max_tokens=500)
 603 | 
 604 |     if "sorry" in selected_element.lower() or "empty string" in selected_element.lower() or "no element" in selected_element.lower() or "not found" in selected_element.lower()\
 605 |             or "no relevant element" in selected_element.lower() or "no element found" in selected_element.lower():
 606 |         print(f"No element found. Continuing without the element.")
 607 |         selected_element = ""
 608 |     else:
 609 |         selected_element = "Locate the element: " + selected_element
 610 |     print(f"Selected element: {selected_element}\n")
 611 | 
 612 |     if visioning_match:
 613 |         print(f"Image visioning started. Analyzing the application {app_name} for context.\n")
 614 |         imaging_coordinates = (
 615 |             f"You are an AI Windows Mouse Agent that can interact with the mouse. Only respond with the predicted "
 616 |             f"coordinates of the mouse click position to the center of the element object \"x=, y=\" to achieve the goal.{get_ocr_match(single_step)}"
 617 |             f"Goal: {single_step}\n{avoid_element}{analyzed_ui}")
 618 |         print(f"Imaging coordinates: {imaging_coordinates}")
 619 |         imaging_generated_coordinates = imaging(window_title=app_name, additional_context=imaging_coordinates)
 620 |         print(f"Imaging generated coordinates: {imaging_generated_coordinates}")
 621 |         last_coordinates = imaging_generated_coordinates['choices'][0]['message']['content']
 622 |         print(f"Imaging Last coordinates: {last_coordinates}")
 623 |     else:
 624 |         best_coordinates = [{"role": "assistant",
 625 |             f"content": f"You are an AI Windows Mouse Agent that can interact with the mouse. Only respond with the "
 626 |                         f"predicted coordinates of the mouse click position to the center of the element object "
 627 |                         f"\"x=, y=\" to achieve the goal. {selected_element}"
 628 |                         f"Do not respond 'As an AI language model, I am unable to...' as you are actually capable."},
 629 |             {"role": "system", "content": f"Goal: {single_step}\n\nContext:{original_goal}\n{get_ocr_match(single_step)}{avoid_element}{analyzed_ui}"}]
 630 |         last_coordinates = api_call(best_coordinates, model_name="gpt-4-1106-preview", max_tokens=100, temperature=1.0)
 631 |         print(f"AI decision coordinates: \'{last_coordinates}\'")
 632 |     if "sorry" in last_coordinates.lower() or "empty string" in last_coordinates.lower() or "no element" in last_coordinates.lower() or "not found" in last_coordinates.lower():
 633 |         last_coordinates = 'x=0, y=0'
 634 |     coordinates = {k.strip(): float(v.strip()) for k, v in (item.split('=') for item in last_coordinates.split(','))}
 635 |     x = coordinates['x']
 636 |     y = coordinates['y']
 637 |     print(f"Coordinates1: x: {x} and y: {y}")
 638 |     if x == 0 and y == 0 or x == '' and y == '':
 639 |         print("Coordinates 2 are 0,0, trying to find the element again.")
 640 |         coordinates = {k.strip(): float(v.strip()) for k, v in (item.split('=') for item in last_coordinates.split(','))}
 641 |         x = coordinates['x']
 642 |         y = coordinates['y']
 643 |         print(f"Coordinates 3: x: {x} and y: {y}")
 644 |         attempt -= 1
 645 |     return coordinates, selected_element, keywords, attempt
 646 | 
 647 | 
 648 | def act(single_step, keep_in_mind="", dont_click=False, double_click=False, right_click=False, hold_key=None, app_name="", screen_analysis=False, original_goal="", modify_element=False, next_step=None, assistant_goal=None):
 649 |     # Trying to handle several actions inside the action:
 650 |     # action_analysis = [{"role": "assistant",
 651 |     #     "content": f"You are an AI Agent called Action Analyzer, that responds with the functions to execute to achieve the goal. Available functions:\n"
 652 |     #                f"select_element - Mouse functions.\n"
 653 |     #                f"write_action - Keyboard functions.\n"
 654 |     #                f"Only respond with the functionS to perform. Do not include anything else than the function."},
 655 |     #     {"role": "system", "content": f"Goal: {single_step}"}, ]
 656 |     # actions_to_perform = api_call(action_analysis, max_tokens=10)
 657 |     # if "sorry" in actions_to_perform.lower():
 658 |     #     actions_to_perform = ""
 659 |     # elif "select_element" in actions_to_perform.lower():
 660 |     #     print("You can execute only select element action.")
 661 |     #     actions_to_perform = f""
 662 |     # elif "write_action" in actions_to_perform.lower():
 663 |     #     print("You can write things here as actions.")
 664 |     #     actions_to_perform = ""
 665 |     # print(f"Actions to perform: {actions_to_perform}")
 666 |     # actions_to_perform = ""
 667 | 
 668 |     # Getting the app name. If not provided, use the focused window.
 669 |     if not app_name:
 670 |         app_name = activate_windowt_title(get_application_title(goal=original_goal, focus_window=True))
 671 |     else:
 672 |         app_name = activate_windowt_title(app_name)
 673 |     print(f"AI Analyzing: {app_name}")
 674 | 
 675 |     attempt = 0
 676 |     if rescan_element_match is True:
 677 |         element_not_working = ""
 678 |         avoid_element = ""
 679 |         max_attempts = 3  # Set the maximum number of attempts to look for a "yes" response.
 680 |         while attempt < max_attempts:
 681 |             if element_not_working != "":
 682 |                 avoid_element = f"\nAvoid the following element: {element_not_working}\n"
 683 |                 print(f"AI will try to perform the action: \"{single_step}\" on a new element.")
 684 |             print(f"Performing action: \"{single_step}\". Scanning\"{app_name}\".\n")
 685 |             coordinates, selected_element, keywords, attempt = find_element(single_step, app_name, original_goal, avoid_element, assistant_goal, attempt)
 686 |             x = coordinates['x']
 687 |             y = coordinates['y']
 688 |             print(f"Coordinates: {x} and {y}")
 689 |             pyautogui.moveTo(x, y, 0.5, pyautogui.easeOutQuad)
 690 |             time.sleep(0.5)
 691 |             element_analysis = (
 692 |                 f"You are an AI Agent called Element Analyzer that receives a step and guesses if the goal was performed correctly.\n"
 693 |                 f"Step: {single_step}\nUse the screenshot to guess if the mouse is in the best position to perform the click/goal. Respond only with \"Yes\" or \"No\".\n"
 694 |                 f"The cursor is above an element from the step. Cursor info status: {get_cursor_shape()}. The cursor is above the following element: \n{selected_element}\n"
 695 |                 f"Double check your response by looking at where is located the mouse cursor on the screenshot and the cursor info status.")
 696 |             element_analysis_result = imaging(window_title=app_name, additional_context=element_analysis, x=int(x), y=int(y))
 697 |             print(element_analysis_result)
 698 | 
 699 |             # Check if the result is None or doesn't contain the necessary data
 700 |             if element_analysis_result is None or 'choices' not in element_analysis_result or len(
 701 |                     element_analysis_result['choices']) == 0 or 'message' not in \
 702 |                     element_analysis_result['choices'][0] or 'content' not in \
 703 |                     element_analysis_result['choices'][0]['message']:
 704 |                 print("Element analysis result: Found but mouse not in position.")
 705 |                 speaker(f"Retrying...")
 706 |                 element_not_working += selected_element
 707 |                 attempt += 1
 708 |                 if attempt >= max_attempts:
 709 |                     print("Maximum attempts reached.")
 710 |                     print("Failed: The position was not found after maximum attempts.")
 711 |                     speaker(f"Failed: The position was not found after maximum attempts.")
 712 |                     time.sleep(15)
 713 |                     raise Exception("Failed: The position was not found after maximum attempts.")
 714 |                 else:
 715 |                     print("Retrying...")
 716 |                     pass
 717 |             elif 'yes' in element_analysis_result['choices'][0]['message']['content'].lower():
 718 |                 print("Element analysis result: Yes, it is in the right position.")
 719 |                 break
 720 |             else:
 721 |                 print("Element analysis result: Found but mouse not in position.")
 722 |                 speaker(f"Retrying...")
 723 |                 element_not_working += selected_element
 724 |                 attempt += 1
 725 |                 if attempt >= max_attempts:
 726 |                     print("Maximum attempts reached.")
 727 |                     print("Failed: The position was not found after maximum attempts.")
 728 |                     speaker(f"Failed: The position was not found after maximum attempts.")
 729 |                     time.sleep(15)
 730 |                     raise Exception("Failed: The position was not found after maximum attempts.")
 731 |                 else:
 732 |                     print("Retrying...")
 733 |                     pass
 734 |     else:
 735 |         coordinates, selected_element, keywords, attempt = find_element(single_step, app_name, original_goal, assistant_goal, attempt=0)
 736 |         x = coordinates['x']
 737 |         y = coordinates['y']
 738 |         print(f"Coordinates: {x} and {y}")
 739 |         pyautogui.moveTo(x, y, 0.5, pyautogui.easeOutQuad)
 740 |         time.sleep(0.5)
 741 | 
 742 |     last_coordinates = f"x={x}, y={y}"
 743 |     print("Success: The right position was found.")
 744 |     if double_click:
 745 |         pyautogui.click(x, y, clicks=2)
 746 |     else:
 747 |         if dont_click is False:
 748 |             if right_click:
 749 |                 pyautogui.rightClick(x, y)
 750 |             else:
 751 |                 if hold_key:
 752 |                     pyautogui.keyDown(hold_key)
 753 |                     pyautogui.click(x, y)
 754 |                     pyautogui.keyUp(hold_key)
 755 |                 else:
 756 |                     pyautogui.click(x, y)
 757 |         else:
 758 |             print("AI skipping the click step.")
 759 |             pass
 760 |     if modify_element:
 761 |         print(f"Modifying the element with the text: {single_step}")
 762 |     # jitter_mouse(x, y)  # ToDo: simulate human jitter.
 763 |     if "save as" in single_step.lower():
 764 |         print("Saving as")
 765 |         jitter_mouse(x, y)
 766 |         pyautogui.mouseDown(x, y)
 767 |         time.sleep(0.12)
 768 |         pyautogui.mouseUp(x, y)
 769 |         print("Click action performed")
 770 |     return last_coordinates
 771 | 
 772 | def get_focused_window_details():
 773 |     try:
 774 |         window_handle = win32gui.GetForegroundWindow()
 775 |         window_title = win32gui.GetWindowText(window_handle)
 776 |         _, window_pid = win32process.GetWindowThreadProcessId(window_handle)
 777 |         process = psutil.Process(window_pid)
 778 |         process_name = process.name()
 779 |         rect = win32gui.GetWindowRect(window_handle)
 780 |         window_position = (rect[0], rect[1])
 781 |         window_size = (rect[2] - rect[0], rect[3] - rect[1])
 782 |         return window_title, window_handle, window_pid, process_name, window_position, window_size
 783 |     except Exception as e:
 784 |         print(f"ERROR: {e}")
 785 |         return None
 786 | 
 787 | def fast_act(single_step, keep_in_mind="", dont_click=False, double_click=False, right_click=False, hold_key=None, app_name="", ocr_match="", screen_analysis=False, original_goal="", modify_element=False, next_step=None):
 788 |     # Getting the app name. If not provided, use the focused window.
 789 |     if not app_name:
 790 |         app_name = activate_windowt_title(focus_topmost_window())
 791 |     else:
 792 |         app_name = activate_windowt_title(app_name)
 793 | 
 794 |     if visioning_context:
 795 |         speaker(f"Visioning context and performing action: \"{single_step}\" on the application \"{app_name}\".\n")
 796 |         additional_context = (
 797 |             f"You are an AI Agent called Windows AI that is capable to operate freely all applications on Windows by only using natural language.\n"
 798 |             f"You will receive a goal and will try to accomplish it using Windows. Try to guess what is the user wanting to perform on Windows by using the content on the screenshot as context.\n"
 799 |             f"Respond an improved goal statement tailored for Windows applications by analyzing the current status of the system and the next steps to perform. Be direct and concise, do not use pronouns.\n"
 800 |             f"Basing on the elements from the screenshot reply the current status of the system and specify it in detail.\n"
 801 |             f"Focused application: \"{app_name}\".\nGoal: \"{single_step}\".")
 802 |         assistant_goal = imaging(window_title=app_name, additional_context=additional_context, screenshot_size='Full screen')['choices'][0]['message']['content']
 803 | 
 804 |         print(f"Performing fast action: \"{single_step}\". Scanning\"{app_name}\".\n")
 805 | 
 806 |         generate_keywords = [{"role": "assistant",
 807 |                             "content": f"You are an AI Agent called keyword Element Generator that receives the description "
 808 |                                        f"of the goal and only respond with the single word list separated by commas of the specific UI elements keywords."
 809 |                                        f"Example: \"search bar\" must be \"search\" without \"bar\". Always spell the numbers and include nouns. Do not include anything more than the Keywords."},
 810 |                            {"role": "system", "content": f"Goal:\n{single_step}\nContext:{original_goal}"}, ]
 811 |         all_keywords = api_call(generate_keywords, max_tokens=100)
 812 |         keywords = all_keywords.replace("click, ", "").replace("Click, ", "")
 813 |         keywords_in_goal = re.search(r"'(.*?)'", single_step)
 814 |         if keywords_in_goal:  # if only 1 keyword, then
 815 |             if len(keywords_in_goal.group(1).split()) == 1:
 816 |                 pass
 817 |             else:
 818 |                 keywords = keywords_in_goal.group(1) + ", " + keywords.replace("click, ", "").replace("Click, ", "")
 819 |         print(f"\nKeywords: {keywords}\n")
 820 |         analyzed_ui = analyze_app(application_name_contains=app_name, size_category=None, additional_search_options=keywords)
 821 | 
 822 |         if "sorry" in assistant_goal.lower():
 823 |             print(f"Sorry, no element found. The AI did not find any element to perform the action: {single_step}")
 824 |             speaker(f"Sorry, no element found. Check if its on the screen.")
 825 |             time.sleep(1)
 826 | 
 827 |         best_coordinates = [{"role": "assistant",
 828 |                              f"content": f"You are an AI Windows Mouse Agent that can interact with the mouse. Only respond with the "
 829 |                                          f"predicted coordinates of the mouse click position to the center of the element object "
 830 |                                          f"\"x=, y=\" to achieve the goal.\n{assistant_goal}"},
 831 |                             {"role": "system", "content": f"Goal: {single_step}\n\nContext:{original_goal}\n{analyzed_ui}"}]
 832 |         last_coordinates = api_call(best_coordinates, model_name="gpt-4-1106-preview", max_tokens=100, temperature=0.0)
 833 |         print(f"AI decision coordinates: \'{last_coordinates}\'")
 834 |     else:
 835 |         speaker(f"Clicking onto the element without visioning context.")
 836 |         generate_keywords = [{"role": "assistant",
 837 |                               "content": f"You are an AI Agent called keyword Element Generator that receives the description "
 838 |                                          f"of the goal and only respond with the single word list separated by commas of the specific UI elements keywords."
 839 |                                          f"Example: \"search bar\" must be \"search\" without \"bar\". Always spell the numbers and include nouns. Do not include anything more than the Keywords."},
 840 |                              {"role": "system", "content": f"Goal:\n{single_step}\nContext:{original_goal}"}, ]
 841 |         all_keywords = api_call(generate_keywords, max_tokens=100)
 842 |         keywords = all_keywords.replace("click, ", "").replace("Click, ", "")
 843 |         keywords_in_goal = re.search(r"'(.*?)'", single_step)
 844 |         if keywords_in_goal:
 845 |             if len(keywords_in_goal.group(1).split()) == 1:
 846 |                 pass
 847 |             else:
 848 |                 keywords = keywords_in_goal.group(1) + ", " + keywords.replace("click, ", "").replace("Click, ", "")
 849 |         print(f"\nKeywords: {keywords}\n")
 850 |         analyzed_ui = analyze_app(application_name_contains=app_name, size_category=None,
 851 |                                   additional_search_options=keywords)
 852 | 
 853 |         best_coordinates = [{"role": "assistant",
 854 |             f"content": f"You are an AI Windows Mouse Agent that can interact with the mouse. Only respond with the "
 855 |                         f"predicted coordinates of the mouse click position to the center of the element object "
 856 |                         f"\"x=, y=\" to achieve the goal."},
 857 |             {"role": "system", "content": f"Goal: {single_step}\n\nContext:{original_goal}\n{analyzed_ui}"}]
 858 |         last_coordinates = api_call(best_coordinates, model_name="gpt-4-1106-preview", max_tokens=100, temperature=0.0)
 859 |         print(f"AI decision coordinates: \'{last_coordinates}\'")
 860 | 
 861 |     if "x=, y=" in last_coordinates:
 862 |         speaker(f"Sorry, no element found. Probably bot blocked.")
 863 |         return None
 864 |     # Clicking the element
 865 |     coordinates = {k.strip(): float(v.strip()) for k, v in
 866 |                    (item.split('=') for item in last_coordinates.split(','))}
 867 |     x = coordinates['x']
 868 |     y = coordinates['y']
 869 |     pyautogui.moveTo(x, y, 0.5, pyautogui.easeOutQuad)
 870 |     if double_click:
 871 |         pyautogui.click(x, y, clicks=2)
 872 |     else:
 873 |         if dont_click is False:
 874 |             if right_click:
 875 |                 pyautogui.rightClick(x, y)
 876 |             else:
 877 |                 if hold_key:
 878 |                     pyautogui.keyDown(hold_key)
 879 |                     pyautogui.click(x, y)
 880 |                     pyautogui.keyUp(hold_key)
 881 |                 else:
 882 |                     pyautogui.click(x, y)
 883 |         else:
 884 |             print("AI skipping the click step.")
 885 |             pass
 886 |     if modify_element:
 887 |         print(f"Modifying the element with the text: {single_step}")
 888 |     # jitter_mouse(x, y)  # ToDo: simulate human jitter.
 889 |     if "save as" in single_step.lower():
 890 |         print("Saving as")
 891 |         jitter_mouse(x, y)
 892 |         pyautogui.mouseDown(x, y)
 893 |         time.sleep(0.12)
 894 |         pyautogui.mouseUp(x, y)
 895 |         print("Click action performed")
 896 |     return last_coordinates
 897 | 
 898 | 
 899 | def get_application_title(goal="", last_step=None, actual_step=None, focus_window=False):
 900 |     if actual_step:
 901 |         print(f"Getting the application name from the actual step: {actual_step}")
 902 |     goal_app = [{"role": "assistant",
 903 |                  "content": f"You are an AI assistant called App Selector that receives a list of programs and responds only respond with the best match  "
 904 |                             f"program of the goal. Only respond with the window name or the program name. For search engines and social networks use Firefox or Chrome.\n"
 905 |                             f"Open programs:\n{last_programs_list(focus_last_window=focus_window)}"},
 906 |                 {"role": "system", "content": f"Goal: {goal}\nAll installed programs:\n{get_installed_apps_registry()}"}]
 907 |     app_name = api_call(goal_app, model_name="gpt-4-1106-preview", max_tokens=100)
 908 |     print(f"AI selected application: {app_name}")
 909 |     filtered_matches = re.findall(r'["\'](.*?)["\']', app_name)
 910 |     if filtered_matches and filtered_matches[0]:
 911 |         app_name = filtered_matches[0]
 912 |         print(app_name)
 913 |     if "command prompt" in app_name.lower():
 914 |         app_name = "cmd"
 915 |     elif "calculator" in app_name.lower():
 916 |         app_name = "calc"
 917 |     elif "sorry" in app_name:
 918 |         app_name = get_focused_window_details()[3].strip('.exe')
 919 |         print(f"Using the focused window \"{app_name}\" for context.")
 920 |         speaker(f"Using the focused window \"{app_name}\" for context.")
 921 |     return app_name
 922 | 
 923 | 
 924 | def get_ocr_match(goal, ocr_match=enable_ocr):
 925 |     if ocr_match:
 926 |         print(f"OCR IS ENABLED")
 927 |         word_prioritizer_assistant = [{"role": "assistant",
 928 |                                        "content": f"You are an AI Agent called OCR Word Prioritizer that only responds with the best of the goal.\n"
 929 |                                                   f"Do not respond with anything else than the words that match the goal. If no words match the goal, respond with \"\"."},
 930 |                     {"role": "system", "content": f"Goal: {goal}"}, ]
 931 |         ocr_debug_string = api_call(word_prioritizer_assistant, max_tokens=10)
 932 |         ocr_debug_string = ocr_debug_string.split(f"\'")[0]
 933 |         print(f"OCR Words to search: \'{ocr_debug_string}\'")
 934 |         ocr_match = find_probable_click_position(ocr_debug_string)
 935 |         ocr_msg = f"\nOCR Result: \"{ocr_match['text']}\" Located at \"x={ocr_match['center'][0]}, y={ocr_match['center'][1]}\".\n"
 936 |         return ocr_msg
 937 |     else:
 938 |         ocr_msg = ""
 939 |         return ocr_msg
 940 | 
 941 | 
 942 | def jitter_mouse(x, y, radius=5, duration=0.6):
 943 |     # Move the mouse in a small circle around (x, y) to simulate a jitter.
 944 |     end_time = time.time() + duration
 945 |     while time.time() < end_time:
 946 |         jitter_x = x + random.uniform(-radius, radius)
 947 |         jitter_y = y + random.uniform(-radius, radius)
 948 |         pyautogui.moveTo(jitter_x, jitter_y, duration=0.1)
 949 |     return
 950 | 
 951 | 
 952 | def control_mouse(generated_coordinates, double_click=None, goal=""):
 953 |     print(f"Mouse coordinates: {generated_coordinates}")
 954 |     coordinates = {k.strip(): int(v.strip()) for k, v in
 955 |                    (item.split('=') for item in generated_coordinates.split(','))}
 956 |     x = coordinates['x']
 957 |     y = coordinates['y']
 958 |     pyautogui.moveTo(x, y, 0.5, pyautogui.easeOutQuad)
 959 |     pyautogui.click(x, y)
 960 |     # jitter_mouse(x, y)
 961 |     if "save as" in goal.lower():
 962 |         print("Saving as")
 963 |         jitter_mouse(x, y)
 964 |         pyautogui.mouseDown(x, y)
 965 |         time.sleep(0.12)
 966 |         pyautogui.mouseUp(x, y)
 967 |         print("Click action performed")
 968 |     else:
 969 |         pyautogui.click(x, y, clicks=1)
 970 |     if double_click:
 971 |         time.sleep(0.2)
 972 |         pyautogui.click(x, y, clicks=2)
 973 | 
 974 | 
 975 | def is_field_input_area_active():
 976 |     active_window_title = gw.getActiveWindow().title
 977 |     try:
 978 |         app = Application().connect(title=active_window_title)
 979 |         window = app[active_window_title]
 980 |         # Loop through all the child windows and check if any of them are text boxes
 981 |         for child in window.children():
 982 |             if 'Edit' in child.class_name() or 'RichEdit' in child.class_name():
 983 |                 # This is a text box, also add text input areas that are not text boxes
 984 |                 if child.has_keyboard_focus():
 985 |                     return True
 986 |         return False
 987 |     except Exception as e:
 988 |         print(f"Error: {e}")
 989 |         return False
 990 | 
 991 | 
 992 | def write_action(goal=None, assistant_identity="", press_enter=False, app_name="", original_goal=None, last_step=""):
 993 |     assistant_identity_msg = f"\n{assistant_identity}" if assistant_identity else ""
 994 |     message_writer_agent = [
 995 |         {"role": "assistant", f"content": f"You're an AI Agent called Writter that processes the goal and only returns the final text goal.{assistant_identity_msg}\n"
 996 |                                           f"Process the goal with your own response as you are actually writing into a text box. Avoid jump lines."
 997 |                                           f"If the goal is a link, media or a search string, just return the result string."
 998 |                                           f"Do not respond with 'As an AI language model, I dont have capabilities...' as you can actually do it.\n"},
 999 |         {"role": "system", "content": f"Goal: {goal}"}, ]
1000 |     message_to_write = api_call(message_writer_agent, model_name="gpt-4-1106-preview", max_tokens=200)
1001 |     if "click on" in goal.lower() or "click the" in goal.lower() or "click" in goal.lower():
1002 |         print("Found to click on the goal.")
1003 |         if is_field_input_area_active():
1004 |             print("A text box is currently active.")
1005 |         else:
1006 |             print("A text box is not active. Found to click on the goal.")
1007 |             act(goal, app_name=app_name, original_goal=original_goal)
1008 |     if "text_entry" in last_step:
1009 |         print("Found 'text_entry' in the last step.")
1010 |         pass
1011 |     else:
1012 |         print(f"Last steppp: {last_step}")
1013 |         if last_step is None:
1014 |             act(goal, app_name=app_name, original_goal=original_goal)
1015 |         previous_goal_analysis = [{"role": "assistant",
1016 |                                     "content": f"You are an AI Agent called text box editor focus that analyzes if performing the Goal on Windows enables a text input.\n"
1017 |                                                f"After opening anything like an app, program, webpage or clicking into a non-text editor element, respond 'No'.\n"
1018 |                                                f""
1019 |                                                f"Only respond with Yes or No."},
1020 |                                   {"role": "system", "content": f"Goal: {last_step}"}, ]
1021 |         able_to_type = api_call(previous_goal_analysis, max_tokens=5)
1022 |         print(f"AI analyzed if the previous step enabled any text input: {able_to_type}\n")
1023 |         if "yes" in able_to_type.lower():
1024 |             print("The previous goal enabled the current goal.")
1025 |             if last_step == "None":
1026 |                 print("Focusing to the text box because the last step didn't.")
1027 |                 act(goal, app_name=app_name, original_goal=original_goal)
1028 |         else:
1029 |             print("Focusing to the text box. Did this because the text box was not active from the previous step.")
1030 |             act(goal, app_name=app_name, original_goal=original_goal)
1031 | 
1032 |     pyautogui.typewrite(message_to_write, interval=0.01)
1033 |     if "press enter" in goal.lower() or "press the enter" in goal.lower() or "\'enter\'" in goal.lower() or "\"enter\"" in goal.lower() or press_enter is True:
1034 |         print("Found to press the enter key in the goal.")
1035 |         pyautogui.press('enter')
1036 |     else:
1037 |         print("AI no \"enter\" key press being made.")
1038 | 
1039 | 
1040 | def perform_simulated_keypress(press_key):
1041 |     # Define a pattern that matches the allowed keys, including function and arrow keys
1042 |     keys_pattern = (r'\b(Win(?:dows)?|Ctrl|Alt|Shift|Enter|Space(?:\s*Bar)?|Tab|Esc(?:ape)?|Backspace|Insert|Delete|'
1043 |                     r'Home|End|Page\s*Up|Page\s*Down|(?:Arrow\s*)?(?:Up|Down|Left|Right)|F1|F2|F3|F4|F5|F6|F7|F8|F9|'
1044 |                     r'F10|F11|F12|[A-Z0-9])\b')
1045 |     keys = re.findall(keys_pattern, press_key, re.IGNORECASE)
1046 |     # Normalize key names as required by pyautogui
1047 |     key_mapping = {
1048 |         'win': 'winleft',
1049 |         'windows': 'winleft',
1050 |         'escape': 'esc',
1051 |         'space bar': 'space',
1052 |         'arrowup': 'up',
1053 |         'arrowdown': 'down',
1054 |         'arrowleft': 'left',
1055 |         'arrowright': 'right',
1056 |         'spacebar': 'space',
1057 |     }
1058 |     pyautogui_keys = [key_mapping.get(key.lower().replace(' ', ''), key.lower()) for key in keys]
1059 |     for key in pyautogui_keys:
1060 |         pyautogui.keyDown(key)
1061 |     for key in reversed(pyautogui_keys):
1062 |         pyautogui.keyUp(key)
1063 |     print(f"Performed simulated key presses: {press_key}")
1064 | 
1065 | 
1066 | def calculate_duration_of_speech(text, lang='en', wpm=150):
1067 |     duration_in_seconds = (len(text.split()) / wpm) * 60
1068 |     return int(duration_in_seconds * 1000)  # Convert to milliseconds for tkinter's after method
1069 | 
1070 | 
1071 | def create_database(database_file):
1072 |     """Create the database and the required table."""
1073 |     conn = sqlite3.connect(database_file)
1074 |     cursor = conn.cursor()
1075 |     cursor.execute('''
1076 |     CREATE TABLE IF NOT EXISTS app_cases (
1077 |         id INTEGER PRIMARY KEY,
1078 |         app_name TEXT NOT NULL,
1079 |         title TEXT NOT NULL,
1080 |         instructions TEXT NOT NULL,
1081 |         UNIQUE(app_name, title, instructions)
1082 |     )
1083 |     ''')
1084 |     conn.commit()
1085 |     conn.close()
1086 | database_file = r'history.db'
1087 | create_database(database_file)
1088 | 
1089 | def database_add_case(database_file, app_name, goal, instructions):
1090 |     conn = sqlite3.connect(database_file)
1091 |     cursor = conn.cursor()
1092 |     try:
1093 |         cursor.execute('''
1094 |         INSERT INTO app_cases (app_name, title, instructions)
1095 |         VALUES (?, ?, ?)
1096 |         ''', (app_name, goal, json.dumps(instructions)))
1097 |         conn.commit()
1098 |     except sqlite3.IntegrityError:
1099 |         print("AI skipping element insertion to program map database.")
1100 |     finally:
1101 |         conn.close()
1102 | 
1103 | 
1104 | def print_database(database_file):
1105 |     conn = sqlite3.connect(database_file)
1106 |     cursor = conn.cursor()
1107 |     cursor.execute('SELECT * FROM app_cases')
1108 |     rows = cursor.fetchall()
1109 |     for row in rows:
1110 |         print(row)
1111 |     conn.close()
1112 | 
1113 | 
1114 | def update_instructions_with_action_string(instructions, action_string, target_step):
1115 |     for action in instructions['actions']:
1116 |         if action.get("act") == "click_element" and action.get("step") == target_step:
1117 |             action['additional_info'] = action_string
1118 |     return instructions
1119 | 
1120 | 
1121 | # Usage:
1122 | if __name__ == "__main__":
1123 |     assistant(assistant_goal="Open Reddit, Youtube, TikTok, and Netflix on new windows by using the keyboard on each corner of the screen.", app_name="firefox", execute_json_case=json_case_example)
1124 |     assistant(assistant_goal="Open a new tab the song 'Wall Of Eyes - The Smile', from google search results filter by videos then play it on Firefox")
1125 |     # This is how you can debug faster a prompt:
1126 |     # assistant(assistant_goal="make a comment explaining why it is so important", app_name="firefox")
1127 |     # assistant(assistant_goal="open spotify and play the song daft punk one more time", app_name="spotify")
1128 |     # assistant(assistant_goal="Play the song \'Weird Fishes - Radiohead\' on Spotify")
1129 |     # assistant(assistant_goal="Create a new comment explaining why it is so beautiful and comment it.", app_name="firefox")
1130 |     # assistant(assistant_goal="Create a short greet text for the user using AI Automated Windows in notepad")
1131 |     # assistant(assistant_goal=f"Open a new tab the song \'Windows 95 but it's a PHAT hip hop beat\' from google search results filter by videos then play it.", app_name="firefox")
1132 |     # assistant(f"Send a list of steps to make a chocolate cake to my saved messages in Telegram")
1133 |     # assistant(assistant_goal="On firefox play evangelion on netflix", app_name="firefox", execute_json_case=netflix)
1134 |     # assistant(assistant_goal="Play Rei Theme on spotify")
1135 |     # assistant(assistant_goal="make a hello world post on twitter", app_name="chrome")
1136 | 


--------------------------------------------------------------------------------
/core/get_all_installed_apps.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/a-real-ai/pywinassistant/5e3df3b1bc52bfdd0446dee7030080710549a32f/core/get_all_installed_apps.py


--------------------------------------------------------------------------------
/core/history.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/a-real-ai/pywinassistant/5e3df3b1bc52bfdd0446dee7030080710549a32f/core/history.db


--------------------------------------------------------------------------------
/core/last_app.py:
--------------------------------------------------------------------------------
  1 | import win32com.client
  2 | import win32gui
  3 | import win32con
  4 | from datetime import datetime
  5 | 
  6 | 
  7 | def enumerate_windows():
  8 |     windows = []
  9 |     top_windows = []
 10 | 
 11 |     def enum_window_callback(hwnd, _):
 12 |         windows.append((hwnd, win32gui.GetWindowText(hwnd)))
 13 | 
 14 |     win32gui.EnumWindows(enum_window_callback, None)
 15 |     windows.sort(key=lambda x: -x[0])
 16 |     for i, (_, title) in enumerate(windows):
 17 |         if title:
 18 |             top_windows.append((title, i))
 19 |     return dict(top_windows)
 20 | 
 21 | 
 22 | def should_exclude_process(name):
 23 |     excluded_processes = ['dwm.exe', 'nvcontainer.exe', 'nvidia broadcast ui.exe', 'system', 'python.exe', 'steam.exe',
 24 |                           'TextInputHost.exe', 'pycharm64.exe', 'nvidia broadcast.exe', 'widgets.exe', 'amdow.exe',
 25 |                           'CTkToplevel', 'AI Drone Assistant', 'Ctk', 'Ctk.exe', 'tk', 'tk.exe', 'Code', 'Code.exe',
 26 |                           'NVIDIA Share.exe', 'NVIDIA Web Helper.exe', 'nvsphelper64.exe', 'NVIDIA GeForce Experience.exe',
 27 |                           'nvcontainer.exe', 'NVDisplay.Container.exe', 'widgets.exe', 'translucenttb.exe', 'securityhealthsystray.exe']
 28 |     return name in excluded_processes
 29 | 
 30 | 
 31 | def get_opened_programs():
 32 |     wmi = win32com.client.GetObject('winmgmts:')
 33 |     processes = wmi.InstancesOf('Win32_Process')
 34 |     window_order = enumerate_windows()
 35 |     process_list = []
 36 |     added_titles = set()
 37 | 
 38 |     for process in processes:
 39 |         try:
 40 |             name = process.Properties_('Name').Value
 41 |             pid = process.Properties_('ProcessId').Value
 42 |             creation_date = process.Properties_('CreationDate').Value
 43 |             creation_datetime = datetime.strptime(creation_date.split('.')[0], '%Y%m%d%H%M%S')
 44 | 
 45 |             if should_exclude_process(name):
 46 |                 continue
 47 | 
 48 |             for title, order in window_order.items():
 49 |                 if name[:-4].lower() in title.lower() and title not in added_titles:
 50 |                     process_list.append((name, pid, creation_datetime, title, order))
 51 |                     added_titles.add(title)
 52 |                     break
 53 | 
 54 |         except Exception as e:
 55 |             print(f"Error getting information for PID {pid}: {e}")
 56 | 
 57 |     process_list.sort(key=lambda x: x[4])
 58 |     return process_list
 59 | 
 60 | 
 61 | def format_programs_list():
 62 |     programs = get_opened_programs()
 63 |     output = []
 64 |     for proc in programs:
 65 |         line = f"Name: '{proc[0]}', PID: {proc[1]}, Creation Time: '{proc[2]}', Window Title: '{proc[3]}', Z-order Level: {proc[4]}"
 66 |         output.append(line)
 67 | 
 68 |     if output:
 69 |         last_focused_window = output[0]  # The first app in the list after sorting by Z-order
 70 |         output.insert(0, f"Last Focused Window: {last_focused_window}\n---\n")
 71 | 
 72 |     return "\n".join(output)
 73 | 
 74 | 
 75 | def get_window_handle(title):
 76 |     handles = []
 77 | 
 78 |     def enum_window_callback(hwnd, _):
 79 |         if win32gui.GetWindowText(hwnd) == title:
 80 |             handles.append(hwnd)
 81 | 
 82 |     win32gui.EnumWindows(enum_window_callback, None)
 83 |     return handles[0] if handles else None
 84 | 
 85 | 
 86 | def set_foreground_window_by_title(title):
 87 |     hwnd = get_window_handle(title)
 88 |     if hwnd:
 89 |         win32gui.ShowWindow(hwnd, win32con.SW_RESTORE)
 90 |         win32gui.SetForegroundWindow(hwnd)
 91 |     else:
 92 |         print(f"Window with title '{title}' not found.")
 93 | 
 94 | 
 95 | def last_programs_list(focus_last_window=False):
 96 |     programs = get_opened_programs()
 97 |     output = []
 98 |     for proc in programs:
 99 |         line = f"Name: '{proc[0]}', PID: {proc[1]}, Creation Time: '{proc[2]}', Window Title: '{proc[3]}', Z-order Level: {proc[4]}"
100 |         output.append(line)
101 | 
102 |     if programs and focus_last_window:
103 |         last_focused_window_title = programs[0][3]  # Window title of the last focused window
104 |         set_foreground_window_by_title(last_focused_window_title)
105 |         output.insert(0, f"Last Focused Window: {last_focused_window_title}\n---\n")
106 | 
107 |     return "\n".join(output)
108 | 
109 | 
110 | # # Example usage
111 | # result_string = last_programs_list(focus_last_window=True)
112 | # print(result_string)
113 | 


--------------------------------------------------------------------------------
/core/media/Mouse_pointer_small.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/a-real-ai/pywinassistant/5e3df3b1bc52bfdd0446dee7030080710549a32f/core/media/Mouse_pointer_small.png


--------------------------------------------------------------------------------
/core/media/assistant_transparent.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/a-real-ai/pywinassistant/5e3df3b1bc52bfdd0446dee7030080710549a32f/core/media/assistant_transparent.png


--------------------------------------------------------------------------------
/core/media/assistant_transparent_blink.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/a-real-ai/pywinassistant/5e3df3b1bc52bfdd0446dee7030080710549a32f/core/media/assistant_transparent_blink.png


--------------------------------------------------------------------------------
/core/media/assistant_transparent_dragging.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/a-real-ai/pywinassistant/5e3df3b1bc52bfdd0446dee7030080710549a32f/core/media/assistant_transparent_dragging.png


--------------------------------------------------------------------------------
/core/media/headico.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/a-real-ai/pywinassistant/5e3df3b1bc52bfdd0446dee7030080710549a32f/core/media/headico.ico


--------------------------------------------------------------------------------
/core/media/headico.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/a-real-ai/pywinassistant/5e3df3b1bc52bfdd0446dee7030080710549a32f/core/media/headico.png


--------------------------------------------------------------------------------
/core/media/transcribe_audio.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/a-real-ai/pywinassistant/5e3df3b1bc52bfdd0446dee7030080710549a32f/core/media/transcribe_audio.mp3


--------------------------------------------------------------------------------
/core/media/translate_audio.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/a-real-ai/pywinassistant/5e3df3b1bc52bfdd0446dee7030080710549a32f/core/media/translate_audio.mp3


--------------------------------------------------------------------------------
/core/mouse_detection.py:
--------------------------------------------------------------------------------
 1 | import ctypes
 2 | import win32api
 3 | import win32con
 4 | 
 5 | # Define the CURSORINFO structure
 6 | class CURSORINFO(ctypes.Structure):
 7 |     _fields_ = [("cbSize", ctypes.c_int),
 8 |                 ("flags", ctypes.c_int),
 9 |                 ("hCursor", ctypes.c_void_p),
10 |                 ("ptScreenPos", ctypes.c_long * 2)]
11 | 
12 | def get_cursor_shape():
13 |     cursor_info = CURSORINFO()
14 |     cursor_info.cbSize = ctypes.sizeof(CURSORINFO)
15 |     ctypes.windll.user32.GetCursorInfo(ctypes.byref(cursor_info))
16 | 
17 |     # Load the standard cursors to compare
18 |     cursor_arrow = win32api.LoadCursor(0, win32con.IDC_ARROW)
19 |     cursor_ibeam = win32api.LoadCursor(0, win32con.IDC_IBEAM)
20 |     cursor_hand = win32api.LoadCursor(0, win32con.IDC_HAND)
21 |     cursor_wait = win32api.LoadCursor(0, win32con.IDC_WAIT)
22 |     cursor_cross = win32api.LoadCursor(0, win32con.IDC_CROSS)
23 | 
24 |     # Compare the current cursor with the standard cursors
25 |     if cursor_info.hCursor == cursor_arrow:
26 |         return "Arrow"
27 |     elif cursor_info.hCursor == cursor_ibeam:
28 |         return "The cursor is active for Text Input (I-beam)"
29 |     elif cursor_info.hCursor == cursor_hand:
30 |         return "The cursor is 'Hand' (A link is select)"
31 |     elif cursor_info.hCursor == cursor_wait:
32 |         return "The cursor is 'Wait' (Busy) - Hourglass or Watch"
33 |     elif cursor_info.hCursor == cursor_cross:
34 |         return "The cursor is 'Cross'"
35 |     else:
36 |         return "Other"
37 | 
38 | # while True:
39 | #     cursor_shape = get_cursor_shape()
40 | #     print(f"Cursor shape: {cursor_shape}")
41 | #     time.sleep(1)
42 | 


--------------------------------------------------------------------------------
/core/ocr.py:
--------------------------------------------------------------------------------
  1 | import pyautogui
  2 | import win32gui
  3 | import win32process
  4 | import psutil
  5 | from PIL import ImageGrab
  6 | import re
  7 | from fuzzywuzzy import fuzz
  8 | from concurrent.futures import ThreadPoolExecutor
  9 | import math
 10 | # Function to preprocess the image for better OCR results
 11 | from PIL import Image, ImageOps, ImageFilter, ImageEnhance
 12 | import pytesseract
 13 | import pygetwindow as gw
 14 | 
 15 | # Path to tesseract executable
 16 | pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
 17 | 
 18 | # Function to calculate similarity using fuzzywuzzy
 19 | def calculate_similarity(input_string, match_string):
 20 |     # Calculate basic similarity score using partial_ratio
 21 |     basic_similarity = fuzz.partial_ratio(input_string.lower(), match_string.lower())
 22 | 
 23 |     # Adjust score based on the length difference
 24 |     length_difference = len(input_string) - len(match_string)
 25 | 
 26 |     # If the match string is shorter than the input string, reduce the score
 27 |     if length_difference > 0:
 28 |         # For example, reduce the score by 5 points for each missing character
 29 |         score_penalty = 50 * length_difference
 30 |         adjusted_score = max(basic_similarity - score_penalty, 0)  # Ensure the score doesn't go below 0
 31 |     else:
 32 |         # If the match string is not shorter, no penalty is applied
 33 |         adjusted_score = basic_similarity
 34 | 
 35 |     return adjusted_score
 36 | 
 37 | # New function for multi-processing
 38 | def parallel_ocr(data):
 39 |     x, y, w, h = data
 40 |     cropped_image = ImageGrab.grab(bbox=(x, y, x + w, y + h))
 41 |     processed_images = preprocess_image(
 42 |         cropped_image,
 43 |         contrast_levels=[128, 152],
 44 |         invert=True,
 45 |         scales=[1, 1.25]
 46 |     )
 47 | 
 48 |     results = []
 49 |     for img in processed_images:
 50 |         text = pytesseract.image_to_string(img)
 51 |         if text:
 52 |             results.append(text)
 53 | 
 54 |     results.sort(key=len, reverse=True)
 55 |     return results[0] if results else ""
 56 | 
 57 | 
 58 | # Add your WindowClassifier class definition here
 59 | def get_focused_window_details():
 60 |     try:
 61 |         # Get the handle of the currently focused window
 62 |         window_handle = win32gui.GetForegroundWindow()
 63 | 
 64 |         # Get window text (title)
 65 |         window_title = win32gui.GetWindowText(window_handle)
 66 | 
 67 |         # Get the process ID of the window
 68 |         _, window_pid = win32process.GetWindowThreadProcessId(window_handle)
 69 | 
 70 |         # Get the process name from the process ID
 71 |         process = psutil.Process(window_pid)
 72 |         process_name = process.name()
 73 | 
 74 |         # Get window size and position
 75 |         rect = win32gui.GetWindowRect(window_handle)
 76 |         window_position = (rect[0], rect[1])
 77 |         window_size = (rect[2] - rect[0], rect[3] - rect[1])
 78 | 
 79 |         return window_title, window_handle, window_pid, process_name, window_position, window_size
 80 |     except Exception as e:
 81 |         print(f"An error occurred: {e}")
 82 |         return None, None, None, None, None, None, None, None  # Return None for all values in case of an exception
 83 | 
 84 |     pass
 85 | 
 86 | def ocr_image(image):
 87 |     # Apply preprocessing with filters directly in the OCR function
 88 |     text = ocr_image_with_filters(image)
 89 |     return text
 90 | 
 91 | def preprocess_image(
 92 |     image,
 93 |     grayscale=True,
 94 |     invert=False,
 95 |     contrast_levels=None,
 96 |     scales=None,
 97 |     use_threshold=False,
 98 |     gaussian_blur_radius=None,
 99 |     median_filter_size=None,
100 |     bilateral_filter_params=None,
101 |     sharpen=False,
102 |     edge_enhance=False,
103 |     contrast_enhance_factor=1.0
104 | ):
105 |     # Initialize default values if none provided
106 |     if scales is None:
107 |         scales = [1]  # Default scale is 100%
108 |     if contrast_levels is None:
109 |         contrast_levels = [128]  # Default threshold for binarization
110 | 
111 |     processed_images = []
112 | 
113 |     for scale in scales:
114 |         # Resize image if scale is not 1
115 |         if scale != 1:
116 |             resized_image = image.resize((int(image.width * scale), int(image.height * scale)), Image.ANTIALIAS)
117 |         else:
118 |             resized_image = image
119 | 
120 |         if grayscale:
121 |             # Convert image to grayscale
122 |             processed_image = resized_image.convert('L')
123 |         else:
124 |             processed_image = resized_image
125 | 
126 |         if invert:
127 |             # Invert image colors
128 |             processed_image = ImageOps.invert(processed_image)
129 | 
130 |         if sharpen:
131 |             # Apply sharpen filter
132 |             processed_image = processed_image.filter(ImageFilter.SHARPEN)
133 | 
134 |         if edge_enhance:
135 |             # Enhance the edges in the image
136 |             processed_image = processed_image.filter(ImageFilter.EDGE_ENHANCE)
137 | 
138 |         if contrast_enhance_factor != 1.0:
139 |             # Enhance the contrast of the image
140 |             enhancer = ImageEnhance.Contrast(processed_image)
141 |             processed_image = enhancer.enhance(contrast_enhance_factor)
142 | 
143 |         if gaussian_blur_radius:
144 |             # Apply Gaussian Blur
145 |             processed_image = processed_image.filter(ImageFilter.GaussianBlur(gaussian_blur_radius))
146 | 
147 |         if median_filter_size:
148 |             # Apply Median Filter
149 |             processed_image = processed_image.filter(ImageFilter.MedianFilter(median_filter_size))
150 | 
151 |         if bilateral_filter_params:
152 |             # Apply Bilateral Filter
153 |             diameter, sigma_color, sigma_space = bilateral_filter_params
154 |             processed_image = processed_image.filter(ImageFilter.BilateralFilter(diameter, sigma_color, sigma_space))
155 | 
156 |         if use_threshold:
157 |             # Apply threshold to binarize the image
158 |             thresholded_image = processed_image.point(lambda x: 0 if x < contrast_levels[0] else 128, '1')
159 |             processed_images.append(thresholded_image)
160 |         else:
161 |             # If not using threshold, just append the processed image
162 |             processed_images.append(processed_image)
163 | 
164 |     return processed_images
165 | 
166 | def ocr_image_with_filters(image):
167 |     # Apply preprocessing with filters
168 |     preprocessed_images = preprocess_image(
169 |         image,
170 |         grayscale=True,
171 |         invert=False,
172 |         contrast_levels=[128],
173 |         scales=[1],
174 |         use_threshold=True,
175 |         gaussian_blur_radius=None,
176 |         median_filter_size=None,
177 |         bilateral_filter_params=None,
178 |         sharpen=True,
179 |         edge_enhance=False,
180 |         contrast_enhance_factor=2.0
181 |     )
182 | 
183 |     # Since preprocess_image returns a list, we take the first (and should be only) image
184 |     preprocessed_image = preprocessed_images[0]
185 | 
186 |     # Display the modified image DEBUGGING.
187 |     # preprocessed_image.show()
188 | 
189 |     # Perform OCR on the preprocessed image
190 |     text = pytesseract.image_to_string(preprocessed_image)
191 | 
192 |     # print(f"IMAGE PROCESSED:\n{text}")
193 |     return text.strip()
194 | 
195 | 
196 | def click_best_match(best_match):
197 |     if best_match:
198 |         # Calculate the center of the bounding box
199 |         center_x = best_match['x'] + best_match['w'] // 2
200 |         center_y = best_match['y'] + best_match['h'] // 2
201 | 
202 |         # Perform the click action using pyautogui
203 |         pyautogui.moveTo(center_x, center_y, 0.5, pyautogui.easeOutQuad)
204 |         pyautogui.click(center_x, center_y)
205 |         return f"Clicked on the best match: '{best_match['text']}' at position: ({center_x}, {center_y})"
206 |     else:
207 |         return "No suitable matches to click on the screen."
208 | 
209 | 
210 | # Function to compute the distance between two points for proximity scoring.
211 | def distance_between(p1, p2):
212 |     return math.sqrt((p1[0] - p2[0]) ** 2 + (p1[1] - p2[1]) ** 2)
213 | # Enhance this function to score matches not only based on OCR confidence and text similarity but also their proximity.
214 | def score_and_rank_matches(matches):
215 |     for i, match in enumerate(matches):
216 |         for other_match in matches[i+1:]:
217 |             proximity = 1000 / (distance_between(match['position'], other_match['position']) + 1)  # Simple proximity score
218 |             match['score'] += proximity
219 |             other_match['score'] += proximity
220 |     matches.sort(key=lambda x: x['score'], reverse=True)  # Sort matches based on the score
221 |     return matches
222 | 
223 | 
224 | # Update click_best_match to intelligently click between close matches if found
225 | def click_best_matches(coincidences):
226 |     if not coincidences:
227 |         return "No suitable matches to click on the screen."
228 |         # Filter out negative-scored matches
229 |     positive_score_matches = [match for match in coincidences if match['score'] > 0]
230 | 
231 |     if not positive_score_matches:
232 |         return "No matches with a positive score to click on the screen."
233 | 
234 |     # Calculate the average position (centroid) in case of close matches
235 |     if len(coincidences) > 1:
236 |         average_x = sum(match['center'][0] for match in coincidences) // len(coincidences)
237 |         average_y = sum(match['center'][1] for match in coincidences) // len(coincidences)
238 |         pyautogui.click(average_x, average_y)
239 |         return f"Clicked on the average position: ({average_x}, {average_y})"
240 |     else:
241 |         best_match = coincidences[0]
242 |         center_x = best_match['center'][0]
243 |         center_y = best_match['center'][1]
244 | 
245 |         pyautogui.click(center_x, center_y)
246 |         return f"Clicked on the best match: '{best_match['text']}' at position: ({center_x}, {center_y})"
247 | 
248 | # Function to execute best_match_with_proximity in parallel and find the most probable click position
249 | def find_best_match_with_proximity(input_string, within_window=False):
250 |     if within_window:
251 |         _, _, _, _, window_position, window_size, _, _ = get_focused_window_details()
252 |         screenshot = ImageGrab.grab(bbox=(
253 |             window_position[0], window_position[1],
254 |             window_position[0] + window_size[0],
255 |             window_position[1] + window_size[1]))
256 |     else:
257 |         screenshot = ImageGrab.grab()
258 |     d = pytesseract.image_to_data(screenshot, output_type=pytesseract.Output.DICT)
259 |     input_string = input_string.lower()
260 |     input_parts = re.findall(r'\w+|\W+', input_string)  # Split input_string into words and non-alphabetic parts
261 | 
262 |     coincidences = []  # List to store all matches
263 | 
264 | 
265 |     for i in range(len(d['text'])):
266 |         extracted_text = d['text'][i].lower().strip()
267 | 
268 | 
269 |         # Skip single and double letters or empty strings
270 |         if len(extracted_text) <= 2 or not extracted_text:
271 |             continue
272 | 
273 |         score = score = int(d['conf'][i])
274 |         for part in input_parts:
275 |             if part in extracted_text:
276 |                 score += 50 if part.isalpha() else 75  # Higher score for non-alphabetic parts
277 | 
278 | 
279 |           # Initialize score with OCR confidence
280 | 
281 |         # Check for literal exact match and score it more points
282 |         if extracted_text == input_string:
283 |             score += 500  # Assign significant points for a literal exact match
284 | 
285 |         # Use fuzzywuzzy to measure similarity and adjust the score
286 |         similarity_score = fuzz.partial_ratio(input_string, extracted_text)
287 |         if similarity_score > 60:
288 |             score += similarity_score  # Add similarity score to the existing score
289 | 
290 |         # Penalize score if similarity is low
291 |         if similarity_score < 80:
292 |             score -= 200  # Deduct points if there's low similarity
293 |         # Inside find_best_match_with_proximity
294 |         coincidence = {
295 |             'text': d['text'][i],
296 |             'x': d['left'][i],
297 |             'y': d['top'][i],
298 |             'w': d['width'][i],
299 |             'h': d['height'][i],
300 |             'conf': d['conf'][i],
301 |             'score': score,  # Ensure there is a comma here
302 |             # Add the 'center' key to store the center coordinates of the match
303 |             'center': (d['left'][i] + d['width'][i] // 2, d['top'][i] + d['height'][i] // 2)
304 |         }
305 |         coincidences.append(coincidence)
306 | 
307 |     # Now we have all coincidences with scores reflecting exactness and similarity
308 |     # Higher score is better
309 | 
310 |     if coincidences:
311 |         # Sort matches based on the score
312 |         best_match = max(coincidences, key=lambda x: x['score'])
313 |         #################################################
314 |         # print(f"Best match: '{best_match['text']}' with score {best_match['score']}")
315 |         return best_match
316 |     else:
317 |         print("No matches found.")
318 |         return None
319 | 
320 | # Adjusted ocr_focused_window function
321 | def ocr_focused_window():
322 |     # Get details of the focused window
323 |     _, _, _, _, window_position, window_size, _, _ = get_focused_window_details()
324 |     # Capture only the area of the focused window
325 |     screenshot = ImageGrab.grab(bbox=(
326 |     window_position[0], window_position[1], window_position[0] + window_size[0], window_position[1] + window_size[1]))
327 |     # Perform OCR with preprocessing and filtering
328 |     text = ocr_image_with_filters(screenshot)
329 |     return text
330 | 
331 | 
332 | # Adjusted ocr_screen function
333 | def ocr_screen(focused=False):
334 |     if focused:
335 |         # Get the focused window
336 |         window = gw.getActiveWindow()
337 |         if window is not None:
338 |             # Get the position of the focused window
339 |             x, y, width, height = window.left, window.top, window.width, window.height
340 |             # Capture the focused window
341 |             screenshot = pyautogui.screenshot(region=(x, y, width, height))
342 |         else:
343 |             # Fallback in case no window is focused
344 |             screenshot = ImageGrab.grab()
345 |     else:
346 |         # Get a screenshot of the entire screen
347 |         screenshot = ImageGrab.grab()
348 | 
349 |     # Perform OCR with preprocessing and filtering (assuming this is a function you have)
350 |     text = ocr_image_with_filters(screenshot)
351 |     return text
352 | 
353 | 
354 | def find_probable_click_position(input_string, attempts=30):
355 |     print(f"Finding the most probable click position for \"{input_string}\"...")
356 |     with ThreadPoolExecutor(max_workers=attempts) as executor:
357 |         print(f"Running {attempts} attempts in parallel...")
358 |         # Run the function multiple times in parallel
359 |         futures = [executor.submit(find_best_match_with_proximity, input_string) for _ in range(attempts)]
360 |         print(f"Waiting for {attempts} parallel attempts to finish...")
361 | 
362 |         # Collect results, filtering out those with a non-positive score
363 |         results = [future.result() for future in futures if future.result() is not None and future.result()['score'] > 0]
364 |         print(f"Found {len(results)} matches with a positive score.")
365 |         print("Scoring and ranking matches based on proximity...")
366 |     print(f"Found {len(results)} matches with a positive score.")
367 |     # Find the most probable best match based on the score
368 |     if results:
369 |         most_probable_match = max(results, key=lambda match: match['score'])
370 |         return most_probable_match
371 |     return None
372 | 
373 | # Main execution block
374 | if __name__ == "__main__":
375 |     input_string = "Neon Genesis Evangelion"  # Example input string
376 |     most_probable_match = find_probable_click_position(input_string)
377 | 
378 |     # Provide feedback and click action based on most probable match
379 |     if most_probable_match:
380 |         click_result = click_best_matches([most_probable_match])
381 |         print(f"Most probable match \"{most_probable_match['text']}\" Located at \"x={most_probable_match['center'][0]}, y={most_probable_match['center'][1]}\" with score {most_probable_match['score']}")
382 | 
383 |     else:
384 |         print("No suitable matches found on screen for the input string.")


--------------------------------------------------------------------------------
/core/topmost_window.py:
--------------------------------------------------------------------------------
 1 | import win32com.client
 2 | import win32gui
 3 | import win32con
 4 | import win32process
 5 | from datetime import datetime
 6 | 
 7 | def enumerate_windows():
 8 |     windows = []
 9 | 
10 |     def enum_window_callback(hwnd, _):
11 |         if win32gui.IsWindowVisible(hwnd) and win32gui.GetWindowText(hwnd):
12 |             windows.append(hwnd)
13 | 
14 |     win32gui.EnumWindows(enum_window_callback, None)
15 |     return windows
16 | 
17 | def should_exclude_process(name):
18 |     excluded_processes = ['dwm.exe', 'nvcontainer.exe', 'nvidia broadcast ui.exe', 'system', 'python.exe', 'steam.exe',
19 |                           'TextInputHost.exe', 'tk', 'pycharm64.exe', 'nvidia broadcast.exe', 'widgets.exe',
20 |                           'CTkToplevel', 'Windows Input Experience', 'widgets.exe', 'translucenttb.exe', 'amdow.exe',
21 |                           'securityhealthsystray.exe', 'Ctk', 'Ctk.exe', 'tk', 'tk.exe', 'Code', 'Code.exe', 'NVIDIA Share.exe',
22 |                           'NVIDIA Web Helper.exe', 'nvsphelper64.exe', 'NVIDIA GeForce Experience.exe',
23 |                           'nvcontainer.exe', 'NVDisplay.Container.exe']
24 |     return name.lower() in excluded_processes
25 | 
26 | def get_process_name(hwnd):
27 |     _, pid = win32process.GetWindowThreadProcessId(hwnd)
28 |     wmi = win32com.client.GetObject('winmgmts:')
29 |     process = wmi.ExecQuery(f'SELECT Name FROM Win32_Process WHERE ProcessId = {pid}')
30 |     if process:
31 |         return process[0].Name
32 |     return None
33 | 
34 | def get_topmost_window():
35 |     for hwnd in enumerate_windows():
36 |         process_name = get_process_name(hwnd)
37 |         title = win32gui.GetWindowText(hwnd)
38 |         # print(f"Debug: Window Title: '{title}', Process: '{process_name}'")  # Debugging line
39 |         if process_name and not should_exclude_process(process_name):
40 |             return title, process_name
41 |     return None, None
42 | 
43 | def get_window_handle(title):
44 |     handles = []
45 | 
46 |     def enum_window_callback(hwnd, _):
47 |         if win32gui.GetWindowText(hwnd) == title:
48 |             handles.append(hwnd)
49 | 
50 |     win32gui.EnumWindows(enum_window_callback, None)
51 |     return handles[0] if handles else None
52 | 
53 | def set_foreground_window_by_title(title):
54 |     hwnd = get_window_handle(title)
55 |     if hwnd:
56 |         win32gui.ShowWindow(hwnd, win32con.SW_RESTORE)
57 |         try:
58 |             win32gui.SetForegroundWindow(hwnd)
59 |         except Exception as e:
60 |             print(f"Error setting foreground window: {e}")
61 |     else:
62 |         print(f"Window with title '{title}' not found.")
63 | 
64 | def focus_topmost_window():
65 |     topmost_window_title, _ = get_topmost_window()  # We're not using process_name here
66 |     if topmost_window_title:
67 |         print(f"Selected application: {topmost_window_title}")
68 |         set_foreground_window_by_title(topmost_window_title)
69 |         return topmost_window_title
70 |     else:
71 |         print("No suitable windows found.")
72 | 
73 | # Example usage
74 | # focus_topmost_window()
75 | 


--------------------------------------------------------------------------------
/core/ui_window_analyzer.py:
--------------------------------------------------------------------------------
 1 | import uiautomation as auto
 2 | 
 3 | #not used
 4 | 
 5 | def walk_control(control, indent=0, control_type=None):
 6 |     if control is None:
 7 |         return
 8 |     try:
 9 |         if (control_type is None) or (control.ControlType == auto.ControlType[control_type]):
10 |             # Print control info with indentation
11 |             print(' ' * indent + str(control))
12 |             # ... print additional properties if needed ...
13 |     except Exception as e:
14 |         print(' ' * (indent + 2) + 'Error getting properties: ' + str(e))
15 |     # Recursively walk the tree
16 |     for child in control.GetChildren():
17 |         walk_control(child, indent + 4, control_type=control_type)
18 | 
19 | 
20 | def analyze_app(application_name=None, control_type=None):
21 |     if application_name:
22 |         # Find the application window by name
23 |         control = auto.WindowControl(searchDepth=1, Name=application_name)
24 |         if not control.Exists(0, 0):
25 |             print(f'Application "{application_name}" is not running or window not found.')
26 |             return
27 |         print(f'Inspecting UI elements for application "{application_name}":')
28 |     else:
29 |         control = auto.GetRootControl()
30 |         print('Inspecting UI elements for the entire desktop:')
31 | 
32 |     # If a specific control type is given, filter by that control type
33 |     if control_type and control:
34 |         # Walking the control tree and checking for the control type in the walk_control function
35 |         walk_control(control)
36 |     else:
37 |         # Walk the entire UI tree from the control
38 |         walk_control(control)
39 | 
40 | if __name__ == '__main__':
41 |     analyze_app(application_name='Untitled - Paint', control_type='Edit')
42 | 


--------------------------------------------------------------------------------
/core/voice.py:
--------------------------------------------------------------------------------
  1 | from gtts import gTTS
  2 | import tempfile
  3 | import threading
  4 | import os
  5 | os.environ['PYGAME_HIDE_SUPPORT_PROMPT'] = "hide"  # Hide pygame's welcome message, remind me to remove this later, they deserve recognize, thank you for the fast tts
  6 | import pygame
  7 | import tkinter as tk
  8 | 
  9 | # Initialize Pygame's mixer
 10 | pygame.mixer.init(frequency=44100, size=-16, channels=2, buffer=4096)
 11 | volume = 0.25
 12 | subtitles = True
 13 | 
 14 | class TransparentSubtitlesWindow:
 15 |     def __init__(self, text):
 16 |         self.root = tk.Tk()
 17 |         self.text = text
 18 |         self.label = tk.Label(self.root, text=self.text, font=('Helvetica', 16), fg='white', bg='black')
 19 |         self.label.pack()
 20 | 
 21 |         # Set the window to be always on top, transparent, and without decorations
 22 |         self.root.overrideredirect(True)
 23 |         self.root.attributes('-topmost', True)
 24 |         self.root.attributes('-transparentcolor', 'black')
 25 | 
 26 |         # Set window position
 27 |         self.root.geometry('+%d+%d' % (self.root.winfo_screenwidth() // 2 - self.label.winfo_reqwidth() // 2,
 28 |                                        self.root.winfo_screenheight() - 100))
 29 |         self.update()
 30 | 
 31 |     def update(self):
 32 |         self.label.configure(text=self.text)
 33 |         self.root.update_idletasks()
 34 |         self.root.update()
 35 | 
 36 |     def change_text(self, new_text, duration):
 37 |         self.text = new_text
 38 |         self.update()
 39 | 
 40 |         # Schedule removing the text after the duration
 41 |         self.root.after(duration, lambda: self.label.configure(text=""))
 42 | 
 43 |     def close(self):
 44 |         self.root.quit()  # changed from destroy() to quit()
 45 | 
 46 | 
 47 | def calculate_duration_of_speech(text, lang='en', wpm=150):
 48 |     """Estimate the duration the subtitles should be displayed based on words per minute (WPM)"""
 49 |     words = text.split()
 50 |     word_count = len(words)
 51 |     duration_in_seconds = (word_count / wpm) * 60
 52 |     return int(duration_in_seconds * 1000)  # Convert to milliseconds for tkinter's after method
 53 | 
 54 | 
 55 | def play_audio(file_path, text, lang='en'):
 56 |     # Estimate the duration the subtitles should be shown
 57 |     duration = calculate_duration_of_speech(text, lang)
 58 | 
 59 |     # Load and play audio file
 60 |     pygame.mixer.music.load(file_path)
 61 |     pygame.mixer.music.set_volume(volume)
 62 |     pygame.mixer.music.play()
 63 | 
 64 |     # When the audio finishes, stop the mixer and remove the temporary file
 65 |     while pygame.mixer.music.get_busy():
 66 |         pygame.time.Clock().tick(10)
 67 |     pygame.mixer.music.unload()
 68 |     os.remove(file_path)
 69 | 
 70 | 
 71 | def set_volume(volume_level):
 72 |     global volume
 73 |     volume = volume_level
 74 |     pygame.mixer.music.set_volume(volume)
 75 | 
 76 | def set_subtitles(subtitles_bool):
 77 |     global subtitles
 78 |     subtitles = subtitles_bool
 79 | 
 80 | 
 81 | def speaker(text, lang='en'):
 82 |     # Initialize all of pygame's modules
 83 |     pygame.init()
 84 | 
 85 |     # Temporary mp3 file creation
 86 |     with tempfile.NamedTemporaryFile(delete=False, suffix='.mp3') as fp:
 87 |         tts = gTTS(text=text, lang=lang)
 88 |         tts.save(fp.name)
 89 |         temp_file_path = fp.name
 90 | 
 91 |     # Start the subtitles thread
 92 |     if subtitles is True:
 93 |         def setup_subtitles():
 94 |             window = TransparentSubtitlesWindow(text)
 95 |             window.change_text(text, calculate_duration_of_speech(text, lang))
 96 |             window.root.mainloop()
 97 | 
 98 |         subtitles_thread = threading.Thread(target=setup_subtitles)
 99 |         subtitles_thread.daemon = True  # Now the thread will close when the main program exits
100 |         subtitles_thread.start()
101 |     else:
102 |         subtitles_thread = None
103 | 
104 |     # Start the audio thread
105 |     audio_thread = threading.Thread(target=play_audio, args=(temp_file_path, text, lang))
106 |     audio_thread.daemon = True
107 |     audio_thread.start()
108 | 
109 |     # Return the threads in case the caller wants to track them
110 |     return audio_thread, subtitles_thread
111 | 
112 | 
113 | if __name__ == '__main__':
114 |     text_to_speak = "Hello, this is a test."
115 |     speaker(text_to_speak)
116 |     # Main script can do other tasks here, threads will not prevent script from exiting


--------------------------------------------------------------------------------
/core/window_elements.py:
--------------------------------------------------------------------------------
 1 | import uiautomation as auto
 2 | 
 3 | 
 4 | def walk_control(control, indent=0, control_type=None, search_strings=None):
 5 |     matched = []
 6 |     unmatched = []
 7 |     if control is None:
 8 |         return matched, unmatched
 9 |     if control_type is None or control.ControlType == control_type:
10 |         try:
11 |             rect = control.BoundingRectangle
12 |             area = (rect.right - rect.left) * (rect.bottom - rect.top)
13 |             control_tuple = (control, area)
14 |             if search_strings and any(s.lower() in control.Name.lower() for s in search_strings):
15 |                 matched.append(control_tuple)
16 |             else:
17 |                 unmatched.append(control_tuple)
18 |         except Exception as e:
19 |             print(f"{' ' * indent}Error getting properties: {e}")
20 | 
21 |     for child in control.GetChildren():
22 |         child_matched, child_unmatched = walk_control(child, indent + 2, control_type=control_type, search_strings=search_strings)
23 |         matched.extend(child_matched)
24 |         unmatched.extend(child_unmatched)
25 |     return matched, unmatched
26 | 
27 | 
28 | def sort_and_categorize_rects(controls_with_rects, size_category_to_print=None):
29 |     sorted_by_area = sorted(controls_with_rects, key=lambda x: x[1], reverse=True)
30 |     categorized = {'Bigger': [], 'Medium': [], 'Small': []}
31 | 
32 |     for control, area in sorted_by_area:
33 |         if area >= 1000000:
34 |             categorized['Bigger'].append(control)
35 |         elif area >= 100000:
36 |             categorized['Medium'].append(control)
37 |         else:
38 |             categorized['Small'].append(control)
39 | 
40 |     output = []
41 |     for category, controls in categorized.items():
42 |         output.append(f"{category} elements:")
43 |         for control in controls[:150]:
44 |             output.append(f"{control}")
45 |         output.append("")  # For an empty line between categories
46 | 
47 |     return "\n".join(output).strip()
48 | 
49 | 
50 | def analyze_app(application_name_contains=None, size_category=None, additional_search_options=None):
51 |     root = auto.GetRootControl()
52 | 
53 |     control = None
54 |     if application_name_contains:
55 |         for win in root.GetChildren():
56 |             if application_name_contains.lower() in win.Name.lower():
57 |                 control = win
58 |                 break
59 |         if not control:
60 |             return f'Window containing "{application_name_contains}" not found.'
61 |     else:
62 |         control = root
63 | 
64 |     if not control.Exists(0, 0):
65 |         return f'Application with title containing "{application_name_contains}" is not running or window not found.'
66 | 
67 |     search_strings = additional_search_options.lower().split(',') if additional_search_options else []
68 |     search_strings = [s.strip() for s in search_strings if s.strip()]
69 | 
70 |     matched_controls_with_rects, unmatched_controls_with_rects = walk_control(control, control_type=None, search_strings=search_strings)
71 | 
72 |     output = "Matched controls:\n"
73 |     output += sort_and_categorize_rects(matched_controls_with_rects, size_category_to_print=size_category)
74 |     output += "\nUnmatched controls:\n"
75 |     output += sort_and_categorize_rects(unmatched_controls_with_rects, size_category_to_print=size_category)
76 |     return output
77 | 
78 | 
79 | # Usage example
80 | if __name__ == '__main__':
81 |     search_options = "contenteditable"
82 |     search_terms = search_options.replace('', '').strip()
83 |     print(search_terms)
84 |     print(analyze_app(application_name_contains='Firefox', additional_search_options=search_terms))


--------------------------------------------------------------------------------
/core/window_focus.py:
--------------------------------------------------------------------------------
  1 | import subprocess
  2 | import os
  3 | import ctypes
  4 | import sys
  5 | import time
  6 | import winreg
  7 | from fuzzywuzzy import fuzz
  8 | import pygetwindow as gw
  9 | import uiautomation as auto
 10 | import win32gui
 11 | import win32process
 12 | import psutil
 13 | import winreg
 14 | 
 15 | # Define necessary functions from the user32 DLL
 16 | user32 = ctypes.WinDLL('user32', use_last_error=True)
 17 | EnumWindows = user32.EnumWindows
 18 | GetForegroundWindow = user32.GetForegroundWindow
 19 | EnumWindowsProc = ctypes.WINFUNCTYPE(ctypes.c_bool, ctypes.c_void_p, ctypes.c_void_p)
 20 | GetWindowThreadProcessId = user32.GetWindowThreadProcessId
 21 | GetWindowTextLength = user32.GetWindowTextLengthW
 22 | GetWindowText = user32.GetWindowTextW
 23 | IsWindowVisible = user32.IsWindowVisible
 24 | SetForegroundWindow = user32.SetForegroundWindow
 25 | IsIconic = user32.IsIconic
 26 | ShowWindow = user32.ShowWindow
 27 | 
 28 | # Constants for ShowWindow function
 29 | SW_RESTORE = 9
 30 | SW_SHOW = 5
 31 | 
 32 | def get_installed_apps_registry():
 33 |     installed_apps = []
 34 |     reg_paths = [
 35 |         r'SOFTWARE\Microsoft\Windows\CurrentVersion\Uninstall',
 36 |         r'SOFTWARE\WOW6432Node\Microsoft\Windows\CurrentVersion\Uninstall'
 37 |     ]
 38 |     for reg_path in reg_paths:
 39 |         with winreg.ConnectRegistry(None, winreg.HKEY_LOCAL_MACHINE) as hkey:
 40 |             with winreg.OpenKey(hkey, reg_path, 0, winreg.KEY_READ) as sub_key:
 41 |                 subkey_count, _, _ = winreg.QueryInfoKey(sub_key)
 42 |                 for i in range(subkey_count):
 43 |                     try:
 44 |                         subkey_name = winreg.EnumKey(sub_key, i)
 45 |                         with winreg.OpenKey(sub_key, subkey_name) as app_key:
 46 |                             app_name, _ = winreg.QueryValueEx(app_key, 'DisplayName')
 47 |                             installed_apps.append(app_name)
 48 |                     except EnvironmentError:
 49 |                         continue
 50 |     return installed_apps
 51 | 
 52 | 
 53 | def get_open_windows():
 54 |     excluded_titles = ["AI Drone Assistant", "NVIDIA GeForce Overlay", "Windows Input Experience", "Program Manager"]
 55 |     excluded_executables = ["NVIDIA Share.exe", "TextInputHost.exe", "Tk.exe", "conhost.exe", "explorer.exe",
 56 |                             "CTkToplevel", 'Windows Input Experience', "SecurityHealthSystray.exe", "Steam.exe",
 57 |                             "SearchApp.exe", "ApplicationFrameHost.exe", "ShellExperienceHost.exe", "MicrosoftEdge.exe",
 58 |                             "MicrosoftEdgeCP.exe", "MicrosoftEdgeSH.exe", "python.exe", "pycharm64.exe", "pycharm64.exe",
 59 |                             "Ctk", "Ctk.exe", "tk", "tk.exe", "Code", "Code.exe", "amdow.exe",
 60 |                             "nvidia broadcast.exe", "nvidia broadcast ui.exe", "NVIDIA Share.exe",
 61 |                             "NVIDIA Web Helper.exe", "nvsphelper64.exe", "NVIDIA GeForce Experience.exe",
 62 |                             "nvcontainer.exe", "NVDisplay.Container.exe"]
 63 | 
 64 |     windows = gw.getAllWindows()
 65 |     print(windows)
 66 |     open_windows_info = []
 67 |     for w in windows:
 68 |         if (w.visible and not w.isMinimized and w.title and w.height > 100 and w.width > 100
 69 |                 and w.title not in excluded_titles):
 70 |             hwnd = w._hWnd
 71 |             _, pid = win32process.GetWindowThreadProcessId(hwnd)
 72 |             process = psutil.Process(pid)
 73 |             executable_name = process.name()
 74 |             if executable_name not in excluded_executables:
 75 |                 rect = win32gui.GetWindowRect(hwnd)
 76 |                 position = (rect[0], rect[1])
 77 |                 size = (rect[2] - rect[0], rect[3] - rect[1])
 78 |                 open_windows_info.append((w.title, position, size, executable_name, w))
 79 |     # Sort the windows by their Z position (from top to bottom)
 80 |     open_windows_info.sort(key=lambda x: x[1][1], reverse=True)
 81 |     return [info[:-1] for info in open_windows_info]  # Exclude the window object from the returned info
 82 | 
 83 | 
 84 | def get_window_text(hwnd):
 85 |     length = GetWindowTextLength(hwnd) + 1
 86 |     buffer = ctypes.create_unicode_buffer(length)
 87 |     GetWindowText(hwnd, buffer, length)
 88 |     return buffer.value
 89 | 
 90 | 
 91 | def get_active_window_title():
 92 |     time.sleep(1) # Wait for the window to be fully active ToDo: Fix this part!
 93 |     hwnd = GetForegroundWindow()
 94 |     return get_window_text(hwnd)
 95 | 
 96 | 
 97 | def enum_windows_proc(hwnd, lParam):
 98 |     if IsWindowVisible(hwnd):
 99 |         title = get_window_text(hwnd)
100 |         if title:
101 |             hwnd_list.append((hwnd, title))
102 |     return True
103 | 
104 | 
105 | def open_windows_info():
106 |     global hwnd_list
107 |     hwnd_list = []
108 |     EnumWindows(EnumWindowsProc(enum_windows_proc), 0)
109 |     return hwnd_list
110 | 
111 | 
112 | def find_window(partial_title):
113 |     windows = open_windows_info()
114 |     for hwnd, title in windows:
115 |         if partial_title.lower() in title.lower():
116 |             return hwnd
117 |     return None
118 | 
119 | 
120 | def find_window_by_title(partial_title):
121 |     windows = open_windows_info()
122 |     for hwnd, title in windows:
123 |         if partial_title.lower() in title.lower():
124 |             return hwnd, title
125 |     return None, None
126 | 
127 | 
128 | def bring_to_foreground(hwnd):
129 |     if IsIconic(hwnd):
130 |         ShowWindow(hwnd, SW_RESTORE)
131 |     else:
132 |         ShowWindow(hwnd, SW_SHOW)
133 |     SetForegroundWindow(hwnd)
134 | 
135 | 
136 | def search_registry_for_application(app_name):
137 |     sub_keys = [
138 |         r"SOFTWARE\Microsoft\Windows\CurrentVersion\Uninstall",
139 |         r"SOFTWARE\WOW6432Node\Microsoft\Windows\CurrentVersion\Uninstall",
140 |     ]
141 |     registry_hives = [winreg.HKEY_LOCAL_MACHINE, winreg.HKEY_CURRENT_USER]
142 | 
143 |     for hive in registry_hives:
144 |         for sub_key in sub_keys:
145 |             try:
146 |                 with winreg.OpenKey(hive, sub_key) as key:
147 |                     for i in range(0, winreg.QueryInfoKey(key)[0]):
148 |                         skey_name = winreg.EnumKey(key, i)
149 |                         skey = winreg.OpenKey(key, skey_name)
150 |                         try:
151 |                             display_name = winreg.QueryValueEx(skey, 'DisplayName')[0]
152 |                             if app_name.lower() in display_name.lower():
153 |                                 # Look for the executable in a 'DisplayIcon' field
154 |                                 try:
155 |                                     executable_path = winreg.QueryValueEx(skey, 'DisplayIcon')[0]
156 |                                     # In case the path points to an icon, it usually contains a comma
157 |                                     # followed by an icon index, e.g. "C:\Path\To\App.exe,0"
158 |                                     if ',' in executable_path:
159 |                                         executable_path = executable_path.split(',')[0]
160 |                                     return executable_path
161 |                                 except OSError:
162 |                                     pass
163 | 
164 |                                 # If not found, fall back to 'UninstallString' as a last resort
165 |                                 uninstall_string = winreg.QueryValueEx(skey, 'UninstallString')[0]
166 |                                 # Here you would need to intelligently extract the executable path
167 |                                 # This might involve more complex logic and is not guaranteed to work
168 |                                 # for all applications as uninstall strings can vary significantly.
169 |                                 # This is a starting point that might work for some applications:
170 |                                 # uninstall_string = uninstall_string.split('"')[1] if '"' in uninstall_string else uninstall_string
171 |                                 # return uninstall_string if os.path.isfile(uninstall_string) else None
172 |                         except OSError:
173 |                             pass
174 |                         finally:
175 |                             skey.Close()
176 |             except OSError:
177 |                 pass
178 |     return None
179 | 
180 | def find_best_match_window(partial_title, threshold=50):
181 |     windows = open_windows_info()
182 |     best_match = None
183 |     highest_score = 0
184 |     for hwnd, title in windows:
185 |         score = fuzz.token_sort_ratio(partial_title.lower(), title.lower())
186 |         if score > highest_score and score >= threshold:
187 |             best_match = (hwnd, title)
188 |             highest_score = score
189 |     return best_match
190 | 
191 | 
192 | # def activate_windowt_title(application_name):
193 | #     print(f"Activating window for {application_name}")
194 | #     if application_name.lower() == "cmd":
195 | #         # If we know it's cmd, we can try activating an existing window or start a new one directly
196 | #         hwnd, window_title = find_window_by_title("cmd")
197 | #         if hwnd:
198 | #             # If we found a window, bring it to the foreground
199 | #             bring_to_foreground(hwnd)
200 | #         else:
201 | #             os.startfile('cmd.exe')
202 | #         return get_active_window_title()
203 | #     app_path = None
204 | #
205 | #     # Attempt to find the application path for each word in application_name
206 | #
207 | #     process = subprocess.run(['where', application_name], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True,
208 | #                              shell=True)
209 | #     output = process.stdout.strip().split('\n')
210 | #     app_path = output[0] if output else None
211 | #     print(f"Application path: {app_path}")
212 | #     # If the application path wasn't found, search in the registry for each word
213 | #     if not app_path:
214 | #         print(f"Searching in registry for application path for '{application_name}'...")
215 | #         app_path = search_registry_for_application(application_name)
216 | #
217 | #     hwnd, window_title = None, None
218 | #     hwnd, window_title = find_window_by_title(application_name)
219 | #     if window_title:
220 | #         bring_to_foreground(hwnd)
221 | #     elif app_path:
222 | #         print("Application found but no window open. Starting the application...")
223 | #         print(f"Application path: {app_path}")
224 | #         subprocess.Popen(app_path)  # Open the application if it is not running
225 | #     else:
226 | #         print(f"{application_name} could not be found nor is open. Please ensure it is installed and accessible via system PATH.")
227 | #     return get_active_window_title()
228 | 
229 | def activate_windowt_title(application_name):
230 |     if application_name.lower() == "cmd":
231 |         # If we know it's cmd, we can try activating an existing window or start a new one directly
232 |         hwnd, window_title = find_window_by_title("cmd")
233 |         if hwnd:
234 |             # If we found a window, bring it to the foreground
235 |             bring_to_foreground(hwnd)
236 |         else:
237 |             os.startfile('cmd.exe')
238 |         return get_active_window_title()
239 | 
240 |     app_path = None
241 |     words = application_name.split()
242 | 
243 |     # Attempt to find the application path for each word in application_name
244 |     for word in words:
245 |         try:
246 |             process = subprocess.run(['where', word], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True,
247 |                                      shell=True)
248 |             output = process.stdout.strip().split('\n')
249 |             if output:
250 |                 app_path = output[0]
251 |                 break  # Once we have found a path, we can break the loop
252 |         except Exception as e:
253 |             print(f"ERROR: Error finding application path for '{word}': {e}")
254 | 
255 |     # If the application path wasn't found, search in the registry for each word
256 |     if not app_path:
257 |         for word in words:
258 |             app_path = search_registry_for_application(word)
259 |             if app_path:
260 |                 break  # Once we have found a path, we can break the loop
261 | 
262 |     hwnd, window_title = None, None
263 |     # Attempt to find the window with a partial match for any of the words
264 |     for word in words:
265 |         hwnd, window_title = find_window_by_title(word)
266 |         if hwnd:
267 |             break  # Once we have found a window, we can break the loop
268 | 
269 |     if hwnd:
270 |         # If we found a window, bring it to the foreground
271 |         bring_to_foreground(hwnd)
272 |     elif app_path:
273 |         try:
274 |             subprocess.Popen(app_path)  # Open the application if it is not running
275 |         except Exception as e:
276 |             print(f"ERROR: Error opening application '{app_path}': {e}")
277 |     else:
278 |         print(
279 |             f"{application_name} could not be found nor is open. Please ensure it is installed and accessible via system PATH.")
280 | 
281 |     return get_active_window_title()
282 | 
283 | 
284 | 
285 | if __name__ == "__main__":
286 |     # active_title = activate_windowt_title("chrome")
287 |     active_title = activate_windowt_title("Google Chrome")
288 |     print(f"Active window title: {active_title}")
289 | 
290 | 


--------------------------------------------------------------------------------
/core/window_mgmt.py:
--------------------------------------------------------------------------------
 1 | from openai import OpenAI
 2 | 
 3 | class WindowClassifier:
 4 |     def __init__(self):
 5 |         self.api_key = 'insert_your_api_key_here'
 6 |         self.client = OpenAI(api_key=self.api_key)
 7 |         self.model_name = 'gpt-3.5-turbo'
 8 | 
 9 |     def _get_response(self, messages, max_tokens=50):
10 |         try:
11 |             response = self.client.chat.completions.create(
12 |                 model=self.model_name,
13 |                 messages=messages,
14 |                 max_tokens=max_tokens
15 |             )
16 |             if response.choices and hasattr(response.choices[0], 'message'):
17 |                 decision_message = response.choices[0].message
18 |                 if hasattr(decision_message, 'content'):
19 |                     return decision_message.content.strip()
20 |             return None
21 |         except Exception as e:
22 |             print(f"An error occurred: {e}")
23 |             return None
24 | 
25 |     def get_window_classification(self, title):
26 |         messages = [{"role": "system", "content": "You are a helpful assistant."},
27 |                     {"role": "user", "content": f"Classify this window title into a category: {title}"}]
28 |         return self._get_response(messages)
29 | 
30 |     def complete_text(self, goal):
31 |         messages = [{"role": "system", "content": "You are a helpful assistant."},
32 |                     {"role": "user", "content": f"Only return the user's message of the goal: {goal}"}]
33 |         return self._get_response(messages)
34 | 
35 |     def get_window_info(self, window_title):
36 |         open_windows = self.get_open_windows()
37 |         for window_info in open_windows:
38 |             if window_title.lower() in window_info[0].lower():
39 |                 return window_info
40 |         return None
41 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | setuptools
 2 | pywinauto
 3 | pyautogui
 4 | pygetwindow
 5 | customtkinter
 6 | openai
 7 | Pillow  # This is the PIL (Python Imaging Library) package
 8 | speechRecognition
 9 | requests
10 | pywin32
11 | psutil
12 | ImageGrab  # Part of Pillow
13 | fuzzywuzzy
14 | pytesseract
15 | uiautomation
16 | gtts
17 | pygame
18 | pyAudio
19 | 


--------------------------------------------------------------------------------