├── .python-version ├── src └── windows_mcp │ ├── __init__.py │ ├── tree │ ├── __init__.py │ ├── utils.py │ ├── config.py │ ├── views.py │ └── service.py │ ├── desktop │ ├── __init__.py │ ├── config.py │ ├── views.py │ └── service.py │ ├── analytics.py │ └── __main__.py ├── .mcpbignore ├── assets ├── demo1.mov ├── demo2.mov ├── logo.png └── screenshots │ ├── screenshot_1.png │ ├── screenshot_2.png │ └── screenshot_3.png ├── server.json ├── LICENSE.md ├── pyproject.toml ├── .gitignore ├── manifest.json ├── CONTRIBUTING.md ├── SECURITY.md └── README.md /.python-version: -------------------------------------------------------------------------------- 1 | 3.13 2 | -------------------------------------------------------------------------------- /src/windows_mcp/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/windows_mcp/tree/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/windows_mcp/desktop/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.mcpbignore: -------------------------------------------------------------------------------- 1 | .venv 2 | __pycache__ 3 | build 4 | dist 5 | notebook.ipynb -------------------------------------------------------------------------------- /assets/demo1.mov: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CursorTouch/Windows-MCP/HEAD/assets/demo1.mov -------------------------------------------------------------------------------- /assets/demo2.mov: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CursorTouch/Windows-MCP/HEAD/assets/demo2.mov -------------------------------------------------------------------------------- /assets/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CursorTouch/Windows-MCP/HEAD/assets/logo.png -------------------------------------------------------------------------------- /assets/screenshots/screenshot_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CursorTouch/Windows-MCP/HEAD/assets/screenshots/screenshot_1.png -------------------------------------------------------------------------------- /assets/screenshots/screenshot_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CursorTouch/Windows-MCP/HEAD/assets/screenshots/screenshot_2.png -------------------------------------------------------------------------------- /assets/screenshots/screenshot_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CursorTouch/Windows-MCP/HEAD/assets/screenshots/screenshot_3.png -------------------------------------------------------------------------------- /src/windows_mcp/desktop/config.py: -------------------------------------------------------------------------------- 1 | from typing import Set 2 | 3 | BROWSER_NAMES=set([ 4 | 'msedge.exe', 5 | 'chrome.exe', 6 | 'firefox.exe' 7 | ]) 8 | 9 | AVOIDED_APPS:Set[str]=set([ 10 | 'AgentUI' 11 | ]) 12 | 13 | EXCLUDED_APPS:Set[str]=set([ 14 | 'Progman', 15 | 'Shell_TrayWnd', 16 | 'Shell_SecondaryTrayWnd', 17 | 'Microsoft.UI.Content.PopupWindowSiteBridge', 18 | 'Windows.UI.Core.CoreWindow', 19 | ]) 20 | 21 | PROCESS_PER_MONITOR_DPI_AWARE = 2 -------------------------------------------------------------------------------- /server.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://static.modelcontextprotocol.io/schemas/2025-07-09/server.schema.json", 3 | "name": "io.github.CursorTouch/Windows-MCP", 4 | "description": "An MCP Server for computer-use in Windows OS", 5 | "status": "active", 6 | "repository": { 7 | "url": "https://github.com/CursorTouch/Windows-MCP", 8 | "source": "github" 9 | }, 10 | "version": "1.0.0", 11 | "packages": [ 12 | { 13 | "registry_type": "pypi", 14 | "registry_base_url": "https://pypi.org", 15 | "identifier": "windows_mcp", 16 | "version": "0.5.4", 17 | "runtime_hint": "uvx", 18 | "transport": { 19 | "type": "stdio" 20 | } 21 | } 22 | ] 23 | } -------------------------------------------------------------------------------- /src/windows_mcp/tree/utils.py: -------------------------------------------------------------------------------- 1 | import random 2 | from uiautomation import Control 3 | 4 | def random_point_within_bounding_box(node: Control, scale_factor: float = 1.0) -> tuple[int, int]: 5 | """ 6 | Generate a random point within a scaled-down bounding box. 7 | 8 | Args: 9 | node (Control): The node with a bounding rectangle 10 | scale_factor (float, optional): The factor to scale down the bounding box. Defaults to 1.0. 11 | 12 | Returns: 13 | tuple: A random point (x, y) within the scaled-down bounding box 14 | """ 15 | box = node.BoundingRectangle 16 | scaled_width = int(box.width() * scale_factor) 17 | scaled_height = int(box.height() * scale_factor) 18 | scaled_left = box.left + (box.width() - scaled_width) // 2 19 | scaled_top = box.top + (box.height() - scaled_height) // 2 20 | x = random.randint(scaled_left, scaled_left + scaled_width) 21 | y = random.randint(scaled_top, scaled_top + scaled_height) 22 | return (x, y) -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 JEOMON GEORGE 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /src/windows_mcp/tree/config.py: -------------------------------------------------------------------------------- 1 | INTERACTIVE_CONTROL_TYPE_NAMES=set([ 2 | 'ButtonControl', 3 | 'ListItemControl', 4 | 'MenuItemControl', 5 | 'EditControl', 6 | 'CheckBoxControl', 7 | 'RadioButtonControl', 8 | 'ComboBoxControl', 9 | 'HyperlinkControl', 10 | 'SplitButtonControl', 11 | 'TabItemControl', 12 | 'TreeItemControl', 13 | 'DataItemControl', 14 | 'HeaderItemControl', 15 | 'TextBoxControl', 16 | 'SpinnerControl', 17 | 'ScrollBarControl' 18 | ]) 19 | 20 | DOCUMENT_CONTROL_TYPE_NAMES=set([ 21 | 'DocumentControl' 22 | ]) 23 | 24 | STRUCTURAL_CONTROL_TYPE_NAMES = set([ 25 | 'PaneControl', 26 | 'GroupControl', 27 | 'CustomControl' 28 | ]) 29 | 30 | INFORMATIVE_CONTROL_TYPE_NAMES=set([ 31 | 'TextControl', 32 | 'ImageControl', 33 | 'StatusBarControl', 34 | # 'ProgressBarControl', 35 | # 'ToolTipControl', 36 | # 'TitleBarControl', 37 | # 'SeparatorControl', 38 | # 'HeaderControl', 39 | # 'HeaderItemControl', 40 | ]) 41 | 42 | DEFAULT_ACTIONS=set([ 43 | 'Click', 44 | 'Press', 45 | 'Jump', 46 | 'Check', 47 | 'Uncheck', 48 | 'Double Click' 49 | ]) 50 | 51 | THREAD_MAX_RETRIES = 3 -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "windows-mcp" 3 | version = "0.5.7" 4 | description = "Lightweight MCP Server for interacting with Windows Operating System." 5 | authors = [ 6 | { name = "Jeomon George", email = "jeogeoalukka@gmail.com" } 7 | ] 8 | readme = "README.md" 9 | license = { file = "LICENSE.md" } 10 | urls = { homepage = "https://github.com/CursorTouch" } 11 | keywords = ["windows", "mcp", "ai", "desktop","ai agent"] 12 | requires-python = ">=3.13" 13 | dependencies = [ 14 | "click>=8.2.1", 15 | "fastmcp>=2.8.1", 16 | "fuzzywuzzy>=0.18.0", 17 | "humancursor>=1.1.5", 18 | "ipykernel>=6.30.0", 19 | "live-inspect>=0.1.2", 20 | "markdownify>=1.1.0", 21 | "pdfplumber>=0.11.7", 22 | "pillow>=11.2.1", 23 | "posthog>=7.4.0", 24 | "psutil>=7.0.0", 25 | "pyautogui>=0.9.54", 26 | "pygetwindow>=0.0.9", 27 | "python-dotenv>=1.1.0", 28 | "python-levenshtein>=0.27.1", 29 | "pywinauto>=0.6.9", 30 | "requests>=2.32.3", 31 | "tabulate>=0.9.0", 32 | "uiautomation>=2.0.24", 33 | "uuid7>=0.1.0", 34 | ] 35 | 36 | [project.scripts] 37 | windows-mcp = "windows_mcp.__main__:main" 38 | 39 | [build-system] 40 | requires = ["hatchling"] 41 | build-backend = "hatchling.build" 42 | 43 | 44 | -------------------------------------------------------------------------------- /src/windows_mcp/desktop/views.py: -------------------------------------------------------------------------------- 1 | from windows_mcp.tree.views import TreeState 2 | from dataclasses import dataclass 3 | from tabulate import tabulate 4 | from typing import Optional 5 | from PIL.Image import Image 6 | from enum import Enum 7 | 8 | class Browser(Enum): 9 | CHROME='Chrome' 10 | EDGE='Edge' 11 | FIREFOX='Firefox' 12 | 13 | class Status(Enum): 14 | MAXIMIZED='Maximized' 15 | MINIMIZED='Minimized' 16 | NORMAL='Normal' 17 | HIDDEN='Hidden' 18 | 19 | 20 | @dataclass 21 | class App: 22 | name:str 23 | depth:int 24 | status:Status 25 | size:'Size' 26 | handle: int 27 | process_id:int 28 | 29 | def to_row(self): 30 | return [self.name, self.depth, self.status.value, self.size.width, self.size.height, self.handle] 31 | 32 | @dataclass 33 | class Size: 34 | width:int 35 | height:int 36 | 37 | def to_string(self): 38 | return f'({self.width},{self.height})' 39 | 40 | @dataclass 41 | class DesktopState: 42 | apps:list[App] 43 | active_app:Optional[App] 44 | screenshot:Image|None 45 | tree_state:TreeState 46 | 47 | def active_app_to_string(self): 48 | if self.active_app is None: 49 | return 'No active app found' 50 | headers = ["Name", "Depth", "Status", "Width", "Height", "Handle"] 51 | return tabulate([self.active_app.to_row()], headers=headers, tablefmt="simple") 52 | 53 | def apps_to_string(self): 54 | if not self.apps: 55 | return 'No apps running in background' 56 | headers = ["Name", "Depth", "Status", "Width", "Height", "Handle"] 57 | rows = [app.to_row() for app in self.apps] 58 | return tabulate(rows, headers=headers, tablefmt="simple") -------------------------------------------------------------------------------- /src/windows_mcp/tree/views.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass,field 2 | from tabulate import tabulate 3 | from typing import Optional 4 | 5 | @dataclass 6 | class DOMInfo: 7 | horizontal_scrollable: bool 8 | horizontal_scroll_percent: float 9 | vertical_scrollable: bool 10 | vertical_scroll_percent: float 11 | 12 | @dataclass 13 | class TreeState: 14 | interactive_nodes:list['TreeElementNode']=field(default_factory=list) 15 | scrollable_nodes:list['ScrollElementNode']=field(default_factory=list) 16 | dom_informative_nodes:list['TextElementNode']=field(default_factory=list) 17 | dom_info:Optional['DOMInfo']=None 18 | 19 | def interactive_elements_to_string(self) -> str: 20 | if not self.interactive_nodes: 21 | return "No interactive elements" 22 | headers = ["Label", "App Name", "ControlType", "Name", "Value", "Shortcut", "Coordinates" ,"IsFocused"] 23 | rows = [node.to_row(idx) for idx, node in enumerate(self.interactive_nodes)] 24 | return tabulate(rows, headers=headers, tablefmt="simple") 25 | 26 | def scrollable_elements_to_string(self) -> str: 27 | if not self.scrollable_nodes: 28 | return "No scrollable elements" 29 | headers = [ 30 | "Label", "App Name", "ControlType", "Name", "Coordinates", 31 | "Horizontal Scrollable", "Horizontal Scroll Percent(%)", "Vertical Scrollable", "Vertical Scroll Percent(%)", "IsFocused" 32 | ] 33 | base_index = len(self.interactive_nodes) 34 | rows = [node.to_row(idx, base_index) for idx, node in enumerate(self.scrollable_nodes)] 35 | return tabulate(rows, headers=headers, tablefmt="simple") 36 | 37 | @dataclass 38 | class BoundingBox: 39 | left:int 40 | top:int 41 | right:int 42 | bottom:int 43 | width:int 44 | height:int 45 | 46 | def get_center(self)->'Center': 47 | return Center(x=self.left+self.width//2,y=self.top+self.height//2) 48 | 49 | def xywh_to_string(self): 50 | return f'({self.left},{self.top},{self.width},{self.height})' 51 | 52 | def xyxy_to_string(self): 53 | x1,y1,x2,y2=self.convert_xywh_to_xyxy() 54 | return f'({x1},{y1},{x2},{y2})' 55 | 56 | def convert_xywh_to_xyxy(self)->tuple[int,int,int,int]: 57 | x1,y1=self.left,self.top 58 | x2,y2=self.left+self.width,self.top+self.height 59 | return x1,y1,x2,y2 60 | 61 | @dataclass 62 | class Center: 63 | x:int 64 | y:int 65 | 66 | def to_string(self)->str: 67 | return f'({self.x},{self.y})' 68 | 69 | @dataclass 70 | class TreeElementNode: 71 | name: str 72 | control_type: str 73 | app_name: str 74 | value:str 75 | shortcut: str 76 | bounding_box: BoundingBox 77 | center: Center 78 | xpath:str 79 | is_focused:bool 80 | 81 | def to_row(self, index: int): 82 | return [index, self.app_name, self.control_type, self.name, self.value, self.shortcut, self.center.to_string(),self.is_focused] 83 | 84 | @dataclass 85 | class ScrollElementNode: 86 | name: str 87 | control_type: str 88 | xpath:str 89 | app_name: str 90 | bounding_box: BoundingBox 91 | center: Center 92 | horizontal_scrollable: bool 93 | horizontal_scroll_percent: float 94 | vertical_scrollable: bool 95 | vertical_scroll_percent: float 96 | is_focused: bool 97 | 98 | def to_row(self, index: int, base_index: int): 99 | return [ 100 | base_index + index, 101 | self.app_name, 102 | self.control_type, 103 | self.name, 104 | self.center.to_string(), 105 | self.horizontal_scrollable, 106 | self.horizontal_scroll_percent, 107 | self.vertical_scrollable, 108 | self.vertical_scroll_percent, 109 | self.is_focused 110 | ] 111 | 112 | @dataclass 113 | class TextElementNode: 114 | text:str 115 | 116 | ElementNode=TreeElementNode|ScrollElementNode -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 110 | .pdm.toml 111 | .pdm-python 112 | .pdm-build/ 113 | 114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 115 | __pypackages__/ 116 | 117 | # Celery stuff 118 | celerybeat-schedule 119 | celerybeat.pid 120 | 121 | # SageMath parsed files 122 | *.sage.py 123 | 124 | # Environments 125 | .env 126 | .venv 127 | env/ 128 | venv/ 129 | ENV/ 130 | env.bak/ 131 | venv.bak/ 132 | 133 | # Spyder project settings 134 | .spyderproject 135 | .spyproject 136 | 137 | # Rope project settings 138 | .ropeproject 139 | 140 | # mkdocs documentation 141 | /site 142 | 143 | # mypy 144 | .mypy_cache/ 145 | .dmypy.json 146 | dmypy.json 147 | 148 | # Pyre type checker 149 | .pyre/ 150 | 151 | # pytype static type analyzer 152 | .pytype/ 153 | 154 | # Cython debug symbols 155 | cython_debug/ 156 | 157 | # PyCharm 158 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 159 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 160 | # and can be added to the global gitignore or merged into this file. For a more nuclear 161 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 162 | #.idea/ 163 | 164 | .vscode 165 | .mcpregistry_github_token 166 | .mcpregistry_registry_token 167 | sandbox 168 | *.ipynb 169 | *.mcpb -------------------------------------------------------------------------------- /manifest.json: -------------------------------------------------------------------------------- 1 | { 2 | "manifest_version": "0.2", 3 | "name": "Windows-MCP", 4 | "version": "0.5.6", 5 | "description": "MCP Server that enables Claude to interact with Windows OS", 6 | "long_description": "Windows MCP is an open-source project that enables seamless integration between AI agents and the Windows operating system. Acting as an MCP server bridges the gap between LLMs and the Windows operating system, allowing agents to perform tasks such as **file navigation, application control, UI interaction, QA testing,** and more.\\n\\n## Key Features\\n\\n- **Seamless Windows Integration**: Interacts natively with Windows UI elements, opens apps, controls windows, simulates user input, and more.\\n- **Use Any LLM (Vision Optional)**: Unlike many automation tools, Windows MCP doesn't rely on any traditional computer vision techniques or specific fine-tuned models; it works with any LLMs, reducing complexity and setup time.\\n- **Rich Toolset for UI Automation**: Includes tools for basic keyboard, mouse operation and capturing window/UI state.\\n- **Lightweight & Open-Source**: Minimal dependencies and easy setup with full source code available under MIT license.\\n- **Customizable & Extendable**: Easily adapt or extend tools to suit your unique automation or AI integration needs.\\n- **Real-Time Interaction**: Typical latency between actions (e.g., from one mouse click to the next) ranges from **1.5 to 2.3 secs**, and may slightly vary based on the number of active applications and system load, also the inferencing speed of the llm.\\n\\n## Requirements\\n\\n### UV Package Manager\\nThis MCP server requires [UV](https://github.com/astral-sh/uv), a fast Python package manager. \\n\\n```bash\\npip install uv\\n```\\n\\nFor detailed installation instructions, see the [UV documentation](https://github.com/astral-sh/uv#installation).", 7 | "author": { 8 | "name": "CursorTouch", 9 | "url": "https://cursortouch.com/" 10 | }, 11 | "homepage": "https://cursortouch.com/", 12 | "documentation": "https://github.com/CursorTouch/Windows-MCP", 13 | "support": "https://github.com/CursorTouch/Windows-MCP", 14 | "icon": "./assets/logo.png", 15 | "screenshots": [ 16 | "./assets/screenshots/screenshot_1.png", 17 | "./assets/screenshots/screenshot_2.png", 18 | "./assets/screenshots/screenshot_3.png" 19 | ], 20 | "server": { 21 | "type": "python", 22 | "entry_point": "./src/windows_mcp/__main__.py", 23 | "mcp_config": { 24 | "command": "uv", 25 | "args": [ 26 | "--directory", 27 | "${__dirname}", 28 | "run", 29 | "windows-mcp" 30 | ], 31 | "env": { 32 | "ANONYMIZED_TELEMETRY": "${user_config.anonymized_telemetry}" 33 | } 34 | } 35 | }, 36 | "user_config": { 37 | "anonymized_telemetry": { 38 | "type": "boolean", 39 | "title": "Anonymized Telemetry", 40 | "description": "Windows-MCP collects basic usage data to help improve the MCP server. No personal information, tool arguments, or tool outputs are tracked.", 41 | "required": false, 42 | "default": true 43 | } 44 | }, 45 | "tools": [ 46 | { 47 | "name": "App Tool", 48 | "description": "Manages Windows applications through launch, resize, and window switching operations." 49 | }, 50 | { 51 | "name": "Powershell Tool", 52 | "description": "Execute PowerShell commands and return the output with status code" 53 | }, 54 | { 55 | "name": "State Tool", 56 | "description": "Capture comprehensive desktop state including focused/opened applications, interactive UI elements (buttons, text fields, menus), informative content (text, labels, status), and scrollable areas. Optionally includes visual screenshot when use_vision=True. Essential for understanding current desktop context and available UI interactions." 57 | }, 58 | { 59 | "name": "Click Tool", 60 | "description": "Click on UI elements at specific coordinates. Supports left/right/middle mouse buttons and single/double/triple clicks. Use coordinates from State Tool output." 61 | }, 62 | { 63 | "name": "Type Tool", 64 | "description": "Type text into input fields, text areas, or focused elements. Set clear=True to replace existing text, False to append. Click on target element coordinates first." 65 | }, 66 | { 67 | "name": "Scroll Tool", 68 | "description": "Scroll at specific coordinates or current mouse position. Use wheel_times to control scroll amount (1 wheel = ~3-5 lines). Essential for navigating lists, web pages, and long content." 69 | }, 70 | { 71 | "name": "Drag Tool", 72 | "description": "Drag and drop operation from source coordinates to destination coordinates. Useful for moving files, resizing windows, or drag-and-drop interactions." 73 | }, 74 | { 75 | "name": "Move Tool", 76 | "description": "Move mouse cursor to specific coordinates without clicking. Useful for hovering over elements or positioning cursor before other actions." 77 | }, 78 | { 79 | "name": "Shortcut Tool", 80 | "description": "Execute keyboard shortcuts using key combinations. Pass keys as list (e.g., 'ctrl'+'c' for copy, 'alt'+'tab' for app switching, 'win'+'r' for Run dialog, 'win' is for opening the start menu)." 81 | }, 82 | { 83 | "name": "Wait Tool", 84 | "description": "Pause execution for specified duration in seconds. Useful for waiting for applications to load, animations to complete, or adding delays between actions." 85 | }, 86 | { 87 | "name": "Scrape Tool", 88 | "description": "Fetch content from a URL or the active browser tab. By default (use_dom=False), performs a lightweight HTTP request to the URL and returns markdown content of complete webpage. Note: Some websites may block automated HTTP requests. If this fails, open the page in a browser and retry with use_dom=True to extract visible text from the active tab's DOM within the viewport." 89 | } 90 | ], 91 | "compatibility": { 92 | "platforms": [ 93 | "win32" 94 | ], 95 | "runtimes": { 96 | "python": ">=3.13" 97 | } 98 | }, 99 | "keywords": [ 100 | "windows", 101 | "automation", 102 | "ai", 103 | "mcp", 104 | "computer-use" 105 | ], 106 | "license": "MIT", 107 | "repository": { 108 | "type": "git", 109 | "url": "https://github.com/CursorTouch/Windows-MCP" 110 | } 111 | } -------------------------------------------------------------------------------- /src/windows_mcp/analytics.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Dict, Any, TypeVar, Callable, Protocol, Awaitable 2 | from tempfile import TemporaryDirectory 3 | from uuid_extensions import uuid7str 4 | from fastmcp import Context 5 | from functools import wraps 6 | from pathlib import Path 7 | import posthog 8 | import asyncio 9 | import logging 10 | import time 11 | import os 12 | 13 | logging.basicConfig(level=logging.DEBUG) 14 | logger = logging.getLogger(__name__) 15 | 16 | T = TypeVar("T") 17 | 18 | class Analytics(Protocol): 19 | async def track_tool(self, tool_name: str, result: Dict[str, Any]) -> None: 20 | """Tracks the execution of a tool.""" 21 | ... 22 | 23 | async def track_error(self, error: Exception, context: Dict[str, Any]) -> None: 24 | """Tracks an error that occurred during the execution of a tool.""" 25 | ... 26 | 27 | async def is_feature_enabled(self, feature: str) -> bool: 28 | """Checks if a feature flag is enabled.""" 29 | ... 30 | 31 | async def close(self) -> None: 32 | """Closes the analytics client.""" 33 | ... 34 | 35 | class PostHogAnalytics: 36 | TEMP_FOLDER = Path(TemporaryDirectory().name).parent 37 | API_KEY = 'phc_uxdCItyVTjXNU0sMPr97dq3tcz39scQNt3qjTYw5vLV' 38 | HOST = 'https://us.i.posthog.com' 39 | 40 | def __init__(self): 41 | self.client = posthog.Posthog( 42 | self.API_KEY, 43 | host=self.HOST, 44 | disable_geoip=False, 45 | enable_exception_autocapture=True, 46 | debug=True 47 | ) 48 | self._user_id = None 49 | self.mcp_interaction_id = f"mcp_{int(time.time()*1000)}_{os.getpid()}" 50 | 51 | if self.client: 52 | logger.debug(f"Initialized with user ID: {self.user_id} and session ID: {self.mcp_interaction_id}") 53 | 54 | @property 55 | def user_id(self) -> str: 56 | if self._user_id: 57 | return self._user_id 58 | 59 | user_id_file = self.TEMP_FOLDER / '.windows-mcp-user-id' 60 | if user_id_file.exists(): 61 | self._user_id = user_id_file.read_text(encoding='utf-8').strip() 62 | else: 63 | self._user_id = uuid7str() 64 | try: 65 | user_id_file.write_text(self._user_id, encoding='utf-8') 66 | except Exception as e: 67 | logger.warning(f"Could not persist user ID: {e}") 68 | 69 | return self._user_id 70 | 71 | async def track_tool(self, tool_name: str, result: Dict[str, Any]) -> None: 72 | if self.client: 73 | self.client.capture( 74 | distinct_id=self.user_id, 75 | event="tool_executed", 76 | properties={ 77 | "tool_name": tool_name, 78 | "session_id": self.mcp_interaction_id, 79 | "process_person_profile": True, 80 | **result 81 | } 82 | ) 83 | 84 | duration = result.get("duration_ms", 0) 85 | success_mark = "SUCCESS" if result.get("success") else "FAILED" 86 | # Using print for immediate visibility in console during debugging 87 | print(f"[Analytics] {tool_name}: {success_mark} ({duration}ms)") 88 | logger.info(f"{tool_name}: {success_mark} ({duration}ms)") 89 | if self.client: 90 | self.client.flush() 91 | 92 | async def track_error(self, error: Exception, context: Dict[str, Any]) -> None: 93 | if self.client: 94 | self.client.capture( 95 | distinct_id=self.user_id, 96 | event="exception", 97 | properties={ 98 | "exception": str(error), 99 | "traceback": str(error) if not hasattr(error, '__traceback__') else str(error), 100 | "session_id": self.mcp_interaction_id, 101 | "process_person_profile": True, 102 | **context 103 | } 104 | ) 105 | 106 | if self.client: 107 | self.client.flush() 108 | 109 | logger.error(f"ERROR in {context.get('tool_name')}: {error}") 110 | 111 | async def is_feature_enabled(self, feature: str) -> bool: 112 | if not self.client: 113 | return False 114 | return self.client.is_feature_enabled(feature, self.user_id) 115 | 116 | async def close(self) -> None: 117 | if self.client: 118 | self.client.shutdown() 119 | logger.debug("Closed analytics") 120 | 121 | def with_analytics(analytics_instance: Optional[Analytics], tool_name: str): 122 | """ 123 | Decorator to wrap tool functions with analytics tracking. 124 | """ 125 | def decorator(func: Callable[..., Awaitable[T]]) -> Callable[..., Awaitable[T]]: 126 | @wraps(func) 127 | async def wrapper(*args, **kwargs) -> T: 128 | start = time.time() 129 | 130 | # Capture client info from Context passed as argument 131 | client_data = {} 132 | try: 133 | ctx = next((arg for arg in args if isinstance(arg, Context)), None) 134 | if not ctx: 135 | ctx = next((val for val in kwargs.values() if isinstance(val, Context)), None) 136 | 137 | if ctx and ctx.session and ctx.session.client_params and ctx.session.client_params.clientInfo: 138 | info = ctx.session.client_params.clientInfo 139 | client_data["client_name"] = info.name 140 | client_data["client_version"] = info.version 141 | except Exception: 142 | pass 143 | 144 | try: 145 | if asyncio.iscoroutinefunction(func): 146 | result = await func(*args, **kwargs) 147 | else: 148 | # Run sync function in thread to avoid blocking loop 149 | result = await asyncio.to_thread(func, *args, **kwargs) 150 | 151 | duration_ms = int((time.time() - start) * 1000) 152 | 153 | if analytics_instance: 154 | await analytics_instance.track_tool(tool_name, { 155 | "duration_ms": duration_ms, 156 | "success": True, 157 | **client_data 158 | }) 159 | 160 | return result 161 | except Exception as error: 162 | duration_ms = int((time.time() - start) * 1000) 163 | if analytics_instance: 164 | await analytics_instance.track_error(error, { 165 | "tool_name": tool_name, 166 | "duration_ms": duration_ms, 167 | **client_data 168 | }) 169 | raise error 170 | return wrapper 171 | return decorator 172 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to Windows-MCP 2 | 3 | Thank you for your interest in contributing to Windows-MCP! We welcome contributions from the community to help make this project better. This document provides guidelines and instructions for contributing. 4 | 5 | ## Table of Contents 6 | 7 | - [Code of Conduct](#code-of-conduct) 8 | - [Getting Started](#getting-started) 9 | - [Prerequisites](#prerequisites) 10 | - [Development Environment Setup](#development-environment-setup) 11 | - [Development Workflow](#development-workflow) 12 | - [Branching Strategy](#branching-strategy) 13 | - [Making Changes](#making-changes) 14 | - [Commit Messages](#commit-messages) 15 | - [Code Style](#code-style) 16 | - [Testing](#testing) 17 | - [Running Tests](#running-tests) 18 | - [Adding Tests](#adding-tests) 19 | - [Pull Requests](#pull-requests) 20 | - [Before Submitting](#before-submitting) 21 | - [Pull Request Process](#pull-request-process) 22 | - [Review Process](#review-process) 23 | - [Documentation](#documentation) 24 | - [Reporting Issues](#reporting-issues) 25 | - [Security Vulnerabilities](#security-vulnerabilities) 26 | - [Getting Help](#getting-help) 27 | 28 | ## Code of Conduct 29 | 30 | By participating in this project, you agree to maintain a respectful and inclusive environment. We expect all contributors to: 31 | 32 | - Be respectful and considerate in communication 33 | - Welcome newcomers and help them get started 34 | - Accept constructive criticism gracefully 35 | - Focus on what's best for the community and project 36 | 37 | ## Getting Started 38 | 39 | ### Prerequisites 40 | 41 | Before you begin, ensure you have: 42 | 43 | - **Windows OS**: Windows 7, 8, 8.1, 10, or 11 44 | - **Python 3.13+**: [Download Python](https://www.python.org/downloads/) 45 | - **UV Package Manager**: Install with `pip install uv` or see [UV documentation](https://github.com/astral-sh/uv) 46 | - **Git**: [Download Git](https://git-scm.com/downloads) 47 | - **A GitHub account**: [Sign up here](https://github.com/join) 48 | 49 | ### Development Environment Setup 50 | 51 | 1. **Fork the Repository** 52 | 53 | Click the "Fork" button on the [Windows-MCP repository](https://github.com/CursorTouch/Windows-MCP) to create your own copy. 54 | 55 | 2. **Clone Your Fork** 56 | 57 | ```bash 58 | git clone https://github.com/YOUR_USERNAME/Windows-MCP.git 59 | cd Windows-MCP 60 | ``` 61 | 62 | 3. **Add Upstream Remote** 63 | 64 | ```bash 65 | git remote add upstream https://github.com/CursorTouch/Windows-MCP.git 66 | ``` 67 | 68 | 4. **Install Dependencies** 69 | 70 | ```bash 71 | uv sync 72 | ``` 73 | 74 | 5. **Verify Installation** 75 | 76 | ```bash 77 | uv run main.py --help 78 | ``` 79 | 80 | ## Development Workflow 81 | 82 | ### Branching Strategy 83 | 84 | - **`main`** branch contains the latest stable code 85 | - Create feature branches from `main` using descriptive names: 86 | - Features: `feature/add-new-tool` 87 | - Bug fixes: `fix/click-tool-coordinates` 88 | - Documentation: `docs/update-readme` 89 | - Refactoring: `refactor/desktop-service` 90 | 91 | ### Making Changes 92 | 93 | 1. **Create a New Branch** 94 | 95 | ```bash 96 | git checkout -b feature/your-feature-name 97 | ``` 98 | 99 | 2. **Make Your Changes** 100 | 101 | - Write clean, readable code 102 | - Follow the existing code structure 103 | - Add comments for complex logic 104 | - Update documentation as needed 105 | 106 | 3. **Test Your Changes** 107 | 108 | - Test manually in a safe environment (VM recommended) 109 | - Add automated tests if applicable 110 | - Ensure existing functionality isn't broken 111 | 112 | 4. **Commit Your Changes** 113 | 114 | ```bash 115 | git add . 116 | git commit -m "Add feature: description of your changes" 117 | ``` 118 | 119 | ### Commit Messages 120 | 121 | While we don't enforce a strict commit message format, please make your commits informative: 122 | 123 | **Good examples:** 124 | - `Add support for multi-monitor setups in State-Tool` 125 | - `Fix Click-Tool coordinate offset on high DPI displays` 126 | - `Update README with Perplexity Desktop installation steps` 127 | - `Refactor Desktop class to improve error handling` 128 | 129 | **Avoid:** 130 | - `fix bug` 131 | - `update` 132 | - `changes` 133 | 134 | ### Code Style 135 | 136 | We use **[Ruff](https://github.com/astral-sh/ruff)** for code formatting and linting. 137 | 138 | **Key Guidelines:** 139 | - **Line length**: 100 characters maximum 140 | - **Quotes**: Use double quotes for strings 141 | - **Naming conventions**: Follow PEP 8 142 | - `snake_case` for functions and variables 143 | - `PascalCase` for classes 144 | - `UPPER_CASE` for constants 145 | - **Type hints**: Add type annotations to function signatures 146 | - **Docstrings**: Use Google-style docstrings for all public functions and classes 147 | 148 | **Example:** 149 | 150 | ```python 151 | def click_tool( 152 | loc: list[int], 153 | button: Literal['left', 'right', 'middle'] = 'left', 154 | clicks: int = 1 155 | ) -> str: 156 | """Click on UI elements at specific coordinates. 157 | 158 | Args: 159 | loc: List of [x, y] coordinates to click 160 | button: Mouse button to use (left, right, or middle) 161 | clicks: Number of clicks (1=single, 2=double, 3=triple) 162 | 163 | Returns: 164 | Confirmation message describing the action performed 165 | 166 | Raises: 167 | ValueError: If loc doesn't contain exactly 2 integers 168 | """ 169 | if len(loc) != 2: 170 | raise ValueError("Location must be a list of exactly 2 integers [x, y]") 171 | # Implementation... 172 | ``` 173 | 174 | **Format Code:** 175 | 176 | ```bash 177 | ruff format . 178 | ``` 179 | 180 | **Run Linter:** 181 | 182 | ```bash 183 | ruff check . 184 | ``` 185 | 186 | ## Testing 187 | 188 | ### Running Tests 189 | 190 | If the project has tests (check the `tests/` directory): 191 | 192 | ```bash 193 | pytest 194 | ``` 195 | 196 | Run specific test files: 197 | 198 | ```bash 199 | pytest tests/test_desktop.py 200 | ``` 201 | 202 | Run with coverage: 203 | 204 | ```bash 205 | pytest --cov=src tests/ 206 | ``` 207 | 208 | ### Adding Tests 209 | 210 | When adding new features: 211 | 212 | 1. **Create test files** in the `tests/` directory matching the module structure 213 | 2. **Write unit tests** for individual functions 214 | 3. **Write integration tests** for tool workflows 215 | 4. **Use fixtures** for common test setup 216 | 5. **Mock external dependencies** (Windows API calls, file system operations) 217 | 218 | **Example Test:** 219 | 220 | ```python 221 | import pytest 222 | from src.desktop.service import Desktop 223 | 224 | def test_click_tool_validates_coordinates(): 225 | """Test that click_tool raises ValueError for invalid coordinates.""" 226 | with pytest.raises(ValueError, match="exactly 2 integers"): 227 | click_tool([100]) # Missing y coordinate 228 | ``` 229 | 230 | ## Pull Requests 231 | 232 | ### Before Submitting 233 | 234 | - [ ] Code follows the project's style guidelines 235 | - [ ] All tests pass (if applicable) 236 | - [ ] Documentation is updated (README, docstrings, etc.) 237 | - [ ] Commit messages are clear and descriptive 238 | - [ ] Changes are tested in a safe environment (VM recommended) 239 | - [ ] No sensitive information (API keys, passwords) is included 240 | 241 | ### Pull Request Process 242 | 243 | 1. **Update Your Branch** 244 | 245 | ```bash 246 | git fetch upstream 247 | git rebase upstream/main 248 | ``` 249 | 250 | 2. **Push to Your Fork** 251 | 252 | ```bash 253 | git push origin feature/your-feature-name 254 | ``` 255 | 256 | 3. **Create Pull Request** 257 | 258 | - Go to the [Windows-MCP repository](https://github.com/CursorTouch/Windows-MCP) 259 | - Click "New Pull Request" 260 | - Select your fork and branch 261 | - Fill out the PR template with: 262 | - **Description**: What does this PR do? 263 | - **Motivation**: Why is this change needed? 264 | - **Testing**: How was this tested? 265 | - **Screenshots**: If applicable (UI changes, new features) 266 | - **Related Issues**: Link any related issues 267 | 268 | 4. **Respond to Feedback** 269 | 270 | - Address reviewer comments promptly 271 | - Make requested changes in new commits 272 | - Push updates to the same branch 273 | 274 | ### Review Process 275 | 276 | - Maintainers will review your PR within a few days 277 | - You may be asked to make changes or provide clarification 278 | - Once approved, a maintainer will merge your PR 279 | - Your contribution will be acknowledged in release notes 280 | 281 | ## Documentation 282 | 283 | Good documentation is crucial! When contributing: 284 | 285 | ### Code Documentation 286 | 287 | - **Docstrings**: Add to all public functions, classes, and methods 288 | - **Comments**: Explain complex logic or non-obvious decisions 289 | - **Type hints**: Help users and tools understand your code 290 | 291 | ### User Documentation 292 | 293 | Update relevant documentation files: 294 | 295 | - **README.md**: For user-facing features or installation changes 296 | - **SECURITY.md**: For security-related changes 297 | - **CONTRIBUTING.md**: For development process changes 298 | 299 | ### Tool Documentation 300 | 301 | When adding or modifying tools: 302 | 303 | 1. Update the tool's `description` parameter in `main.py` 304 | 2. Add appropriate `ToolAnnotations` 305 | 3. Update the tools list in `README.md` 306 | 4. Update `manifest.json` if needed 307 | 308 | ## Reporting Issues 309 | 310 | Found a bug or have a feature request? Please open an issue! 311 | 312 | ### Bug Reports 313 | 314 | Include: 315 | - **Description**: Clear description of the bug 316 | - **Steps to Reproduce**: Detailed steps to recreate the issue 317 | - **Expected Behavior**: What should happen 318 | - **Actual Behavior**: What actually happens 319 | - **Environment**: Windows version, Python version, MCP client 320 | - **Screenshots/Logs**: If applicable 321 | 322 | ### Feature Requests 323 | 324 | Include: 325 | - **Description**: What feature do you want? 326 | - **Use Case**: Why is this feature needed? 327 | - **Proposed Solution**: How might this be implemented? 328 | - **Alternatives**: Other approaches you've considered 329 | 330 | ## Security Vulnerabilities 331 | 332 | **DO NOT** report security vulnerabilities through public GitHub issues. 333 | 334 | Instead, please: 335 | 1. Email the maintainers at [jeogeoalukka@gmail.com](mailto:jeogeoalukka@gmail.com) 336 | 2. Or use [GitHub Security Advisories](https://github.com/CursorTouch/Windows-MCP/security/advisories) 337 | 338 | See our [Security Policy](SECURITY.md) for more details. 339 | 340 | ## Getting Help 341 | 342 | Need help with your contribution? 343 | 344 | - **Discord**: Join our [Discord Community](https://discord.com/invite/Aue9Yj2VzS) 345 | - **Twitter/X**: Follow [@CursorTouch](https://x.com/CursorTouch) 346 | - **GitHub Discussions**: Ask questions in [Discussions](https://github.com/CursorTouch/Windows-MCP/discussions) 347 | - **Issues**: Open an issue for technical questions 348 | 349 | ## Types of Contributions 350 | 351 | We welcome many types of contributions: 352 | 353 | ### Code Contributions 354 | 355 | - **New Tools**: Add new MCP tools for Windows automation 356 | - **Bug Fixes**: Fix issues in existing tools 357 | - **Performance Improvements**: Optimize code for speed or efficiency 358 | - **Refactoring**: Improve code structure and maintainability 359 | 360 | ### Non-Code Contributions 361 | 362 | - **Documentation**: Improve README, guides, or docstrings 363 | - **Testing**: Add test cases or improve test coverage 364 | - **Bug Reports**: Report issues with detailed information 365 | - **Feature Requests**: Suggest new features or improvements 366 | - **Community Support**: Help others in Discord or Discussions 367 | - **Translations**: Help translate documentation (future) 368 | 369 | ## Recognition 370 | 371 | Contributors are recognized in: 372 | - GitHub contributors page 373 | - Release notes for significant contributions 374 | - Special mentions for major features or fixes 375 | 376 | ## License 377 | 378 | By contributing to Windows-MCP, you agree that your contributions will be licensed under the [MIT License](LICENSE.md). 379 | 380 | --- 381 | 382 | Thank you for contributing to Windows-MCP! Your efforts help make this project better for everyone. 🙏 383 | 384 | Made with ❤️ by the CursorTouch community -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | # Security Policy 2 | 3 | ## Overview 4 | 5 | Windows-MCP provides powerful automation capabilities that interact directly with your Windows operating system. This document outlines security considerations, best practices, and our commitment to maintaining a secure project. 6 | 7 | ## ⚠️ CRITICAL WARNING 8 | 9 | **READ THIS BEFORE DEPLOYING WINDOWS-MCP** 10 | 11 | ### Direct Operating System Interaction 12 | 13 | Windows-MCP is **NOT** a sandboxed or isolated tool. It interacts **directly with your actual Windows operating system** on behalf of the connected LLM agent. This means: 14 | 15 | - **Real System Actions**: Every tool call executes real actions on your physical or virtual Windows machine 16 | - **No Safety Net**: There is no intermediate layer, simulation, or preview mode 17 | - **User Permissions**: The MCP server operates on behalf of the user running it 18 | 19 | ### Irreversible and Destructive Changes 20 | 21 | Many operations performed by Windows-MCP **CANNOT BE UNDONE**: 22 | 23 | - **File Deletions**: Files deleted through PowerShell or UI interactions may be permanently lost 24 | - **Data Overwrites**: Text typed with `clear=True` replaces existing content without recovery options 25 | - **System Modifications**: PowerShell commands can modify registry, services, and system configurations 26 | - **Application Actions**: Clicking "Delete", "Yes", or "Confirm" buttons has real consequences 27 | - **No Undo/Rollback**: Unlike text editors or IDEs, most Windows operations don't have an undo feature 28 | 29 | ### Where NOT to Deploy 30 | 31 | **DO NOT** deploy Windows-MCP on systems where you cannot tolerate the risk of: 32 | 33 | - ❌ Accidental data loss or corruption 34 | - ❌ Unintended system configuration changes 35 | - ❌ Exposure of sensitive information through screenshots 36 | - ❌ Execution of malicious commands if the LLM is compromised 37 | - ❌ Compliance violations in regulated environments 38 | 39 | **Specifically, NEVER deploy on:** 40 | 41 | - Production servers or workstations 42 | - Systems containing irreplaceable data 43 | - Machines with access to sensitive databases or networks 44 | - Compliance-regulated environments (healthcare, finance, government) 45 | - Shared systems or multi-user environments without explicit consent 46 | - Any system you don't fully control and can't afford to lose 47 | 48 | ### Recommended Safe Deployment 49 | 50 | For safer experimentation and usage, **strongly consider** deploying Windows-MCP in: 51 | 52 | ✅ **Virtual Machines (VMs)** 53 | - Use VMware, VirtualBox, Hyper-V, or similar virtualization platforms 54 | - Take snapshots before each session for easy rollback 55 | - Isolate the VM from production networks 56 | - Limit VM access to non-sensitive resources only 57 | 58 | ✅ **Sandboxed Environments** 59 | - Windows Sandbox (built into Windows 10/11 Pro/Enterprise) 60 | - Containerized Windows environments 61 | - Dedicated test machines with no production data 62 | - Isolated network segments with restricted access 63 | 64 | ✅ **Dedicated Test Systems** 65 | - Separate physical machines used only for testing 66 | - Systems with regular backups and disaster recovery plans 67 | - Machines that can be wiped and rebuilt without consequence 68 | 69 | ### Impact Limitation Strategies 70 | 71 | If you must use Windows-MCP on a regular system: 72 | 73 | 1. **Create a Dedicated User Account**: Run the MCP server under a restricted user account with minimal permissions 74 | 2. **Regular Backups**: Maintain frequent, verified backups of all important data 75 | 3. **Network Isolation**: Disconnect from production networks or use firewall rules 76 | 4. **Supervised Operation**: Always monitor the agent's actions in real-time 77 | 5. **Disable High-Risk Tools**: Remove or restrict access to PowerShell-Tool and other destructive tools 78 | 6. **Test First**: Thoroughly test workflows in a safe environment before production use 79 | 80 | ## Security Considerations 81 | 82 | ### System Access Level 83 | 84 | Windows-MCP operates with the same permissions as the user running it. This means: 85 | 86 | - **Full System Access**: The MCP server can perform any action that the current user can perform 87 | - **No Sandboxing**: Tools execute directly on your Windows system without isolation 88 | - **Persistent Changes**: Actions taken by the MCP server can permanently modify your system state 89 | 90 | ### Tool-Specific Security Implications 91 | 92 | Based on our tool annotations, here's the security profile of each tool: 93 | 94 | #### **High-Risk Tools** (Potentially Destructive) 95 | 96 | These tools can make permanent changes to your system: 97 | 98 | | Tool | Risk | Description | 99 | |------|------|-------------| 100 | | **Powershell-Tool** | Critical | Can execute arbitrary PowerShell commands, including system modifications, file deletions, and network operations | 101 | | **Click-Tool** | High | Can trigger destructive UI actions (delete confirmations, system dialogs) | 102 | | **Type-Tool** | High | Can overwrite text, potentially destroying data when `clear=True` | 103 | | **Drag-Tool** | High | Can move/reorganize files, potentially overwriting existing files | 104 | | **Shortcut-Tool** | High | Can execute destructive keyboard shortcuts (Ctrl+D delete, Alt+F4 close) | 105 | 106 | #### **Medium-Risk Tools** (Modifying but Non-Destructive) 107 | 108 | These tools modify system state but are generally safe: 109 | 110 | | Tool | Risk | Description | 111 | |------|------|-------------| 112 | | **App-Tool** | Medium | Launches/manages applications but doesn't modify data | 113 | | **Scroll-Tool** | Low | Only changes viewport position | 114 | | **Move-Tool** | Low | Only positions mouse cursor | 115 | 116 | #### **Low-Risk Tools** (Read-Only) 117 | 118 | These tools only read information without making changes: 119 | 120 | | Tool | Risk | Description | 121 | |------|------|-------------| 122 | | **State-Tool** | Safe | Only captures desktop state and screenshots | 123 | | **Wait-Tool** | Safe | Only pauses execution | 124 | | **Scrape-Tool** | Safe* | Fetches web content (*may expose browsing activity) | 125 | 126 | ## Best Practices 127 | 128 | ### 1. **Run with Least Privilege** 129 | 130 | - Use a standard user account, not an administrator account, when possible 131 | - Avoid running Windows-MCP with elevated privileges unless absolutely necessary 132 | - Consider creating a dedicated user account for automation tasks 133 | 134 | ### 2. **Trusted LLM Clients Only** 135 | 136 | - Only connect Windows-MCP to trusted MCP clients 137 | - Be cautious when using with third-party or experimental LLM applications 138 | - Review the client application's security practices before integration 139 | 140 | ### 3. **Monitor Tool Usage** 141 | 142 | - Regularly review logs to understand what actions are being performed 143 | - Be especially vigilant with high-risk tools (Powershell-Tool, Click-Tool, etc.) 144 | - Set up alerts for unexpected or suspicious activity 145 | 146 | ### 4. **Network Security** 147 | 148 | - When using SSE or HTTP transport modes, ensure proper network isolation 149 | - Use localhost binding (`127.0.0.1`) instead of `0.0.0.0` when possible 150 | - Implement firewall rules to restrict access to the MCP server ports 151 | - Never expose the MCP server directly to the internet without proper authentication 152 | 153 | ### 5. **Data Protection** 154 | 155 | - Be aware that **State-Tool** captures screenshots that may contain sensitive information 156 | - **Scrape-Tool** may fetch content from untrusted websites 157 | - Avoid using Windows-MCP in environments with highly sensitive data 158 | - Consider disabling screenshot functionality (`use_vision=False`) when handling confidential information 159 | 160 | ### 6. **Code Review** 161 | 162 | - Review the source code before deployment in production environments 163 | - Audit any custom extensions or modifications 164 | - Keep dependencies up to date to patch known vulnerabilities 165 | 166 | ### 7. **Backup and Recovery** 167 | 168 | - Maintain regular backups before using automation tools 169 | - Test automation workflows in a safe environment first 170 | - Have a recovery plan in case of unintended system changes 171 | 172 | ## Deployment Recommendations 173 | 174 | ### **Recommended Use Cases** 175 | 176 | - Personal productivity automation on your own machine 177 | - Development and testing environments 178 | - QA automation in isolated test systems 179 | - Controlled demonstrations with supervision 180 | 181 | ### **Use with Caution** 182 | 183 | - Shared workstations or multi-user systems 184 | - Systems with access to production data 185 | - Environments with compliance requirements (HIPAA, PCI-DSS, etc.) 186 | - Automated workflows without human oversight 187 | 188 | ### **Not Recommended** 189 | 190 | - Production servers or critical infrastructure 191 | - Systems handling highly sensitive data (financial, medical, personal) 192 | - Public-facing systems or kiosks 193 | - Environments where destructive actions cannot be tolerated 194 | - Systems without proper backups 195 | 196 | ## Vulnerability Reporting 197 | 198 | We take security vulnerabilities seriously. If you discover a security issue, please follow responsible disclosure practices: 199 | 200 | ### How to Report 201 | 202 | **DO NOT** open a public GitHub issue for security vulnerabilities. 203 | 204 | Instead, please report security issues via: 205 | 206 | 1. **Email**: Send details to the project maintainers at [jeogeoalukka@gmail.com](mailto:jeogeoalukka@gmail.com) 207 | 2. **GitHub Security Advisories**: Use the [GitHub Security Advisory](https://github.com/CursorTouch/Windows-MCP/security/advisories) feature (preferred) 208 | 209 | ### What to Include 210 | 211 | Please provide: 212 | 213 | - Description of the vulnerability 214 | - Steps to reproduce the issue 215 | - Potential impact assessment 216 | - Suggested fix (if available) 217 | - Your contact information for follow-up 218 | 219 | ### Response Timeline 220 | 221 | - **Initial Response**: Within 48 hours 222 | - **Status Update**: Within 7 days 223 | - **Fix Timeline**: Depends on severity (critical issues prioritized) 224 | 225 | We will acknowledge your contribution in the security advisory and release notes (unless you prefer to remain anonymous). 226 | 227 | ## Security Updates 228 | 229 | ### Staying Informed 230 | 231 | - Watch this repository for security announcements 232 | - Follow [@CursorTouch](https://x.com/CursorTouch) on X for updates 233 | - Join our [Discord Community](https://discord.com/invite/Aue9Yj2VzS) for discussions 234 | 235 | ### Update Policy 236 | 237 | - Security patches will be released as soon as possible 238 | - Critical vulnerabilities will be addressed within 7 days 239 | - Users will be notified via GitHub releases and community channels 240 | 241 | ## Dependency Security 242 | 243 | Windows-MCP relies on several third-party libraries. We: 244 | 245 | - Regularly update dependencies to patch known vulnerabilities 246 | - Monitor security advisories for our dependencies 247 | - Use `uv` for reproducible dependency management 248 | 249 | ### Key Dependencies 250 | 251 | - **PyAutoGUI**: Mouse and keyboard automation 252 | - **UIAutomation**: Windows UI interaction 253 | - **FastMCP**: MCP server framework 254 | - **httpx**: HTTP client for web scraping 255 | 256 | ## Compliance and Auditing 257 | 258 | ### Logging 259 | 260 | Windows-MCP does not implement comprehensive audit logging by default. For compliance-sensitive environments, consider: 261 | 262 | - Implementing custom logging middleware 263 | - Using Windows Event Logging for system-level auditing 264 | - Monitoring file system and registry changes 265 | 266 | ### Data Privacy 267 | 268 | - Windows-MCP collects basic usage data to help improve the MCP server. 269 | - **No personal information, tool arguments, or tool outputs are tracked.** 270 | - Telemetry is enabled by default but can be disabled by setting the `ANONYMIZED_TELEMETRY` environment variable to `false` in the MCP server configuration. 271 | - Windows-MCP processes commands locally on your machine. 272 | - Screenshots and state captures remain on your local system. 273 | - Web scraping may expose browsing activity to target websites. 274 | 275 | ## Tool Annotations Reference 276 | 277 | All tools include security-relevant annotations: 278 | 279 | - **readOnlyHint**: `true` if the tool only reads data 280 | - **destructiveHint**: `true` if the tool may perform destructive updates 281 | - **idempotentHint**: `true` if repeated calls have no additional effect 282 | - **openWorldHint**: `true` if the tool interacts with external entities 283 | 284 | Refer to `main.py` for complete tool annotations. 285 | 286 | ## Disclaimer 287 | 288 | **USE AT YOUR OWN RISK** 289 | 290 | Windows-MCP is provided "as is" without warranty of any kind. The maintainers are not responsible for: 291 | 292 | - Data loss or system damage caused by tool usage 293 | - Security breaches resulting from improper configuration 294 | - Actions performed by LLM agents using this MCP server 295 | - Compliance violations in regulated environments 296 | 297 | Users are solely responsible for: 298 | 299 | - Ensuring appropriate use in their environment 300 | - Implementing necessary security controls 301 | - Complying with applicable laws and regulations 302 | - Monitoring and auditing tool usage 303 | 304 | ## License 305 | 306 | This security policy is part of the Windows-MCP project, licensed under the MIT License. See [LICENSE](LICENSE.md) for details. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![MseeP.ai Security Assessment Badge](https://mseep.net/pr/cursortouch-windows-mcp-badge.png)](https://mseep.ai/app/cursortouch-windows-mcp) 2 | 3 |
4 |

🪟 Windows-MCP

5 | 6 | 7 | License 8 | 9 | Python 10 | Platform: Windows 7 to 11 11 | Last Commit 12 |
13 | 14 | Follow on Twitter 15 | 16 | 17 | Join us on Discord 18 | 19 | 20 |
21 | 22 |
23 | 24 | **Windows MCP** is a lightweight, open-source project that enables seamless integration between AI agents and the Windows operating system. Acting as an MCP server bridges the gap between LLMs and the Windows operating system, allowing agents to perform tasks such as **file navigation, application control, UI interaction, QA testing,** and more. 25 | 26 | mcp-name: io.github.CursorTouch/Windows-MCP 27 | 28 | ## Updates 29 | - Windows-MCP is now available on [PyPI](https://pypi.org/project/windows-mcp/) (thus supports `uvx`) 30 | - Windows-MCP is added to [MCP Registry](https://github.com/modelcontextprotocol/registry) 31 | - Try out 🪟[Windows-Use](https://github.com/CursorTouch/Windows-Use)!!, an agent built using Windows-MCP. 32 | - Windows-MCP is now featured as Desktop Extension in `Claude Desktop`. 33 | 34 | ### Supported Operating Systems 35 | 36 | - Windows 7 37 | - Windows 8, 8.1 38 | - Windows 10 39 | - Windows 11 40 | 41 | ## 🎥 Demos 42 | 43 | 44 | 45 | 46 | 47 | ## ✨ Key Features 48 | 49 | - **Seamless Windows Integration** 50 | Interacts natively with Windows UI elements, opens apps, controls windows, simulates user input, and more. 51 | 52 | - **Use Any LLM (Vision Optional)** 53 | Unlike many automation tools, Windows MCP doesn't rely on any traditional computer vision techniques or specific fine-tuned models; it works with any LLMs, reducing complexity and setup time. 54 | 55 | - **Rich Toolset for UI Automation** 56 | Includes tools for basic keyboard, mouse operation and capturing window/UI state. 57 | 58 | - **Lightweight & Open-Source** 59 | Minimal dependencies and easy setup with full source code available under MIT license. 60 | 61 | - **Customizable & Extendable** 62 | Easily adapt or extend tools to suit your unique automation or AI integration needs. 63 | 64 | - **Real-Time Interaction** 65 | Typical latency between actions (e.g., from one mouse click to the next) ranges from **0.7 to 2.5 secs**, and may slightly vary based on the number of active applications and system load, also the inferencing speed of the llm. 66 | 67 | - **DOM Mode for Browser Automation** 68 | Special `use_dom=True` mode for State-Tool that focuses exclusively on web page content, filtering out browser UI elements for cleaner, more efficient web automation. 69 | 70 | ## 🛠️Installation 71 | 72 | ### Prerequisites 73 | 74 | - Python 3.13+ 75 | - UV (Package Manager) from Astra, install with `pip install uv` or `curl -LsSf https://astral.sh/uv/install.sh | sh` 76 | - `English` as the default language in Windows highly preferred or disable the `App-Tool` in the MCP Server for Windows with other languages. 77 | 78 |
79 | Install in Claude Desktop 80 | 81 | 1. Install [Claude Desktop](https://claude.ai/download) and 82 | 83 | ```shell 84 | npm install -g @anthropic-ai/mcpb 85 | ``` 86 | 87 | 88 | 2. Configure the extension: 89 | 90 | **Option A: Install from PyPI (Recommended)** 91 | 92 | Use `uvx` to run the latest version directly from PyPI. 93 | 94 | Add this to your `claude_desktop_config.json`: 95 | ```json 96 | { 97 | "mcpServers": { 98 | "windows-mcp": { 99 | "command": "uvx", 100 | "args": [ 101 | "windows-mcp" 102 | ] 103 | } 104 | } 105 | } 106 | ``` 107 | 108 | **Option B: Install from Source** 109 | 110 | 1. Clone the repository: 111 | ```shell 112 | git clone https://github.com/CursorTouch/Windows-MCP.git 113 | cd Windows-MCP 114 | ``` 115 | 116 | 2. Add this to your `claude_desktop_config.json`: 117 | ```json 118 | { 119 | "mcpServers": { 120 | "windows-mcp": { 121 | "command": "uv", 122 | "args": [ 123 | "--directory", 124 | "", 125 | "run", 126 | "windows-mcp" 127 | ] 128 | } 129 | } 130 | } 131 | ``` 132 | 133 | 134 | 135 | 3. Open Claude Desktop and enjoy! 🥳 136 | 137 | 138 | 5. Enjoy 🥳. 139 | 140 | For additional Claude Desktop integration troubleshooting, see the [MCP documentation](https://modelcontextprotocol.io/quickstart/server#claude-for-desktop-integration-issues). The documentation includes helpful tips for checking logs and resolving common issues. 141 |
142 | 143 |
144 | Install in Perplexity Desktop 145 | 146 | 1. Install [Perplexity Desktop](https://apps.microsoft.com/detail/xp8jnqfbqh6pvf): 147 | 148 | 2. Clone the repository. 149 | 150 | ```shell 151 | git clone https://github.com/CursorTouch/Windows-MCP.git 152 | 153 | cd Windows-MCP 154 | ``` 155 | 156 | 3. Open Perplexity Desktop: 157 | 158 | Go to `Settings->Connectors->Add Connector->Advanced` 159 | 160 | 4. Enter the name as `Windows-MCP`, then paste the following JSON in the text area. 161 | 162 | 163 | **Option A: Install from PyPI (Recommended)** 164 | 165 | ```json 166 | { 167 | "command": "uvx", 168 | "args": [ 169 | "windows-mcp" 170 | ] 171 | } 172 | ``` 173 | 174 | **Option B: Install from Source** 175 | 176 | ```json 177 | { 178 | "command": "uv", 179 | "args": [ 180 | "--directory", 181 | "", 182 | "run", 183 | "windows-mcp" 184 | ] 185 | } 186 | ``` 187 | 188 | 189 | 5. Click `Save` and Enjoy 🥳. 190 | 191 | For additional Claude Desktop integration troubleshooting, see the [Perplexity MCP Support](https://www.perplexity.ai/help-center/en/articles/11502712-local-and-remote-mcps-for-perplexity). The documentation includes helpful tips for checking logs and resolving common issues. 192 |
193 | 194 |
195 | Install in Gemini CLI 196 | 197 | 1. Install Gemini CLI: 198 | 199 | ```shell 200 | npm install -g @google/gemini-cli 201 | ``` 202 | 203 | 204 | 2. Configure the server in `%USERPROFILE%/.gemini/settings.json`: 205 | 206 | 207 | 3. Navigate to `%USERPROFILE%/.gemini` in File Explorer and open `settings.json`. 208 | 209 | 4. Add the `windows-mcp` config in the `settings.json` and save it. 210 | 211 | ```json 212 | { 213 | "theme": "Default", 214 | ... 215 | "mcpServers": { 216 | "windows-mcp": { 217 | "command": "uvx", 218 | "args": [ 219 | "windows-mcp" 220 | ] 221 | } 222 | } 223 | } 224 | ``` 225 | *Note: To run from source, replace the command with `uv` and args with `["--directory", "", "run", "windows-mcp"]`.* 226 | 227 | 228 | 5. Rerun Gemini CLI in terminal. Enjoy 🥳 229 |
230 | 231 |
232 | Install in Qwen Code 233 | 1. Install Qwen Code: 234 | 235 | ```shell 236 | npm install -g @qwen-code/qwen-code@latest 237 | ``` 238 | 239 | 2. Configure the server in `%USERPROFILE%/.qwen/settings.json`: 240 | 241 | 242 | 3. Navigate to `%USERPROFILE%/.qwen/settings.json`. 243 | 244 | 4. Add the `windows-mcp` config in the `settings.json` and save it. 245 | 246 | ```json 247 | { 248 | "mcpServers": { 249 | "windows-mcp": { 250 | "command": "uvx", 251 | "args": [ 252 | "windows-mcp" 253 | ] 254 | } 255 | } 256 | } 257 | ``` 258 | *Note: To run from source, replace the command with `uv` and args with `["--directory", "", "run", "windows-mcp"]`.* 259 | 260 | 261 | 5. Rerun Qwen Code in terminal. Enjoy 🥳 262 |
263 | 264 |
265 | Install in Codex CLI 266 | 1. Install Codex CLI: 267 | 268 | ```shell 269 | npm install -g @openai/codex 270 | ``` 271 | 272 | 2. Configure the server in `%USERPROFILE%/.codex/config.toml`: 273 | 274 | 3. Navigate to `%USERPROFILE%/.codex/config.toml`. 275 | 276 | 4. Add the `windows-mcp` config in the `config.toml` and save it. 277 | 278 | ```toml 279 | [mcp_servers.windows-mcp] 280 | command="uvx" 281 | args=[ 282 | "windows-mcp" 283 | ] 284 | ``` 285 | *Note: To run from source, replace the command with `uv` and args with `["--directory", "", "run", "windows-mcp"]`.* 286 | 287 | 288 | 5. Rerun Codex CLI in terminal. Enjoy 🥳 289 |
290 | 291 | --- 292 | 293 | ## 🔨MCP Tools 294 | 295 | MCP Client can access the following tools to interact with Windows: 296 | 297 | - `Click-Tool`: Click on the screen at the given coordinates. 298 | - `Type-Tool`: Type text on an element (optionally clears existing text). 299 | - `Scroll-Tool`: Scroll vertically or horizontally on the window or specific regions. 300 | - `Drag-Tool`: Drag from one point to another. 301 | - `Move-Tool`: Move mouse pointer. 302 | - `Shortcut-Tool`: Press keyboard shortcuts (`Ctrl+c`, `Alt+Tab`, etc). 303 | - `Wait-Tool`: Pause for a defined duration. 304 | - `State-Tool`: Combined snapshot of default language, browser, active apps and interactive, textual and scrollable elements along with screenshot of the desktop. Supports `use_dom=True` for browser content extraction (web page elements only) and `use_vision=True` for including screenshots. 305 | - `App-Tool`: To launch an application from the start menu, resize or move the window and switch between apps. 306 | - `Shell-Tool`: To execute PowerShell commands. 307 | - `Scrape-Tool`: To scrape the entire webpage for information. 308 | 309 | ## 🤝 Connect with Us 310 | Stay updated and join our community: 311 | 312 | - 📢 Follow us on [X](https://x.com/CursorTouch) for the latest news and updates 313 | 314 | - 💬 Join our [Discord Community](https://discord.com/invite/Aue9Yj2VzS) 315 | 316 | ## Star History 317 | 318 | [![Star History Chart](https://api.star-history.com/svg?repos=CursorTouch/Windows-MCP&type=Date)](https://www.star-history.com/#CursorTouch/Windows-MCP&Date) 319 | 320 | ## ⚠️Caution 321 | 322 | This MCP interacts directly with your Windows operating system to perform actions. Use with caution and avoid deploying it in environments where such risks cannot be tolerated. 323 | 324 | ## 🔒 Security 325 | 326 | **Important**: Windows-MCP operates with full system access and can perform irreversible operations. Please review our comprehensive security guidelines before deployment. 327 | 328 | For detailed security information, including: 329 | - Tool-specific risk assessments 330 | - Deployment recommendations 331 | - Vulnerability reporting procedures 332 | - Compliance and auditing guidelines 333 | 334 | Please read our [Security Policy](SECURITY.md). 335 | 336 | ## 📊 Telemetry 337 | 338 | Windows-MCP collects usage data to help improve the MCP server. No personal information, no tool arguments, no outputs are tracked. 339 | 340 | To disable telemetry, add the following to your MCP client configuration: 341 | 342 | ```json 343 | { 344 | "mcpServers": { 345 | "windows-mcp": { 346 | "command": "uvx", 347 | "args": [ 348 | "windows-mcp" 349 | ], 350 | "env": { 351 | "ANONYMIZED_TELEMETRY": "false" 352 | } 353 | } 354 | } 355 | } 356 | ``` 357 | 358 | ## 📝 Limitations 359 | 360 | - Selecting specific sections of the text in a paragraph, as the MCP is relying on a11y tree. (⌛ Working on it.) 361 | - `Type-Tool` is meant for typing text, not programming in IDE because of it types program as a whole in a file. (⌛ Working on it.) 362 | - This MCP server can't be used to play video games 🎮. 363 | 364 | ## 🪪 License 365 | 366 | This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details. 367 | 368 | ## 🙏 Acknowledgements 369 | 370 | Windows-MCP makes use of several excellent open-source projects that power its Windows automation features: 371 | 372 | - [UIAutomation](https://github.com/yinkaisheng/Python-UIAutomation-for-Windows) 373 | 374 | - [PyAutoGUI](https://github.com/asweigart/pyautogui) 375 | 376 | Huge thanks to the maintainers and contributors of these libraries for their outstanding work and open-source spirit. 377 | 378 | ## 🤝Contributing 379 | 380 | Contributions are welcome! Please see [CONTRIBUTING](CONTRIBUTING) for setup instructions and development guidelines. 381 | 382 | Made with ❤️ by [CursorTouch](https://github.com/CursorTouch) 383 | 384 | ## Citation 385 | 386 | ```bibtex 387 | @software{ 388 | author = {CursorTouch}, 389 | title = {Windows-MCP: Lightweight open-source project for integrating LLM agents with Windows}, 390 | year = {2024}, 391 | publisher = {GitHub}, 392 | url={https://github.com/CursorTouch/Windows-MCP} 393 | } 394 | ``` 395 | 396 | -------------------------------------------------------------------------------- /src/windows_mcp/__main__.py: -------------------------------------------------------------------------------- 1 | from windows_mcp.analytics import PostHogAnalytics, with_analytics 2 | from live_inspect.watch_cursor import WatchCursor 3 | from windows_mcp.desktop.service import Desktop 4 | from contextlib import asynccontextmanager 5 | from fastmcp.utilities.types import Image 6 | from mcp.types import ToolAnnotations 7 | from typing import Literal, Optional 8 | from humancursor import SystemCursor 9 | from fastmcp import FastMCP, Context 10 | from dotenv import load_dotenv 11 | from textwrap import dedent 12 | import pyautogui as pg 13 | import asyncio 14 | import click 15 | import os 16 | 17 | load_dotenv() 18 | 19 | pg.FAILSAFE=False 20 | pg.PAUSE=1.0 21 | 22 | desktop=Desktop() 23 | cursor=SystemCursor() 24 | watch_cursor=WatchCursor() 25 | windows_version=desktop.get_windows_version() 26 | default_language=desktop.get_default_language() 27 | screen_width,screen_height=desktop.get_resolution() 28 | 29 | instructions=dedent(f''' 30 | Windows MCP server provides tools to interact directly with the {windows_version} desktop, 31 | thus enabling to operate the desktop on the user's behalf. 32 | ''') 33 | 34 | # Initialize analytics at module level to be used in decorators 35 | if os.getenv("ANONYMIZED_TELEMETRY", "true").lower() == "false": 36 | analytics = None 37 | else: 38 | analytics = PostHogAnalytics() 39 | 40 | @asynccontextmanager 41 | async def lifespan(app: FastMCP): 42 | """Runs initialization code before the server starts and cleanup code after it shuts down.""" 43 | try: 44 | watch_cursor.start() 45 | await asyncio.sleep(1) # Simulate startup latency 46 | yield 47 | finally: 48 | watch_cursor.stop() 49 | if analytics: 50 | await analytics.close() 51 | 52 | mcp=FastMCP(name='windows-mcp',instructions=instructions,lifespan=lifespan) 53 | 54 | @mcp.tool( 55 | name="App-Tool", 56 | description="Manages Windows applications with three modes: 'launch' (start app by name), 'resize' (set window position/size using window_loc=[x,y] and window_size=[width,height]), 'switch' (activate app by name). Essential for application lifecycle management.", 57 | annotations=ToolAnnotations( 58 | title="App Tool", 59 | readOnlyHint=False, 60 | destructiveHint=True, 61 | idempotentHint=False, 62 | openWorldHint=False 63 | ) 64 | ) 65 | @with_analytics(analytics, "App-Tool") 66 | def app_tool(mode:Literal['launch','resize','switch'],name:str|None=None,window_loc:list[int]|None=None,window_size:list[int]|None=None, ctx: Context = None): 67 | return desktop.app(mode,name,window_loc,window_size) 68 | 69 | @mcp.tool( 70 | name='Powershell-Tool', 71 | description='Execute PowerShell commands directly on the Windows system and return output with status code. Supports all PowerShell cmdlets, scripts, and system commands. Use for file operations, system queries, and administrative tasks.', 72 | annotations=ToolAnnotations( 73 | title="Powershell Tool", 74 | readOnlyHint=False, 75 | destructiveHint=True, 76 | idempotentHint=False, 77 | openWorldHint=True 78 | ) 79 | ) 80 | @with_analytics(analytics, "Powershell-Tool") 81 | def powershell_tool(command: str, ctx: Context = None) -> str: 82 | response,status_code=desktop.execute_command(command) 83 | return f'Response: {response}\nStatus Code: {status_code}' 84 | 85 | @mcp.tool( 86 | name='State-Tool', 87 | description='Captures complete desktop state including: system language, focused/opened apps, interactive elements (buttons, text fields, links, menus with coordinates), and scrollable areas. Set use_vision=True to include screenshot. Set use_dom=True for browser content to get web page elements instead of browser UI. Always call this first to understand the current desktop state before taking actions.', 88 | annotations=ToolAnnotations( 89 | title="State Tool", 90 | readOnlyHint=True, 91 | destructiveHint=False, 92 | idempotentHint=True, 93 | openWorldHint=False 94 | ) 95 | ) 96 | @with_analytics(analytics, "State-Tool") 97 | def state_tool(use_vision:bool=False,use_dom:bool=False, ctx: Context = None): 98 | # Calculate scale factor to cap resolution at 1080p (1920x1080) 99 | max_width, max_height = 1920, 1080 100 | scale_width = max_width / screen_width if screen_width > max_width else 1.0 101 | scale_height = max_height / screen_height if screen_height > max_height else 1.0 102 | scale = min(scale_width, scale_height) # Use the smaller scale to ensure both dimensions fit 103 | 104 | desktop_state=desktop.get_state(use_vision=use_vision,use_dom=use_dom,as_bytes=True,scale=scale) 105 | interactive_elements=desktop_state.tree_state.interactive_elements_to_string() 106 | scrollable_elements=desktop_state.tree_state.scrollable_elements_to_string() 107 | apps=desktop_state.apps_to_string() 108 | active_app=desktop_state.active_app_to_string() 109 | return [dedent(f''' 110 | Default Language of User: 111 | {default_language} with encoding: {desktop.encoding} 112 | 113 | Focused App: 114 | {active_app} 115 | 116 | Opened Apps: 117 | {apps} 118 | 119 | List of Interactive Elements: 120 | {interactive_elements or 'No interactive elements found.'} 121 | 122 | List of Scrollable Elements: 123 | {scrollable_elements or 'No scrollable elements found.'} 124 | ''')]+([Image(data=desktop_state.screenshot,format='png')] if use_vision else []) 125 | 126 | @mcp.tool( 127 | name='Click-Tool', 128 | description='Performs mouse clicks at specified coordinates [x, y]. Supports button types: left (default), right (context menu), middle. Supports clicks: 1 (single), 2 (double), 3 (triple). Always use coordinates from State-Tool output to ensure accuracy.', 129 | annotations=ToolAnnotations( 130 | title="Click Tool", 131 | readOnlyHint=False, 132 | destructiveHint=True, 133 | idempotentHint=False, 134 | openWorldHint=False 135 | ) 136 | ) 137 | @with_analytics(analytics, "Click-Tool") 138 | def click_tool(loc:list[int],button:Literal['left','right','middle']='left',clicks:int=1, ctx: Context = None)->str: 139 | if len(loc) != 2: 140 | raise ValueError("Location must be a list of exactly 2 integers [x, y]") 141 | x,y=loc[0],loc[1] 142 | desktop.click(loc=loc,button=button,clicks=clicks) 143 | num_clicks={1:'Single',2:'Double',3:'Triple'} 144 | return f'{num_clicks.get(clicks)} {button} clicked at ({x},{y}).' 145 | 146 | @mcp.tool( 147 | name='Type-Tool', 148 | description='Types text at specified coordinates [x, y]. Set clear=True to clear existing text first (Ctrl+A then type), clear=False to append. Set press_enter=True to submit after typing. Always click on the target input field first to ensure focus.', 149 | annotations=ToolAnnotations( 150 | title="Type Tool", 151 | readOnlyHint=False, 152 | destructiveHint=True, 153 | idempotentHint=False, 154 | openWorldHint=False 155 | ) 156 | ) 157 | @with_analytics(analytics, "Type-Tool") 158 | def type_tool(loc:list[int],text:str,clear:bool=False,press_enter:bool=False, ctx: Context = None)->str: 159 | if len(loc) != 2: 160 | raise ValueError("Location must be a list of exactly 2 integers [x, y]") 161 | x,y=loc[0],loc[1] 162 | desktop.type(loc=loc,text=text,clear=clear,press_enter=press_enter) 163 | return f'Typed {text} at ({x},{y}).' 164 | 165 | @mcp.tool( 166 | name='Scroll-Tool', 167 | description='Scrolls at coordinates [x, y] or current mouse position if loc=None. Type: vertical (default) or horizontal. Direction: up/down for vertical, left/right for horizontal. wheel_times controls amount (1 wheel ≈ 3-5 lines). Use for navigating long content, lists, and web pages.', 168 | annotations=ToolAnnotations( 169 | title="Scroll Tool", 170 | readOnlyHint=False, 171 | destructiveHint=False, 172 | idempotentHint=True, 173 | openWorldHint=False 174 | ) 175 | ) 176 | @with_analytics(analytics, "Scroll-Tool") 177 | def scroll_tool(loc:list[int]=None,type:Literal['horizontal','vertical']='vertical',direction:Literal['up','down','left','right']='down',wheel_times:int=1, ctx: Context = None)->str: 178 | if loc and len(loc) != 2: 179 | raise ValueError("Location must be a list of exactly 2 integers [x, y]") 180 | response=desktop.scroll(loc,type,direction,wheel_times) 181 | if response: 182 | return response 183 | return f'Scrolled {type} {direction} by {wheel_times} wheel times'+f' at ({loc[0]},{loc[1]}).' if loc else '' 184 | 185 | @mcp.tool( 186 | name='Drag-Tool', 187 | description='Performs drag-and-drop from current mouse position to destination coordinates [x, y]. Click or move to source position first, then call this tool with target coordinates. Use for moving files, reordering items, resizing windows, or any drag-drop UI interactions.', 188 | annotations=ToolAnnotations( 189 | title="Drag Tool", 190 | readOnlyHint=False, 191 | destructiveHint=True, 192 | idempotentHint=False, 193 | openWorldHint=False 194 | ) 195 | ) 196 | @with_analytics(analytics, "Drag-Tool") 197 | def drag_tool(to_loc:list[int], ctx: Context = None)->str: 198 | if len(to_loc) != 2: 199 | raise ValueError("to_loc must be a list of exactly 2 integers [x, y]") 200 | desktop.drag(to_loc) 201 | x2,y2=to_loc[0],to_loc[1] 202 | return f'Dragged the element to ({x2},{y2}).' 203 | 204 | @mcp.tool( 205 | name='Move-Tool', 206 | description='Moves mouse cursor to coordinates [x, y] without clicking. Use for hovering to reveal tooltips/menus, positioning cursor before drag operations, or triggering hover-based UI changes. Does not interact with elements.', 207 | annotations=ToolAnnotations( 208 | title="Move Tool", 209 | readOnlyHint=False, 210 | destructiveHint=False, 211 | idempotentHint=True, 212 | openWorldHint=False 213 | ) 214 | ) 215 | @with_analytics(analytics, "Move-Tool") 216 | def move_tool(to_loc:list[int], ctx: Context = None)->str: 217 | if len(to_loc) != 2: 218 | raise ValueError("to_loc must be a list of exactly 2 integers [x, y]") 219 | x,y=to_loc[0],to_loc[1] 220 | desktop.move(to_loc) 221 | return f'Moved the mouse pointer to ({x},{y}).' 222 | 223 | @mcp.tool( 224 | name='Shortcut-Tool', 225 | description='Executes keyboard shortcuts using key combinations separated by +. Examples: "ctrl+c" (copy), "ctrl+v" (paste), "alt+tab" (switch apps), "win+r" (Run dialog), "win" (Start menu), "ctrl+shift+esc" (Task Manager). Use for quick actions and system commands.', 226 | annotations=ToolAnnotations( 227 | title="Shortcut Tool", 228 | readOnlyHint=False, 229 | destructiveHint=True, 230 | idempotentHint=False, 231 | openWorldHint=False 232 | ) 233 | ) 234 | @with_analytics(analytics, "Shortcut-Tool") 235 | def shortcut_tool(shortcut:str, ctx: Context = None): 236 | desktop.shortcut(shortcut) 237 | return f"Pressed {shortcut}." 238 | 239 | @mcp.tool( 240 | name='Wait-Tool', 241 | description='Pauses execution for specified duration in seconds. Use when waiting for: applications to launch/load, UI animations to complete, page content to render, dialogs to appear, or between rapid actions. Helps ensure UI is ready before next interaction.', 242 | annotations=ToolAnnotations( 243 | title="Wait Tool", 244 | readOnlyHint=True, 245 | destructiveHint=False, 246 | idempotentHint=True, 247 | openWorldHint=False 248 | ) 249 | ) 250 | @with_analytics(analytics, "Wait-Tool") 251 | def wait_tool(duration:int, ctx: Context = None)->str: 252 | pg.sleep(duration) 253 | return f'Waited for {duration} seconds.' 254 | 255 | @mcp.tool( 256 | name='Scrape-Tool', 257 | description='Fetch content from a URL or the active browser tab. By default (use_dom=False), performs a lightweight HTTP request to the URL and returns markdown content of complete webpage. Note: Some websites may block automated HTTP requests. If this fails, open the page in a browser and retry with use_dom=True to extract visible text from the active tab\'s DOM within the viewport.', 258 | annotations=ToolAnnotations( 259 | title="Scrape Tool", 260 | readOnlyHint=True, 261 | destructiveHint=False, 262 | idempotentHint=True, 263 | openWorldHint=True 264 | ) 265 | ) 266 | @with_analytics(analytics, "Scrape-Tool") 267 | def scrape_tool(url:str,use_dom:bool=False, ctx: Context = None)->str: 268 | if not use_dom: 269 | content=desktop.scrape(url) 270 | return f'URL:{url}\nContent:\n{content}' 271 | 272 | desktop_state=desktop.get_state(use_vision=False,use_dom=use_dom) 273 | tree_state=desktop_state.tree_state 274 | if not tree_state.dom_info: 275 | return f'No DOM information found. Please open {url} in browser first.' 276 | dom_info=tree_state.dom_info 277 | vertical_scroll_percent=dom_info.vertical_scroll_percent 278 | content='\n'.join([node.text for node in tree_state.dom_informative_nodes]) 279 | header_status = "Reached top" if vertical_scroll_percent <= 0 else "Scroll up to see more" 280 | footer_status = "Reached bottom" if vertical_scroll_percent >= 100 else "Scroll down to see more" 281 | return f'URL:{url}\nContent:\n[{header_status}]\n{content}\n[{footer_status}]' 282 | 283 | 284 | @click.command() 285 | @click.option( 286 | "--transport", 287 | help="The transport layer used by the MCP server.", 288 | type=click.Choice(['stdio','sse','streamable-http']), 289 | default='stdio' 290 | ) 291 | @click.option( 292 | "--host", 293 | help="Host to bind the SSE/Streamable HTTP server.", 294 | default="localhost", 295 | type=str, 296 | show_default=True 297 | ) 298 | @click.option( 299 | "--port", 300 | help="Port to bind the SSE/Streamable HTTP server.", 301 | default=8000, 302 | type=int, 303 | show_default=True 304 | ) 305 | def main(transport, host, port): 306 | if transport=='stdio': 307 | mcp.run() 308 | else: 309 | mcp.run(transport=transport,host=host,port=port) 310 | 311 | if __name__ == "__main__": 312 | main() 313 | -------------------------------------------------------------------------------- /src/windows_mcp/desktop/service.py: -------------------------------------------------------------------------------- 1 | from windows_mcp.desktop.config import BROWSER_NAMES, PROCESS_PER_MONITOR_DPI_AWARE 2 | from windows_mcp.desktop.views import DesktopState, App, Size, Status 3 | from windows_mcp.tree.service import Tree 4 | from locale import getpreferredencoding 5 | from contextlib import contextmanager 6 | from typing import Optional,Literal 7 | from markdownify import markdownify 8 | from fuzzywuzzy import process 9 | from psutil import Process 10 | from time import sleep 11 | from PIL import Image 12 | import win32process 13 | import subprocess 14 | import win32gui 15 | import win32con 16 | import requests 17 | import logging 18 | import base64 19 | import ctypes 20 | import csv 21 | import re 22 | import os 23 | import io 24 | 25 | logger = logging.getLogger(__name__) 26 | logger.setLevel(logging.INFO) 27 | handler = logging.StreamHandler() 28 | formatter = logging.Formatter('[%(levelname)s] %(message)s') 29 | handler.setFormatter(formatter) 30 | logger.addHandler(handler) 31 | 32 | try: 33 | ctypes.windll.shcore.SetProcessDpiAwareness(PROCESS_PER_MONITOR_DPI_AWARE) 34 | except Exception: 35 | ctypes.windll.user32.SetProcessDPIAware() 36 | 37 | import uiautomation as uia 38 | import pyautogui as pg 39 | 40 | pg.FAILSAFE=False 41 | pg.PAUSE=1.0 42 | 43 | class Desktop: 44 | def __init__(self): 45 | self.encoding=getpreferredencoding() 46 | self.tree=Tree(self) 47 | self.desktop_state=None 48 | 49 | def get_resolution(self)->tuple[int,int]: 50 | return pg.size() 51 | 52 | def get_state(self,use_vision:bool=False,use_dom:bool=False,as_bytes:bool=False,scale:float=1.0)->DesktopState: 53 | sleep(0.1) 54 | apps=self.get_apps() 55 | active_app=self.get_active_app() 56 | if active_app is not None and active_app in apps: 57 | apps.remove(active_app) 58 | logger.debug(f"Active app: {active_app}") 59 | logger.debug(f"Apps: {apps}") 60 | tree_state=self.tree.get_state(active_app,apps,use_dom=use_dom) 61 | if use_vision: 62 | screenshot=self.tree.get_annotated_screenshot(tree_state.interactive_nodes,scale=scale) 63 | if as_bytes: 64 | bytes_io=io.BytesIO() 65 | screenshot.save(bytes_io,format='PNG') 66 | screenshot=bytes_io.getvalue() 67 | else: 68 | screenshot=None 69 | self.desktop_state=DesktopState(apps= apps,active_app=active_app,screenshot=screenshot,tree_state=tree_state) 70 | return self.desktop_state 71 | 72 | def get_window_element_from_element(self,element:uia.Control)->uia.Control|None: 73 | while element is not None: 74 | if uia.IsTopLevelWindow(element.NativeWindowHandle): 75 | return element 76 | element = element.GetParentControl() 77 | return None 78 | 79 | def get_active_app(self)->App|None: 80 | try: 81 | handle=uia.GetForegroundWindow() 82 | for app in self.get_apps(): 83 | if app.handle!=handle: 84 | continue 85 | return app 86 | except Exception as ex: 87 | logger.error(f"Error in get_active_app: {ex}") 88 | return None 89 | 90 | def get_app_status(self,control:uia.Control)->Status: 91 | if uia.IsIconic(control.NativeWindowHandle): 92 | return Status.MINIMIZED 93 | elif uia.IsZoomed(control.NativeWindowHandle): 94 | return Status.MAXIMIZED 95 | elif uia.IsWindowVisible(control.NativeWindowHandle): 96 | return Status.NORMAL 97 | else: 98 | return Status.HIDDEN 99 | 100 | def get_cursor_location(self)->tuple[int,int]: 101 | position=pg.position() 102 | return (position.x,position.y) 103 | 104 | def get_element_under_cursor(self)->uia.Control: 105 | return uia.ControlFromCursor() 106 | 107 | def get_apps_from_start_menu(self)->dict[str,str]: 108 | command='Get-StartApps | ConvertTo-Csv -NoTypeInformation' 109 | apps_info,_=self.execute_command(command) 110 | reader=csv.DictReader(io.StringIO(apps_info)) 111 | return {row.get('Name').lower():row.get('AppID') for row in reader} 112 | 113 | def execute_command(self,command:str)->tuple[str,int]: 114 | try: 115 | encoded = base64.b64encode(command.encode("utf-16le")).decode("ascii") 116 | result = subprocess.run( 117 | ['powershell', '-NoProfile', '-EncodedCommand', encoded], 118 | capture_output=True, 119 | errors='ignore', 120 | timeout=25, 121 | cwd=os.path.expanduser(path='~') 122 | ) 123 | stdout=result.stdout 124 | stderr=result.stderr 125 | return (stdout or stderr,result.returncode) 126 | except subprocess.TimeoutExpired: 127 | return ('Command execution timed out', 1) 128 | except Exception as e: 129 | return ('Command execution failed', 1) 130 | 131 | def is_app_browser(self,node:uia.Control): 132 | process=Process(node.ProcessId) 133 | return process.name() in BROWSER_NAMES 134 | 135 | def get_default_language(self)->str: 136 | command="Get-Culture | Select-Object Name,DisplayName | ConvertTo-Csv -NoTypeInformation" 137 | response,_=self.execute_command(command) 138 | reader=csv.DictReader(io.StringIO(response)) 139 | return "".join([row.get('DisplayName') for row in reader]) 140 | 141 | def resize_app(self,size:tuple[int,int]=None,loc:tuple[int,int]=None)->tuple[str,int]: 142 | active_app=self.desktop_state.active_app 143 | if active_app is None: 144 | return "No active app found",1 145 | if active_app.status==Status.MINIMIZED: 146 | return f"{active_app.name} is minimized",1 147 | elif active_app.status==Status.MAXIMIZED: 148 | return f"{active_app.name} is maximized",1 149 | else: 150 | app_control=uia.ControlFromHandle(active_app.handle) 151 | if loc is None: 152 | x=app_control.BoundingRectangle.left 153 | y=app_control.BoundingRectangle.top 154 | loc=(x,y) 155 | if size is None: 156 | width=app_control.BoundingRectangle.width() 157 | height=app_control.BoundingRectangle.height() 158 | size=(width,height) 159 | x,y=loc 160 | width,height=size 161 | app_control.MoveWindow(x,y,width,height) 162 | return (f'{active_app.name} resized to {width}x{height} at {x},{y}.',0) 163 | 164 | def is_app_running(self,name:str)->bool: 165 | apps={app.name:app for app in self.get_apps()} 166 | return process.extractOne(name,list(apps.keys()),score_cutoff=60) is not None 167 | 168 | def app(self,mode:Literal['launch','switch','resize'],name:Optional[str]=None,loc:Optional[tuple[int,int]]=None,size:Optional[tuple[int,int]]=None): 169 | match mode: 170 | case 'launch': 171 | response,status=self.launch_app(name) 172 | sleep(1.25) 173 | if status!=0: 174 | return response 175 | consecutive_waits=10 176 | for _ in range(consecutive_waits): 177 | if not self.is_app_running(name): 178 | sleep(1.25) 179 | else: 180 | return f'{name.title()} launched.' 181 | return f'Launching {name.title()} wait for it to come load.' 182 | case 'resize': 183 | response,status=self.resize_app(size=size,loc=loc) 184 | if status!=0: 185 | return response 186 | else: 187 | return response 188 | case 'switch': 189 | response,status=self.switch_app(name) 190 | if status!=0: 191 | return response 192 | else: 193 | return response 194 | 195 | def launch_app(self,name:str)->tuple[str,int]: 196 | apps_map=self.get_apps_from_start_menu() 197 | matched_app=process.extractOne(name,apps_map.keys(),score_cutoff=70) 198 | if matched_app is None: 199 | return (f'{name.title()} not found in start menu.',1) 200 | app_name,_=matched_app 201 | appid=apps_map.get(app_name) 202 | if appid is None: 203 | return (f'{name.title()} not found in start menu.',1) 204 | if appid.endswith('.exe'): 205 | command=f"Start-Process '{appid}'" 206 | else: 207 | command=f"Start-Process shell:AppsFolder\\{appid}" 208 | response,status=self.execute_command(command) 209 | return response,status 210 | 211 | def switch_app(self,name:str): 212 | apps={app.name:app for app in [self.desktop_state.active_app]+self.desktop_state.apps if app is not None} 213 | matched_app:Optional[tuple[str,float]]=process.extractOne(name,list(apps.keys()),score_cutoff=70) 214 | if matched_app is None: 215 | return (f'Application {name.title()} not found.',1) 216 | app_name,_=matched_app 217 | app=apps.get(app_name) 218 | target_handle=app.handle 219 | 220 | if uia.IsIconic(target_handle): 221 | uia.ShowWindow(target_handle, win32con.SW_RESTORE) 222 | content=f'{app_name.title()} restored from Minimized state.' 223 | else: 224 | self.bring_window_to_top(target_handle) 225 | content=f'Switched to {app_name.title()} window.' 226 | return content,0 227 | 228 | def bring_window_to_top(self,target_handle:int): 229 | foreground_handle=win32gui.GetForegroundWindow() 230 | foreground_thread,_=win32process.GetWindowThreadProcessId(foreground_handle) 231 | target_thread,_=win32process.GetWindowThreadProcessId(target_handle) 232 | try: 233 | ctypes.windll.user32.AllowSetForegroundWindow(-1) 234 | win32process.AttachThreadInput(foreground_thread,target_thread,True) 235 | win32gui.SetForegroundWindow(target_handle) 236 | win32gui.BringWindowToTop(target_handle) 237 | except Exception as e: 238 | logger.error(f'Failed to bring window to top: {e}') 239 | finally: 240 | win32process.AttachThreadInput(foreground_thread,target_thread,False) 241 | 242 | def get_element_handle_from_label(self,label:int)->uia.Control: 243 | tree_state=self.desktop_state.tree_state 244 | element_node=tree_state.interactive_nodes[label] 245 | xpath=element_node.xpath 246 | element_handle=self.get_element_from_xpath(xpath) 247 | return element_handle 248 | 249 | def get_coordinates_from_label(self,label:int)->tuple[int,int]: 250 | element_handle=self.get_element_handle_from_label(label) 251 | bounding_rectangle=element_handle.BoundingRectangle 252 | return bounding_rectangle.xcenter(),bounding_rectangle.ycenter() 253 | 254 | def click(self,loc:tuple[int,int],button:str='left',clicks:int=2): 255 | x,y=loc 256 | pg.click(x,y,button=button,clicks=clicks,duration=0.1) 257 | 258 | def type(self,loc:tuple[int,int],text:str,caret_position:Literal['start','end','none']='none',clear:Literal['true','false']='false',press_enter:Literal['true','false']='false'): 259 | x,y=loc 260 | pg.leftClick(x,y) 261 | if caret_position == 'start': 262 | pg.press('home') 263 | elif caret_position == 'end': 264 | pg.press('end') 265 | else: 266 | pass 267 | if clear=='true': 268 | pg.sleep(0.5) 269 | pg.hotkey('ctrl','a') 270 | pg.press('backspace') 271 | pg.typewrite(text,interval=0.02) 272 | if press_enter=='true': 273 | pg.press('enter') 274 | 275 | def scroll(self,loc:tuple[int,int]=None,type:Literal['horizontal','vertical']='vertical',direction:Literal['up','down','left','right']='down',wheel_times:int=1)->str|None: 276 | if loc: 277 | self.move(loc) 278 | match type: 279 | case 'vertical': 280 | match direction: 281 | case 'up': 282 | uia.WheelUp(wheel_times) 283 | case 'down': 284 | uia.WheelDown(wheel_times) 285 | case _: 286 | return 'Invalid direction. Use "up" or "down".' 287 | case 'horizontal': 288 | match direction: 289 | case 'left': 290 | pg.keyDown('Shift') 291 | pg.sleep(0.05) 292 | uia.WheelUp(wheel_times) 293 | pg.sleep(0.05) 294 | pg.keyUp('Shift') 295 | case 'right': 296 | pg.keyDown('Shift') 297 | pg.sleep(0.05) 298 | uia.WheelDown(wheel_times) 299 | pg.sleep(0.05) 300 | pg.keyUp('Shift') 301 | case _: 302 | return 'Invalid direction. Use "left" or "right".' 303 | case _: 304 | return 'Invalid type. Use "horizontal" or "vertical".' 305 | return None 306 | 307 | def drag(self,loc:tuple[int,int]): 308 | x,y=loc 309 | pg.sleep(0.5) 310 | pg.dragTo(x,y,duration=0.6) 311 | 312 | def move(self,loc:tuple[int,int]): 313 | x,y=loc 314 | pg.moveTo(x,y,duration=0.1) 315 | 316 | def shortcut(self,shortcut:str): 317 | shortcut=shortcut.split('+') 318 | if len(shortcut)>1: 319 | pg.hotkey(*shortcut) 320 | else: 321 | pg.press(''.join(shortcut)) 322 | 323 | def multi_select(self,press_ctrl:Literal['true','false']='false',elements:list[tuple[int,int]|int]=[]): 324 | if press_ctrl=='true': 325 | pg.keyDown('ctrl') 326 | for element in elements: 327 | x,y=element 328 | pg.click(x,y,duration=0.2) 329 | pg.sleep(0.5) 330 | pg.keyUp('ctrl') 331 | 332 | def multi_edit(self,elements:list[tuple[int,int,str]|tuple[int,str]]): 333 | for element in elements: 334 | x,y,text=element 335 | self.type((x,y),text=text,clear='true') 336 | 337 | def scrape(self,url:str)->str: 338 | response=requests.get(url,timeout=10) 339 | html=response.text 340 | content=markdownify(html=html) 341 | return content 342 | 343 | def get_app_size(self,control:uia.Control): 344 | window=control.BoundingRectangle 345 | if window.isempty(): 346 | return Size(width=0,height=0) 347 | return Size(width=window.width(),height=window.height()) 348 | 349 | def is_app_visible(self,app)->bool: 350 | is_minimized=self.get_app_status(app)!=Status.MINIMIZED 351 | size=self.get_app_size(app) 352 | area=size.width*size.height 353 | is_overlay=self.is_overlay_app(app) 354 | return not is_overlay and is_minimized and area>10 355 | 356 | def is_overlay_app(self,element:uia.Control) -> bool: 357 | no_children = len(element.GetChildren()) == 0 358 | is_name = "Overlay" in element.Name.strip() 359 | return no_children or is_name 360 | 361 | def get_apps(self) -> list[App]: 362 | try: 363 | desktop = uia.GetRootControl() # Get the desktop control 364 | children = desktop.GetChildren() 365 | apps = [] 366 | for depth, child in enumerate(children): 367 | if isinstance(child,(uia.WindowControl,uia.PaneControl)): 368 | window_pattern=child.GetPattern(uia.PatternId.WindowPattern) 369 | if (window_pattern is None): 370 | continue 371 | if window_pattern.CanMinimize and window_pattern.CanMaximize: 372 | status = self.get_app_status(child) 373 | size=self.get_app_size(child) 374 | apps.append(App(**{ 375 | "name":child.Name, 376 | "depth":depth, 377 | "status":status, 378 | "size":size, 379 | "handle":child.NativeWindowHandle, 380 | "process_id":child.ProcessId 381 | })) 382 | except Exception as ex: 383 | logger.error(f"Error in get_apps: {ex}") 384 | apps = [] 385 | return apps 386 | 387 | def get_xpath_from_element(self,element:uia.Control): 388 | current=element 389 | if current is None: 390 | return "" 391 | path_parts=[] 392 | while current is not None: 393 | parent=current.GetParentControl() 394 | if parent is None: 395 | # we are at the root node 396 | path_parts.append(f'{current.ControlTypeName}') 397 | break 398 | children=parent.GetChildren() 399 | same_type_children=["-".join(map(lambda x:str(x),child.GetRuntimeId())) for child in children if child.ControlType==current.ControlType] 400 | index=same_type_children.index("-".join(map(lambda x:str(x),current.GetRuntimeId()))) 401 | if same_type_children: 402 | path_parts.append(f'{current.ControlTypeName}[{index+1}]') 403 | else: 404 | path_parts.append(f'{current.ControlTypeName}') 405 | current=parent 406 | path_parts.reverse() 407 | xpath="/".join(path_parts) 408 | return xpath 409 | 410 | def get_element_from_xpath(self,xpath:str)->uia.Control: 411 | pattern = re.compile(r'(\w+)(?:\[(\d+)\])?') 412 | parts=xpath.split("/") 413 | root=uia.GetRootControl() 414 | element=root 415 | for part in parts[1:]: 416 | match=pattern.fullmatch(part) 417 | if match is None: 418 | continue 419 | control_type, index=match.groups() 420 | index=int(index) if index else None 421 | children=element.GetChildren() 422 | same_type_children=list(filter(lambda x:x.ControlTypeName==control_type,children)) 423 | if index: 424 | element=same_type_children[index-1] 425 | else: 426 | element=same_type_children[0] 427 | return element 428 | 429 | def get_windows_version(self)->str: 430 | response,status=self.execute_command("(Get-CimInstance Win32_OperatingSystem).Caption") 431 | if status==0: 432 | return response.strip() 433 | return "Windows" 434 | 435 | def get_user_account_type(self)->str: 436 | response,status=self.execute_command("(Get-LocalUser -Name $env:USERNAME).PrincipalSource") 437 | return "Local Account" if response.strip()=='Local' else "Microsoft Account" if status==0 else "Local Account" 438 | 439 | def get_dpi_scaling(self): 440 | user32 = ctypes.windll.user32 441 | dpi = user32.GetDpiForSystem() 442 | return dpi / 96.0 443 | 444 | def get_screen_size(self)->Size: 445 | width, height = uia.GetScreenSize() 446 | return Size(width=width,height=height) 447 | 448 | def get_screenshot(self)->Image.Image: 449 | return pg.screenshot() 450 | 451 | @contextmanager 452 | def auto_minimize(self): 453 | try: 454 | handle = uia.GetForegroundWindow() 455 | uia.ShowWindow(handle, win32con.SW_MINIMIZE) 456 | yield 457 | finally: 458 | uia.ShowWindow(handle, win32con.SW_RESTORE) -------------------------------------------------------------------------------- /src/windows_mcp/tree/service.py: -------------------------------------------------------------------------------- 1 | from windows_mcp.tree.config import INTERACTIVE_CONTROL_TYPE_NAMES,DOCUMENT_CONTROL_TYPE_NAMES,INFORMATIVE_CONTROL_TYPE_NAMES, DEFAULT_ACTIONS, THREAD_MAX_RETRIES 2 | from windows_mcp.tree.views import TreeElementNode, ScrollElementNode, TextElementNode, Center, BoundingBox, TreeState, DOMInfo 3 | from uiautomation import Control,ImageControl,ScrollPattern,WindowControl,Rect,GetRootControl,PatternId 4 | from concurrent.futures import ThreadPoolExecutor, as_completed 5 | from windows_mcp.tree.utils import random_point_within_bounding_box 6 | from PIL import Image, ImageFont, ImageDraw 7 | from typing import TYPE_CHECKING,Optional 8 | from windows_mcp.desktop.views import App 9 | from time import sleep 10 | import logging 11 | import random 12 | 13 | logger = logging.getLogger(__name__) 14 | logger.setLevel(logging.INFO) 15 | handler = logging.StreamHandler() 16 | formatter = logging.Formatter('[%(levelname)s] %(message)s') 17 | handler.setFormatter(formatter) 18 | logger.addHandler(handler) 19 | 20 | if TYPE_CHECKING: 21 | from windows_mcp.desktop.service import Desktop 22 | 23 | class Tree: 24 | def __init__(self,desktop:'Desktop'): 25 | self.desktop=desktop 26 | self.screen_size=self.desktop.get_screen_size() 27 | self.dom_info:Optional[DOMInfo]=None 28 | self.dom_bounding_box:BoundingBox=None 29 | self.screen_box=BoundingBox( 30 | top=0, left=0, bottom=self.screen_size.height, right=self.screen_size.width, 31 | width=self.screen_size.width, height=self.screen_size.height 32 | ) 33 | 34 | def get_state(self,active_app:App,other_apps:list[App],use_dom:bool=False)->TreeState: 35 | root=GetRootControl() 36 | other_apps_handle=set(map(lambda other_app: other_app.handle,other_apps)) 37 | apps=list(filter(lambda app:app.NativeWindowHandle not in other_apps_handle,root.GetChildren())) 38 | del other_apps_handle 39 | if active_app: 40 | apps=list(filter(lambda app:app.ClassName!='Progman',apps)) 41 | interactive_nodes,scrollable_nodes,dom_informative_nodes=self.get_appwise_nodes(apps=apps,use_dom=use_dom) 42 | return TreeState(dom_info=self.dom_info,interactive_nodes=interactive_nodes,scrollable_nodes=scrollable_nodes,dom_informative_nodes=dom_informative_nodes) 43 | 44 | def get_appwise_nodes(self,apps:list[Control],use_dom:bool=False)-> tuple[list[TreeElementNode],list[ScrollElementNode],list[TextElementNode]]: 45 | interactive_nodes, scrollable_nodes,dom_informative_nodes = [], [], [] 46 | with ThreadPoolExecutor() as executor: 47 | retry_counts = {app: 0 for app in apps} 48 | future_to_app = { 49 | executor.submit( 50 | self.get_nodes, app, 51 | self.desktop.is_app_browser(app), 52 | use_dom 53 | ): app 54 | for app in apps 55 | } 56 | while future_to_app: # keep running until no pending futures 57 | for future in as_completed(list(future_to_app)): 58 | app = future_to_app.pop(future) # remove completed future 59 | try: 60 | result = future.result() 61 | if result: 62 | element_nodes, scroll_nodes,informative_nodes = result 63 | interactive_nodes.extend(element_nodes) 64 | scrollable_nodes.extend(scroll_nodes) 65 | dom_informative_nodes.extend(informative_nodes) 66 | except Exception as e: 67 | retry_counts[app] += 1 68 | logger.debug(f"Error in processing node {app.Name}, retry attempt {retry_counts[app]}\nError: {e}") 69 | if retry_counts[app] < THREAD_MAX_RETRIES: 70 | logger.debug(f"Retrying {app.Name} for the {retry_counts[app]}th time") 71 | new_future = executor.submit(self.get_nodes, app, self.desktop.is_app_browser(app),use_dom) 72 | future_to_app[new_future] = app 73 | else: 74 | logger.error(f"Task failed completely for {app.Name} after {THREAD_MAX_RETRIES} retries") 75 | return interactive_nodes,scrollable_nodes,dom_informative_nodes 76 | 77 | def iou_bounding_box(self,window_box: Rect,element_box: Rect,) -> BoundingBox: 78 | # Step 1: Intersection of element and window (existing logic) 79 | intersection_left = max(window_box.left, element_box.left) 80 | intersection_top = max(window_box.top, element_box.top) 81 | intersection_right = min(window_box.right, element_box.right) 82 | intersection_bottom = min(window_box.bottom, element_box.bottom) 83 | 84 | # Step 2: Clamp to screen boundaries (new addition) 85 | intersection_left = max(self.screen_box.left, intersection_left) 86 | intersection_top = max(self.screen_box.top, intersection_top) 87 | intersection_right = min(self.screen_box.right, intersection_right) 88 | intersection_bottom = min(self.screen_box.bottom, intersection_bottom) 89 | 90 | # Step 3: Validate intersection 91 | if (intersection_right > intersection_left and intersection_bottom > intersection_top): 92 | bounding_box = BoundingBox( 93 | left=intersection_left, 94 | top=intersection_top, 95 | right=intersection_right, 96 | bottom=intersection_bottom, 97 | width=intersection_right - intersection_left, 98 | height=intersection_bottom - intersection_top 99 | ) 100 | else: 101 | # No valid visible intersection (either outside window or screen) 102 | bounding_box = BoundingBox( 103 | left=0, 104 | top=0, 105 | right=0, 106 | bottom=0, 107 | width=0, 108 | height=0 109 | ) 110 | return bounding_box 111 | 112 | def get_nodes(self, node: Control, is_browser:bool=False,use_dom:bool=False) -> tuple[list[TreeElementNode],list[ScrollElementNode]]: 113 | window_bounding_box=node.BoundingRectangle 114 | 115 | def is_element_visible(node:Control,threshold:int=0): 116 | is_control=node.IsControlElement 117 | box=node.BoundingRectangle 118 | if box.isempty(): 119 | return False 120 | width=box.width() 121 | height=box.height() 122 | area=width*height 123 | is_offscreen=(not node.IsOffscreen) or node.ControlTypeName in ['EditControl'] 124 | return area > threshold and is_offscreen and is_control 125 | 126 | def is_element_enabled(node:Control): 127 | try: 128 | return node.IsEnabled 129 | except Exception: 130 | return False 131 | 132 | def is_default_action(node:Control): 133 | legacy_pattern=node.GetLegacyIAccessiblePattern() 134 | default_action=legacy_pattern.DefaultAction.title() 135 | if default_action in DEFAULT_ACTIONS: 136 | return True 137 | return False 138 | 139 | def is_element_image(node:Control): 140 | if isinstance(node,ImageControl): 141 | if node.LocalizedControlType=='graphic' or not node.IsKeyboardFocusable: 142 | return True 143 | return False 144 | 145 | def is_element_text(node:Control): 146 | try: 147 | if node.ControlTypeName in INFORMATIVE_CONTROL_TYPE_NAMES: 148 | if is_element_visible(node) and is_element_enabled(node) and not is_element_image(node): 149 | return True 150 | except Exception: 151 | return False 152 | return False 153 | 154 | def is_window_modal(node:WindowControl): 155 | try: 156 | window_pattern=node.GetWindowPattern() 157 | return window_pattern.IsModal 158 | except Exception: 159 | return False 160 | 161 | def is_keyboard_focusable(node:Control): 162 | try: 163 | if node.ControlTypeName in set(['EditControl','ButtonControl','CheckBoxControl','RadioButtonControl','TabItemControl']): 164 | return True 165 | return node.IsKeyboardFocusable 166 | except Exception: 167 | return False 168 | 169 | def element_has_child_element(node:Control,control_type:str,child_control_type:str): 170 | if node.LocalizedControlType==control_type: 171 | first_child=node.GetFirstChildControl() 172 | if first_child is None: 173 | return False 174 | return first_child.LocalizedControlType==child_control_type 175 | 176 | def group_has_no_name(node:Control): 177 | try: 178 | if node.ControlTypeName=='GroupControl': 179 | if not node.Name.strip(): 180 | return True 181 | return False 182 | except Exception: 183 | return False 184 | 185 | def is_element_scrollable(node:Control): 186 | try: 187 | if (node.ControlTypeName in INTERACTIVE_CONTROL_TYPE_NAMES|INFORMATIVE_CONTROL_TYPE_NAMES) or node.IsOffscreen: 188 | return False 189 | scroll_pattern:ScrollPattern=node.GetPattern(PatternId.ScrollPattern) 190 | if scroll_pattern is None: 191 | return False 192 | return scroll_pattern.VerticallyScrollable 193 | except Exception: 194 | return False 195 | 196 | def is_element_interactive(node:Control): 197 | try: 198 | if is_browser and node.ControlTypeName in set(['DataItemControl','ListItemControl']) and not is_keyboard_focusable(node): 199 | return False 200 | elif not is_browser and node.ControlTypeName=="ImageControl" and is_keyboard_focusable(node): 201 | return True 202 | elif node.ControlTypeName in INTERACTIVE_CONTROL_TYPE_NAMES|DOCUMENT_CONTROL_TYPE_NAMES: 203 | return is_element_visible(node) and is_element_enabled(node) and (not is_element_image(node) or is_keyboard_focusable(node)) 204 | elif node.ControlTypeName=='GroupControl': 205 | if is_browser: 206 | return is_element_visible(node) and is_element_enabled(node) and (is_default_action(node) or is_keyboard_focusable(node)) 207 | # else: 208 | # return is_element_visible and is_element_enabled(node) and is_default_action(node) 209 | except Exception: 210 | return False 211 | return False 212 | 213 | def dom_correction(node:Control): 214 | if element_has_child_element(node,'list item','link') or element_has_child_element(node,'item','link'): 215 | dom_interactive_nodes.pop() 216 | return None 217 | elif node.ControlTypeName=='GroupControl': 218 | dom_interactive_nodes.pop() 219 | if is_keyboard_focusable(node): 220 | child=node 221 | try: 222 | while child.GetFirstChildControl() is not None: 223 | if child.ControlTypeName in INTERACTIVE_CONTROL_TYPE_NAMES: 224 | return None 225 | child=child.GetFirstChildControl() 226 | except Exception: 227 | return None 228 | if child.ControlTypeName!='TextControl': 229 | return None 230 | legacy_pattern=node.GetLegacyIAccessiblePattern() 231 | value=legacy_pattern.Value 232 | element_bounding_box = node.BoundingRectangle 233 | bounding_box=self.iou_bounding_box(self.dom_bounding_box,element_bounding_box) 234 | center = bounding_box.get_center() 235 | is_focused=node.HasKeyboardFocus 236 | dom_interactive_nodes.append(TreeElementNode(**{ 237 | 'name':child.Name.strip(), 238 | 'control_type':node.LocalizedControlType, 239 | 'value':value, 240 | 'shortcut':node.AcceleratorKey, 241 | 'bounding_box':bounding_box, 242 | 'xpath':'', 243 | 'center':center, 244 | 'app_name':app_name, 245 | 'is_focused':is_focused 246 | })) 247 | elif element_has_child_element(node,'link','heading'): 248 | dom_interactive_nodes.pop() 249 | node=node.GetFirstChildControl() 250 | control_type='link' 251 | legacy_pattern=node.GetLegacyIAccessiblePattern() 252 | value=legacy_pattern.Value 253 | element_bounding_box = node.BoundingRectangle 254 | bounding_box=self.iou_bounding_box(self.dom_bounding_box,element_bounding_box) 255 | center = bounding_box.get_center() 256 | is_focused=node.HasKeyboardFocus 257 | dom_interactive_nodes.append(TreeElementNode(**{ 258 | 'name':node.Name.strip(), 259 | 'control_type':control_type, 260 | 'value':node.Name.strip(), 261 | 'shortcut':node.AcceleratorKey, 262 | 'bounding_box':bounding_box, 263 | 'xpath':'', 264 | 'center':center, 265 | 'app_name':app_name, 266 | 'is_focused':is_focused 267 | })) 268 | 269 | def tree_traversal(node: Control,is_dom:bool=False,is_dialog:bool=False): 270 | # Checks to skip the nodes that are not interactive 271 | if node.IsOffscreen and (node.ControlTypeName not in set(["GroupControl","EditControl","TitleBarControl"])) and node.ClassName not in set(["Popup","Windows.UI.Core.CoreComponentInputSource"]): 272 | return None 273 | 274 | if is_element_scrollable(node): 275 | scroll_pattern:ScrollPattern=node.GetPattern(PatternId.ScrollPattern) 276 | box = node.BoundingRectangle 277 | # Get the center 278 | x,y=random_point_within_bounding_box(node=node,scale_factor=0.8) 279 | center = Center(x=x,y=y) 280 | scrollable_nodes.append(ScrollElementNode(**{ 281 | 'name':node.Name.strip() or node.AutomationId or node.LocalizedControlType.capitalize() or "''", 282 | 'app_name':app_name, 283 | 'control_type':node.LocalizedControlType.title(), 284 | 'bounding_box':BoundingBox(**{ 285 | 'left':box.left, 286 | 'top':box.top, 287 | 'right':box.right, 288 | 'bottom':box.bottom, 289 | 'width':box.width(), 290 | 'height':box.height() 291 | }), 292 | 'center':center, 293 | 'xpath':'', 294 | 'horizontal_scrollable':scroll_pattern.HorizontallyScrollable, 295 | 'horizontal_scroll_percent':scroll_pattern.HorizontalScrollPercent if scroll_pattern.HorizontallyScrollable else 0, 296 | 'vertical_scrollable':scroll_pattern.VerticallyScrollable, 297 | 'vertical_scroll_percent':scroll_pattern.VerticalScrollPercent if scroll_pattern.VerticallyScrollable else 0, 298 | 'is_focused':node.HasKeyboardFocus 299 | })) 300 | 301 | if is_element_interactive(node): 302 | legacy_pattern=node.GetLegacyIAccessiblePattern() 303 | value=legacy_pattern.Value.strip() if legacy_pattern.Value is not None else "" 304 | is_focused=node.HasKeyboardFocus 305 | name=node.Name.strip() 306 | element_bounding_box = node.BoundingRectangle 307 | if is_browser and is_dom: 308 | bounding_box=self.iou_bounding_box(self.dom_bounding_box,element_bounding_box) 309 | center = bounding_box.get_center() 310 | tree_node=TreeElementNode(**{ 311 | 'name':name, 312 | 'control_type':node.LocalizedControlType.title(), 313 | 'value':value, 314 | 'shortcut':node.AcceleratorKey, 315 | 'bounding_box':bounding_box, 316 | 'center':center, 317 | 'xpath':'', 318 | 'app_name':app_name, 319 | 'is_focused':is_focused 320 | }) 321 | dom_interactive_nodes.append(tree_node) 322 | dom_correction(node=node) 323 | else: 324 | bounding_box=self.iou_bounding_box(window_bounding_box,element_bounding_box) 325 | center = bounding_box.get_center() 326 | tree_node=TreeElementNode(**{ 327 | 'name':name, 328 | 'control_type':node.LocalizedControlType.title(), 329 | 'value':value, 330 | 'shortcut':node.AcceleratorKey, 331 | 'bounding_box':bounding_box, 332 | 'center':center, 333 | 'xpath':'', 334 | 'app_name':app_name, 335 | 'is_focused':is_focused 336 | }) 337 | interactive_nodes.append(tree_node) 338 | elif is_element_text(node): 339 | dom_informative_nodes.append(TextElementNode( 340 | text=node.Name.strip(), 341 | )) 342 | 343 | children=node.GetChildren() 344 | 345 | # Recursively traverse the tree the right to left for normal apps and for DOM traverse from left to right 346 | for child in (children if is_dom else children[::-1]): 347 | # Incrementally building the xpath 348 | 349 | # Check if the child is a DOM element 350 | if is_browser and child.AutomationId == "RootWebArea": 351 | bounding_box=child.BoundingRectangle 352 | self.dom_bounding_box=BoundingBox(left=bounding_box.left,top=bounding_box.top, 353 | right=bounding_box.right,bottom=bounding_box.bottom,width=bounding_box.width(), 354 | height=bounding_box.height()) 355 | scroll_pattern=child.GetPattern(PatternId.ScrollPattern) 356 | self.dom_info=DOMInfo( 357 | horizontal_scrollable=scroll_pattern.HorizontallyScrollable, 358 | horizontal_scroll_percent=scroll_pattern.HorizontalScrollPercent if scroll_pattern.HorizontallyScrollable else 0, 359 | vertical_scrollable=scroll_pattern.VerticallyScrollable, 360 | vertical_scroll_percent=scroll_pattern.VerticalScrollPercent if scroll_pattern.VerticallyScrollable else 0 361 | ) 362 | # enter DOM subtree 363 | tree_traversal(child, is_dom=True, is_dialog=is_dialog) 364 | # Check if the child is a dialog 365 | elif isinstance(child,WindowControl): 366 | if not child.IsOffscreen: 367 | if is_dom: 368 | bounding_box=child.BoundingRectangle 369 | if bounding_box.width() > 0.8*self.dom_bounding_box.width: 370 | # Because this window element covers the majority of the screen 371 | dom_interactive_nodes.clear() 372 | else: 373 | if is_window_modal(child): 374 | # Because this window element is modal 375 | interactive_nodes.clear() 376 | # enter dialog subtree 377 | tree_traversal(child, is_dom=is_dom, is_dialog=True) 378 | else: 379 | # normal non-dialog children 380 | tree_traversal(child, is_dom=is_dom, is_dialog=is_dialog) 381 | 382 | interactive_nodes, dom_interactive_nodes, scrollable_nodes, dom_informative_nodes = [], [], [], [] 383 | app_name=node.Name.strip() 384 | match node.ClassName: 385 | case "Progman": 386 | app_name="Desktop" 387 | case 'Shell_TrayWnd'|'Shell_SecondaryTrayWnd': 388 | app_name="Taskbar" 389 | case 'Microsoft.UI.Content.PopupWindowSiteBridge': 390 | app_name="Context Menu" 391 | case _: 392 | pass 393 | tree_traversal(node,is_dom=False,is_dialog=False) 394 | 395 | logger.debug(f'Interactive nodes:{len(interactive_nodes)}') 396 | logger.debug(f'DOM interactive nodes:{len(dom_interactive_nodes)}') 397 | logger.debug(f'Scrollable nodes:{len(scrollable_nodes)}') 398 | 399 | if use_dom: 400 | if is_browser: 401 | return (dom_interactive_nodes,scrollable_nodes,dom_informative_nodes) 402 | else: 403 | return ([],[],[]) 404 | else: 405 | return (interactive_nodes+dom_interactive_nodes,scrollable_nodes,dom_informative_nodes) 406 | 407 | def get_annotated_screenshot(self, nodes: list[TreeElementNode],scale:float=1.0) -> Image.Image: 408 | screenshot = self.desktop.get_screenshot() 409 | sleep(0.10) 410 | 411 | original_width = screenshot.width 412 | original_height = screenshot.height 413 | 414 | scaled_width = int(original_width * scale) 415 | scaled_height = int(original_height * scale) 416 | screenshot = screenshot.resize((scaled_width, scaled_height), Image.Resampling.LANCZOS) 417 | 418 | # Add padding 419 | padding = 5 420 | width = int(screenshot.width + (1.5 * padding)) 421 | height = int(screenshot.height + (1.5 * padding)) 422 | padded_screenshot = Image.new("RGB", (width, height), color=(255, 255, 255)) 423 | padded_screenshot.paste(screenshot, (padding, padding)) 424 | 425 | draw = ImageDraw.Draw(padded_screenshot) 426 | font_size = 12 427 | try: 428 | font = ImageFont.truetype('arial.ttf', font_size) 429 | except IOError: 430 | font = ImageFont.load_default() 431 | 432 | def get_random_color(): 433 | return "#{:06x}".format(random.randint(0, 0xFFFFFF)) 434 | 435 | def draw_annotation(label, node: TreeElementNode): 436 | box = node.bounding_box 437 | color = get_random_color() 438 | 439 | # Scale and pad the bounding box coordinates 440 | adjusted_box = ( 441 | int(box.left * scale) + padding, 442 | int(box.top * scale) + padding, 443 | int(box.right * scale) + padding, 444 | int(box.bottom * scale) + padding 445 | ) 446 | # Draw bounding box 447 | draw.rectangle(adjusted_box, outline=color, width=2) 448 | 449 | # Label dimensions 450 | label_width = draw.textlength(str(label), font=font) 451 | label_height = font_size 452 | left, top, right, bottom = adjusted_box 453 | 454 | # Label position above bounding box 455 | label_x1 = right - label_width 456 | label_y1 = top - label_height - 4 457 | label_x2 = label_x1 + label_width 458 | label_y2 = label_y1 + label_height + 4 459 | 460 | # Draw label background and text 461 | draw.rectangle([(label_x1, label_y1), (label_x2, label_y2)], fill=color) 462 | draw.text((label_x1 + 2, label_y1 + 2), str(label), fill=(255, 255, 255), font=font) 463 | 464 | # Draw annotations in parallel 465 | with ThreadPoolExecutor() as executor: 466 | executor.map(draw_annotation, range(len(nodes)), nodes) 467 | return padded_screenshot --------------------------------------------------------------------------------