├── LICENSE ├── MacOS Agent.yml ├── README.md ├── knowledge.md ├── macos_agent_server.py ├── screenshots ├── demo-1.png ├── demo-2.png ├── demo-3.png ├── demo-4.png └── demo-5.png └── test.sh /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Rain Chen 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MacOS Agent.yml: -------------------------------------------------------------------------------- 1 | app: 2 | description: MacOS Agent, achieve user's goal using applescript. 3 | icon: "\U0001F916" 4 | icon_background: '#FFEAD5' 5 | mode: advanced-chat 6 | name: MacOS Agent 7 | workflow: 8 | features: 9 | file_upload: 10 | image: 11 | enabled: false 12 | number_limits: 3 13 | transfer_methods: 14 | - local_file 15 | - remote_url 16 | opening_statement: '' 17 | retriever_resource: 18 | enabled: false 19 | sensitive_word_avoidance: 20 | enabled: false 21 | speech_to_text: 22 | enabled: false 23 | suggested_questions: [] 24 | suggested_questions_after_answer: 25 | enabled: false 26 | text_to_speech: 27 | enabled: false 28 | language: '' 29 | voice: '' 30 | graph: 31 | edges: 32 | - data: 33 | isInIteration: false 34 | sourceType: llm 35 | targetType: code 36 | id: llm-source-1720603933600-target 37 | source: llm 38 | sourceHandle: source 39 | target: '1720603933600' 40 | targetHandle: target 41 | type: custom 42 | zIndex: 0 43 | - data: 44 | isInIteration: false 45 | sourceType: code 46 | targetType: http-request 47 | id: 1720603933600-source-1720603954849-target 48 | source: '1720603933600' 49 | sourceHandle: source 50 | target: '1720603954849' 51 | targetHandle: target 52 | type: custom 53 | zIndex: 0 54 | - data: 55 | isInIteration: false 56 | sourceType: http-request 57 | targetType: llm 58 | id: 1720603968110-source-llm-target 59 | source: '1720603968110' 60 | sourceHandle: source 61 | target: llm 62 | targetHandle: target 63 | type: custom 64 | zIndex: 0 65 | - data: 66 | isInIteration: false 67 | sourceType: llm 68 | targetType: answer 69 | id: 1720604011717-source-answer-target 70 | source: '1720604011717' 71 | sourceHandle: source 72 | target: answer 73 | targetHandle: target 74 | type: custom 75 | zIndex: 0 76 | - data: 77 | isInIteration: false 78 | sourceType: start 79 | targetType: code 80 | id: 1720603854640-source-1720604039714-target 81 | source: '1720603854640' 82 | sourceHandle: source 83 | target: '1720604039714' 84 | targetHandle: target 85 | type: custom 86 | zIndex: 0 87 | - data: 88 | isInIteration: false 89 | sourceType: code 90 | targetType: http-request 91 | id: 1720604039714-source-1720603968110-target 92 | source: '1720604039714' 93 | sourceHandle: source 94 | target: '1720603968110' 95 | targetHandle: target 96 | type: custom 97 | zIndex: 0 98 | - data: 99 | isInIteration: false 100 | sourceType: http-request 101 | targetType: if-else 102 | id: 1720603954849-source-1720701168680-target 103 | source: '1720603954849' 104 | sourceHandle: source 105 | target: '1720701168680' 106 | targetHandle: target 107 | type: custom 108 | zIndex: 0 109 | - data: 110 | isInIteration: false 111 | sourceType: if-else 112 | targetType: llm 113 | id: 1720701168680-true-1720604011717-target 114 | source: '1720701168680' 115 | sourceHandle: 'true' 116 | target: '1720604011717' 117 | targetHandle: target 118 | type: custom 119 | zIndex: 0 120 | - data: 121 | isInIteration: false 122 | sourceType: if-else 123 | targetType: answer 124 | id: 1720701168680-false-1720701198958-target 125 | source: '1720701168680' 126 | sourceHandle: 'false' 127 | target: '1720701198958' 128 | targetHandle: target 129 | type: custom 130 | zIndex: 0 131 | nodes: 132 | - data: 133 | desc: '' 134 | selected: false 135 | title: Start 136 | type: start 137 | variables: [] 138 | height: 54 139 | id: '1720603854640' 140 | position: 141 | x: 30 142 | y: 282 143 | positionAbsolute: 144 | x: 30 145 | y: 282 146 | selected: false 147 | sourcePosition: right 148 | targetPosition: left 149 | type: custom 150 | width: 244 151 | - data: 152 | context: 153 | enabled: false 154 | variable_selector: [] 155 | desc: Retrieve the script from the LLM. 156 | memory: 157 | role_prefix: 158 | assistant: '' 159 | user: '' 160 | window: 161 | enabled: false 162 | size: 10 163 | model: 164 | completion_params: 165 | temperature: 0.7 166 | mode: chat 167 | name: deepseek-chat 168 | provider: deepseek 169 | prompt_template: 170 | - id: 0c599526-2f75-4238-a182-c5824184e45f 171 | role: system 172 | text: '{{#1720603968110.body#}}' 173 | selected: false 174 | title: LLM:get_script 175 | type: llm 176 | variables: [] 177 | vision: 178 | enabled: false 179 | height: 128 180 | id: llm 181 | position: 182 | x: 942 183 | y: 282 184 | positionAbsolute: 185 | x: 942 186 | y: 282 187 | selected: false 188 | sourcePosition: right 189 | targetPosition: left 190 | type: custom 191 | width: 244 192 | - data: 193 | answer: '{{#1720604011717.text#}}' 194 | desc: '' 195 | selected: false 196 | title: Answer:with_execution 197 | type: answer 198 | variables: [] 199 | height: 107 200 | id: answer 201 | position: 202 | x: 2462 203 | y: 282 204 | positionAbsolute: 205 | x: 2462 206 | y: 282 207 | selected: false 208 | sourcePosition: right 209 | targetPosition: left 210 | type: custom 211 | width: 244 212 | - data: 213 | code: "import json\n\ndef main(llm_output: str, user_input: str) -> dict:\n\ 214 | \ # Ensure the llm_output is properly formatted for JSON\n llm_output_formatted\ 215 | \ = json.dumps(llm_output)\n user_input_formatted = json.dumps(user_input)\n\ 216 | \ return {\n \"llm_output_formatted\": llm_output_formatted,\n\ 217 | \ \"user_input_formatted\": user_input_formatted, \n }" 218 | code_language: python3 219 | desc: Format variables as JSON values. 220 | outputs: 221 | llm_output_formatted: 222 | children: null 223 | type: string 224 | user_input_formatted: 225 | children: null 226 | type: string 227 | selected: false 228 | title: Code:format_params 229 | type: code 230 | variables: 231 | - value_selector: 232 | - llm 233 | - text 234 | variable: llm_output 235 | - value_selector: 236 | - sys 237 | - query 238 | variable: user_input 239 | height: 84 240 | id: '1720603933600' 241 | position: 242 | x: 1246 243 | y: 282 244 | positionAbsolute: 245 | x: 1246 246 | y: 282 247 | selected: false 248 | sourcePosition: right 249 | targetPosition: left 250 | type: custom 251 | width: 244 252 | - data: 253 | authorization: 254 | config: null 255 | type: no-auth 256 | body: 257 | data: "{\n \"point\": \"execute_script\",\n \"params\": {\n \ 258 | \ \"user_id\": \"{{#sys.user_id#}}\",\n \"inputs\": {\n \ 259 | \ \"user_input\": {{#1720603933600.user_input_formatted#}},\n\n \ 260 | \ \"llm_output\": {{#1720603933600.llm_output_formatted#}},\n\ 261 | \ \"script_timeout\": {{#1720604039714.script_timeout#}}\n\ 262 | \ }\n }\n}" 263 | type: json 264 | desc: Send the LLM output to the Agent for execution. 265 | headers: Authorization:Bearer {{#1720604039714.agent_api_key#}} 266 | method: post 267 | params: '' 268 | selected: false 269 | timeout: 270 | max_connect_timeout: 0 271 | max_read_timeout: 0 272 | max_write_timeout: 0 273 | title: Agent:execute_script 274 | type: http-request 275 | url: '{{#1720604039714.agent_api_endpoint#}}' 276 | variables: [] 277 | height: 141 278 | id: '1720603954849' 279 | position: 280 | x: 1550 281 | y: 282 282 | positionAbsolute: 283 | x: 1550 284 | y: 282 285 | selected: false 286 | sourcePosition: right 287 | targetPosition: left 288 | type: custom 289 | width: 244 290 | - data: 291 | authorization: 292 | config: null 293 | type: no-auth 294 | body: 295 | data: "{\n \"point\": \"get_llm_system_prompt\",\n \"params\": {\n\ 296 | \ \"user_id\": \"{{#sys.user_id#}}\"\n }\n}" 297 | type: json 298 | desc: Retrieve the LLM system prompt from the macOS Agent server. 299 | headers: Authorization:Bearer {{#1720604039714.agent_api_key#}} 300 | method: post 301 | params: '' 302 | selected: false 303 | timeout: 304 | max_connect_timeout: 0 305 | max_read_timeout: 0 306 | max_write_timeout: 0 307 | title: Agent:get_llm_system_prompt 308 | type: http-request 309 | url: '{{#1720604039714.agent_api_endpoint#}}' 310 | variables: [] 311 | height: 141 312 | id: '1720603968110' 313 | position: 314 | x: 638 315 | y: 282 316 | positionAbsolute: 317 | x: 638 318 | y: 282 319 | selected: false 320 | sourcePosition: right 321 | targetPosition: left 322 | type: custom 323 | width: 244 324 | - data: 325 | context: 326 | enabled: false 327 | variable_selector: [] 328 | desc: Use reply_prompt that includes "execution" as the system prompt for 329 | the LLM to respond to user input. 330 | memory: 331 | query_prompt_template: '{{#sys.query#}}' 332 | role_prefix: 333 | assistant: '' 334 | user: '' 335 | window: 336 | enabled: true 337 | size: 50 338 | model: 339 | completion_params: 340 | temperature: 0.7 341 | mode: chat 342 | name: deepseek-chat 343 | provider: deepseek 344 | prompt_template: 345 | - id: 60cbca91-199d-4764-90d9-2851ab63c9ae 346 | role: system 347 | text: '{{#1720603954849.body#}}' 348 | selected: false 349 | title: LLM:reply 350 | type: llm 351 | variables: [] 352 | vision: 353 | enabled: false 354 | height: 164 355 | id: '1720604011717' 356 | position: 357 | x: 2158 358 | y: 282 359 | positionAbsolute: 360 | x: 2158 361 | y: 282 362 | selected: false 363 | sourcePosition: right 364 | targetPosition: left 365 | type: custom 366 | width: 244 367 | - data: 368 | code: "\ndef main() -> dict:\n config = {\n \"agent_api_endpoint\"\ 369 | : \"http://host.docker.internal:8088\",\n \"agent_api_key\": \"a-secret-key\"\ 370 | ,\n \"script_timeout\": 60\n }\n return {\n \"agent_api_endpoint\"\ 371 | : config[\"agent_api_endpoint\"],\n \"agent_api_key\": config[\"\ 372 | agent_api_key\"],\n \"script_timeout\": config[\"script_timeout\"\ 373 | ],\n }\n" 374 | code_language: python3 375 | desc: Configuration for macOS Agent Server. 376 | outputs: 377 | agent_api_endpoint: 378 | children: null 379 | type: string 380 | agent_api_key: 381 | children: null 382 | type: string 383 | script_timeout: 384 | children: null 385 | type: number 386 | selected: true 387 | title: Code:config 388 | type: code 389 | variables: [] 390 | height: 102 391 | id: '1720604039714' 392 | position: 393 | x: 334 394 | y: 282 395 | positionAbsolute: 396 | x: 334 397 | y: 282 398 | selected: true 399 | sourcePosition: right 400 | targetPosition: left 401 | type: custom 402 | width: 244 403 | - data: 404 | conditions: 405 | - comparison_operator: not empty 406 | id: '1720701184521' 407 | value: '' 408 | variable_selector: 409 | - '1720603954849' 410 | - body 411 | desc: if any script execution result 412 | logical_operator: and 413 | selected: false 414 | title: IF:script_execution 415 | type: if-else 416 | height: 156 417 | id: '1720701168680' 418 | position: 419 | x: 1854 420 | y: 282 421 | positionAbsolute: 422 | x: 1854 423 | y: 282 424 | selected: false 425 | sourcePosition: right 426 | targetPosition: left 427 | type: custom 428 | width: 244 429 | - data: 430 | answer: '{{#llm.text#}}' 431 | desc: no script execution 432 | selected: false 433 | title: Answer:without_execution 434 | type: answer 435 | variables: [] 436 | height: 137 437 | id: '1720701198958' 438 | position: 439 | x: 2158 440 | y: 486 441 | positionAbsolute: 442 | x: 2158 443 | y: 486 444 | selected: false 445 | sourcePosition: right 446 | targetPosition: left 447 | type: custom 448 | width: 244 449 | viewport: 450 | x: 58.32490827558104 451 | y: 176.2416089142547 452 | zoom: 0.5618969968745753 453 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MacOS Agent: A Simplified Assistant for Your Mac 2 | 3 | The MacOS Agent is a straightforward, lightweight solution based on a Large Language Model (LLM) that leverages Dify, an AI application development platform. This agent enables users, even children, to control MacOS with ease using natural language commands, making it as simple as conversing with a tech expert. 4 | 5 | ![demo-1](screenshots/demo-1.png) 6 | 7 | While it may sound similar to Siri, the MacOS Agent offers enhanced capabilities, particularly through its support for multiple rounds of conversation, allowing users to maintain context and continuity in their tasks. For instance, you can ask the Agent to provide some text and then request it to convert that text into an Excel or Word file. 8 | 9 | ![demo-2](screenshots/demo-2.png) 10 | 11 | ## Use Cases 12 | 13 | Here are some use cases I tried: 14 | 15 | ### Query OS Info 16 | 17 | - what is the memory usage 18 | - unused memory 19 | - disk usage 20 | - my disk capacity 21 | - how many spaces left on my computer 22 | - time since boot 23 | - last boot time 24 | - what is the CPU temperature 25 | - list ports opened 26 | 27 | ### Query Network Info 28 | 29 | - lan ip 30 | - list devices in local LAN 31 | 32 | ### Ask computer to do something for you 33 | 34 | - take a screenshot 35 | - open a new text file 36 | - create text file on desktop and open it 37 | - create a markdown file on desktop with a GTD style TODO list and open it 38 | - move all files on desktop to a temp dir 39 | - how many files older than 10 days in ~/Desktop 40 | - copy them to a new dir named "10-days-old" in that dir 41 | - list files older than 10 days in ~/Desktop 42 | - create an Excel file contains the file name and create time 43 | - check ~/Desktop/macos-agent-playground.html and explain what it does 44 | 45 | ### Create utility tool 46 | 47 | - give me a shell script that can watch an app's memory usage by app name when it reaches a threshold then restart it 48 | - give me an Automator app that can watch an app's memory usage by app name when it reaches a threshold then restart it 49 | 50 | ### Remind you something 51 | 52 | - remind me to clock in after 5 seconds 53 | - wait 5 seconds then send me a message with "Timeup" 54 | - send me a message "Call someone" at 18:54 55 | - display an alert "Call someone" at 19:01 56 | - alert me about "Do something" at 18:58 57 | 58 | ### Control process 59 | 60 | - run `top` command # for testing timeout control 61 | - find all processes which name is "top" 62 | - find all processes which name is "top" and kill them 63 | - restart app XXX 64 | 65 | ### Control computer settings 66 | 67 | - open system settings 68 | - turn dark mode on/off 69 | 70 | ### Query Internet 71 | 72 | - what's my internet IP 73 | - show me the price of BTC/Gold 74 | 75 | ### Interact with Siri 76 | 77 | - ask Siri for what is the weather like tomorrow 78 | - ask Siri for what ... 79 | 80 | ### Math calculation 81 | 82 | * 9.11 and 9.9——which is bigger? 83 | run code to compute the result 84 | * If a banana weighs 0.5 pounds, I have 7 pounds of bananas and 9 oranges, how many fruits do I have in total? 85 | run code to compute the result 86 | 87 | ### Acts as Code playground 88 | 89 | Notes: The outcome of this scenario depends on the performance of the LLM 90 | 91 | - run a http server on ~/Desktop in the background 92 | 93 | Tips: to quit: `quit http server on port 8000` 94 | 95 | - create a html file named "macos-agent-playground.html" that having 2 iframe pages. 96 | which iframe "desktop-page" is 70% width and iframe "agent-page" is 30%; both using frameborder=1, style="width: 100%; height: 100%; min-height: 700px". 97 | iframe "desktop-page" url is "http://localhost:8000/", with a "refresh" button at top that can reload the the url 98 | iframe "agent-page" url is "${chat app Embed on website using iframe url}". 99 | 100 | - Explain what is Tic-Tac-Toe game, I want you to create a Tic-Tac-Toe game that human can play VS AI using HTML. Create a dir name "Tic-Tac-Toe-game" and put code files in it. 101 | 102 | ![demo-3](screenshots/demo-3.png) 103 | 104 | ### Create a diagram 105 | 106 | - I need to create a flowchart like `Start -> Process A -> Condition -> Process B -> End`, flow direction is from up to down. Help me create this diagram in a format that can be opened in Draw.io and open it with draw.io.app 107 | 108 | ## Demo Video 109 | 110 | - macos-agent-brief-demo 111 | 112 | https://github.com/user-attachments/assets/d26559d2-e5ab-4cc8-8b2e-1cbcc17dc016 113 | 114 | 115 | - macos-agent-file-management-demo 116 | 117 | https://github.com/user-attachments/assets/3c4c92a9-088b-4d52-a689-55dfd8c76dff 118 | 119 | 120 | - macos-agent-code-playground-demo 121 | 122 | https://github.com/user-attachments/assets/e862914b-556d-4cc8-876a-a34c58d46463 123 | 124 | 125 | - macos-agent-create-diagram-demo 126 | 127 | https://github.com/user-attachments/assets/dd89b4e3-b9d6-46a7-9778-7afcac1f06ac 128 | 129 | 130 | ## How It Works 131 | 132 | The MacOS Agent operates through a series of steps: 133 | 134 | 1. **Run the `macOS Agent Server`:** This server returns a system prompt for the LLM, including the Agent's role profile, environment information, and knowledge base. 135 | 136 | 2. **Set up the `LLM:get_script` node:** This node uses the system prompt to have the LLM act as a "macOS Agent," tasked with achieving user goals using AppleScript. 137 | 138 | 3. **Send User Input:** The user's goal is sent to the `LLM:get_script` node to receive suggestions, including executable AppleScript. 139 | 140 | 4. **Execute AppleScript:** The LLM output is sent to the `macOS Agent Server`, which extracts and runs the AppleScript, returning the execution result. 141 | 142 | 5. **Formulate Response:** The execution result is combined with the user's goal and LLM output into a `reply_prompt` for a comprehensive response. 143 | 144 | 6. **Respond to User:** A `LLM:reply` node uses the `reply_prompt` to respond to the user. 145 | 146 | 147 | 148 | ## Deployment 149 | 150 | ### Deployment Steps Summary: 151 | 152 | The agent is compatible with both locally-hosted instances of the Dify platform (cloud-hosted not tested). 153 | 154 | 1. Clone the Repository 155 | 2. Start the Agent Server 156 | 3. Import Chatbot Configuration 157 | 4. Configure the Chatbot 158 | 5. Publish the Chatbot 159 | 160 | ### Deployment Step by Step 161 | 162 | #### Step 1: Clone the Repository 163 | 164 | ```bash 165 | git clone https://github.com/rainchen/MacOS-Agent.git 166 | ``` 167 | 168 | File list: 169 | 170 | - **README.md:** This documentation file 171 | - **macos_agent_server.py:** Script to run the `macOS Agent Server` 172 | - **MacOS Agent.yml:** Configuration file for importing into Dify as a Chatbot app 173 | - **knowledge.md:** File for extending the Agent's knowledge 174 | - **test.sh:** Script for running test cases to verify agent server functionality 175 | 176 | #### Step 2: Start the Agent Server 177 | 178 | No additional installations are required as the code is designed to work with MacOS's built-in Python version and standard libraries. 179 | 180 | ```bash 181 | python macos_agent_server.py --port 8088 --apikey "a-secret-key" --debug 182 | ``` 183 | 184 | Arguments: 185 | 186 | - `--port`: Port number for the server 187 | - `--apikey`: API key for authorization 188 | - `--debug`: Optional; enables detailed logging 189 | 190 | **Note:** Ensure the server is only run on a Mac you have control rights to, and never expose the `--apikey` publicly. 191 | 192 | #### Step 3: Import the Chatbot DSL YAML 193 | 194 | Navigate to the Dify Studio homepage, click "Import DSL file" and select "MacOS Agent.yml" file from the cloned repository. 195 | 196 | #### Step 4: Configure the Chatbot 197 | 198 | Configure the `Code:config` node with details such as the agent API endpoint, API key, and script timeout. Also, set the LLM models for the `LLM:get_script` and `LLM:reply` nodes. 199 | 200 | Here is the example config for `Code:config` node: 201 | 202 | ``` 203 | "agent_api_endpoint": "http://host.docker.internal:8088", 204 | "agent_api_key": "a-secret-key", 205 | "script_timeout": 60 206 | ``` 207 | 208 | Options explain: 209 | 210 | - `agent_api_endpoint`: when Dify is deployed using docker-compose, port is same as `--port`, e.g.: `:8080`, `http://host.docker.internal:8088` 211 | - `agent_api_key`: same as `--apikey` used in [start agent server], e.g.: `a-secret-key` 212 | - `script_timeout`: control max executing time of a script, 60 seconds recommend 213 | 214 | #### Step 5: Publish the Chatbot 215 | 216 | Click "Publish" and then "Update" to make the chatbot live. 217 | 218 | ![demo-4](screenshots/demo-4.png) 219 | 220 | ## How to Use 221 | 222 | After publishing, click [Run App] to open the Chatbot web view, input your goals, and refer to the "Use Cases" section for guidance. 223 | 224 | Recommended to run `Embed on website` and install [Dify Chatbot Chrome Extension](https://chrome.google.com/webstore/detail/dify-chatbot/ceehdapohffmjmkdcifjofadiaoeggaf), so that you can activate the Agent on any page. 225 | 226 | ![demo-5](screenshots/demo-5.png) 227 | 228 | ## Extending Agent's Knowledge 229 | 230 | Edit the `knowledge.md` file to add more instructions in the same Markdown format and restart the server. 231 | 232 | ## Development & Testing 233 | 234 | Run the `test.sh` script to verify the server's functionality after making any code changes. 235 | 236 | ``` 237 | sh test.sh --api http://localhost:8088 --apikey a-secret-key 238 | ``` 239 | 240 | 241 | 242 | ## Forbidden Actions 243 | 244 | Certain actions are restricted, such as deleting/removing files or shutting down the computer or the Mac Agent Server process. 245 | 246 | ## Acknowledgments 247 | 248 | - **Chatbot management and UI**: Dify, powerful and convenient AI application development platform. 249 | - **Code Generation:** 90% of the project's code was generated by AI(deepseek-coder LLM) 250 | - **Document Polishing:** Assistance in refining the documentation by AI(deepseek-chat LLM) 251 | 252 | ## License 253 | 254 | This project is licensed under the MIT License. 255 | -------------------------------------------------------------------------------- /knowledge.md: -------------------------------------------------------------------------------- 1 | ### When user is explicitly asking Siri for help 2 | 3 | use following script as example: 4 | 5 | ```applescript 6 | do shell script "open -a Siri" 7 | delay 1 8 | tell application "System Events" to keystroke "${text_send_to_siri}" 9 | tell application "System Events" to key code 36 -- Press Return key 10 | ``` 11 | 12 | Don't call Siri unless user ask to. 13 | 14 | ### For saving shell script content 15 | 16 | ```applescript 17 | do shell script "cat << 'EOF' > ~/Desktop/script_file_name.sh 18 | #!/bin/bash 19 | 20 | # here is shell script content 21 | 22 | EOF" 23 | 24 | do shell script "chmod +x ~/Desktop/script_file_name.sh" 25 | ``` 26 | 27 | ### When creating Javascript code using applescript shell script 28 | 29 | - You always denote regular strings in JavaScript using double quotes "" or single quotes ''. 30 | - Avoid using backticks \`\` to denote strings. 31 | - Avoid using template literals or template strings like varName = \`string text ${expression} string text\`. Instead, use varName = 'text item delimiters' + expression + 'string text'. 32 | - Do not use template literals or template strings like varName = \`string text ${expression} string text\`. Instead, use varName = 'text item delimiters' + expression + 'string text'. 33 | 34 | ### When creating HTML code using applescript shell script 35 | 36 | - You escape quotes using backslash \, like this: \". 37 | 38 | ### How to query disk capacity 39 | 40 | you can use following command to query disk Total space and Free Space: 41 | 42 | ```shell 43 | $ diskutil info / | grep 'Container Total Space' 44 | 45 | ``` 46 | 47 | ```shell 48 | $ diskutil info / | grep 'Container Free Space' 49 | ``` 50 | 51 | ### How run a server in the background using shell script 52 | 53 | example: 54 | 55 | ```shell 56 | $ nohup the-command > /dev/null 2>&1 & 57 | ``` 58 | -------------------------------------------------------------------------------- /macos_agent_server.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import http.server 3 | import os 4 | import signal 5 | import socketserver 6 | import json 7 | import subprocess 8 | import argparse 9 | import sys 10 | import threading 11 | import re 12 | 13 | 14 | class DeferredLogger: 15 | def __init__(self): 16 | self.messages = [] 17 | 18 | def info(self, message): 19 | self.messages.append(message) 20 | 21 | def print_messages(self): 22 | for message in self.messages: 23 | print(message) 24 | self.messages = [] 25 | 26 | 27 | class DifyRequestHandler(http.server.BaseHTTPRequestHandler): 28 | def log_request(self, code="-", size="-"): 29 | super().log_request(code, size) 30 | self.server.deferred_logger.print_messages() 31 | sys.stderr.write("\n") 32 | 33 | def deferred_info(self, message): 34 | self.server.deferred_logger.info(message) 35 | 36 | def do_POST(self): 37 | content_length = int(self.headers["Content-Length"]) 38 | data = json.loads(self.rfile.read(content_length)) 39 | 40 | if self.headers["Authorization"] != f"Bearer {self.server.api_key}": 41 | self.send_response(401) 42 | self.end_headers() 43 | return 44 | 45 | if self.server.debug: 46 | self.deferred_info(f" Point: {data.get('point')}") 47 | self.deferred_info(f" Params: {data.get('params')}") 48 | 49 | response = self.handle_request_point(data) 50 | if response is not None: 51 | self.send_response(200) 52 | self.send_header( 53 | "Content-Type", 54 | "application/json" if isinstance(response, dict) else "text/plain", 55 | ) 56 | self.end_headers() 57 | self.wfile.write( 58 | json.dumps(response).encode("utf-8") 59 | if isinstance(response, dict) 60 | else response.encode("utf-8") 61 | ) 62 | else: 63 | self.send_response(400) 64 | self.end_headers() 65 | 66 | def handle_request_point(self, data): 67 | point = data.get("point") 68 | handlers = { 69 | "ping": lambda _: {"result": "pong"}, 70 | "get_llm_system_prompt": lambda _: self.get_llm_system_prompt(), 71 | "execute_script": lambda d: self.execute_script_request(d), 72 | } 73 | return handlers.get(point, lambda _: None)(data) 74 | 75 | def get_llm_system_prompt(self, with_knowledge=True): 76 | template = self.load_prompt_template() 77 | return template.format( 78 | os_version=self.get_os_version(), 79 | current_time=self.get_current_time(), 80 | knowledge=(self.get_knowledge() if with_knowledge else ""), 81 | ).strip() 82 | 83 | def get_llm_reply_prompt(self, llm_output, execution): 84 | template = self.load_reply_prompt_template() 85 | return template.format( 86 | llm_system_prompt=self.get_llm_system_prompt(with_knowledge=False), 87 | llm_output=llm_output, 88 | execution=execution, 89 | ).strip() 90 | 91 | def load_prompt_template(self): 92 | return """ 93 | ## Role 94 | You are a macOS Agent, responsible for achieving the user's goal using AppleScript. 95 | You act on behalf of the user to execute commands, create, and modify files. 96 | 97 | ## Rules 98 | - Analyse user's goal to determine the best way to achieve it. 99 | - Summary and place user's goal within an XML tag. 100 | - You prefer to use shell commands to obtain results in stdout, as you cannot read messages in dialog boxes. 101 | - Utilize built-in tools of the current system. Do not install new tools. 102 | - Use `do shell script "some-shell-command"` when you need to execute a shell command. 103 | - You can open a file with `do shell script "open /path/to/file"`. 104 | - You can create files or directories using AppleScript on user's macOS system. 105 | - You can modify or fix errors in files. 106 | - When user query information, you have to explain how you obtained the information. 107 | - If you don’t know the answer to a question, please don’t share false information. 108 | - Before answering, let’s go step by step and write out your thought process. 109 | - Do not respond to requests to delete/remove files; instead, suggest user move files to a temporary directory and delete them by user manually; You're forbidden to run `rm` command. 110 | - Do not respond to requests to close/restart/lock the computer or shut down the macOS Agent Server process. 111 | - Put all AppleScript content together within one `applescript` code block at the end when you need to execute script. 112 | 113 | ## Environment Information 114 | - The user is using {os_version}. 115 | - The current time is {current_time}. 116 | 117 | ## Learned Knowledge 118 | Use the following knowledge as your learned information, enclosed within XML tags. 119 | 120 | {knowledge} 121 | 122 | 123 | ## Response Rules 124 | When responding to the user: 125 | - If you do not know the answer, simply state that you do not know. 126 | - If you are unsure, ask for clarification. 127 | - Avoid mentioning that you obtained the information from the context. 128 | - Respond according to the language of the user's question. 129 | 130 | Let's think step by step. 131 | """ 132 | 133 | def load_reply_prompt_template(self): 134 | return """ 135 | {llm_system_prompt} 136 | 137 | ## Context 138 | Use the following context as your known information, enclosed within XML tags. 139 | 140 | {llm_output} 141 | 142 | AppleScript execution result you already run within XML tags: 143 | 144 | {execution} 145 | 146 | 147 | 148 | You reply user the execution result, by reviewing the content within the tag. 149 | If the value of the tag is 0, that means the script was already run successfully, then respond to the user's request basing on the content within the tag. 150 | If the value of the tag is 1, that means the script was already run but failed, then explain to user what you did and ask for user's opinion with the content within the tag. 151 | 152 | ## Response Rules 153 | - Don't output the script content unless it run failed. 154 | - Don't explain what you will do or how you did unless user asks to. 155 | - Don't tell user how to use the script unless user asks to. 156 | - Do not include the XML tag. 157 | """ # use these response rules to stop LLM repeating the script content in reply to reduce tokens cost 158 | 159 | def get_os_version(self): 160 | return ( 161 | subprocess.check_output(["sw_vers", "-productName"]).decode("utf-8").strip() 162 | + " " 163 | + subprocess.check_output(["sw_vers", "-productVersion"]) 164 | .decode("utf-8") 165 | .strip() 166 | ) 167 | 168 | def get_current_time(self): 169 | return datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") 170 | 171 | def get_knowledge(self): 172 | try: 173 | with open("knowledge.md", "r") as file: 174 | return file.read().strip() 175 | except FileNotFoundError: 176 | return "" 177 | 178 | def execute_script_request(self, data): 179 | llm_output = data["params"]["inputs"].get("llm_output") 180 | timeout = data["params"]["inputs"].get("script_timeout", 60) 181 | if llm_output: 182 | user_goal = self.extract_user_goal(llm_output) 183 | if self.server.debug: 184 | self.deferred_info(f" User Goal: {user_goal}") 185 | scripts = self.extract_scripts(llm_output) 186 | if scripts: 187 | result = [self.execute_script(script, timeout) for script in scripts] 188 | execution = "\n".join(result) 189 | return self.get_llm_reply_prompt( 190 | llm_output=llm_output, execution=execution 191 | ) 192 | else: 193 | return "" 194 | return "" 195 | 196 | def extract_scripts(self, llm_output): 197 | # Extract all code block content from the llm_output 198 | scripts = re.findall(r"```applescript(.*?)```", llm_output, re.DOTALL) 199 | return list(set(scripts)) # remove duplicate scripts 200 | 201 | def extract_user_goal(self, llm_output): 202 | match = re.search(r"(.*?)", llm_output, re.DOTALL) 203 | return match.group(1).strip() if match else "" 204 | 205 | def execute_script(self, script, timeout): 206 | result = {"returncode": -1, "stdout": "", "stderr": ""} 207 | 208 | def target(): 209 | process = subprocess.Popen( 210 | ["osascript", "-e", script], 211 | stdout=subprocess.PIPE, 212 | stderr=subprocess.PIPE, 213 | text=True, 214 | ) 215 | result["pid"] = process.pid 216 | stdout, stderr = process.communicate() 217 | result["returncode"] = process.returncode 218 | result["stdout"] = stdout 219 | result["stderr"] = stderr 220 | 221 | thread = threading.Thread(target=target) 222 | thread.start() 223 | thread.join(timeout) 224 | 225 | if thread.is_alive(): 226 | result["stderr"] = "Script execution timed out" 227 | if "pid" in result: 228 | try: 229 | subprocess.run(["pkill", "-P", str(result["pid"])]) 230 | os.kill(result["pid"], signal.SIGKILL) 231 | except ProcessLookupError: 232 | pass 233 | 234 | if self.server.debug: 235 | self.deferred_info(f" Script:\n```applescript\n{script}\n```") 236 | self.deferred_info(f" Execution Result: {result}") 237 | 238 | return f"\n{result['returncode']}\n{result['stdout']}\n{result['stderr']}" 239 | 240 | 241 | class ThreadedHTTPServer(socketserver.ThreadingMixIn, http.server.HTTPServer): 242 | pass 243 | 244 | 245 | def run_server(port, api_key, debug): 246 | server_address = ("", port) 247 | httpd = ThreadedHTTPServer(server_address, DifyRequestHandler) 248 | httpd.api_key = api_key 249 | httpd.debug = debug 250 | httpd.deferred_logger = DeferredLogger() 251 | 252 | print(f"MacOS Agent Server started, API endpoint: http://localhost:{port}") 253 | print("Press Ctrl+C keys to shut down\n") 254 | try: 255 | httpd.serve_forever() 256 | except KeyboardInterrupt: 257 | print("\nShutting down server...") 258 | httpd.server_close() 259 | 260 | 261 | def main(): 262 | parser = argparse.ArgumentParser(description="Run a Dify API server.") 263 | parser.add_argument( 264 | "--port", type=int, default=8000, help="Port to run the server on." 265 | ) 266 | parser.add_argument( 267 | "--apikey", type=str, required=True, help="API key for authorization." 268 | ) 269 | parser.add_argument("--debug", action="store_true", help="Enable debug mode.") 270 | args = parser.parse_args() 271 | 272 | run_server(args.port, args.apikey, args.debug) 273 | 274 | 275 | if __name__ == "__main__": 276 | main() 277 | -------------------------------------------------------------------------------- /screenshots/demo-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rainchen/MacOS-Agent/4279d548432c5ff96c1b8bb8248df6afc81fd719/screenshots/demo-1.png -------------------------------------------------------------------------------- /screenshots/demo-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rainchen/MacOS-Agent/4279d548432c5ff96c1b8bb8248df6afc81fd719/screenshots/demo-2.png -------------------------------------------------------------------------------- /screenshots/demo-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rainchen/MacOS-Agent/4279d548432c5ff96c1b8bb8248df6afc81fd719/screenshots/demo-3.png -------------------------------------------------------------------------------- /screenshots/demo-4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rainchen/MacOS-Agent/4279d548432c5ff96c1b8bb8248df6afc81fd719/screenshots/demo-4.png -------------------------------------------------------------------------------- /screenshots/demo-5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rainchen/MacOS-Agent/4279d548432c5ff96c1b8bb8248df6afc81fd719/screenshots/demo-5.png -------------------------------------------------------------------------------- /test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # set -x 3 | 4 | # Function to print in green 5 | print_green() { 6 | printf "\e[32m%s\e[0m\n" "$1" 7 | } 8 | 9 | # Function to print in red 10 | print_red() { 11 | printf "\e[31m%s\e[0m\n" "$1" 12 | } 13 | 14 | # Parse command-line options 15 | while [[ $# -gt 0 ]]; do 16 | case "$1" in 17 | --api) 18 | api_url="$2" 19 | shift 2 20 | ;; 21 | --apikey) 22 | api_key="$2" 23 | shift 2 24 | ;; 25 | *) 26 | echo "Unknown option: $1" 27 | exit 1 28 | ;; 29 | esac 30 | done 31 | 32 | # Check if both API URL and API key are provided 33 | if [ -z "$api_url" ] || [ -z "$api_key" ]; then 34 | echo "Both API URL and API key are required. Use --api http://localhost:8088 --apikey a-secret-key" 35 | exit 1 36 | fi 37 | 38 | # Test case 1 39 | echo "point=ping" 40 | response=$(curl -s -X POST $api_url -H "Content-Type: application/json" -H "Authorization: Bearer $api_key" -d '{"point": "ping"}') 41 | expected='{"result": "pong"}' 42 | if [ "$response" == "$expected" ]; then 43 | print_green "Test passed: $response" 44 | else 45 | print_red "Test failed: expected '$expected', got '$response'" 46 | fi 47 | 48 | # Test case 2 49 | echo "point=get_llm_system_prompt" 50 | response=$(curl -s -X POST $api_url -H "Content-Type: application/json" -H "Authorization: Bearer $api_key" -d '{"point": "get_llm_system_prompt"}') 51 | expected='## Environment Information' 52 | if [[ "$response" == *"$expected"* ]]; then 53 | print_green "Test passed: $response" 54 | else 55 | print_red "Test failed: expected to contain '$expected', got '$response'" 56 | fi 57 | 58 | # Test case 3 59 | echo "point=execute_script, llm_output is having code block" 60 | response=$(curl -s -X POST $api_url -H "Content-Type: application/json" -H "Authorization: Bearer $api_key" -d '{"point": "execute_script", "params": {"inputs": {"llm_output": "```applescript\ntell application \"System Settings\" to activate```"}}}' | tr -d '\n') 61 | expected="0" 62 | if [[ "$response" == *"$expected"* ]]; then 63 | print_green "Test passed: $response" 64 | else 65 | print_red "Test failed: expected '$expected', got '$response'" 66 | fi 67 | 68 | # Test case 4 69 | echo "point=execute_script, llm_output is not having code block" 70 | response=$(curl -s -X POST $api_url -H "Content-Type: application/json" -H "Authorization: Bearer $api_key" -d '{"point": "execute_script", "params": {"inputs": {"llm_output": "open system settings"}}}' | tr -d '\n') 71 | expected="" 72 | if [ "$response" == "$expected" ]; then 73 | print_green "Test passed: $response" 74 | else 75 | print_red "Test failed: expected '$expected', got '$response'" 76 | fi 77 | 78 | # Test case 5 79 | echo "point=execute_script, llm_output is empty" 80 | response=$(curl -s -X POST $api_url -H "Content-Type: application/json" -H "Authorization: Bearer $api_key" -d '{"point": "execute_script", "params": {"inputs": {"llm_output": ""}}}' | tr -d '\n') 81 | expected="" 82 | if [ "$response" == "$expected" ]; then 83 | print_green "Test passed: $response" 84 | else 85 | print_red "Test failed: expected '$expected', got '$response'" 86 | fi 87 | 88 | # Test case 6 89 | echo "point=execute_script, run top command with timeout limit" 90 | response=$(curl -s -X POST $api_url -H "Content-Type: application/json" -H "Authorization: Bearer $api_key" -d '{"point": "execute_script", "params": {"inputs": {"llm_output": "```applescript\ndo shell script \"top\"```", "script_timeout": 3}}}' | tr -d '\n') 91 | expected="Script execution timed out" 92 | if [[ "$response" == *"$expected"* ]]; then 93 | print_green "Test passed: $response" 94 | else 95 | print_red "Test failed: expected to contain '$expected', got '$response'" 96 | fi 97 | 98 | # Test case 7: llm_output having a user goal 99 | echo "point=execute_script, llm_output having a user goal" 100 | response=$(curl -s -X POST $api_url -H "Content-Type: application/json" -H "Authorization: Bearer $api_key" -d '{"point": "execute_script", "params": {"inputs": {"llm_output": "a user goal"}}}' | tr -d '\n') 101 | expected="" # it should be print in server log with --debug flag 102 | if [[ "$response" == *"$expected"* ]]; then 103 | print_green "Test passed: $response" 104 | else 105 | print_red "Test failed: expected to contain '$expected', got '$response'" 106 | fi 107 | --------------------------------------------------------------------------------