├── LICENSE
├── MacOS Agent.yml
├── README.md
├── knowledge.md
├── macos_agent_server.py
├── screenshots
├── demo-1.png
├── demo-2.png
├── demo-3.png
├── demo-4.png
└── demo-5.png
└── test.sh
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2024 Rain Chen
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/MacOS Agent.yml:
--------------------------------------------------------------------------------
1 | app:
2 | description: MacOS Agent, achieve user's goal using applescript.
3 | icon: "\U0001F916"
4 | icon_background: '#FFEAD5'
5 | mode: advanced-chat
6 | name: MacOS Agent
7 | workflow:
8 | features:
9 | file_upload:
10 | image:
11 | enabled: false
12 | number_limits: 3
13 | transfer_methods:
14 | - local_file
15 | - remote_url
16 | opening_statement: ''
17 | retriever_resource:
18 | enabled: false
19 | sensitive_word_avoidance:
20 | enabled: false
21 | speech_to_text:
22 | enabled: false
23 | suggested_questions: []
24 | suggested_questions_after_answer:
25 | enabled: false
26 | text_to_speech:
27 | enabled: false
28 | language: ''
29 | voice: ''
30 | graph:
31 | edges:
32 | - data:
33 | isInIteration: false
34 | sourceType: llm
35 | targetType: code
36 | id: llm-source-1720603933600-target
37 | source: llm
38 | sourceHandle: source
39 | target: '1720603933600'
40 | targetHandle: target
41 | type: custom
42 | zIndex: 0
43 | - data:
44 | isInIteration: false
45 | sourceType: code
46 | targetType: http-request
47 | id: 1720603933600-source-1720603954849-target
48 | source: '1720603933600'
49 | sourceHandle: source
50 | target: '1720603954849'
51 | targetHandle: target
52 | type: custom
53 | zIndex: 0
54 | - data:
55 | isInIteration: false
56 | sourceType: http-request
57 | targetType: llm
58 | id: 1720603968110-source-llm-target
59 | source: '1720603968110'
60 | sourceHandle: source
61 | target: llm
62 | targetHandle: target
63 | type: custom
64 | zIndex: 0
65 | - data:
66 | isInIteration: false
67 | sourceType: llm
68 | targetType: answer
69 | id: 1720604011717-source-answer-target
70 | source: '1720604011717'
71 | sourceHandle: source
72 | target: answer
73 | targetHandle: target
74 | type: custom
75 | zIndex: 0
76 | - data:
77 | isInIteration: false
78 | sourceType: start
79 | targetType: code
80 | id: 1720603854640-source-1720604039714-target
81 | source: '1720603854640'
82 | sourceHandle: source
83 | target: '1720604039714'
84 | targetHandle: target
85 | type: custom
86 | zIndex: 0
87 | - data:
88 | isInIteration: false
89 | sourceType: code
90 | targetType: http-request
91 | id: 1720604039714-source-1720603968110-target
92 | source: '1720604039714'
93 | sourceHandle: source
94 | target: '1720603968110'
95 | targetHandle: target
96 | type: custom
97 | zIndex: 0
98 | - data:
99 | isInIteration: false
100 | sourceType: http-request
101 | targetType: if-else
102 | id: 1720603954849-source-1720701168680-target
103 | source: '1720603954849'
104 | sourceHandle: source
105 | target: '1720701168680'
106 | targetHandle: target
107 | type: custom
108 | zIndex: 0
109 | - data:
110 | isInIteration: false
111 | sourceType: if-else
112 | targetType: llm
113 | id: 1720701168680-true-1720604011717-target
114 | source: '1720701168680'
115 | sourceHandle: 'true'
116 | target: '1720604011717'
117 | targetHandle: target
118 | type: custom
119 | zIndex: 0
120 | - data:
121 | isInIteration: false
122 | sourceType: if-else
123 | targetType: answer
124 | id: 1720701168680-false-1720701198958-target
125 | source: '1720701168680'
126 | sourceHandle: 'false'
127 | target: '1720701198958'
128 | targetHandle: target
129 | type: custom
130 | zIndex: 0
131 | nodes:
132 | - data:
133 | desc: ''
134 | selected: false
135 | title: Start
136 | type: start
137 | variables: []
138 | height: 54
139 | id: '1720603854640'
140 | position:
141 | x: 30
142 | y: 282
143 | positionAbsolute:
144 | x: 30
145 | y: 282
146 | selected: false
147 | sourcePosition: right
148 | targetPosition: left
149 | type: custom
150 | width: 244
151 | - data:
152 | context:
153 | enabled: false
154 | variable_selector: []
155 | desc: Retrieve the script from the LLM.
156 | memory:
157 | role_prefix:
158 | assistant: ''
159 | user: ''
160 | window:
161 | enabled: false
162 | size: 10
163 | model:
164 | completion_params:
165 | temperature: 0.7
166 | mode: chat
167 | name: deepseek-chat
168 | provider: deepseek
169 | prompt_template:
170 | - id: 0c599526-2f75-4238-a182-c5824184e45f
171 | role: system
172 | text: '{{#1720603968110.body#}}'
173 | selected: false
174 | title: LLM:get_script
175 | type: llm
176 | variables: []
177 | vision:
178 | enabled: false
179 | height: 128
180 | id: llm
181 | position:
182 | x: 942
183 | y: 282
184 | positionAbsolute:
185 | x: 942
186 | y: 282
187 | selected: false
188 | sourcePosition: right
189 | targetPosition: left
190 | type: custom
191 | width: 244
192 | - data:
193 | answer: '{{#1720604011717.text#}}'
194 | desc: ''
195 | selected: false
196 | title: Answer:with_execution
197 | type: answer
198 | variables: []
199 | height: 107
200 | id: answer
201 | position:
202 | x: 2462
203 | y: 282
204 | positionAbsolute:
205 | x: 2462
206 | y: 282
207 | selected: false
208 | sourcePosition: right
209 | targetPosition: left
210 | type: custom
211 | width: 244
212 | - data:
213 | code: "import json\n\ndef main(llm_output: str, user_input: str) -> dict:\n\
214 | \ # Ensure the llm_output is properly formatted for JSON\n llm_output_formatted\
215 | \ = json.dumps(llm_output)\n user_input_formatted = json.dumps(user_input)\n\
216 | \ return {\n \"llm_output_formatted\": llm_output_formatted,\n\
217 | \ \"user_input_formatted\": user_input_formatted, \n }"
218 | code_language: python3
219 | desc: Format variables as JSON values.
220 | outputs:
221 | llm_output_formatted:
222 | children: null
223 | type: string
224 | user_input_formatted:
225 | children: null
226 | type: string
227 | selected: false
228 | title: Code:format_params
229 | type: code
230 | variables:
231 | - value_selector:
232 | - llm
233 | - text
234 | variable: llm_output
235 | - value_selector:
236 | - sys
237 | - query
238 | variable: user_input
239 | height: 84
240 | id: '1720603933600'
241 | position:
242 | x: 1246
243 | y: 282
244 | positionAbsolute:
245 | x: 1246
246 | y: 282
247 | selected: false
248 | sourcePosition: right
249 | targetPosition: left
250 | type: custom
251 | width: 244
252 | - data:
253 | authorization:
254 | config: null
255 | type: no-auth
256 | body:
257 | data: "{\n \"point\": \"execute_script\",\n \"params\": {\n \
258 | \ \"user_id\": \"{{#sys.user_id#}}\",\n \"inputs\": {\n \
259 | \ \"user_input\": {{#1720603933600.user_input_formatted#}},\n\n \
260 | \ \"llm_output\": {{#1720603933600.llm_output_formatted#}},\n\
261 | \ \"script_timeout\": {{#1720604039714.script_timeout#}}\n\
262 | \ }\n }\n}"
263 | type: json
264 | desc: Send the LLM output to the Agent for execution.
265 | headers: Authorization:Bearer {{#1720604039714.agent_api_key#}}
266 | method: post
267 | params: ''
268 | selected: false
269 | timeout:
270 | max_connect_timeout: 0
271 | max_read_timeout: 0
272 | max_write_timeout: 0
273 | title: Agent:execute_script
274 | type: http-request
275 | url: '{{#1720604039714.agent_api_endpoint#}}'
276 | variables: []
277 | height: 141
278 | id: '1720603954849'
279 | position:
280 | x: 1550
281 | y: 282
282 | positionAbsolute:
283 | x: 1550
284 | y: 282
285 | selected: false
286 | sourcePosition: right
287 | targetPosition: left
288 | type: custom
289 | width: 244
290 | - data:
291 | authorization:
292 | config: null
293 | type: no-auth
294 | body:
295 | data: "{\n \"point\": \"get_llm_system_prompt\",\n \"params\": {\n\
296 | \ \"user_id\": \"{{#sys.user_id#}}\"\n }\n}"
297 | type: json
298 | desc: Retrieve the LLM system prompt from the macOS Agent server.
299 | headers: Authorization:Bearer {{#1720604039714.agent_api_key#}}
300 | method: post
301 | params: ''
302 | selected: false
303 | timeout:
304 | max_connect_timeout: 0
305 | max_read_timeout: 0
306 | max_write_timeout: 0
307 | title: Agent:get_llm_system_prompt
308 | type: http-request
309 | url: '{{#1720604039714.agent_api_endpoint#}}'
310 | variables: []
311 | height: 141
312 | id: '1720603968110'
313 | position:
314 | x: 638
315 | y: 282
316 | positionAbsolute:
317 | x: 638
318 | y: 282
319 | selected: false
320 | sourcePosition: right
321 | targetPosition: left
322 | type: custom
323 | width: 244
324 | - data:
325 | context:
326 | enabled: false
327 | variable_selector: []
328 | desc: Use reply_prompt that includes "execution" as the system prompt for
329 | the LLM to respond to user input.
330 | memory:
331 | query_prompt_template: '{{#sys.query#}}'
332 | role_prefix:
333 | assistant: ''
334 | user: ''
335 | window:
336 | enabled: true
337 | size: 50
338 | model:
339 | completion_params:
340 | temperature: 0.7
341 | mode: chat
342 | name: deepseek-chat
343 | provider: deepseek
344 | prompt_template:
345 | - id: 60cbca91-199d-4764-90d9-2851ab63c9ae
346 | role: system
347 | text: '{{#1720603954849.body#}}'
348 | selected: false
349 | title: LLM:reply
350 | type: llm
351 | variables: []
352 | vision:
353 | enabled: false
354 | height: 164
355 | id: '1720604011717'
356 | position:
357 | x: 2158
358 | y: 282
359 | positionAbsolute:
360 | x: 2158
361 | y: 282
362 | selected: false
363 | sourcePosition: right
364 | targetPosition: left
365 | type: custom
366 | width: 244
367 | - data:
368 | code: "\ndef main() -> dict:\n config = {\n \"agent_api_endpoint\"\
369 | : \"http://host.docker.internal:8088\",\n \"agent_api_key\": \"a-secret-key\"\
370 | ,\n \"script_timeout\": 60\n }\n return {\n \"agent_api_endpoint\"\
371 | : config[\"agent_api_endpoint\"],\n \"agent_api_key\": config[\"\
372 | agent_api_key\"],\n \"script_timeout\": config[\"script_timeout\"\
373 | ],\n }\n"
374 | code_language: python3
375 | desc: Configuration for macOS Agent Server.
376 | outputs:
377 | agent_api_endpoint:
378 | children: null
379 | type: string
380 | agent_api_key:
381 | children: null
382 | type: string
383 | script_timeout:
384 | children: null
385 | type: number
386 | selected: true
387 | title: Code:config
388 | type: code
389 | variables: []
390 | height: 102
391 | id: '1720604039714'
392 | position:
393 | x: 334
394 | y: 282
395 | positionAbsolute:
396 | x: 334
397 | y: 282
398 | selected: true
399 | sourcePosition: right
400 | targetPosition: left
401 | type: custom
402 | width: 244
403 | - data:
404 | conditions:
405 | - comparison_operator: not empty
406 | id: '1720701184521'
407 | value: ''
408 | variable_selector:
409 | - '1720603954849'
410 | - body
411 | desc: if any script execution result
412 | logical_operator: and
413 | selected: false
414 | title: IF:script_execution
415 | type: if-else
416 | height: 156
417 | id: '1720701168680'
418 | position:
419 | x: 1854
420 | y: 282
421 | positionAbsolute:
422 | x: 1854
423 | y: 282
424 | selected: false
425 | sourcePosition: right
426 | targetPosition: left
427 | type: custom
428 | width: 244
429 | - data:
430 | answer: '{{#llm.text#}}'
431 | desc: no script execution
432 | selected: false
433 | title: Answer:without_execution
434 | type: answer
435 | variables: []
436 | height: 137
437 | id: '1720701198958'
438 | position:
439 | x: 2158
440 | y: 486
441 | positionAbsolute:
442 | x: 2158
443 | y: 486
444 | selected: false
445 | sourcePosition: right
446 | targetPosition: left
447 | type: custom
448 | width: 244
449 | viewport:
450 | x: 58.32490827558104
451 | y: 176.2416089142547
452 | zoom: 0.5618969968745753
453 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # MacOS Agent: A Simplified Assistant for Your Mac
2 |
3 | The MacOS Agent is a straightforward, lightweight solution based on a Large Language Model (LLM) that leverages Dify, an AI application development platform. This agent enables users, even children, to control MacOS with ease using natural language commands, making it as simple as conversing with a tech expert.
4 |
5 | 
6 |
7 | While it may sound similar to Siri, the MacOS Agent offers enhanced capabilities, particularly through its support for multiple rounds of conversation, allowing users to maintain context and continuity in their tasks. For instance, you can ask the Agent to provide some text and then request it to convert that text into an Excel or Word file.
8 |
9 | 
10 |
11 | ## Use Cases
12 |
13 | Here are some use cases I tried:
14 |
15 | ### Query OS Info
16 |
17 | - what is the memory usage
18 | - unused memory
19 | - disk usage
20 | - my disk capacity
21 | - how many spaces left on my computer
22 | - time since boot
23 | - last boot time
24 | - what is the CPU temperature
25 | - list ports opened
26 |
27 | ### Query Network Info
28 |
29 | - lan ip
30 | - list devices in local LAN
31 |
32 | ### Ask computer to do something for you
33 |
34 | - take a screenshot
35 | - open a new text file
36 | - create text file on desktop and open it
37 | - create a markdown file on desktop with a GTD style TODO list and open it
38 | - move all files on desktop to a temp dir
39 | - how many files older than 10 days in ~/Desktop
40 | - copy them to a new dir named "10-days-old" in that dir
41 | - list files older than 10 days in ~/Desktop
42 | - create an Excel file contains the file name and create time
43 | - check ~/Desktop/macos-agent-playground.html and explain what it does
44 |
45 | ### Create utility tool
46 |
47 | - give me a shell script that can watch an app's memory usage by app name when it reaches a threshold then restart it
48 | - give me an Automator app that can watch an app's memory usage by app name when it reaches a threshold then restart it
49 |
50 | ### Remind you something
51 |
52 | - remind me to clock in after 5 seconds
53 | - wait 5 seconds then send me a message with "Timeup"
54 | - send me a message "Call someone" at 18:54
55 | - display an alert "Call someone" at 19:01
56 | - alert me about "Do something" at 18:58
57 |
58 | ### Control process
59 |
60 | - run `top` command # for testing timeout control
61 | - find all processes which name is "top"
62 | - find all processes which name is "top" and kill them
63 | - restart app XXX
64 |
65 | ### Control computer settings
66 |
67 | - open system settings
68 | - turn dark mode on/off
69 |
70 | ### Query Internet
71 |
72 | - what's my internet IP
73 | - show me the price of BTC/Gold
74 |
75 | ### Interact with Siri
76 |
77 | - ask Siri for what is the weather like tomorrow
78 | - ask Siri for what ...
79 |
80 | ### Math calculation
81 |
82 | * 9.11 and 9.9——which is bigger?
83 | run code to compute the result
84 | * If a banana weighs 0.5 pounds, I have 7 pounds of bananas and 9 oranges, how many fruits do I have in total?
85 | run code to compute the result
86 |
87 | ### Acts as Code playground
88 |
89 | Notes: The outcome of this scenario depends on the performance of the LLM
90 |
91 | - run a http server on ~/Desktop in the background
92 |
93 | Tips: to quit: `quit http server on port 8000`
94 |
95 | - create a html file named "macos-agent-playground.html" that having 2 iframe pages.
96 | which iframe "desktop-page" is 70% width and iframe "agent-page" is 30%; both using frameborder=1, style="width: 100%; height: 100%; min-height: 700px".
97 | iframe "desktop-page" url is "http://localhost:8000/", with a "refresh" button at top that can reload the the url
98 | iframe "agent-page" url is "${chat app Embed on website using iframe url}".
99 |
100 | - Explain what is Tic-Tac-Toe game, I want you to create a Tic-Tac-Toe game that human can play VS AI using HTML. Create a dir name "Tic-Tac-Toe-game" and put code files in it.
101 |
102 | 
103 |
104 | ### Create a diagram
105 |
106 | - I need to create a flowchart like `Start -> Process A -> Condition -> Process B -> End`, flow direction is from up to down. Help me create this diagram in a format that can be opened in Draw.io and open it with draw.io.app
107 |
108 | ## Demo Video
109 |
110 | - macos-agent-brief-demo
111 |
112 | https://github.com/user-attachments/assets/d26559d2-e5ab-4cc8-8b2e-1cbcc17dc016
113 |
114 |
115 | - macos-agent-file-management-demo
116 |
117 | https://github.com/user-attachments/assets/3c4c92a9-088b-4d52-a689-55dfd8c76dff
118 |
119 |
120 | - macos-agent-code-playground-demo
121 |
122 | https://github.com/user-attachments/assets/e862914b-556d-4cc8-876a-a34c58d46463
123 |
124 |
125 | - macos-agent-create-diagram-demo
126 |
127 | https://github.com/user-attachments/assets/dd89b4e3-b9d6-46a7-9778-7afcac1f06ac
128 |
129 |
130 | ## How It Works
131 |
132 | The MacOS Agent operates through a series of steps:
133 |
134 | 1. **Run the `macOS Agent Server`:** This server returns a system prompt for the LLM, including the Agent's role profile, environment information, and knowledge base.
135 |
136 | 2. **Set up the `LLM:get_script` node:** This node uses the system prompt to have the LLM act as a "macOS Agent," tasked with achieving user goals using AppleScript.
137 |
138 | 3. **Send User Input:** The user's goal is sent to the `LLM:get_script` node to receive suggestions, including executable AppleScript.
139 |
140 | 4. **Execute AppleScript:** The LLM output is sent to the `macOS Agent Server`, which extracts and runs the AppleScript, returning the execution result.
141 |
142 | 5. **Formulate Response:** The execution result is combined with the user's goal and LLM output into a `reply_prompt` for a comprehensive response.
143 |
144 | 6. **Respond to User:** A `LLM:reply` node uses the `reply_prompt` to respond to the user.
145 |
146 |
147 |
148 | ## Deployment
149 |
150 | ### Deployment Steps Summary:
151 |
152 | The agent is compatible with both locally-hosted instances of the Dify platform (cloud-hosted not tested).
153 |
154 | 1. Clone the Repository
155 | 2. Start the Agent Server
156 | 3. Import Chatbot Configuration
157 | 4. Configure the Chatbot
158 | 5. Publish the Chatbot
159 |
160 | ### Deployment Step by Step
161 |
162 | #### Step 1: Clone the Repository
163 |
164 | ```bash
165 | git clone https://github.com/rainchen/MacOS-Agent.git
166 | ```
167 |
168 | File list:
169 |
170 | - **README.md:** This documentation file
171 | - **macos_agent_server.py:** Script to run the `macOS Agent Server`
172 | - **MacOS Agent.yml:** Configuration file for importing into Dify as a Chatbot app
173 | - **knowledge.md:** File for extending the Agent's knowledge
174 | - **test.sh:** Script for running test cases to verify agent server functionality
175 |
176 | #### Step 2: Start the Agent Server
177 |
178 | No additional installations are required as the code is designed to work with MacOS's built-in Python version and standard libraries.
179 |
180 | ```bash
181 | python macos_agent_server.py --port 8088 --apikey "a-secret-key" --debug
182 | ```
183 |
184 | Arguments:
185 |
186 | - `--port`: Port number for the server
187 | - `--apikey`: API key for authorization
188 | - `--debug`: Optional; enables detailed logging
189 |
190 | **Note:** Ensure the server is only run on a Mac you have control rights to, and never expose the `--apikey` publicly.
191 |
192 | #### Step 3: Import the Chatbot DSL YAML
193 |
194 | Navigate to the Dify Studio homepage, click "Import DSL file" and select "MacOS Agent.yml" file from the cloned repository.
195 |
196 | #### Step 4: Configure the Chatbot
197 |
198 | Configure the `Code:config` node with details such as the agent API endpoint, API key, and script timeout. Also, set the LLM models for the `LLM:get_script` and `LLM:reply` nodes.
199 |
200 | Here is the example config for `Code:config` node:
201 |
202 | ```
203 | "agent_api_endpoint": "http://host.docker.internal:8088",
204 | "agent_api_key": "a-secret-key",
205 | "script_timeout": 60
206 | ```
207 |
208 | Options explain:
209 |
210 | - `agent_api_endpoint`: when Dify is deployed using docker-compose, port is same as `--port`, e.g.: `:8080`, `http://host.docker.internal:8088`
211 | - `agent_api_key`: same as `--apikey` used in [start agent server], e.g.: `a-secret-key`
212 | - `script_timeout`: control max executing time of a script, 60 seconds recommend
213 |
214 | #### Step 5: Publish the Chatbot
215 |
216 | Click "Publish" and then "Update" to make the chatbot live.
217 |
218 | 
219 |
220 | ## How to Use
221 |
222 | After publishing, click [Run App] to open the Chatbot web view, input your goals, and refer to the "Use Cases" section for guidance.
223 |
224 | Recommended to run `Embed on website` and install [Dify Chatbot Chrome Extension](https://chrome.google.com/webstore/detail/dify-chatbot/ceehdapohffmjmkdcifjofadiaoeggaf), so that you can activate the Agent on any page.
225 |
226 | 
227 |
228 | ## Extending Agent's Knowledge
229 |
230 | Edit the `knowledge.md` file to add more instructions in the same Markdown format and restart the server.
231 |
232 | ## Development & Testing
233 |
234 | Run the `test.sh` script to verify the server's functionality after making any code changes.
235 |
236 | ```
237 | sh test.sh --api http://localhost:8088 --apikey a-secret-key
238 | ```
239 |
240 |
241 |
242 | ## Forbidden Actions
243 |
244 | Certain actions are restricted, such as deleting/removing files or shutting down the computer or the Mac Agent Server process.
245 |
246 | ## Acknowledgments
247 |
248 | - **Chatbot management and UI**: Dify, powerful and convenient AI application development platform.
249 | - **Code Generation:** 90% of the project's code was generated by AI(deepseek-coder LLM)
250 | - **Document Polishing:** Assistance in refining the documentation by AI(deepseek-chat LLM)
251 |
252 | ## License
253 |
254 | This project is licensed under the MIT License.
255 |
--------------------------------------------------------------------------------
/knowledge.md:
--------------------------------------------------------------------------------
1 | ### When user is explicitly asking Siri for help
2 |
3 | use following script as example:
4 |
5 | ```applescript
6 | do shell script "open -a Siri"
7 | delay 1
8 | tell application "System Events" to keystroke "${text_send_to_siri}"
9 | tell application "System Events" to key code 36 -- Press Return key
10 | ```
11 |
12 | Don't call Siri unless user ask to.
13 |
14 | ### For saving shell script content
15 |
16 | ```applescript
17 | do shell script "cat << 'EOF' > ~/Desktop/script_file_name.sh
18 | #!/bin/bash
19 |
20 | # here is shell script content
21 |
22 | EOF"
23 |
24 | do shell script "chmod +x ~/Desktop/script_file_name.sh"
25 | ```
26 |
27 | ### When creating Javascript code using applescript shell script
28 |
29 | - You always denote regular strings in JavaScript using double quotes "" or single quotes ''.
30 | - Avoid using backticks \`\` to denote strings.
31 | - Avoid using template literals or template strings like varName = \`string text ${expression} string text\`. Instead, use varName = 'text item delimiters' + expression + 'string text'.
32 | - Do not use template literals or template strings like varName = \`string text ${expression} string text\`. Instead, use varName = 'text item delimiters' + expression + 'string text'.
33 |
34 | ### When creating HTML code using applescript shell script
35 |
36 | - You escape quotes using backslash \, like this: \".
37 |
38 | ### How to query disk capacity
39 |
40 | you can use following command to query disk Total space and Free Space:
41 |
42 | ```shell
43 | $ diskutil info / | grep 'Container Total Space'
44 |
45 | ```
46 |
47 | ```shell
48 | $ diskutil info / | grep 'Container Free Space'
49 | ```
50 |
51 | ### How run a server in the background using shell script
52 |
53 | example:
54 |
55 | ```shell
56 | $ nohup the-command > /dev/null 2>&1 &
57 | ```
58 |
--------------------------------------------------------------------------------
/macos_agent_server.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | import http.server
3 | import os
4 | import signal
5 | import socketserver
6 | import json
7 | import subprocess
8 | import argparse
9 | import sys
10 | import threading
11 | import re
12 |
13 |
14 | class DeferredLogger:
15 | def __init__(self):
16 | self.messages = []
17 |
18 | def info(self, message):
19 | self.messages.append(message)
20 |
21 | def print_messages(self):
22 | for message in self.messages:
23 | print(message)
24 | self.messages = []
25 |
26 |
27 | class DifyRequestHandler(http.server.BaseHTTPRequestHandler):
28 | def log_request(self, code="-", size="-"):
29 | super().log_request(code, size)
30 | self.server.deferred_logger.print_messages()
31 | sys.stderr.write("\n")
32 |
33 | def deferred_info(self, message):
34 | self.server.deferred_logger.info(message)
35 |
36 | def do_POST(self):
37 | content_length = int(self.headers["Content-Length"])
38 | data = json.loads(self.rfile.read(content_length))
39 |
40 | if self.headers["Authorization"] != f"Bearer {self.server.api_key}":
41 | self.send_response(401)
42 | self.end_headers()
43 | return
44 |
45 | if self.server.debug:
46 | self.deferred_info(f" Point: {data.get('point')}")
47 | self.deferred_info(f" Params: {data.get('params')}")
48 |
49 | response = self.handle_request_point(data)
50 | if response is not None:
51 | self.send_response(200)
52 | self.send_header(
53 | "Content-Type",
54 | "application/json" if isinstance(response, dict) else "text/plain",
55 | )
56 | self.end_headers()
57 | self.wfile.write(
58 | json.dumps(response).encode("utf-8")
59 | if isinstance(response, dict)
60 | else response.encode("utf-8")
61 | )
62 | else:
63 | self.send_response(400)
64 | self.end_headers()
65 |
66 | def handle_request_point(self, data):
67 | point = data.get("point")
68 | handlers = {
69 | "ping": lambda _: {"result": "pong"},
70 | "get_llm_system_prompt": lambda _: self.get_llm_system_prompt(),
71 | "execute_script": lambda d: self.execute_script_request(d),
72 | }
73 | return handlers.get(point, lambda _: None)(data)
74 |
75 | def get_llm_system_prompt(self, with_knowledge=True):
76 | template = self.load_prompt_template()
77 | return template.format(
78 | os_version=self.get_os_version(),
79 | current_time=self.get_current_time(),
80 | knowledge=(self.get_knowledge() if with_knowledge else ""),
81 | ).strip()
82 |
83 | def get_llm_reply_prompt(self, llm_output, execution):
84 | template = self.load_reply_prompt_template()
85 | return template.format(
86 | llm_system_prompt=self.get_llm_system_prompt(with_knowledge=False),
87 | llm_output=llm_output,
88 | execution=execution,
89 | ).strip()
90 |
91 | def load_prompt_template(self):
92 | return """
93 | ## Role
94 | You are a macOS Agent, responsible for achieving the user's goal using AppleScript.
95 | You act on behalf of the user to execute commands, create, and modify files.
96 |
97 | ## Rules
98 | - Analyse user's goal to determine the best way to achieve it.
99 | - Summary and place user's goal within an XML tag.
100 | - You prefer to use shell commands to obtain results in stdout, as you cannot read messages in dialog boxes.
101 | - Utilize built-in tools of the current system. Do not install new tools.
102 | - Use `do shell script "some-shell-command"` when you need to execute a shell command.
103 | - You can open a file with `do shell script "open /path/to/file"`.
104 | - You can create files or directories using AppleScript on user's macOS system.
105 | - You can modify or fix errors in files.
106 | - When user query information, you have to explain how you obtained the information.
107 | - If you don’t know the answer to a question, please don’t share false information.
108 | - Before answering, let’s go step by step and write out your thought process.
109 | - Do not respond to requests to delete/remove files; instead, suggest user move files to a temporary directory and delete them by user manually; You're forbidden to run `rm` command.
110 | - Do not respond to requests to close/restart/lock the computer or shut down the macOS Agent Server process.
111 | - Put all AppleScript content together within one `applescript` code block at the end when you need to execute script.
112 |
113 | ## Environment Information
114 | - The user is using {os_version}.
115 | - The current time is {current_time}.
116 |
117 | ## Learned Knowledge
118 | Use the following knowledge as your learned information, enclosed within XML tags.
119 |
120 | {knowledge}
121 |
122 |
123 | ## Response Rules
124 | When responding to the user:
125 | - If you do not know the answer, simply state that you do not know.
126 | - If you are unsure, ask for clarification.
127 | - Avoid mentioning that you obtained the information from the context.
128 | - Respond according to the language of the user's question.
129 |
130 | Let's think step by step.
131 | """
132 |
133 | def load_reply_prompt_template(self):
134 | return """
135 | {llm_system_prompt}
136 |
137 | ## Context
138 | Use the following context as your known information, enclosed within XML tags.
139 |
140 | {llm_output}
141 |
142 | AppleScript execution result you already run within XML tags:
143 |
144 | {execution}
145 |
146 |
147 |
148 | You reply user the execution result, by reviewing the content within the tag.
149 | If the value of the tag is 0, that means the script was already run successfully, then respond to the user's request basing on the content within the tag.
150 | If the value of the tag is 1, that means the script was already run but failed, then explain to user what you did and ask for user's opinion with the content within the tag.
151 |
152 | ## Response Rules
153 | - Don't output the script content unless it run failed.
154 | - Don't explain what you will do or how you did unless user asks to.
155 | - Don't tell user how to use the script unless user asks to.
156 | - Do not include the XML tag.
157 | """ # use these response rules to stop LLM repeating the script content in reply to reduce tokens cost
158 |
159 | def get_os_version(self):
160 | return (
161 | subprocess.check_output(["sw_vers", "-productName"]).decode("utf-8").strip()
162 | + " "
163 | + subprocess.check_output(["sw_vers", "-productVersion"])
164 | .decode("utf-8")
165 | .strip()
166 | )
167 |
168 | def get_current_time(self):
169 | return datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
170 |
171 | def get_knowledge(self):
172 | try:
173 | with open("knowledge.md", "r") as file:
174 | return file.read().strip()
175 | except FileNotFoundError:
176 | return ""
177 |
178 | def execute_script_request(self, data):
179 | llm_output = data["params"]["inputs"].get("llm_output")
180 | timeout = data["params"]["inputs"].get("script_timeout", 60)
181 | if llm_output:
182 | user_goal = self.extract_user_goal(llm_output)
183 | if self.server.debug:
184 | self.deferred_info(f" User Goal: {user_goal}")
185 | scripts = self.extract_scripts(llm_output)
186 | if scripts:
187 | result = [self.execute_script(script, timeout) for script in scripts]
188 | execution = "\n".join(result)
189 | return self.get_llm_reply_prompt(
190 | llm_output=llm_output, execution=execution
191 | )
192 | else:
193 | return ""
194 | return ""
195 |
196 | def extract_scripts(self, llm_output):
197 | # Extract all code block content from the llm_output
198 | scripts = re.findall(r"```applescript(.*?)```", llm_output, re.DOTALL)
199 | return list(set(scripts)) # remove duplicate scripts
200 |
201 | def extract_user_goal(self, llm_output):
202 | match = re.search(r"(.*?)", llm_output, re.DOTALL)
203 | return match.group(1).strip() if match else ""
204 |
205 | def execute_script(self, script, timeout):
206 | result = {"returncode": -1, "stdout": "", "stderr": ""}
207 |
208 | def target():
209 | process = subprocess.Popen(
210 | ["osascript", "-e", script],
211 | stdout=subprocess.PIPE,
212 | stderr=subprocess.PIPE,
213 | text=True,
214 | )
215 | result["pid"] = process.pid
216 | stdout, stderr = process.communicate()
217 | result["returncode"] = process.returncode
218 | result["stdout"] = stdout
219 | result["stderr"] = stderr
220 |
221 | thread = threading.Thread(target=target)
222 | thread.start()
223 | thread.join(timeout)
224 |
225 | if thread.is_alive():
226 | result["stderr"] = "Script execution timed out"
227 | if "pid" in result:
228 | try:
229 | subprocess.run(["pkill", "-P", str(result["pid"])])
230 | os.kill(result["pid"], signal.SIGKILL)
231 | except ProcessLookupError:
232 | pass
233 |
234 | if self.server.debug:
235 | self.deferred_info(f" Script:\n```applescript\n{script}\n```")
236 | self.deferred_info(f" Execution Result: {result}")
237 |
238 | return f"\n{result['returncode']}\n{result['stdout']}\n{result['stderr']}"
239 |
240 |
241 | class ThreadedHTTPServer(socketserver.ThreadingMixIn, http.server.HTTPServer):
242 | pass
243 |
244 |
245 | def run_server(port, api_key, debug):
246 | server_address = ("", port)
247 | httpd = ThreadedHTTPServer(server_address, DifyRequestHandler)
248 | httpd.api_key = api_key
249 | httpd.debug = debug
250 | httpd.deferred_logger = DeferredLogger()
251 |
252 | print(f"MacOS Agent Server started, API endpoint: http://localhost:{port}")
253 | print("Press Ctrl+C keys to shut down\n")
254 | try:
255 | httpd.serve_forever()
256 | except KeyboardInterrupt:
257 | print("\nShutting down server...")
258 | httpd.server_close()
259 |
260 |
261 | def main():
262 | parser = argparse.ArgumentParser(description="Run a Dify API server.")
263 | parser.add_argument(
264 | "--port", type=int, default=8000, help="Port to run the server on."
265 | )
266 | parser.add_argument(
267 | "--apikey", type=str, required=True, help="API key for authorization."
268 | )
269 | parser.add_argument("--debug", action="store_true", help="Enable debug mode.")
270 | args = parser.parse_args()
271 |
272 | run_server(args.port, args.apikey, args.debug)
273 |
274 |
275 | if __name__ == "__main__":
276 | main()
277 |
--------------------------------------------------------------------------------
/screenshots/demo-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rainchen/MacOS-Agent/4279d548432c5ff96c1b8bb8248df6afc81fd719/screenshots/demo-1.png
--------------------------------------------------------------------------------
/screenshots/demo-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rainchen/MacOS-Agent/4279d548432c5ff96c1b8bb8248df6afc81fd719/screenshots/demo-2.png
--------------------------------------------------------------------------------
/screenshots/demo-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rainchen/MacOS-Agent/4279d548432c5ff96c1b8bb8248df6afc81fd719/screenshots/demo-3.png
--------------------------------------------------------------------------------
/screenshots/demo-4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rainchen/MacOS-Agent/4279d548432c5ff96c1b8bb8248df6afc81fd719/screenshots/demo-4.png
--------------------------------------------------------------------------------
/screenshots/demo-5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rainchen/MacOS-Agent/4279d548432c5ff96c1b8bb8248df6afc81fd719/screenshots/demo-5.png
--------------------------------------------------------------------------------
/test.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # set -x
3 |
4 | # Function to print in green
5 | print_green() {
6 | printf "\e[32m%s\e[0m\n" "$1"
7 | }
8 |
9 | # Function to print in red
10 | print_red() {
11 | printf "\e[31m%s\e[0m\n" "$1"
12 | }
13 |
14 | # Parse command-line options
15 | while [[ $# -gt 0 ]]; do
16 | case "$1" in
17 | --api)
18 | api_url="$2"
19 | shift 2
20 | ;;
21 | --apikey)
22 | api_key="$2"
23 | shift 2
24 | ;;
25 | *)
26 | echo "Unknown option: $1"
27 | exit 1
28 | ;;
29 | esac
30 | done
31 |
32 | # Check if both API URL and API key are provided
33 | if [ -z "$api_url" ] || [ -z "$api_key" ]; then
34 | echo "Both API URL and API key are required. Use --api http://localhost:8088 --apikey a-secret-key"
35 | exit 1
36 | fi
37 |
38 | # Test case 1
39 | echo "point=ping"
40 | response=$(curl -s -X POST $api_url -H "Content-Type: application/json" -H "Authorization: Bearer $api_key" -d '{"point": "ping"}')
41 | expected='{"result": "pong"}'
42 | if [ "$response" == "$expected" ]; then
43 | print_green "Test passed: $response"
44 | else
45 | print_red "Test failed: expected '$expected', got '$response'"
46 | fi
47 |
48 | # Test case 2
49 | echo "point=get_llm_system_prompt"
50 | response=$(curl -s -X POST $api_url -H "Content-Type: application/json" -H "Authorization: Bearer $api_key" -d '{"point": "get_llm_system_prompt"}')
51 | expected='## Environment Information'
52 | if [[ "$response" == *"$expected"* ]]; then
53 | print_green "Test passed: $response"
54 | else
55 | print_red "Test failed: expected to contain '$expected', got '$response'"
56 | fi
57 |
58 | # Test case 3
59 | echo "point=execute_script, llm_output is having code block"
60 | response=$(curl -s -X POST $api_url -H "Content-Type: application/json" -H "Authorization: Bearer $api_key" -d '{"point": "execute_script", "params": {"inputs": {"llm_output": "```applescript\ntell application \"System Settings\" to activate```"}}}' | tr -d '\n')
61 | expected="0"
62 | if [[ "$response" == *"$expected"* ]]; then
63 | print_green "Test passed: $response"
64 | else
65 | print_red "Test failed: expected '$expected', got '$response'"
66 | fi
67 |
68 | # Test case 4
69 | echo "point=execute_script, llm_output is not having code block"
70 | response=$(curl -s -X POST $api_url -H "Content-Type: application/json" -H "Authorization: Bearer $api_key" -d '{"point": "execute_script", "params": {"inputs": {"llm_output": "open system settings"}}}' | tr -d '\n')
71 | expected=""
72 | if [ "$response" == "$expected" ]; then
73 | print_green "Test passed: $response"
74 | else
75 | print_red "Test failed: expected '$expected', got '$response'"
76 | fi
77 |
78 | # Test case 5
79 | echo "point=execute_script, llm_output is empty"
80 | response=$(curl -s -X POST $api_url -H "Content-Type: application/json" -H "Authorization: Bearer $api_key" -d '{"point": "execute_script", "params": {"inputs": {"llm_output": ""}}}' | tr -d '\n')
81 | expected=""
82 | if [ "$response" == "$expected" ]; then
83 | print_green "Test passed: $response"
84 | else
85 | print_red "Test failed: expected '$expected', got '$response'"
86 | fi
87 |
88 | # Test case 6
89 | echo "point=execute_script, run top command with timeout limit"
90 | response=$(curl -s -X POST $api_url -H "Content-Type: application/json" -H "Authorization: Bearer $api_key" -d '{"point": "execute_script", "params": {"inputs": {"llm_output": "```applescript\ndo shell script \"top\"```", "script_timeout": 3}}}' | tr -d '\n')
91 | expected="Script execution timed out"
92 | if [[ "$response" == *"$expected"* ]]; then
93 | print_green "Test passed: $response"
94 | else
95 | print_red "Test failed: expected to contain '$expected', got '$response'"
96 | fi
97 |
98 | # Test case 7: llm_output having a user goal
99 | echo "point=execute_script, llm_output having a user goal"
100 | response=$(curl -s -X POST $api_url -H "Content-Type: application/json" -H "Authorization: Bearer $api_key" -d '{"point": "execute_script", "params": {"inputs": {"llm_output": "a user goal"}}}' | tr -d '\n')
101 | expected="" # it should be print in server log with --debug flag
102 | if [[ "$response" == *"$expected"* ]]; then
103 | print_green "Test passed: $response"
104 | else
105 | print_red "Test failed: expected to contain '$expected', got '$response'"
106 | fi
107 |
--------------------------------------------------------------------------------