├── LICENSE
├── README.md
└── natbot.py
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2022 Nat Friedman
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # natbot
2 |
3 | Drive a browser with GPT-3
4 |
5 | Here's a demo: https://twitter.com/natfriedman/status/1575631194032549888
6 |
7 | Lots of ideas for improvement:
8 | - Better prompt
9 | - Prompt chaining
10 | - Make a recorder to collect human feedback and do better few-shot
11 | - Better DOM serialization
12 | - Let the agent use multiple tabs and switch between them
13 |
14 | Improvements welcome!
15 |
--------------------------------------------------------------------------------
/natbot.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | #
3 | # natbot.py
4 | #
5 | # Set OPENAI_API_KEY to your API key, and then run this from a terminal.
6 | #
7 |
8 | from playwright.sync_api import sync_playwright
9 | import time
10 | from sys import argv, exit, platform
11 | import openai
12 | import os
13 |
14 | quiet = False
15 | if len(argv) >= 2:
16 | if argv[1] == '-q' or argv[1] == '--quiet':
17 | quiet = True
18 | print(
19 | "Running in quiet mode (HTML and other content hidden); \n"
20 | + "exercise caution when running suggested commands."
21 | )
22 |
23 | prompt_template = """
24 | You are an agent controlling a browser. You are given:
25 |
26 | (1) an objective that you are trying to achieve
27 | (2) the URL of your current web page
28 | (3) a simplified text description of what's visible in the browser window (more on that below)
29 |
30 | You can issue these commands:
31 | SCROLL UP - scroll up one page
32 | SCROLL DOWN - scroll down one page
33 | CLICK X - click on a given element. You can only click on links, buttons, and inputs!
34 | TYPE X "TEXT" - type the specified text into the input with id X
35 | TYPESUBMIT X "TEXT" - same as TYPE above, except then it presses ENTER to submit the form
36 |
37 | The format of the browser content is highly simplified; all formatting elements are stripped.
38 | Interactive elements such as links, inputs, buttons are represented like this:
39 |
40 | text
41 |
42 | text
43 |
44 | Images are rendered as their alt text like this:
45 |
46 |
47 |
48 | Based on your given objective, issue whatever command you believe will get you closest to achieving your goal.
49 | You always start on Google; you should submit a search query to Google that will take you to the best page for
50 | achieving your objective. And then interact with that page to achieve your objective.
51 |
52 | If you find yourself on Google and there are no search results displayed yet, you should probably issue a command
53 | like "TYPESUBMIT 7 "search query"" to get to a more useful page.
54 |
55 | Then, if you find yourself on a Google search results page, you might issue the command "CLICK 24" to click
56 | on the first link in the search results. (If your previous command was a TYPESUBMIT your next command should
57 | probably be a CLICK.)
58 |
59 | Don't try to interact with elements that you can't see.
60 |
61 | Here are some examples:
62 |
63 | EXAMPLE 1:
64 | ==================================================
65 | CURRENT BROWSER CONTENT:
66 | ------------------
67 | About
68 | Store
69 | Gmail
70 | Images
71 | (Google apps)
72 | Sign in
73 |
74 |
75 |
76 |
77 |
78 | Advertising
79 | Business
80 | How Search works
81 | Carbon neutral since 2007
82 | Privacy
83 | Terms
84 | Settings
85 | ------------------
86 | OBJECTIVE: Find a 2 bedroom house for sale in Anchorage AK for under $750k
87 | CURRENT URL: https://www.google.com/
88 | YOUR COMMAND:
89 | TYPESUBMIT 8 "anchorage redfin"
90 | ==================================================
91 |
92 | EXAMPLE 2:
93 | ==================================================
94 | CURRENT BROWSER CONTENT:
95 | ------------------
96 | About
97 | Store
98 | Gmail
99 | Images
100 | (Google apps)
101 | Sign in
102 |
103 |
104 |
105 |
106 |
107 | Advertising
108 | Business
109 | How Search works
110 | Carbon neutral since 2007
111 | Privacy
112 | Terms
113 | Settings
114 | ------------------
115 | OBJECTIVE: Make a reservation for 4 at Dorsia at 8pm
116 | CURRENT URL: https://www.google.com/
117 | YOUR COMMAND:
118 | TYPESUBMIT 8 "dorsia nyc opentable"
119 | ==================================================
120 |
121 | EXAMPLE 3:
122 | ==================================================
123 | CURRENT BROWSER CONTENT:
124 | ------------------
125 |
126 |
127 |
128 |
129 | OpenTable logo
130 |
131 | Find your table for any occasion
132 |
133 | Sep 28, 2022
134 | 7:00 PM
135 | 2 people
136 |
137 |
138 | It looks like you're in Peninsula. Not correct?
139 |
140 |
141 | ------------------
142 | OBJECTIVE: Make a reservation for 4 for dinner at Dorsia in New York City at 8pm
143 | CURRENT URL: https://www.opentable.com/
144 | YOUR COMMAND:
145 | TYPESUBMIT 12 "dorsia new york city"
146 | ==================================================
147 |
148 | The current browser content, objective, and current URL follow. Reply with your next command to the browser.
149 |
150 | CURRENT BROWSER CONTENT:
151 | ------------------
152 | $browser_content
153 | ------------------
154 |
155 | OBJECTIVE: $objective
156 | CURRENT URL: $url
157 | PREVIOUS COMMAND: $previous_command
158 | YOUR COMMAND:
159 | """
160 |
161 | black_listed_elements = set(["html", "head", "title", "meta", "iframe", "body", "script", "style", "path", "svg", "br", "::marker",])
162 |
163 | class Crawler:
164 | def __init__(self):
165 | self.browser = (
166 | sync_playwright()
167 | .start()
168 | .chromium.launch(
169 | headless=False,
170 | )
171 | )
172 |
173 | self.page = self.browser.new_page()
174 | self.page.set_viewport_size({"width": 1280, "height": 1080})
175 |
176 | def go_to_page(self, url):
177 | self.page.goto(url=url if "://" in url else "http://" + url)
178 | self.client = self.page.context.new_cdp_session(self.page)
179 | self.page_element_buffer = {}
180 |
181 | def scroll(self, direction):
182 | if direction == "up":
183 | self.page.evaluate(
184 | "(document.scrollingElement || document.body).scrollTop = (document.scrollingElement || document.body).scrollTop - window.innerHeight;"
185 | )
186 | elif direction == "down":
187 | self.page.evaluate(
188 | "(document.scrollingElement || document.body).scrollTop = (document.scrollingElement || document.body).scrollTop + window.innerHeight;"
189 | )
190 |
191 | def click(self, id):
192 | # Inject javascript into the page which removes the target= attribute from all links
193 | js = """
194 | links = document.getElementsByTagName("a");
195 | for (var i = 0; i < links.length; i++) {
196 | links[i].removeAttribute("target");
197 | }
198 | """
199 | self.page.evaluate(js)
200 |
201 | element = self.page_element_buffer.get(int(id))
202 | if element:
203 | x = element.get("center_x")
204 | y = element.get("center_y")
205 |
206 | self.page.mouse.click(x, y)
207 | else:
208 | print("Could not find element")
209 |
210 | def type(self, id, text):
211 | self.click(id)
212 | self.page.keyboard.type(text)
213 |
214 | def enter(self):
215 | self.page.keyboard.press("Enter")
216 |
217 | def crawl(self):
218 | page = self.page
219 | page_element_buffer = self.page_element_buffer
220 | start = time.time()
221 |
222 | page_state_as_text = []
223 |
224 | device_pixel_ratio = page.evaluate("window.devicePixelRatio")
225 | if platform == "darwin" and device_pixel_ratio == 1: # lies
226 | device_pixel_ratio = 2
227 |
228 | win_scroll_x = page.evaluate("window.scrollX")
229 | win_scroll_y = page.evaluate("window.scrollY")
230 | win_upper_bound = page.evaluate("window.pageYOffset")
231 | win_left_bound = page.evaluate("window.pageXOffset")
232 | win_width = page.evaluate("window.screen.width")
233 | win_height = page.evaluate("window.screen.height")
234 | win_right_bound = win_left_bound + win_width
235 | win_lower_bound = win_upper_bound + win_height
236 | document_offset_height = page.evaluate("document.body.offsetHeight")
237 | document_scroll_height = page.evaluate("document.body.scrollHeight")
238 |
239 | # percentage_progress_start = (win_upper_bound / document_scroll_height) * 100
240 | # percentage_progress_end = (
241 | # (win_height + win_upper_bound) / document_scroll_height
242 | # ) * 100
243 | percentage_progress_start = 1
244 | percentage_progress_end = 2
245 |
246 | page_state_as_text.append(
247 | {
248 | "x": 0,
249 | "y": 0,
250 | "text": "[scrollbar {:0.2f}-{:0.2f}%]".format(
251 | round(percentage_progress_start, 2), round(percentage_progress_end)
252 | ),
253 | }
254 | )
255 |
256 | tree = self.client.send(
257 | "DOMSnapshot.captureSnapshot",
258 | {"computedStyles": [], "includeDOMRects": True, "includePaintOrder": True},
259 | )
260 | strings = tree["strings"]
261 | document = tree["documents"][0]
262 | nodes = document["nodes"]
263 | backend_node_id = nodes["backendNodeId"]
264 | attributes = nodes["attributes"]
265 | node_value = nodes["nodeValue"]
266 | parent = nodes["parentIndex"]
267 | node_types = nodes["nodeType"]
268 | node_names = nodes["nodeName"]
269 | is_clickable = set(nodes["isClickable"]["index"])
270 |
271 | text_value = nodes["textValue"]
272 | text_value_index = text_value["index"]
273 | text_value_values = text_value["value"]
274 |
275 | input_value = nodes["inputValue"]
276 | input_value_index = input_value["index"]
277 | input_value_values = input_value["value"]
278 |
279 | input_checked = nodes["inputChecked"]
280 | layout = document["layout"]
281 | layout_node_index = layout["nodeIndex"]
282 | bounds = layout["bounds"]
283 |
284 | cursor = 0
285 | html_elements_text = []
286 |
287 | child_nodes = {}
288 | elements_in_view_port = []
289 |
290 | anchor_ancestry = {"-1": (False, None)}
291 | button_ancestry = {"-1": (False, None)}
292 |
293 | def convert_name(node_name, has_click_handler):
294 | if node_name == "a":
295 | return "link"
296 | if node_name == "input":
297 | return "input"
298 | if node_name == "img":
299 | return "img"
300 | if (
301 | node_name == "button" or has_click_handler
302 | ): # found pages that needed this quirk
303 | return "button"
304 | else:
305 | return "text"
306 |
307 | def find_attributes(attributes, keys):
308 | values = {}
309 |
310 | for [key_index, value_index] in zip(*(iter(attributes),) * 2):
311 | if value_index < 0:
312 | continue
313 | key = strings[key_index]
314 | value = strings[value_index]
315 |
316 | if key in keys:
317 | values[key] = value
318 | keys.remove(key)
319 |
320 | if not keys:
321 | return values
322 |
323 | return values
324 |
325 | def add_to_hash_tree(hash_tree, tag, node_id, node_name, parent_id):
326 | parent_id_str = str(parent_id)
327 | if not parent_id_str in hash_tree:
328 | parent_name = strings[node_names[parent_id]].lower()
329 | grand_parent_id = parent[parent_id]
330 |
331 | add_to_hash_tree(
332 | hash_tree, tag, parent_id, parent_name, grand_parent_id
333 | )
334 |
335 | is_parent_desc_anchor, anchor_id = hash_tree[parent_id_str]
336 |
337 | # even if the anchor is nested in another anchor, we set the "root" for all descendants to be ::Self
338 | if node_name == tag:
339 | value = (True, node_id)
340 | elif (
341 | is_parent_desc_anchor
342 | ): # reuse the parent's anchor_id (which could be much higher in the tree)
343 | value = (True, anchor_id)
344 | else:
345 | value = (
346 | False,
347 | None,
348 | ) # not a descendant of an anchor, most likely it will become text, an interactive element or discarded
349 |
350 | hash_tree[str(node_id)] = value
351 |
352 | return value
353 |
354 | for index, node_name_index in enumerate(node_names):
355 | node_parent = parent[index]
356 | node_name = strings[node_name_index].lower()
357 |
358 | is_ancestor_of_anchor, anchor_id = add_to_hash_tree(
359 | anchor_ancestry, "a", index, node_name, node_parent
360 | )
361 |
362 | is_ancestor_of_button, button_id = add_to_hash_tree(
363 | button_ancestry, "button", index, node_name, node_parent
364 | )
365 |
366 | try:
367 | cursor = layout_node_index.index(
368 | index
369 | ) # todo replace this with proper cursoring, ignoring the fact this is O(n^2) for the moment
370 | except:
371 | continue
372 |
373 | if node_name in black_listed_elements:
374 | continue
375 |
376 | [x, y, width, height] = bounds[cursor]
377 | x /= device_pixel_ratio
378 | y /= device_pixel_ratio
379 | width /= device_pixel_ratio
380 | height /= device_pixel_ratio
381 |
382 | elem_left_bound = x
383 | elem_top_bound = y
384 | elem_right_bound = x + width
385 | elem_lower_bound = y + height
386 |
387 | partially_is_in_viewport = (
388 | elem_left_bound < win_right_bound
389 | and elem_right_bound >= win_left_bound
390 | and elem_top_bound < win_lower_bound
391 | and elem_lower_bound >= win_upper_bound
392 | )
393 |
394 | if not partially_is_in_viewport:
395 | continue
396 |
397 | meta_data = []
398 |
399 | # inefficient to grab the same set of keys for kinds of objects but its fine for now
400 | element_attributes = find_attributes(
401 | attributes[index], ["type", "placeholder", "aria-label", "title", "alt"]
402 | )
403 |
404 | ancestor_exception = is_ancestor_of_anchor or is_ancestor_of_button
405 | ancestor_node_key = (
406 | None
407 | if not ancestor_exception
408 | else str(anchor_id)
409 | if is_ancestor_of_anchor
410 | else str(button_id)
411 | )
412 | ancestor_node = (
413 | None
414 | if not ancestor_exception
415 | else child_nodes.setdefault(str(ancestor_node_key), [])
416 | )
417 |
418 | if node_name == "#text" and ancestor_exception:
419 | text = strings[node_value[index]]
420 | if text == "|" or text == "•":
421 | continue
422 | ancestor_node.append({
423 | "type": "type", "value": text
424 | })
425 | else:
426 | if (
427 | node_name == "input" and element_attributes.get("type") == "submit"
428 | ) or node_name == "button":
429 | node_name = "button"
430 | element_attributes.pop(
431 | "type", None
432 | ) # prevent [button ... (button)..]
433 |
434 | for key in element_attributes:
435 | if ancestor_exception:
436 | ancestor_node.append({
437 | "type": "attribute",
438 | "key": key,
439 | "value": element_attributes[key]
440 | })
441 | else:
442 | meta_data.append(element_attributes[key])
443 |
444 | element_node_value = None
445 |
446 | if node_value[index] >= 0:
447 | element_node_value = strings[node_value[index]]
448 | if element_node_value == "|": #commonly used as a seperator, does not add much context - lets save ourselves some token space
449 | continue
450 | elif (
451 | node_name == "input"
452 | and index in input_value_index
453 | and element_node_value is None
454 | ):
455 | node_input_text_index = input_value_index.index(index)
456 | text_index = input_value_values[node_input_text_index]
457 | if node_input_text_index >= 0 and text_index >= 0:
458 | element_node_value = strings[text_index]
459 |
460 | # remove redudant elements
461 | if ancestor_exception and (node_name != "a" and node_name != "button"):
462 | continue
463 |
464 | elements_in_view_port.append(
465 | {
466 | "node_index": str(index),
467 | "backend_node_id": backend_node_id[index],
468 | "node_name": node_name,
469 | "node_value": element_node_value,
470 | "node_meta": meta_data,
471 | "is_clickable": index in is_clickable,
472 | "origin_x": int(x),
473 | "origin_y": int(y),
474 | "center_x": int(x + (width / 2)),
475 | "center_y": int(y + (height / 2)),
476 | }
477 | )
478 |
479 | # lets filter further to remove anything that does not hold any text nor has click handlers + merge text from leaf#text nodes with the parent
480 | elements_of_interest= []
481 | id_counter = 0
482 |
483 | for element in elements_in_view_port:
484 | node_index = element.get("node_index")
485 | node_name = element.get("node_name")
486 | node_value = element.get("node_value")
487 | is_clickable = element.get("is_clickable")
488 | origin_x = element.get("origin_x")
489 | origin_y = element.get("origin_y")
490 | center_x = element.get("center_x")
491 | center_y = element.get("center_y")
492 | meta_data = element.get("node_meta")
493 |
494 | inner_text = f"{node_value} " if node_value else ""
495 | meta = ""
496 |
497 | if node_index in child_nodes:
498 | for child in child_nodes.get(node_index):
499 | entry_type = child.get('type')
500 | entry_value= child.get('value')
501 |
502 | if entry_type == "attribute":
503 | entry_key = child.get('key')
504 | meta_data.append(f'{entry_key}="{entry_value}"')
505 | else:
506 | inner_text += f"{entry_value} "
507 |
508 | if meta_data:
509 | meta_string = " ".join(meta_data)
510 | meta = f" {meta_string}"
511 |
512 | if inner_text != "":
513 | inner_text = f"{inner_text.strip()}"
514 |
515 | converted_node_name = convert_name(node_name, is_clickable)
516 |
517 | # not very elegant, more like a placeholder
518 | if (
519 | (converted_node_name != "button" or meta == "")
520 | and converted_node_name != "link"
521 | and converted_node_name != "input"
522 | and converted_node_name != "img"
523 | and converted_node_name != "textarea"
524 | ) and inner_text.strip() == "":
525 | continue
526 |
527 | page_element_buffer[id_counter] = element
528 |
529 | if inner_text != "":
530 | elements_of_interest.append(
531 | f"""<{converted_node_name} id={id_counter}{meta}>{inner_text}{converted_node_name}>"""
532 | )
533 | else:
534 | elements_of_interest.append(
535 | f"""<{converted_node_name} id={id_counter}{meta}/>"""
536 | )
537 | id_counter += 1
538 |
539 | print("Parsing time: {:0.2f} seconds".format(time.time() - start))
540 | return elements_of_interest
541 |
542 | if (
543 | __name__ == "__main__"
544 | ):
545 | _crawler = Crawler()
546 | openai.api_key = os.environ.get("OPENAI_API_KEY")
547 |
548 | def print_help():
549 | print(
550 | "(g) to visit url\n(u) scroll up\n(d) scroll down\n(c) to click\n(t) to type\n" +
551 | "(h) to view commands again\n(r/enter) to run suggested command\n(o) change objective"
552 | )
553 |
554 | def get_gpt_command(objective, url, previous_command, browser_content):
555 | prompt = prompt_template
556 | prompt = prompt.replace("$objective", objective)
557 | prompt = prompt.replace("$url", url[:100])
558 | prompt = prompt.replace("$previous_command", previous_command)
559 | prompt = prompt.replace("$browser_content", browser_content[:4500])
560 | response = openai.Completion.create(model="text-davinci-002", prompt=prompt, temperature=0.5, best_of=10, n=3, max_tokens=50)
561 | return response.choices[0].text
562 |
563 | def run_cmd(cmd):
564 | cmd = cmd.split("\n")[0]
565 |
566 | if cmd.startswith("SCROLL UP"):
567 | _crawler.scroll("up")
568 | elif cmd.startswith("SCROLL DOWN"):
569 | _crawler.scroll("down")
570 | elif cmd.startswith("CLICK"):
571 | commasplit = cmd.split(",")
572 | id = commasplit[0].split(" ")[1]
573 | _crawler.click(id)
574 | elif cmd.startswith("TYPE"):
575 | spacesplit = cmd.split(" ")
576 | id = spacesplit[1]
577 | text = spacesplit[2:]
578 | text = " ".join(text)
579 | # Strip leading and trailing double quotes
580 | text = text[1:-1]
581 |
582 | if cmd.startswith("TYPESUBMIT"):
583 | text += '\n'
584 | _crawler.type(id, text)
585 |
586 | time.sleep(2)
587 |
588 | objective = "Make a reservation for 2 at 7pm at bistro vida in menlo park"
589 | print("\nWelcome to natbot! What is your objective?")
590 | i = input()
591 | if len(i) > 0:
592 | objective = i
593 |
594 | gpt_cmd = ""
595 | prev_cmd = ""
596 | _crawler.go_to_page("google.com")
597 | try:
598 | while True:
599 | browser_content = "\n".join(_crawler.crawl())
600 | prev_cmd = gpt_cmd
601 | gpt_cmd = get_gpt_command(objective, _crawler.page.url, prev_cmd, browser_content)
602 | gpt_cmd = gpt_cmd.strip()
603 |
604 | if not quiet:
605 | print("URL: " + _crawler.page.url)
606 | print("Objective: " + objective)
607 | print("----------------\n" + browser_content + "\n----------------\n")
608 | if len(gpt_cmd) > 0:
609 | print("Suggested command: " + gpt_cmd)
610 |
611 |
612 | command = input()
613 | if command == "r" or command == "":
614 | run_cmd(gpt_cmd)
615 | elif command == "g":
616 | url = input("URL:")
617 | _crawler.go_to_page(url)
618 | elif command == "u":
619 | _crawler.scroll("up")
620 | time.sleep(1)
621 | elif command == "d":
622 | _crawler.scroll("down")
623 | time.sleep(1)
624 | elif command == "c":
625 | id = input("id:")
626 | _crawler.click(id)
627 | time.sleep(1)
628 | elif command == "t":
629 | id = input("id:")
630 | text = input("text:")
631 | _crawler.type(id, text)
632 | time.sleep(1)
633 | elif command == "o":
634 | objective = input("Objective:")
635 | else:
636 | print_help()
637 | except KeyboardInterrupt:
638 | print("\n[!] Ctrl+C detected, exiting gracefully.")
639 | exit(0)
640 |
--------------------------------------------------------------------------------