├── LICENSE ├── README.md └── natbot.py /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Nat Friedman 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # natbot 2 | 3 | Drive a browser with GPT-3 4 | 5 | Here's a demo: https://twitter.com/natfriedman/status/1575631194032549888 6 | 7 | Lots of ideas for improvement: 8 | - Better prompt 9 | - Prompt chaining 10 | - Make a recorder to collect human feedback and do better few-shot 11 | - Better DOM serialization 12 | - Let the agent use multiple tabs and switch between them 13 | 14 | Improvements welcome! 15 | -------------------------------------------------------------------------------- /natbot.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # 3 | # natbot.py 4 | # 5 | # Set OPENAI_API_KEY to your API key, and then run this from a terminal. 6 | # 7 | 8 | from playwright.sync_api import sync_playwright 9 | import time 10 | from sys import argv, exit, platform 11 | import openai 12 | import os 13 | 14 | quiet = False 15 | if len(argv) >= 2: 16 | if argv[1] == '-q' or argv[1] == '--quiet': 17 | quiet = True 18 | print( 19 | "Running in quiet mode (HTML and other content hidden); \n" 20 | + "exercise caution when running suggested commands." 21 | ) 22 | 23 | prompt_template = """ 24 | You are an agent controlling a browser. You are given: 25 | 26 | (1) an objective that you are trying to achieve 27 | (2) the URL of your current web page 28 | (3) a simplified text description of what's visible in the browser window (more on that below) 29 | 30 | You can issue these commands: 31 | SCROLL UP - scroll up one page 32 | SCROLL DOWN - scroll down one page 33 | CLICK X - click on a given element. You can only click on links, buttons, and inputs! 34 | TYPE X "TEXT" - type the specified text into the input with id X 35 | TYPESUBMIT X "TEXT" - same as TYPE above, except then it presses ENTER to submit the form 36 | 37 | The format of the browser content is highly simplified; all formatting elements are stripped. 38 | Interactive elements such as links, inputs, buttons are represented like this: 39 | 40 | text 41 | 42 | text 43 | 44 | Images are rendered as their alt text like this: 45 | 46 | 47 | 48 | Based on your given objective, issue whatever command you believe will get you closest to achieving your goal. 49 | You always start on Google; you should submit a search query to Google that will take you to the best page for 50 | achieving your objective. And then interact with that page to achieve your objective. 51 | 52 | If you find yourself on Google and there are no search results displayed yet, you should probably issue a command 53 | like "TYPESUBMIT 7 "search query"" to get to a more useful page. 54 | 55 | Then, if you find yourself on a Google search results page, you might issue the command "CLICK 24" to click 56 | on the first link in the search results. (If your previous command was a TYPESUBMIT your next command should 57 | probably be a CLICK.) 58 | 59 | Don't try to interact with elements that you can't see. 60 | 61 | Here are some examples: 62 | 63 | EXAMPLE 1: 64 | ================================================== 65 | CURRENT BROWSER CONTENT: 66 | ------------------ 67 | About 68 | Store 69 | Gmail 70 | Images 71 | (Google apps) 72 | Sign in 73 | (Google)

74 | 75 | 76 | 77 | 78 | Advertising 79 | Business 80 | How Search works 81 | Carbon neutral since 2007 82 | Privacy 83 | Terms 84 | Settings 85 | ------------------ 86 | OBJECTIVE: Find a 2 bedroom house for sale in Anchorage AK for under $750k 87 | CURRENT URL: https://www.google.com/ 88 | YOUR COMMAND: 89 | TYPESUBMIT 8 "anchorage redfin" 90 | ================================================== 91 | 92 | EXAMPLE 2: 93 | ================================================== 94 | CURRENT BROWSER CONTENT: 95 | ------------------ 96 | About 97 | Store 98 | Gmail 99 | Images 100 | (Google apps) 101 | Sign in 102 | (Google)

103 | 104 | 105 | 106 | 107 | Advertising 108 | Business 109 | How Search works 110 | Carbon neutral since 2007 111 | Privacy 112 | Terms 113 | Settings 114 | ------------------ 115 | OBJECTIVE: Make a reservation for 4 at Dorsia at 8pm 116 | CURRENT URL: https://www.google.com/ 117 | YOUR COMMAND: 118 | TYPESUBMIT 8 "dorsia nyc opentable" 119 | ================================================== 120 | 121 | EXAMPLE 3: 122 | ================================================== 123 | CURRENT BROWSER CONTENT: 124 | ------------------ 125 | 126 | 127 | 128 | 129 | OpenTable logo 130 | 131 | Find your table for any occasion 132 | 133 | Sep 28, 2022 134 | 7:00 PM 135 | 2 people 136 | 137 | 138 | It looks like you're in Peninsula. Not correct? 139 | 140 | 141 | ------------------ 142 | OBJECTIVE: Make a reservation for 4 for dinner at Dorsia in New York City at 8pm 143 | CURRENT URL: https://www.opentable.com/ 144 | YOUR COMMAND: 145 | TYPESUBMIT 12 "dorsia new york city" 146 | ================================================== 147 | 148 | The current browser content, objective, and current URL follow. Reply with your next command to the browser. 149 | 150 | CURRENT BROWSER CONTENT: 151 | ------------------ 152 | $browser_content 153 | ------------------ 154 | 155 | OBJECTIVE: $objective 156 | CURRENT URL: $url 157 | PREVIOUS COMMAND: $previous_command 158 | YOUR COMMAND: 159 | """ 160 | 161 | black_listed_elements = set(["html", "head", "title", "meta", "iframe", "body", "script", "style", "path", "svg", "br", "::marker",]) 162 | 163 | class Crawler: 164 | def __init__(self): 165 | self.browser = ( 166 | sync_playwright() 167 | .start() 168 | .chromium.launch( 169 | headless=False, 170 | ) 171 | ) 172 | 173 | self.page = self.browser.new_page() 174 | self.page.set_viewport_size({"width": 1280, "height": 1080}) 175 | 176 | def go_to_page(self, url): 177 | self.page.goto(url=url if "://" in url else "http://" + url) 178 | self.client = self.page.context.new_cdp_session(self.page) 179 | self.page_element_buffer = {} 180 | 181 | def scroll(self, direction): 182 | if direction == "up": 183 | self.page.evaluate( 184 | "(document.scrollingElement || document.body).scrollTop = (document.scrollingElement || document.body).scrollTop - window.innerHeight;" 185 | ) 186 | elif direction == "down": 187 | self.page.evaluate( 188 | "(document.scrollingElement || document.body).scrollTop = (document.scrollingElement || document.body).scrollTop + window.innerHeight;" 189 | ) 190 | 191 | def click(self, id): 192 | # Inject javascript into the page which removes the target= attribute from all links 193 | js = """ 194 | links = document.getElementsByTagName("a"); 195 | for (var i = 0; i < links.length; i++) { 196 | links[i].removeAttribute("target"); 197 | } 198 | """ 199 | self.page.evaluate(js) 200 | 201 | element = self.page_element_buffer.get(int(id)) 202 | if element: 203 | x = element.get("center_x") 204 | y = element.get("center_y") 205 | 206 | self.page.mouse.click(x, y) 207 | else: 208 | print("Could not find element") 209 | 210 | def type(self, id, text): 211 | self.click(id) 212 | self.page.keyboard.type(text) 213 | 214 | def enter(self): 215 | self.page.keyboard.press("Enter") 216 | 217 | def crawl(self): 218 | page = self.page 219 | page_element_buffer = self.page_element_buffer 220 | start = time.time() 221 | 222 | page_state_as_text = [] 223 | 224 | device_pixel_ratio = page.evaluate("window.devicePixelRatio") 225 | if platform == "darwin" and device_pixel_ratio == 1: # lies 226 | device_pixel_ratio = 2 227 | 228 | win_scroll_x = page.evaluate("window.scrollX") 229 | win_scroll_y = page.evaluate("window.scrollY") 230 | win_upper_bound = page.evaluate("window.pageYOffset") 231 | win_left_bound = page.evaluate("window.pageXOffset") 232 | win_width = page.evaluate("window.screen.width") 233 | win_height = page.evaluate("window.screen.height") 234 | win_right_bound = win_left_bound + win_width 235 | win_lower_bound = win_upper_bound + win_height 236 | document_offset_height = page.evaluate("document.body.offsetHeight") 237 | document_scroll_height = page.evaluate("document.body.scrollHeight") 238 | 239 | # percentage_progress_start = (win_upper_bound / document_scroll_height) * 100 240 | # percentage_progress_end = ( 241 | # (win_height + win_upper_bound) / document_scroll_height 242 | # ) * 100 243 | percentage_progress_start = 1 244 | percentage_progress_end = 2 245 | 246 | page_state_as_text.append( 247 | { 248 | "x": 0, 249 | "y": 0, 250 | "text": "[scrollbar {:0.2f}-{:0.2f}%]".format( 251 | round(percentage_progress_start, 2), round(percentage_progress_end) 252 | ), 253 | } 254 | ) 255 | 256 | tree = self.client.send( 257 | "DOMSnapshot.captureSnapshot", 258 | {"computedStyles": [], "includeDOMRects": True, "includePaintOrder": True}, 259 | ) 260 | strings = tree["strings"] 261 | document = tree["documents"][0] 262 | nodes = document["nodes"] 263 | backend_node_id = nodes["backendNodeId"] 264 | attributes = nodes["attributes"] 265 | node_value = nodes["nodeValue"] 266 | parent = nodes["parentIndex"] 267 | node_types = nodes["nodeType"] 268 | node_names = nodes["nodeName"] 269 | is_clickable = set(nodes["isClickable"]["index"]) 270 | 271 | text_value = nodes["textValue"] 272 | text_value_index = text_value["index"] 273 | text_value_values = text_value["value"] 274 | 275 | input_value = nodes["inputValue"] 276 | input_value_index = input_value["index"] 277 | input_value_values = input_value["value"] 278 | 279 | input_checked = nodes["inputChecked"] 280 | layout = document["layout"] 281 | layout_node_index = layout["nodeIndex"] 282 | bounds = layout["bounds"] 283 | 284 | cursor = 0 285 | html_elements_text = [] 286 | 287 | child_nodes = {} 288 | elements_in_view_port = [] 289 | 290 | anchor_ancestry = {"-1": (False, None)} 291 | button_ancestry = {"-1": (False, None)} 292 | 293 | def convert_name(node_name, has_click_handler): 294 | if node_name == "a": 295 | return "link" 296 | if node_name == "input": 297 | return "input" 298 | if node_name == "img": 299 | return "img" 300 | if ( 301 | node_name == "button" or has_click_handler 302 | ): # found pages that needed this quirk 303 | return "button" 304 | else: 305 | return "text" 306 | 307 | def find_attributes(attributes, keys): 308 | values = {} 309 | 310 | for [key_index, value_index] in zip(*(iter(attributes),) * 2): 311 | if value_index < 0: 312 | continue 313 | key = strings[key_index] 314 | value = strings[value_index] 315 | 316 | if key in keys: 317 | values[key] = value 318 | keys.remove(key) 319 | 320 | if not keys: 321 | return values 322 | 323 | return values 324 | 325 | def add_to_hash_tree(hash_tree, tag, node_id, node_name, parent_id): 326 | parent_id_str = str(parent_id) 327 | if not parent_id_str in hash_tree: 328 | parent_name = strings[node_names[parent_id]].lower() 329 | grand_parent_id = parent[parent_id] 330 | 331 | add_to_hash_tree( 332 | hash_tree, tag, parent_id, parent_name, grand_parent_id 333 | ) 334 | 335 | is_parent_desc_anchor, anchor_id = hash_tree[parent_id_str] 336 | 337 | # even if the anchor is nested in another anchor, we set the "root" for all descendants to be ::Self 338 | if node_name == tag: 339 | value = (True, node_id) 340 | elif ( 341 | is_parent_desc_anchor 342 | ): # reuse the parent's anchor_id (which could be much higher in the tree) 343 | value = (True, anchor_id) 344 | else: 345 | value = ( 346 | False, 347 | None, 348 | ) # not a descendant of an anchor, most likely it will become text, an interactive element or discarded 349 | 350 | hash_tree[str(node_id)] = value 351 | 352 | return value 353 | 354 | for index, node_name_index in enumerate(node_names): 355 | node_parent = parent[index] 356 | node_name = strings[node_name_index].lower() 357 | 358 | is_ancestor_of_anchor, anchor_id = add_to_hash_tree( 359 | anchor_ancestry, "a", index, node_name, node_parent 360 | ) 361 | 362 | is_ancestor_of_button, button_id = add_to_hash_tree( 363 | button_ancestry, "button", index, node_name, node_parent 364 | ) 365 | 366 | try: 367 | cursor = layout_node_index.index( 368 | index 369 | ) # todo replace this with proper cursoring, ignoring the fact this is O(n^2) for the moment 370 | except: 371 | continue 372 | 373 | if node_name in black_listed_elements: 374 | continue 375 | 376 | [x, y, width, height] = bounds[cursor] 377 | x /= device_pixel_ratio 378 | y /= device_pixel_ratio 379 | width /= device_pixel_ratio 380 | height /= device_pixel_ratio 381 | 382 | elem_left_bound = x 383 | elem_top_bound = y 384 | elem_right_bound = x + width 385 | elem_lower_bound = y + height 386 | 387 | partially_is_in_viewport = ( 388 | elem_left_bound < win_right_bound 389 | and elem_right_bound >= win_left_bound 390 | and elem_top_bound < win_lower_bound 391 | and elem_lower_bound >= win_upper_bound 392 | ) 393 | 394 | if not partially_is_in_viewport: 395 | continue 396 | 397 | meta_data = [] 398 | 399 | # inefficient to grab the same set of keys for kinds of objects but its fine for now 400 | element_attributes = find_attributes( 401 | attributes[index], ["type", "placeholder", "aria-label", "title", "alt"] 402 | ) 403 | 404 | ancestor_exception = is_ancestor_of_anchor or is_ancestor_of_button 405 | ancestor_node_key = ( 406 | None 407 | if not ancestor_exception 408 | else str(anchor_id) 409 | if is_ancestor_of_anchor 410 | else str(button_id) 411 | ) 412 | ancestor_node = ( 413 | None 414 | if not ancestor_exception 415 | else child_nodes.setdefault(str(ancestor_node_key), []) 416 | ) 417 | 418 | if node_name == "#text" and ancestor_exception: 419 | text = strings[node_value[index]] 420 | if text == "|" or text == "•": 421 | continue 422 | ancestor_node.append({ 423 | "type": "type", "value": text 424 | }) 425 | else: 426 | if ( 427 | node_name == "input" and element_attributes.get("type") == "submit" 428 | ) or node_name == "button": 429 | node_name = "button" 430 | element_attributes.pop( 431 | "type", None 432 | ) # prevent [button ... (button)..] 433 | 434 | for key in element_attributes: 435 | if ancestor_exception: 436 | ancestor_node.append({ 437 | "type": "attribute", 438 | "key": key, 439 | "value": element_attributes[key] 440 | }) 441 | else: 442 | meta_data.append(element_attributes[key]) 443 | 444 | element_node_value = None 445 | 446 | if node_value[index] >= 0: 447 | element_node_value = strings[node_value[index]] 448 | if element_node_value == "|": #commonly used as a seperator, does not add much context - lets save ourselves some token space 449 | continue 450 | elif ( 451 | node_name == "input" 452 | and index in input_value_index 453 | and element_node_value is None 454 | ): 455 | node_input_text_index = input_value_index.index(index) 456 | text_index = input_value_values[node_input_text_index] 457 | if node_input_text_index >= 0 and text_index >= 0: 458 | element_node_value = strings[text_index] 459 | 460 | # remove redudant elements 461 | if ancestor_exception and (node_name != "a" and node_name != "button"): 462 | continue 463 | 464 | elements_in_view_port.append( 465 | { 466 | "node_index": str(index), 467 | "backend_node_id": backend_node_id[index], 468 | "node_name": node_name, 469 | "node_value": element_node_value, 470 | "node_meta": meta_data, 471 | "is_clickable": index in is_clickable, 472 | "origin_x": int(x), 473 | "origin_y": int(y), 474 | "center_x": int(x + (width / 2)), 475 | "center_y": int(y + (height / 2)), 476 | } 477 | ) 478 | 479 | # lets filter further to remove anything that does not hold any text nor has click handlers + merge text from leaf#text nodes with the parent 480 | elements_of_interest= [] 481 | id_counter = 0 482 | 483 | for element in elements_in_view_port: 484 | node_index = element.get("node_index") 485 | node_name = element.get("node_name") 486 | node_value = element.get("node_value") 487 | is_clickable = element.get("is_clickable") 488 | origin_x = element.get("origin_x") 489 | origin_y = element.get("origin_y") 490 | center_x = element.get("center_x") 491 | center_y = element.get("center_y") 492 | meta_data = element.get("node_meta") 493 | 494 | inner_text = f"{node_value} " if node_value else "" 495 | meta = "" 496 | 497 | if node_index in child_nodes: 498 | for child in child_nodes.get(node_index): 499 | entry_type = child.get('type') 500 | entry_value= child.get('value') 501 | 502 | if entry_type == "attribute": 503 | entry_key = child.get('key') 504 | meta_data.append(f'{entry_key}="{entry_value}"') 505 | else: 506 | inner_text += f"{entry_value} " 507 | 508 | if meta_data: 509 | meta_string = " ".join(meta_data) 510 | meta = f" {meta_string}" 511 | 512 | if inner_text != "": 513 | inner_text = f"{inner_text.strip()}" 514 | 515 | converted_node_name = convert_name(node_name, is_clickable) 516 | 517 | # not very elegant, more like a placeholder 518 | if ( 519 | (converted_node_name != "button" or meta == "") 520 | and converted_node_name != "link" 521 | and converted_node_name != "input" 522 | and converted_node_name != "img" 523 | and converted_node_name != "textarea" 524 | ) and inner_text.strip() == "": 525 | continue 526 | 527 | page_element_buffer[id_counter] = element 528 | 529 | if inner_text != "": 530 | elements_of_interest.append( 531 | f"""<{converted_node_name} id={id_counter}{meta}>{inner_text}""" 532 | ) 533 | else: 534 | elements_of_interest.append( 535 | f"""<{converted_node_name} id={id_counter}{meta}/>""" 536 | ) 537 | id_counter += 1 538 | 539 | print("Parsing time: {:0.2f} seconds".format(time.time() - start)) 540 | return elements_of_interest 541 | 542 | if ( 543 | __name__ == "__main__" 544 | ): 545 | _crawler = Crawler() 546 | openai.api_key = os.environ.get("OPENAI_API_KEY") 547 | 548 | def print_help(): 549 | print( 550 | "(g) to visit url\n(u) scroll up\n(d) scroll down\n(c) to click\n(t) to type\n" + 551 | "(h) to view commands again\n(r/enter) to run suggested command\n(o) change objective" 552 | ) 553 | 554 | def get_gpt_command(objective, url, previous_command, browser_content): 555 | prompt = prompt_template 556 | prompt = prompt.replace("$objective", objective) 557 | prompt = prompt.replace("$url", url[:100]) 558 | prompt = prompt.replace("$previous_command", previous_command) 559 | prompt = prompt.replace("$browser_content", browser_content[:4500]) 560 | response = openai.Completion.create(model="text-davinci-002", prompt=prompt, temperature=0.5, best_of=10, n=3, max_tokens=50) 561 | return response.choices[0].text 562 | 563 | def run_cmd(cmd): 564 | cmd = cmd.split("\n")[0] 565 | 566 | if cmd.startswith("SCROLL UP"): 567 | _crawler.scroll("up") 568 | elif cmd.startswith("SCROLL DOWN"): 569 | _crawler.scroll("down") 570 | elif cmd.startswith("CLICK"): 571 | commasplit = cmd.split(",") 572 | id = commasplit[0].split(" ")[1] 573 | _crawler.click(id) 574 | elif cmd.startswith("TYPE"): 575 | spacesplit = cmd.split(" ") 576 | id = spacesplit[1] 577 | text = spacesplit[2:] 578 | text = " ".join(text) 579 | # Strip leading and trailing double quotes 580 | text = text[1:-1] 581 | 582 | if cmd.startswith("TYPESUBMIT"): 583 | text += '\n' 584 | _crawler.type(id, text) 585 | 586 | time.sleep(2) 587 | 588 | objective = "Make a reservation for 2 at 7pm at bistro vida in menlo park" 589 | print("\nWelcome to natbot! What is your objective?") 590 | i = input() 591 | if len(i) > 0: 592 | objective = i 593 | 594 | gpt_cmd = "" 595 | prev_cmd = "" 596 | _crawler.go_to_page("google.com") 597 | try: 598 | while True: 599 | browser_content = "\n".join(_crawler.crawl()) 600 | prev_cmd = gpt_cmd 601 | gpt_cmd = get_gpt_command(objective, _crawler.page.url, prev_cmd, browser_content) 602 | gpt_cmd = gpt_cmd.strip() 603 | 604 | if not quiet: 605 | print("URL: " + _crawler.page.url) 606 | print("Objective: " + objective) 607 | print("----------------\n" + browser_content + "\n----------------\n") 608 | if len(gpt_cmd) > 0: 609 | print("Suggested command: " + gpt_cmd) 610 | 611 | 612 | command = input() 613 | if command == "r" or command == "": 614 | run_cmd(gpt_cmd) 615 | elif command == "g": 616 | url = input("URL:") 617 | _crawler.go_to_page(url) 618 | elif command == "u": 619 | _crawler.scroll("up") 620 | time.sleep(1) 621 | elif command == "d": 622 | _crawler.scroll("down") 623 | time.sleep(1) 624 | elif command == "c": 625 | id = input("id:") 626 | _crawler.click(id) 627 | time.sleep(1) 628 | elif command == "t": 629 | id = input("id:") 630 | text = input("text:") 631 | _crawler.type(id, text) 632 | time.sleep(1) 633 | elif command == "o": 634 | objective = input("Objective:") 635 | else: 636 | print_help() 637 | except KeyboardInterrupt: 638 | print("\n[!] Ctrl+C detected, exiting gracefully.") 639 | exit(0) 640 | --------------------------------------------------------------------------------