├── .env.example ├── .gitignore ├── tsconfig.json ├── package.json ├── slack-manifest.json ├── app └── api │ ├── agent │ ├── request-types.ts │ ├── base-playwright.ts │ ├── types.ts │ ├── agent.ts │ └── browserbase.ts │ ├── demo │ ├── util.ts │ └── route.ts │ └── slack │ ├── route.ts │ └── operator.ts └── README.md /.env.example: -------------------------------------------------------------------------------- 1 | # Slack Bot Token (xoxb-...) 2 | SLACK_BOT_TOKEN= 3 | 4 | # Slack Signing Secret 5 | SLACK_SIGNING_SECRET= 6 | 7 | # Slack Bot User ID (starts with U) 8 | SLACK_BOT_USER_ID= 9 | 10 | # Browserbase Configuration 11 | BROWSERBASE_API_KEY= 12 | BROWSERBASE_PROJECT_ID= 13 | 14 | # OpenAI API Key 15 | OPENAI_API_KEY= 16 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # dependencies 2 | /node_modules 3 | /.pnp 4 | .pnp.js 5 | 6 | # testing 7 | /coverage 8 | 9 | # next.js 10 | /.next/ 11 | /out/ 12 | /build 13 | 14 | # production 15 | /build 16 | /dist 17 | 18 | # misc 19 | .DS_Store 20 | *.pem 21 | 22 | # debug 23 | npm-debug.log* 24 | yarn-debug.log* 25 | yarn-error.log* 26 | 27 | # local env files 28 | .env*.local 29 | .env 30 | 31 | # vercel 32 | .vercel 33 | 34 | # typescript 35 | *.tsbuildinfo 36 | next-env.d.ts 37 | 38 | # IDE 39 | .idea/ 40 | .vscode/ 41 | *.swp 42 | *.swo -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "target": "es5", 4 | "lib": ["dom", "dom.iterable", "esnext"], 5 | "allowJs": true, 6 | "skipLibCheck": true, 7 | "strict": true, 8 | "noEmit": true, 9 | "esModuleInterop": true, 10 | "module": "esnext", 11 | "moduleResolution": "bundler", 12 | "resolveJsonModule": true, 13 | "isolatedModules": true, 14 | "jsx": "preserve", 15 | "incremental": true, 16 | "plugins": [ 17 | { 18 | "name": "next" 19 | } 20 | ], 21 | "paths": { 22 | "@/*": ["./*"] 23 | } 24 | }, 25 | "include": ["next-env.d.ts", "**/*.ts", "**/*.tsx", ".next/types/**/*.ts"], 26 | "exclude": ["node_modules"] 27 | } -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "slack-operator", 3 | "version": "1.0.0", 4 | "description": "A TypeScript-based Slack Operator bot built with Next.js and deployed on Vercel", 5 | "scripts": { 6 | "dev": "next dev", 7 | "build": "next build", 8 | "start": "next start", 9 | "lint": "next lint" 10 | }, 11 | "dependencies": { 12 | "@ai-sdk/openai": "^1.1.12", 13 | "@browserbasehq/sdk": "^2.3.0", 14 | "@slack/bolt": "^3.17.1", 15 | "@slack/web-api": "^6.11.2", 16 | "@vercel/blob": "^0.27.2", 17 | "@vercel/functions": "^2.0.0", 18 | "ai": "^4.1.41", 19 | "axios": "^1.8.3", 20 | "axios-retry": "^4.5.0", 21 | "dotenv": "^16.4.7", 22 | "next": "14.1.0", 23 | "openai": "^4.87.3", 24 | "react": "18.2.0", 25 | "react-dom": "18.2.0" 26 | }, 27 | "devDependencies": { 28 | "@types/node": "^20.11.19", 29 | "@types/react": "18.2.0", 30 | "@types/react-dom": "18.2.0", 31 | "eslint": "^8.56.0", 32 | "eslint-config-next": "^14.1.0", 33 | "typescript": "^5.3.3" 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /slack-manifest.json: -------------------------------------------------------------------------------- 1 | { 2 | "display_information": { 3 | "name": "Slack Operator", 4 | "description": "A Computer Use Slack bot built with Next.js", 5 | "background_color": "#4A154B" 6 | }, 7 | "features": { 8 | "bot_user": { 9 | "display_name": "Slack Operator", 10 | "always_online": true 11 | } 12 | }, 13 | "oauth_config": { 14 | "scopes": { 15 | "bot": [ 16 | "chat:write", 17 | "app_mentions:read", 18 | "channels:history", 19 | "channels:read", 20 | "groups:history", 21 | "groups:read", 22 | "im:history", 23 | "im:read", 24 | "files:write" 25 | ] 26 | } 27 | }, 28 | "settings": { 29 | "event_subscriptions": { 30 | "request_url": "https://your-vercel-deployment-url/api/slack", 31 | "bot_events": [ 32 | "message.channels", 33 | "message.groups", 34 | "message.im" 35 | ] 36 | }, 37 | "interactivity": { 38 | "is_enabled": true, 39 | "request_url": "https://your-vercel-deployment-url/api/slack" 40 | }, 41 | "org_deploy_enabled": false, 42 | "socket_mode_enabled": false, 43 | "token_rotation_enabled": false 44 | } 45 | } -------------------------------------------------------------------------------- /app/api/agent/request-types.ts: -------------------------------------------------------------------------------- 1 | // types.ts 2 | // eslint-disable-next-line @typescript-eslint/no-explicit-any 3 | type Json = Record; 4 | 5 | 6 | // eslint-disable-next-line @typescript-eslint/no-unused-vars 7 | interface ResponseOptions { 8 | model: string; 9 | previous_response_id?: string; 10 | input: string | Json[]; 11 | include?: string[]; 12 | // eslint-disable-next-line @typescript-eslint/no-explicit-any 13 | tools?: any[]; 14 | metadata?: Json; 15 | temperature?: number; 16 | top_p?: number; 17 | parallel_tool_calls?: boolean; 18 | stream?: boolean; 19 | response_format?: Json; 20 | tool_choice?: Json; 21 | truncation?: string; 22 | } 23 | 24 | interface ComputerCall { 25 | type: 'computer_call'; 26 | id: string; 27 | action: { 28 | type: string; 29 | x?: number; 30 | y?: number; 31 | text?: string; 32 | keys?: string[]; 33 | scroll_x?: number; 34 | scroll_y?: number; 35 | }; 36 | } 37 | 38 | interface FunctionCall { 39 | type: 'function_call'; 40 | id: string; 41 | name: string; 42 | arguments: string; 43 | } 44 | 45 | interface OutputText { 46 | type: 'output_text'; 47 | text: string; 48 | } 49 | 50 | interface Message { 51 | type: 'message'; 52 | content: [OutputText]; 53 | } 54 | 55 | // eslint-disable-next-line @typescript-eslint/no-unused-vars 56 | interface Response { 57 | id: string; 58 | output: (ComputerCall | Message | FunctionCall | OutputText)[]; 59 | } -------------------------------------------------------------------------------- /app/api/demo/util.ts: -------------------------------------------------------------------------------- 1 | type BrowserbaseRegion = 2 | | "us-west-2" 3 | | "us-east-1" 4 | | "eu-central-1" 5 | | "ap-southeast-1"; 6 | 7 | // Exact timezone matches for east coast cities 8 | const exactTimezoneMap: Record = { 9 | "America/New_York": "us-east-1", 10 | "America/Detroit": "us-east-1", 11 | "America/Toronto": "us-east-1", 12 | "America/Montreal": "us-east-1", 13 | "America/Boston": "us-east-1", 14 | "America/Chicago": "us-east-1", 15 | }; 16 | 17 | // Prefix-based region mapping 18 | const prefixToRegion: Record = { 19 | America: "us-west-2", 20 | US: "us-west-2", 21 | Canada: "us-west-2", 22 | Europe: "eu-central-1", 23 | Africa: "eu-central-1", 24 | Asia: "ap-southeast-1", 25 | Australia: "ap-southeast-1", 26 | Pacific: "ap-southeast-1", 27 | }; 28 | 29 | // Offset ranges to regions (inclusive bounds) 30 | const offsetRanges: { 31 | min: number; 32 | max: number; 33 | region: BrowserbaseRegion; 34 | }[] = [ 35 | { min: -24, max: -4, region: "us-west-2" }, // UTC-24 to UTC-4 36 | { min: -3, max: 4, region: "eu-central-1" }, // UTC-3 to UTC+4 37 | { min: 5, max: 24, region: "ap-southeast-1" }, // UTC+5 to UTC+24 38 | ]; 39 | 40 | export function getClosestRegion(timezone?: string): BrowserbaseRegion { 41 | try { 42 | if (!timezone) { 43 | return "us-west-2"; // Default if no timezone provided 44 | } 45 | 46 | // Check exact matches first 47 | if (timezone in exactTimezoneMap) { 48 | return exactTimezoneMap[timezone]; 49 | } 50 | 51 | // Check prefix matches 52 | const prefix = timezone.split("/")[0]; 53 | if (prefix in prefixToRegion) { 54 | return prefixToRegion[prefix]; 55 | } 56 | 57 | // Use offset-based fallback 58 | const date = new Date(); 59 | // Create a date formatter for the given timezone 60 | const formatter = new Intl.DateTimeFormat("en-US", { timeZone: timezone }); 61 | // Get the timezone offset in minutes 62 | const timeString = formatter.format(date); 63 | const testDate = new Date(timeString); 64 | const hourOffset = (testDate.getTime() - date.getTime()) / (1000 * 60 * 60); 65 | 66 | const matchingRange = offsetRanges.find( 67 | (range) => hourOffset >= range.min && hourOffset <= range.max 68 | ); 69 | 70 | return matchingRange?.region ?? "us-west-2"; 71 | } catch { 72 | return "us-west-2"; 73 | } 74 | } -------------------------------------------------------------------------------- /app/api/demo/route.ts: -------------------------------------------------------------------------------- 1 | import { NextResponse } from "next/server"; 2 | import { Browserbase } from "@browserbasehq/sdk"; 3 | import { runAgentLoop } from "../slack/operator"; 4 | import { Agent } from "../agent/agent"; 5 | import { BrowserbaseBrowser } from "../agent/browserbase"; 6 | import { getClosestRegion } from "./util"; 7 | 8 | // Set the default to 60 seconds. This is not enough! 9 | // Once you enable Fluid Compute, you can can set this to 800 seconds. 10 | export const maxDuration = 60; 11 | 12 | // Initialize Browserbase client 13 | const validateEnvironment = () => { 14 | if (!process.env.BROWSERBASE_API_KEY) { 15 | throw new Error("BROWSERBASE_API_KEY is not set"); 16 | } 17 | if (!process.env.BROWSERBASE_PROJECT_ID) { 18 | throw new Error("BROWSERBASE_PROJECT_ID is not set"); 19 | } 20 | }; 21 | 22 | validateEnvironment(); 23 | 24 | const browserbase = new Browserbase({ 25 | apiKey: process.env.BROWSERBASE_API_KEY, 26 | }); 27 | 28 | export async function POST(req: Request) { 29 | let sessionId: string | undefined; 30 | try { 31 | const body = await req.json(); 32 | 33 | if (!body.goal) { 34 | return NextResponse.json( 35 | { error: "Missing required field: goal" }, 36 | { status: 400 } 37 | ); 38 | } 39 | 40 | // Get the closest browser region based on the server's timezone 41 | const region = getClosestRegion( 42 | Intl.DateTimeFormat().resolvedOptions().timeZone 43 | ); 44 | 45 | // Create a new Browserbase session 46 | const session = await browserbase.sessions.create({ 47 | projectId: process.env.BROWSERBASE_PROJECT_ID!, 48 | keepAlive: true, 49 | proxies: false, 50 | region, 51 | browserSettings: { 52 | viewport: { 53 | width: 1024, 54 | height: 768, 55 | }, 56 | blockAds: true, 57 | }, 58 | timeout: 3600, 59 | }); 60 | 61 | const computer = new BrowserbaseBrowser(1024, 768, session.id); 62 | 63 | // Set the last argument to true to enable more verbose logging 64 | const agent = new Agent("computer-use-preview", computer, false); 65 | 66 | // Start the agent loop in the background 67 | const result = await runAgentLoop( 68 | computer, 69 | agent, 70 | body.goal, 71 | session.id, 72 | undefined, 73 | undefined, 74 | undefined 75 | ); 76 | 77 | return NextResponse.json({ result }); 78 | } catch (error) { 79 | console.error("Error handling demo request:", error); 80 | return NextResponse.json( 81 | { error: "Internal Server Error" }, 82 | { status: 500 } 83 | ); 84 | } finally { 85 | if (sessionId) { 86 | await browserbase.sessions.update(sessionId, { 87 | status: "REQUEST_RELEASE", 88 | projectId: process.env.BROWSERBASE_PROJECT_ID!, 89 | }); 90 | } 91 | } 92 | } 93 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Slack Operator 2 | 3 | A simple Computer Use Agent connected to Slack and powered by [Browserbase](https://browserbase.com/computer-use). 4 | 5 | ## Overview 6 | - [How it Works](#how-it-works) 7 | - [Running locally (without Slack)](#running-locally-without-slack) 8 | - [Deploying to Production](#deploying-to-production) 9 | 10 | ## How it Works 11 | 12 | Slack Operator is an Computer-Use Agent that can perform web-based tasks through natural language commands in Slack. Here's how the system works: 13 | 14 | 1. **Browser Control**: The agent uses [Browserbase](https://browserbase.com/computer-use) to control a real browser instance, allowing it to interact with websites just like a human would. 15 | 16 | 2. **Integration Points**: 17 | - **Slack Integration**: When deployed, the bot listens for mentions in Slack channels and responds to user requests in threads 18 | - **Demo API**: A simplified endpoint (`/api/demo`) for testing the functionality without Slack integration 19 | 20 | 3. **Agent Loop**: 21 | - The agent receives a goal (either from Slack or the demo API) 22 | - It analyzes the request and determines the appropriate starting point (usually a relevant website) 23 | - Through an iterative process, it: 24 | - Navigates websites 25 | - Interacts with web elements 26 | - Extracts information 27 | - Reports progress back to the user 28 | - When deployed to Vercel, the agent maintains state between interactions, allowing for follow-up questions and multi-step tasks 29 | 30 | 4. **State Management**: 31 | - For Slack interactions, the agent maintains conversation state using Vercel's blob storage to support continuous interactions 32 | - Each session uses a dedicated browser instance that persists throughout the task 33 | 34 | 5. **Regional Optimization**: 35 | - The system automatically selects the closest browser region based on the server's timezone for optimal performance 36 | 37 | ## Running locally (without Slack) 38 | 39 | You can test the functionality directly using the demo API endpoint without setting up Slack integration: 40 | 41 | 1. Clone this repository 42 | 43 | 2. Install dependencies: 44 | ```bash 45 | npm install 46 | ``` 47 | 48 | 3. Copy `.env.example` to `.env.local` and add your API keys. You can get a Browserbase API key and project ID from the [Browserbase Dashboard](https://www.browserbase.com/overview). 49 | ``` 50 | BROWSERBASE_API_KEY=your-browserbase-api-key 51 | BROWSERBASE_PROJECT_ID=your-browserbase-project-id 52 | OPENAI_API_KEY=your-openai-api-key 53 | ``` 54 | 55 | 4. Run the development server: 56 | ```bash 57 | npm run dev 58 | ``` 59 | 60 | 5. Send a POST request to `/api/demo` with your goal: 61 | ```bash 62 | curl -X POST http://localhost:3000/api/demo \ 63 | -H "Content-Type: application/json" \ 64 | -d '{"goal": "What is the weather in San Francisco?"}' 65 | ``` 66 | 67 | Note: The demo endpoint does not support follow up questions with the agent. The requires persistent state to be saved between steps. 68 | 69 | 70 | ## Deploying to Production 71 | 72 | ### 1. Deploy to Vercel 73 | 1. [Click here](https://vercel.com/new/clone?repository-url=https%3A%2F%2Fgithub.com%2Fbrowserbase%2Fslack-operator&env=BROWSERBASE_API_KEY,BROWSERBASE_PROJECT_ID,OPENAI_API_KEY&envDescription=You'll%20need%20these%20variables%20to%20deploy%20this.%20To%20integrate%20Slack%2C%20you'll%20also%20need%20SLACK_BOT_TOKEN%2C%20SLACK_SIGNING_SECRET%2C%20and%20SLACK_BOT_USER_ID.%20&envLink=https%3A%2F%2Fgithub.com%2Fbrowserbase%2Fslack-operator%23deploying-to-production) to create a pre-configured Vercel project 74 | 2. Once it's deployed, you'll need to enable [blob storage](https://vercel.com/docs/vercel-blob) and [fluid compute](https://vercel.com/docs/functions/fluid-compute) to enable state persistence and long-running tasks. Make sure to update slack/route.ts 75 | 3. You can test it's working by sending a POST request to `/api/demo` with your goal. 76 | ```bash 77 | curl -X POST https://YOUR_VERCEL_URL/api/demo \ 78 | -H "Content-Type: application/json" \ 79 | -d '{"goal": "What is the weather in San Francisco?"}' 80 | ``` 81 | 4. Once Slack integration is set up, you'll need to configure the environment variables in Vercel as described in the next steps. 82 | 83 | ### 2. Create Slack App 84 | 1. Go to https://api.slack.com/apps and click "Create New App" 85 | 2. Choose "From an app manifest" 86 | 3. Select your workspace and paste the contents of `slack-manifest.json` 87 | 4. Replace `https://your-vercel-deployment-url` in the manifest with your actual Vercel deployment URL 88 | 5. Review and create the app 89 | 90 | ### 3. Install Slack App to Workspace 91 | 1. Go to "Install App" in the sidebar 92 | 2. Click "Install to Workspace" and authorize the app 93 | 94 | ### 4. Configure Environment Variables 95 | 1. Go to your Slack App's "Basic Information" page and copy the "Signing Secret" 96 | 2. Go to "OAuth & Permissions" and copy the "Bot User OAuth Token" 97 | 3. In your Vercel project settings, add these environment variables: 98 | - `SLACK_BOT_TOKEN`: Your Bot User OAuth Token (starts with xoxb-) 99 | - Go to "OAuth & Permissions" and copy the "Bot User OAuth Token" 100 | - `SLACK_SIGNING_SECRET`: Your Signing Secret 101 | - Go to your Slack App's "Basic Information" page 102 | - `SLACK_BOT_USER_ID`: Your Bot User ID (starts with U) 103 | - You can get that from the Slack API. You'll need the user_id from the response 104 | ``` 105 | curl -X POST "https://slack.com/api/auth.test" -H "Authorization: Bearer xoxb-your-slack-bot-token" 106 | ``` 107 | - `BROWSERBASE_API_KEY`: Your Browserbase API Key 108 | - `BROWSERBASE_PROJECT_ID`: Your Browserbase Project ID 109 | - `OPENAI_API_KEY`: Your OpenAI API Key 110 | 4. Make sure to enable Fluid Compute and update the timeout in `slack/route.ts` to 800 seconds! Otherwise the agent will timeout while working. 111 | 5. Redeploy your Vercel project for the changes to take effect 112 | -------------------------------------------------------------------------------- /app/api/agent/base-playwright.ts: -------------------------------------------------------------------------------- 1 | import { Browser, Page } from 'playwright'; 2 | 3 | // Optional: key mapping if your model uses "CUA" style keys 4 | const CUA_KEY_TO_PLAYWRIGHT_KEY: Record = { 5 | "/": "Divide", 6 | "\\": "Backslash", 7 | "alt": "Alt", 8 | "arrowdown": "ArrowDown", 9 | "arrowleft": "ArrowLeft", 10 | "arrowright": "ArrowRight", 11 | "arrowup": "ArrowUp", 12 | "up": "ArrowUp", 13 | "down": "ArrowDown", 14 | "left": "ArrowLeft", 15 | "right": "ArrowRight", 16 | "backspace": "Backspace", 17 | "capslock": "CapsLock", 18 | "cmd": "Meta", 19 | "command": "Meta", 20 | "ctrl": "Control", 21 | "control": "Control", 22 | "delete": "Delete", 23 | "end": "End", 24 | "enter": "Enter", 25 | "esc": "Escape", 26 | "home": "Home", 27 | "insert": "Insert", 28 | "option": "Alt", 29 | "pagedown": "PageDown", 30 | "pageup": "PageUp", 31 | "shift": "Shift", 32 | "space": " ", 33 | "super": "Meta", 34 | "tab": "Tab", 35 | "win": "Meta", 36 | }; 37 | 38 | const HOTKEYS: Record = { 39 | "alt": "Alt", 40 | "ctrl": "Control", 41 | "control": "Control", 42 | "shift": "Shift", 43 | "meta": "Meta", 44 | "command": "Meta", 45 | "win": "Meta", 46 | } 47 | 48 | export type Environment = "browser"; 49 | 50 | /** 51 | * Abstract base for Playwright-based computers: 52 | * 53 | * - Subclasses override `_getBrowserAndPage()` to do local or remote connection, 54 | * returning [Browser, Page]. 55 | * - This base class handles context creation (`connect`/`disconnect`), 56 | * plus standard "Computer" actions like click, scroll, etc. 57 | * - We also have extra browser actions: `goto(url)` and `back()`. 58 | */ 59 | export abstract class BasePlaywrightComputer { 60 | environment: Environment = "browser"; 61 | dimensions: [number, number] = [1024, 768]; 62 | 63 | protected _browser: Browser | null = null; 64 | protected _page: Page | null = null; 65 | 66 | constructor() { 67 | this._browser = null; 68 | this._page = null; 69 | } 70 | 71 | async connect(): Promise { 72 | // Start Playwright and call the subclass hook for getting browser/page 73 | const [browser, page] = await this._getBrowserAndPage(); 74 | this._browser = browser; 75 | this._page = page; 76 | return this; 77 | } 78 | 79 | async disconnect(): Promise { 80 | if (this._browser) { 81 | await this._browser.close(); 82 | } 83 | } 84 | 85 | // --- Common "Computer" actions --- 86 | async screenshot(): Promise { 87 | /** 88 | * Capture only the viewport (not full_page). 89 | */ 90 | if (!this._page) throw new Error("Page not initialized"); 91 | const buffer = await this._page.screenshot({ fullPage: false }); 92 | return buffer.toString('base64'); 93 | } 94 | 95 | async click(button: string = "left", x: number | string, y: number | string): Promise { 96 | if (!this._page) throw new Error("Page not initialized"); 97 | const parsedX = typeof x === 'string' ? parseInt(x, 10) : x; 98 | const parsedY = typeof y === 'string' ? parseInt(y, 10) : y; 99 | if (isNaN(parsedX) || isNaN(parsedY)) { 100 | throw new Error(`Invalid x or y coordinate: x=${x}, y=${y}`); 101 | } 102 | if (button == "wheel") { 103 | await this._page.mouse.wheel(parsedX, parsedY); 104 | } else { 105 | await this._page.mouse.click(parsedX, parsedY, { button: button as "left" | "right" | "middle" }); 106 | } 107 | } 108 | 109 | async double_click(x: number, y: number): Promise { 110 | if (!this._page) throw new Error("Page not initialized"); 111 | await this._page.mouse.dblclick(x, y); 112 | } 113 | 114 | async scroll(x: number, y: number, scrollX: number, scrollY: number): Promise { 115 | if (!this._page) throw new Error("Page not initialized"); 116 | await this._page.mouse.wheel(scrollX, scrollY); 117 | await this._page.mouse.move(x, y); 118 | } 119 | 120 | async type(text: string): Promise { 121 | if (!this._page) throw new Error("Page not initialized"); 122 | await this._page.keyboard.type(text); 123 | } 124 | 125 | async wait(ms: number = 5000): Promise { 126 | await new Promise(resolve => setTimeout(resolve, ms)); 127 | } 128 | 129 | async move(x: number, y: number): Promise { 130 | if (!this._page) throw new Error("Page not initialized"); 131 | await this._page.mouse.move(x, y); 132 | } 133 | 134 | async keypress(keys: string[]): Promise { 135 | if (!this._page) throw new Error("Page not initialized"); 136 | 137 | // Support for hotkeys 138 | if (HOTKEYS[keys[0].toLowerCase()]) { 139 | await this._page.keyboard.down(HOTKEYS[keys[0].toLowerCase()]); 140 | for (let i = 1; i < keys.length; i++) { 141 | const mappedKey = CUA_KEY_TO_PLAYWRIGHT_KEY[keys[i].toLowerCase()] || keys[i]; 142 | await this._page.keyboard.press(mappedKey); 143 | } 144 | await this._page.keyboard.up(HOTKEYS[keys[0].toLowerCase()]); 145 | } else { 146 | for (const key of keys) { 147 | const mappedKey = CUA_KEY_TO_PLAYWRIGHT_KEY[key.toLowerCase()] || key; 148 | await this._page.keyboard.press(mappedKey); 149 | } 150 | } 151 | } 152 | 153 | async drag(path: {x: number, y: number}[]): Promise { 154 | if (!this._page) throw new Error("Page not initialized"); 155 | if (!path.length) return; 156 | 157 | await this._page.mouse.move(path[0].x, path[0].y); 158 | await this._page.mouse.down(); 159 | 160 | for (let i = 1; i < path.length; i++) { 161 | await this._page.mouse.move(path[i].x, path[i].y); 162 | } 163 | 164 | await this._page.mouse.up(); 165 | } 166 | 167 | // --- Extra browser-oriented actions --- 168 | async goto(url: string): Promise { 169 | if (!this._page) throw new Error("Page not initialized"); 170 | await this._page.goto(url, { waitUntil: "domcontentloaded" }); 171 | } 172 | 173 | async back(): Promise { 174 | if (!this._page) throw new Error("Page not initialized"); 175 | await this._page.goBack({ waitUntil: "domcontentloaded" }); 176 | } 177 | 178 | // --- Subclass hook --- 179 | protected abstract _getBrowserAndPage(): Promise<[Browser, Page]>; 180 | } -------------------------------------------------------------------------------- /app/api/agent/types.ts: -------------------------------------------------------------------------------- 1 | export type Includable = "output[*].file_search_call.search_results"; 2 | 3 | export type FunctionOutput = { 4 | type: "function_call_output"; 5 | call_id: string; 6 | output: string; 7 | }; 8 | 9 | export type ComputerCallOutput = { 10 | type: "computer_call_output"; 11 | call_id: string; 12 | output: { type: "input_image"; image_url: string }; 13 | acknowledged_safety_checks: SafetyCheck[]; 14 | current_url?: string; 15 | }; 16 | 17 | export type EasyMessage = { 18 | role: "system" | "user" | "assistant" | "developer"; 19 | content: string | InputContent[]; 20 | }; 21 | 22 | export type ItemReference = { 23 | type: "item_reference"; 24 | id: string; 25 | }; 26 | 27 | export type InputItem = EasyMessage | FunctionOutput | ComputerCallOutput | Reasoning; 28 | 29 | export type Tool = FunctionTool | ComputerTool; 30 | 31 | export type ComputerTool = { 32 | type: "computer-preview"; 33 | display_width: number; 34 | display_height: number; 35 | environment: "mac" | "windows" | "linux" | "browser"; 36 | }; 37 | 38 | export type FunctionTool = { 39 | type: "function"; 40 | name: string; 41 | description: string | null; 42 | parameters: object; 43 | strict: boolean; 44 | }; 45 | 46 | export type Item = Message | FunctionToolCall | ComputerToolCall | Reasoning; 47 | 48 | export type Message = { 49 | id: string; 50 | type: "message"; 51 | role: "user" | "assistant" | "developer" | "system"; 52 | content: Content[]; 53 | }; 54 | 55 | export type Reasoning = { 56 | id: string; 57 | type: "reasoning"; 58 | summary: { 59 | type: "summary_text"; 60 | text: string; 61 | }[]; 62 | }; 63 | 64 | export type FunctionToolCall = { 65 | type: "function_call"; 66 | id: string; 67 | call_id: string; 68 | name: string; 69 | arguments: string; 70 | output: Content[] | null; 71 | }; 72 | 73 | export type ComputerAction = 74 | | Click 75 | | DoubleClick 76 | | Drag 77 | | Screenshot 78 | | KeyPress 79 | | Move 80 | | Scroll 81 | | Type 82 | | Wait; 83 | 84 | export type ComputerToolCall = { 85 | type: "computer_call"; 86 | id: string; 87 | call_id: string; 88 | action: ComputerAction; 89 | pending_safety_checks: SafetyCheck[]; 90 | }; 91 | 92 | export type Click = { 93 | type: "click"; 94 | button: "left" | "right" | "wheel" | "back" | "forward"; 95 | x: number; 96 | y: number; 97 | }; 98 | 99 | export type DoubleClick = { 100 | type: "double_click"; 101 | x: number; 102 | y: number; 103 | }; 104 | 105 | export type Scroll = { 106 | type: "scroll"; 107 | x: number; 108 | y: number; 109 | scroll_x: number; 110 | scroll_y: number; 111 | }; 112 | 113 | export type Type = { 114 | type: "type"; 115 | text: string; 116 | }; 117 | 118 | export type Wait = { 119 | type: "wait"; 120 | }; 121 | 122 | export type KeyPress = { 123 | type: "keypress"; 124 | keys: string[]; 125 | }; 126 | 127 | export type Drag = { 128 | type: "drag"; 129 | path: { 130 | x: number; 131 | y: number; 132 | }[]; 133 | }; 134 | 135 | export type Screenshot = { 136 | type: "screenshot"; 137 | }; 138 | 139 | export type Move = { 140 | type: "move"; 141 | x: number; 142 | y: number; 143 | }; 144 | 145 | export type SafetyCheck = { 146 | id: string; 147 | code: string; 148 | message: string; 149 | }; 150 | 151 | export type InputContent = InputText | InputImage | InputFile; 152 | 153 | export type OutputContent = OutputText | Refusal; 154 | 155 | export type Content = InputContent | OutputContent | Reasoning; 156 | 157 | export type InputText = { 158 | type: "input_text"; 159 | text: string; 160 | }; 161 | 162 | export type OutputText = { 163 | type: "output_text"; 164 | text: string; 165 | logprobs?: LogProb[] | null; 166 | annotations: Annotation[]; 167 | }; 168 | 169 | export type Refusal = { 170 | type: "refusal"; 171 | refusal: string; 172 | }; 173 | 174 | export type InputImage = { 175 | type: "input_image"; 176 | image_url?: string; 177 | file_id?: string; 178 | detail: "high" | "low" | "auto"; 179 | }; 180 | 181 | export type InputFile = { 182 | type: "input_file"; 183 | file_id: string | null; 184 | filename: string | null; 185 | file_data: string | null; 186 | }; 187 | 188 | export type LogProb = { 189 | token: string; 190 | logprob: number; 191 | bytes: number[]; 192 | top_logprobs?: LogProb[]; 193 | }; 194 | 195 | export type FileCitation = { 196 | type: "file_citation"; 197 | index: number; 198 | file_id: string; 199 | filename: string; 200 | }; 201 | 202 | export type FilePath = { 203 | type: "file_path"; 204 | file_id: string; 205 | index: number; 206 | }; 207 | 208 | export type Annotation = FileCitation | FilePath; 209 | 210 | export type RequestOptions = { 211 | model: string; 212 | input?: string | InputItem[]; 213 | previous_response_id?: string; 214 | include?: Includable[]; 215 | tools?: Tool[]; 216 | 217 | metadata?: Record; 218 | tool_choice?: 219 | | "none" 220 | | "auto" // default 221 | | "required" 222 | | { type: "file_search" } 223 | | { type: "computer" } 224 | | { type: "function"; name: string }; 225 | text?: { 226 | format?: 227 | | { type: "text" } // default 228 | | { type: "json_object" } 229 | | { 230 | type: "json_schema"; 231 | schema: object; 232 | name: string; 233 | description?: string; 234 | strict?: boolean; // default true 235 | }; 236 | }; 237 | temperature?: number; // default 1 238 | top_p?: number; // default 1 239 | truncation?: "auto" | "disabled"; 240 | parallel_tool_calls?: boolean; // default true 241 | stream?: boolean; 242 | reasoning?: { generate_summary?: "concise" }; 243 | }; 244 | 245 | export type Response = { 246 | id: string; 247 | object: "response"; 248 | created_at: number; 249 | completed_at: number | null; 250 | error: Error | null; 251 | model: string; 252 | tools: Tool[]; 253 | tool_choice: 254 | | "none" 255 | | "auto" 256 | | "required" 257 | | { type: "file_search" } 258 | | { type: "code_interpreter" } 259 | | { type: "function"; name: string }; 260 | text: { 261 | response_format: 262 | | { type: "text" } // default 263 | | { type: "json_object" } 264 | | { 265 | type: "json_schema"; 266 | schema: object; 267 | name: string; 268 | description?: string; 269 | strict: boolean | null; 270 | }; 271 | }; 272 | previous_response_id: string | null; 273 | output: Item[]; 274 | metadata: Record; 275 | // eslint-disable-next-line @typescript-eslint/no-explicit-any 276 | usage: any | null; 277 | }; 278 | -------------------------------------------------------------------------------- /app/api/slack/route.ts: -------------------------------------------------------------------------------- 1 | import { WebClient } from "@slack/web-api"; 2 | import { NextResponse } from "next/server"; 3 | import { Browserbase } from "@browserbasehq/sdk"; 4 | import { getState, runAgentLoop } from "./operator"; 5 | import { waitUntil } from '@vercel/functions'; 6 | import { BrowserbaseBrowser } from "../agent/browserbase"; 7 | import { Agent } from "../agent/agent"; 8 | 9 | // Set the default to 60 seconds. This is not enough! 10 | // Once you enable Fluid Compute, you can can set this to 800 seconds. 11 | export const maxDuration: number = 60; 12 | 13 | // Initialize clients 14 | const validateEnvironment = () => { 15 | if (!process.env.BROWSERBASE_API_KEY) { 16 | throw new Error("BROWSERBASE_API_KEY is not set"); 17 | } 18 | if (!process.env.BROWSERBASE_PROJECT_ID) { 19 | throw new Error("BROWSERBASE_PROJECT_ID is not set"); 20 | } 21 | if (!process.env.OPENAI_API_KEY) { 22 | throw new Error("OPENAI_API_KEY is not set"); 23 | } 24 | }; 25 | 26 | validateEnvironment(); 27 | 28 | const slack = new WebClient(process.env.SLACK_BOT_TOKEN); 29 | const browserbase = new Browserbase({ 30 | apiKey: process.env.BROWSERBASE_API_KEY, 31 | }); 32 | 33 | const handleUrlVerification = (body: any) => { 34 | if (body.type === "url_verification") { 35 | return NextResponse.json({ challenge: body.challenge }); 36 | } 37 | return null; 38 | }; 39 | 40 | const createSession = async (channel: string, ts: string, userId: string, goal: string) => { 41 | const session = await browserbase.sessions.create({ 42 | projectId: process.env.BROWSERBASE_PROJECT_ID!, 43 | keepAlive: true, 44 | proxies: false, 45 | browserSettings: { 46 | viewport: { 47 | width: 1024, 48 | height: 768, 49 | }, 50 | blockAds: true, 51 | }, 52 | userMetadata: { 53 | slackChannel: channel, 54 | messageTs: ts?.replace(/[^\w\s-]/g, ""), 55 | userId: userId, 56 | }, 57 | timeout: 3600, 58 | }); 59 | 60 | const computer = new BrowserbaseBrowser( 61 | 1024, 62 | 768, 63 | session.id, 64 | "us-west-2", 65 | false, 66 | ); 67 | const agent = new Agent("computer-use-preview", computer, false); 68 | 69 | if (maxDuration === 60) { 70 | await slack.chat.postMessage({ 71 | channel: channel, 72 | text: `⚠️ The default timeout is 60 seconds. Please enable Fluid Compute and update the timeout in slack/route.ts to 800 seconds.`, 73 | thread_ts: ts, 74 | }); 75 | } 76 | 77 | await runAgentLoop(computer, agent, goal, session.id, slack, channel, ts); 78 | }; 79 | 80 | const handleNewMessage = async (event: any) => { 81 | if ( 82 | !event.thread_ts && 83 | !event.bot_id && 84 | event.user !== process.env.SLACK_BOT_USER_ID && 85 | event?.text?.includes(`<@${process.env.SLACK_BOT_USER_ID}>`) 86 | ) { 87 | // Check for existing sessions 88 | const query = `user_metadata['messageTs']:'${event.ts?.replace(/[^\w\s-]/g, "")}'`; 89 | const existingSessions = await browserbase.sessions.list({ q: query }); 90 | 91 | if (existingSessions.length > 0) { 92 | console.log("Found existing session:", existingSessions[0].id); 93 | return; 94 | } 95 | 96 | const goal = event.text 97 | .replace(`<@${process.env.SLACK_BOT_USER_ID}>`, "") 98 | .trim(); 99 | 100 | await createSession(event.channel, event.ts, event.user, goal); 101 | } 102 | }; 103 | 104 | const handleThreadReply = async (event: any) => { 105 | if (!event.thread_ts || event.bot_id || event.user === process.env.SLACK_BOT_USER_ID) { 106 | return; 107 | } 108 | 109 | const query = `user_metadata['messageTs']:'${event.thread_ts?.replace(/[^\w\s-]/g, "")}'`; 110 | const sessions = await browserbase.sessions.list({ q: query }); 111 | 112 | if (sessions.length === 0) return; 113 | 114 | const session = sessions[0]; 115 | 116 | if (event.text.toLowerCase().includes('stop')) { 117 | await browserbase.sessions.update(session.id, { 118 | status: "REQUEST_RELEASE", 119 | projectId: process.env.BROWSERBASE_PROJECT_ID!, 120 | }); 121 | await slack.chat.postMessage({ 122 | channel: event.channel, 123 | text: `Browser session stopped successfully.`, 124 | thread_ts: event.thread_ts, 125 | }); 126 | return; 127 | } 128 | 129 | const savedState = await getState(session.id); 130 | if (savedState) { 131 | await slack.chat.postMessage({ 132 | channel: event.channel, 133 | text: `👍 Got your response! Continuing with the task...`, 134 | thread_ts: event.thread_ts, 135 | }); 136 | 137 | const computer = new BrowserbaseBrowser( 138 | 1024, 139 | 768, 140 | session.id, 141 | "us-west-2", 142 | false, 143 | ); 144 | const agent = new Agent("computer-use-preview", computer, false); 145 | 146 | await runAgentLoop( 147 | computer, 148 | agent, 149 | "", 150 | session.id, 151 | slack, 152 | event.channel, 153 | event.thread_ts, 154 | savedState, 155 | event.text 156 | ); 157 | } else { 158 | const { debuggerUrl } = await browserbase.sessions.debug(session.id); 159 | await slack.chat.postMessage({ 160 | channel: event.channel, 161 | text: `Found running operator session! Debug URL: ${debuggerUrl}`, 162 | thread_ts: event.thread_ts, 163 | }); 164 | } 165 | }; 166 | 167 | const createTimeoutPromise = (channel: string, threadTs: string) => { 168 | return new Promise((_, reject) => { 169 | setTimeout(() => { 170 | reject(new Error('Processing timeout')); 171 | }, maxDuration * 1000 - 5000); // 5 seconds before timeout so we have time to gracefully stop the session 172 | }).catch(async (error) => { 173 | await slack.chat.postMessage({ 174 | channel: channel, 175 | thread_ts: threadTs, 176 | text: "⚠️ Function timed out while working. Please enable Fluid Compute and maxDuration in slack/route.ts to 800 seconds.", 177 | }); 178 | throw error; 179 | }); 180 | }; 181 | 182 | export async function POST(req: Request) { 183 | try { 184 | const body = await req.json(); 185 | 186 | // Handle URL verification 187 | const urlVerificationResponse = handleUrlVerification(body); 188 | if (urlVerificationResponse) return urlVerificationResponse; 189 | 190 | // Return immediate response for all other requests 191 | const response = NextResponse.json({ ok: true }); 192 | 193 | if (body.type === "event_callback") { 194 | const event = body.event; 195 | 196 | if (event.type === "message" && !event.bot_id) { 197 | waitUntil(Promise.race([ 198 | (async () => { 199 | try { 200 | await handleNewMessage(event); 201 | await handleThreadReply(event); 202 | } catch (err) { 203 | const error = err as Error; 204 | await slack.chat.postMessage({ 205 | channel: event.channel, 206 | thread_ts: event.thread_ts || event.ts, 207 | text: "⚠️ There was an error processing your request. Please try again or check the session status.", 208 | }); 209 | throw error; 210 | } 211 | })(), 212 | createTimeoutPromise(event.channel, event.thread_ts || event.ts) 213 | ])); 214 | } 215 | } 216 | 217 | return response; 218 | } catch (error) { 219 | console.error("Error handling Slack event:", error); 220 | return NextResponse.json( 221 | { error: "Internal Server Error" }, 222 | { status: 500 } 223 | ); 224 | } 225 | } 226 | -------------------------------------------------------------------------------- /app/api/agent/agent.ts: -------------------------------------------------------------------------------- 1 | import { BrowserbaseBrowser } from "./browserbase"; 2 | import OpenAI from "openai"; 3 | import { 4 | InputItem, 5 | Item, 6 | Message, 7 | FunctionToolCall, 8 | ComputerToolCall, 9 | ComputerCallOutput, 10 | FunctionOutput, 11 | Tool, 12 | RequestOptions, 13 | } from "./types"; 14 | import { AxiosError } from "axios"; 15 | import axios from "axios"; 16 | import axiosRetry from "axios-retry"; 17 | 18 | type AcknowledgeSafetyCheckCallback = (message: string) => boolean; 19 | 20 | export class Agent { 21 | private client: OpenAI; 22 | private model: string; 23 | private computer: BrowserbaseBrowser; 24 | private tools: Tool[]; 25 | private printSteps: boolean = true; 26 | private acknowledgeSafetyCheckCallback: AcknowledgeSafetyCheckCallback; 27 | public lastResponseId: string | undefined = undefined; 28 | 29 | constructor( 30 | model: string = "computer-use-preview", 31 | computer: BrowserbaseBrowser, 32 | printSteps: boolean = false, 33 | acknowledgeSafetyCheckCallback: AcknowledgeSafetyCheckCallback = () => true 34 | ) { 35 | this.client = new OpenAI(); 36 | this.model = model; 37 | this.computer = computer; 38 | this.printSteps = printSteps; 39 | this.acknowledgeSafetyCheckCallback = acknowledgeSafetyCheckCallback; 40 | 41 | this.tools = [ 42 | { 43 | type: "computer-preview", 44 | display_width: computer.dimensions[0], 45 | display_height: computer.dimensions[1], 46 | environment: computer.environment, 47 | }, 48 | { 49 | type: "function", 50 | name: "goto", 51 | description: "Go to a specific URL.", 52 | parameters: { 53 | type: "object", 54 | properties: { 55 | url: { 56 | type: "string", 57 | description: "Fully qualified URL to navigate to.", 58 | }, 59 | }, 60 | additionalProperties: false, 61 | required: ["url"], 62 | }, 63 | strict: false, 64 | }, 65 | { 66 | type: "function", 67 | name: "back", 68 | description: "Go back to the previous page.", 69 | parameters: {}, 70 | strict: false, 71 | } 72 | ]; 73 | } 74 | 75 | private async createResponse(options: RequestOptions): Promise { 76 | const url = "https://api.openai.com/v1/responses"; 77 | const headers: Record = { 78 | Authorization: `Bearer ${process.env.OPENAI_API_KEY}`, 79 | "Content-Type": "application/json", 80 | "Openai-beta": "responses=v1", 81 | }; 82 | 83 | const openaiOrg = process.env.OPENAI_ORG; 84 | if (openaiOrg) { 85 | headers["Openai-Organization"] = openaiOrg; 86 | } 87 | 88 | // Configure retry behavior 89 | axiosRetry(axios, { 90 | retries: 3, 91 | retryDelay: axiosRetry.exponentialDelay, 92 | retryCondition: (error: AxiosError): boolean => { 93 | return ( 94 | axiosRetry.isNetworkOrIdempotentRequestError(error) || 95 | (error.response?.status ? error.response.status >= 500 : false) || 96 | (error.message.includes("Timeout")) 97 | ); 98 | }, 99 | }); 100 | 101 | try { 102 | const response = await axios.post(url, options, { headers }); 103 | return response.data; 104 | } catch (error) { 105 | const axiosError = error as AxiosError; 106 | 107 | console.error( 108 | `Error: ${axiosError.response?.status} ${ 109 | axiosError.response?.data || axiosError.message 110 | }` 111 | ); 112 | console.error(`${JSON.stringify(axiosError.response?.data)}`); 113 | throw error; 114 | } 115 | } 116 | 117 | async getAction( 118 | inputItems: InputItem[], 119 | previousResponseId: string | undefined 120 | ): Promise<{ 121 | output: Item[]; 122 | responseId: string; 123 | }> { 124 | const request = { 125 | model: this.model, 126 | input: inputItems, 127 | tools: this.tools, 128 | truncation: "auto" as const, 129 | parallel_tool_calls: false, 130 | reasoning: { generate_summary: "concise" as const }, 131 | ...(previousResponseId 132 | ? { previous_response_id: previousResponseId, instructions: "Use the web browser to complete the task. If needed, always use the additional back() and goto() functions to navigate the browser. If you get stuck, trying going to Google." } 133 | : {}), 134 | } 135 | 136 | if (this.printSteps) { 137 | console.log("request", request); 138 | } 139 | const response = await this.createResponse(request); 140 | 141 | if (this.printSteps) { 142 | console.log("response", response); 143 | } 144 | 145 | return { 146 | output: response.output as Item[], 147 | responseId: response.id as string, 148 | }; 149 | } 150 | 151 | async takeAction( 152 | output: Item[] 153 | ): Promise<(Message | ComputerCallOutput | FunctionOutput)[]> { 154 | const actions: Promise[] = 155 | []; 156 | for (const item of output) { 157 | if (item.type === "message") { 158 | // Do nothing 159 | } 160 | if (item.type === "computer_call") { 161 | actions.push(this.takeComputerAction(item as ComputerToolCall)); 162 | } 163 | if (item.type === "function_call") { 164 | actions.push(this.takeFunctionAction(item as FunctionToolCall)); 165 | } 166 | } 167 | 168 | const results = await Promise.all(actions); 169 | return results; 170 | } 171 | 172 | async takeMessageAction(messageItem: Message): Promise { 173 | if (this.printSteps && messageItem.content?.[0]) { 174 | console.log(messageItem.content[0]); 175 | } 176 | return messageItem; 177 | } 178 | 179 | async takeComputerAction( 180 | computerItem: ComputerToolCall 181 | ): Promise { 182 | const action = computerItem.action; 183 | const actionType = action.type; 184 | const actionArgs = Object.fromEntries( 185 | Object.entries(action).filter(([key]) => key !== "type") 186 | ); 187 | 188 | if (this.printSteps) { 189 | console.log(`${actionType}(${JSON.stringify(actionArgs)})`); 190 | } 191 | 192 | if (!this.computer) { 193 | throw new Error("Computer not initialized"); 194 | } 195 | 196 | const method = (this.computer as unknown as Record)[ 197 | actionType 198 | ] as (...args: unknown[]) => unknown; 199 | await method.apply(this.computer, Object.values(actionArgs)); 200 | 201 | const screenshot = await this.computer.screenshot(); 202 | 203 | // Handle safety checks 204 | const pendingChecks = computerItem.pending_safety_checks || []; 205 | for (const check of pendingChecks) { 206 | const message = check.message; 207 | if (!this.acknowledgeSafetyCheckCallback(message)) { 208 | throw new Error( 209 | `Safety check failed: ${message}. Cannot continue with unacknowledged safety checks.` 210 | ); 211 | } 212 | } 213 | 214 | return { 215 | type: "computer_call_output", 216 | call_id: computerItem.call_id, 217 | acknowledged_safety_checks: pendingChecks, 218 | output: { 219 | type: "input_image", 220 | image_url: `data:image/png;base64,${screenshot}`, 221 | }, 222 | }; 223 | } 224 | 225 | async takeFunctionAction( 226 | functionItem: FunctionToolCall 227 | ): Promise { 228 | const name = functionItem.name; 229 | const args = JSON.parse(functionItem.arguments); 230 | if (this.printSteps) { 231 | console.log(`${name}(${JSON.stringify(args)})`); 232 | } 233 | 234 | if ( 235 | this.computer && 236 | typeof (this.computer as unknown as Record)[name] === 237 | "function" 238 | ) { 239 | const method = (this.computer as unknown as Record)[ 240 | name 241 | ] as (...args: unknown[]) => unknown; 242 | await method.apply(this.computer, Object.values(args)); 243 | } 244 | 245 | return { 246 | type: "function_call_output", 247 | call_id: functionItem.call_id, 248 | output: "success", // hard-coded output for demo 249 | }; 250 | } 251 | } 252 | -------------------------------------------------------------------------------- /app/api/slack/operator.ts: -------------------------------------------------------------------------------- 1 | import { WebClient } from "@slack/web-api"; 2 | import { openai } from "@ai-sdk/openai"; 3 | import { CoreMessage, generateObject } from "ai"; 4 | import { z } from "zod"; 5 | import { put, list } from "@vercel/blob"; 6 | import { 7 | ComputerCallOutput, 8 | ComputerToolCall, 9 | Item, 10 | Reasoning, 11 | } from "../agent/types"; 12 | import { BrowserbaseBrowser } from "../agent/browserbase"; 13 | import { Agent } from "../agent/agent"; 14 | 15 | // Define state type 16 | export interface AgentState { 17 | goal: string; 18 | currentStep: { 19 | output: Item[]; 20 | responseId: string; 21 | }; 22 | } 23 | 24 | interface OutputText { 25 | type: 'output_text'; 26 | text: string; 27 | } 28 | 29 | interface Message { 30 | type: 'message'; 31 | content: [OutputText]; 32 | } 33 | 34 | // Helper functions for state management 35 | export async function saveState(sessionId: string, state: AgentState) { 36 | const { url } = await put( 37 | `agent-${sessionId}-state.json`, 38 | JSON.stringify(state), 39 | { access: "public", addRandomSuffix: true } 40 | ); 41 | return url; 42 | } 43 | 44 | export async function getState(sessionId: string): Promise { 45 | try { 46 | const { blobs } = await list({ prefix: `agent-${sessionId}-state` }); 47 | if (blobs.length === 0) return null; 48 | 49 | // get the most recently created blob 50 | const mostRecentBlob = blobs.sort((a, b) => b.uploadedAt.getTime() - a.uploadedAt.getTime())[0]; 51 | 52 | const response = await fetch(mostRecentBlob.url); 53 | const text = await response.text(); 54 | return JSON.parse(text) as AgentState; 55 | } catch (error) { 56 | console.error("[getState] Error retrieving state:", error); 57 | return null; 58 | } 59 | } 60 | 61 | async function selectStartingUrl(goal: string) { 62 | const message: CoreMessage = { 63 | role: "user", 64 | content: [ 65 | { 66 | type: "text", 67 | text: `Given the goal: "${goal}", determine the best URL to start from. 68 | Choose from: 69 | 1. A relevant search engine (Google, Bing, etc.) 70 | 2. A direct URL if you're confident about the target website 71 | 3. Any other appropriate starting point 72 | 73 | Return a URL that would be most effective for achieving this goal.`, 74 | }, 75 | ], 76 | }; 77 | 78 | // Initialize OpenAI client 79 | const LLMClient = openai("gpt-4o"); 80 | 81 | const result = await generateObject({ 82 | model: LLMClient, 83 | schema: z.object({ 84 | url: z.string().url(), 85 | reasoning: z.string(), 86 | }), 87 | abortSignal: AbortSignal.timeout(5000), 88 | messages: [message], 89 | }).catch((error) => { 90 | console.error("OpenAI timeout when generating starting URL, falling back to Google"); 91 | return { 92 | object: { 93 | url: "https://www.google.com", 94 | }, 95 | }; 96 | }); 97 | 98 | return result.object; 99 | } 100 | 101 | async function execute(computer: BrowserbaseBrowser, agent: Agent, output: any) { 102 | await computer.connect(); 103 | 104 | const result = await agent.takeAction(output.output); 105 | 106 | return result; 107 | } 108 | 109 | async function generate(computer: BrowserbaseBrowser, agent: Agent, input: any, responseId: string) { 110 | let result = await agent.getAction(input, responseId); 111 | 112 | // If there's a screenshot returned, just handle it right here so we don't have to make a round trip. 113 | if (result.output.find((item) => item.type === "computer_call")) { 114 | const computerCall = result.output.find( 115 | (item) => item.type === "computer_call" 116 | ) as ComputerToolCall; 117 | if (computerCall.action.type === "screenshot") { 118 | await computer.connect(); 119 | 120 | const screenshotAction = await agent.takeAction(result.output); 121 | result = await agent.getAction( 122 | screenshotAction.filter((item) => item.type != "message"), 123 | result.responseId 124 | ); 125 | } 126 | } 127 | 128 | // If the generated action is only reasoning, let's request a real action. 129 | if ( 130 | result.output.length == 1 && 131 | result.output.find((item) => item.type === "reasoning") 132 | ) { 133 | do { 134 | result = await agent.getAction([(result.output[0] as Reasoning)], result.responseId); 135 | } while ( 136 | result.output.length == 1 && 137 | result.output.find((item) => item.type === "reasoning") 138 | ); 139 | } 140 | 141 | return result; 142 | } 143 | 144 | export async function runAgentLoop( 145 | computer: BrowserbaseBrowser, 146 | agent: Agent, 147 | goal: string, 148 | sessionId: string, 149 | slack?: WebClient, 150 | channel?: string, 151 | threadTs?: string, 152 | savedState?: AgentState, 153 | userResponse?: string 154 | ) { 155 | // Initialize state from saved state if it exists 156 | let currentStep: { 157 | output: Item[]; 158 | responseId: string; 159 | } | null = null; 160 | 161 | if (savedState) { 162 | try { 163 | currentStep = savedState.currentStep; 164 | } catch (error) { 165 | console.error("[runAgentLoop] Error parsing saved state:", error); 166 | } 167 | } 168 | 169 | // If we have saved state, skip URL selection 170 | if (!savedState) { 171 | if (slack && channel && threadTs) { 172 | await slack.chat.postMessage({ 173 | channel: channel, 174 | text: `🤖 Operator: Starting up to complete the task!\n\nYou can follow along at https://www.browserbase.com/sessions/${sessionId}`, 175 | thread_ts: threadTs, 176 | }); 177 | } else { 178 | console.log( 179 | `🤖 Operator: Starting up to complete the task! You can follow along at https://www.browserbase.com/sessions/${sessionId}` 180 | ); 181 | } 182 | await computer.connect(); 183 | 184 | const startingUrl = await selectStartingUrl(goal); 185 | 186 | await computer.goto(startingUrl.url); 187 | // Initialize the agent with the first step 188 | currentStep = await agent.getAction([ 189 | { 190 | role: "user", 191 | content: goal, 192 | }, 193 | ], undefined); 194 | } 195 | 196 | if (userResponse && currentStep) { 197 | currentStep = await generate( 198 | computer, 199 | agent, 200 | [ 201 | { 202 | role: "assistant", 203 | content: (currentStep.output.find((item) => item.type === "message") as Message | undefined)?.content[0].text ?? "", 204 | }, 205 | { 206 | role: "user", 207 | content: userResponse, 208 | }, 209 | ], 210 | currentStep.responseId ?? null 211 | ); 212 | } 213 | 214 | while (currentStep) { 215 | const reasoning = currentStep.output.find( 216 | (item: any) => item.type === "reasoning" 217 | ) as Reasoning; 218 | if (reasoning) { 219 | if (slack && channel && threadTs) { 220 | await slack.chat.postMessage({ 221 | channel: channel, 222 | text: `🧠 Reasoning: ${reasoning.summary[0].text}`, 223 | thread_ts: threadTs, 224 | }); 225 | } else { 226 | console.log(`🧠 Reasoning: ${reasoning.summary[0].text}`); 227 | } 228 | } 229 | 230 | if (!slack) { 231 | const action = (currentStep.output.find((item: any) => item.type === "computer_call") as ComputerToolCall)?.action; 232 | console.log(`🖥️ Action: ${JSON.stringify(action)}`); 233 | } 234 | // Perform the last step 235 | const nextOutput = await execute(computer, agent, currentStep); 236 | 237 | if ( 238 | reasoning && 239 | nextOutput.find((item: any) => item.type === "computer_call_output") 240 | ) { 241 | const computerCall = nextOutput.find( 242 | (item: any) => item.type === "computer_call_output" 243 | ) as ComputerCallOutput; 244 | if (slack && channel && threadTs) { 245 | await slack.files.uploadV2({ 246 | channel_id: channel, 247 | thread_ts: threadTs, 248 | file: Buffer.from( 249 | computerCall.output.image_url.replace( 250 | /^data:image\/\w+;base64,/, 251 | "" 252 | ), 253 | "base64" 254 | ), 255 | filename: "screenshot.png", 256 | }); 257 | } 258 | } 259 | 260 | // Get next step 261 | const nextStep = await generate( 262 | computer, 263 | agent, 264 | nextOutput, 265 | currentStep.responseId ?? null 266 | ); 267 | 268 | currentStep = nextStep; 269 | 270 | const message = nextStep.output.find( 271 | (item: any) => item.type === "message" 272 | ) as Message | undefined; 273 | if (message) { 274 | if (slack && channel && threadTs) { 275 | await saveState(sessionId, { 276 | goal: goal, 277 | currentStep: nextStep, 278 | }); 279 | const screenshot = await computer.screenshot(); 280 | await slack.files.uploadV2({ 281 | channel_id: channel, 282 | thread_ts: threadTs, 283 | file: Buffer.from(screenshot.replace( 284 | /^data:image\/\w+;base64,/, 285 | "" 286 | ), 287 | "base64" 288 | ), 289 | filename: "screenshot.png", 290 | }); 291 | await slack.chat.postMessage({ 292 | channel: channel, 293 | text: `🤖 Operator: ${message.content[0].text}\n\n You can control the browser if needed at https://www.browserbase.com/sessions/${sessionId}`, 294 | thread_ts: threadTs, 295 | }); 296 | } else { 297 | console.log(`🤖 Operator: ${message.content[0].text}`); 298 | return currentStep; 299 | } 300 | currentStep = null; 301 | } 302 | } 303 | } 304 | -------------------------------------------------------------------------------- /app/api/agent/browserbase.ts: -------------------------------------------------------------------------------- 1 | import * as dotenv from "dotenv"; 2 | import { Browser, Page, chromium } from "playwright"; 3 | import { BasePlaywrightComputer } from "./base-playwright"; 4 | import Browserbase from "@browserbasehq/sdk"; 5 | import { SessionCreateResponse } from "@browserbasehq/sdk/resources/sessions/sessions.mjs"; 6 | import axios from "axios"; 7 | 8 | dotenv.config(); 9 | 10 | // Define a custom type that includes all necessary properties 11 | interface BrowserbaseSession extends SessionCreateResponse { 12 | connectUrl: string; 13 | } 14 | 15 | // Define the type for session creation parameters 16 | interface SessionCreateParams { 17 | projectId: string; 18 | browserSettings: { 19 | viewport: { 20 | width: number; 21 | height: number; 22 | }; 23 | blockAds: boolean; 24 | }; 25 | region: "us-west-2" | "us-east-1" | "eu-central-1" | "ap-southeast-1"; 26 | proxies: boolean; 27 | keepAlive: boolean; 28 | } 29 | 30 | export class BrowserbaseBrowser extends BasePlaywrightComputer { 31 | /** 32 | * Browserbase is a headless browser platform that offers a remote browser API. You can use it to control thousands of browsers from anywhere. 33 | * With Browserbase, you can watch and control a browser in real-time, record and replay sessions, and use built-in proxies for more reliable browsing. 34 | * You can find more information about Browserbase at https://docs.browserbase.com/ or view our OpenAI CUA Quickstart at https://docs.browserbase.com/integrations/openai-cua/introduction. 35 | */ 36 | 37 | private bb: Browserbase; 38 | private projectId: string; 39 | private session: BrowserbaseSession | null = null; 40 | private region: string; 41 | private proxy: boolean; 42 | private sessionId: string | null; 43 | 44 | constructor( 45 | width: number = 1024, 46 | height: number = 768, 47 | sessionId: string | null = null, 48 | region: string = "us-west-2", 49 | proxy: boolean = false 50 | ) { 51 | /** 52 | * Initialize the Browserbase instance. Additional configuration options for features such as persistent cookies, ad blockers, file downloads and more can be found in the Browserbase API documentation: https://docs.browserbase.com/reference/api/create-a-session 53 | * 54 | * @param width - The width of the browser viewport. Default is 1024. 55 | * @param height - The height of the browser viewport. Default is 768. 56 | * @param sessionId - Optional. If provided, use an existing session instead of creating a new one. 57 | * @param region - The region for the Browserbase session. Default is "us-west-2". Pick a region close to you for better performance. https://docs.browserbase.com/guides/multi-region 58 | * @param proxy - Whether to use a proxy for the session. Default is False. Turn on proxies if you're browsing is frequently interrupted. https://docs.browserbase.com/features/proxies 59 | */ 60 | super(); 61 | // We're using a dynamic import here as a workaround since we don't have the actual types 62 | // In a real project, you would install the proper types and import correctly 63 | this.bb = new Browserbase({ apiKey: process.env.BROWSERBASE_API_KEY }); 64 | this.projectId = process.env.BROWSERBASE_PROJECT_ID!; 65 | this.session = null; 66 | this.dimensions = [width, height]; 67 | this.sessionId = sessionId; 68 | this.region = region; 69 | this.proxy = proxy; 70 | } 71 | 72 | protected async _getBrowserAndPage(): Promise<[Browser, Page]> { 73 | /** 74 | * Create a Browserbase session and connect to it, or connect to an existing session if a session ID is provided. 75 | * 76 | * @returns A tuple containing the connected browser and page objects. 77 | */ 78 | if (this.sessionId) { 79 | // TODO: replace with this when we ship connectUrl via session GET to the SDK 80 | const response = await axios.get( 81 | `https://api.browserbase.com/v1/sessions/${this.sessionId}`, 82 | { 83 | headers: { 84 | "X-BB-API-Key": process.env.BROWSERBASE_API_KEY, 85 | }, 86 | } 87 | ); 88 | this.session = { 89 | connectUrl: response.data.connectUrl, 90 | } as unknown as BrowserbaseSession; 91 | } else { 92 | // Create a new session on Browserbase with specified parameters 93 | const [width, height] = this.dimensions; 94 | const sessionParams: SessionCreateParams = { 95 | projectId: this.projectId, 96 | browserSettings: { 97 | blockAds: true, 98 | viewport: { 99 | width, 100 | height, 101 | }, 102 | }, 103 | region: this.region as 104 | | "us-west-2" 105 | | "us-east-1" 106 | | "eu-central-1" 107 | | "ap-southeast-1", 108 | proxies: this.proxy, 109 | keepAlive: true, 110 | }; 111 | 112 | this.session = (await this.bb.sessions.create( 113 | sessionParams 114 | )) as unknown as BrowserbaseSession; 115 | } 116 | 117 | if (!this.session.connectUrl) { 118 | throw new Error("Browserbase session has terminated."); 119 | } 120 | 121 | if (!this.session) { 122 | throw new Error("Failed to create or retrieve session"); 123 | } 124 | 125 | // Connect to the remote session 126 | const browser = await chromium.connectOverCDP(this.session.connectUrl, { 127 | timeout: 1000 * 60, 128 | }); 129 | const context = browser.contexts()[0]; 130 | // Inject inline cursor-rendering script globally for every page 131 | const pages = context.pages(); 132 | const page = pages[0]; 133 | 134 | context.on("page", (newPage) => { 135 | page.goto(newPage.url(), { waitUntil: "domcontentloaded" }).catch((error) => { 136 | console.error("Error navigating to page:", error); 137 | }); 138 | newPage.close().catch((error) => { 139 | console.error("Error closing page:", error); 140 | }); 141 | }); 142 | 143 | page.evaluate(() => { 144 | const CURSOR_ID = '__cursor__'; 145 | 146 | // Check if cursor element already exists 147 | if (document.getElementById(CURSOR_ID)) return; 148 | 149 | const cursor = document.createElement('div'); 150 | cursor.id = CURSOR_ID; 151 | Object.assign(cursor.style, { 152 | position: 'fixed', 153 | top: '0px', 154 | left: '0px', 155 | width: '20px', 156 | height: '20px', 157 | backgroundImage: 'url("data:image/svg+xml;utf8,")', 158 | backgroundSize: 'cover', 159 | pointerEvents: 'none', 160 | zIndex: '99999', 161 | transform: 'translate(-2px, -2px)', 162 | }); 163 | 164 | document.body.appendChild(cursor); 165 | 166 | document.addEventListener("mousemove", (e) => { 167 | cursor.style.top = `${e.clientY}px`; 168 | cursor.style.left = `${e.clientX}px`; 169 | }); 170 | document.addEventListener("mousedown", (e) => { 171 | cursor.style.top = `${e.clientY}px`; 172 | cursor.style.left = `${e.clientX}px`; 173 | }); 174 | }).catch((error) => { 175 | return; 176 | }); 177 | 178 | // Only navigate to Google if it's a new session 179 | if (!this.sessionId) { 180 | await page.goto("https://www.google.com"); 181 | } 182 | 183 | return [browser, page]; 184 | } 185 | 186 | async disconnect(): Promise { 187 | /** 188 | * Clean up resources when exiting the context manager. 189 | */ 190 | /*if (this._page) { 191 | await this._page.close(); 192 | } 193 | if (this._browser) { 194 | await this._browser.close(); 195 | } 196 | 197 | if (this.session) { 198 | console.log(`Session completed. View replay at https://browserbase.com/sessions/${this.session.id}`); 199 | }*/ 200 | } 201 | 202 | async screenshot(): Promise { 203 | /** 204 | * Capture a screenshot of the current viewport using CDP. 205 | * 206 | * @returns A base64 encoded string of the screenshot. 207 | */ 208 | if (!this._page) { 209 | throw new Error("Page not initialized"); 210 | } 211 | 212 | try { 213 | // Get CDP session from the page 214 | const cdpSession = await this._page.context().newCDPSession(this._page); 215 | 216 | // Capture screenshot using CDP 217 | const { data } = await cdpSession.send("Page.captureScreenshot", { 218 | format: "png", 219 | fromSurface: true, 220 | }); 221 | 222 | return data; // CDP already returns base64 encoded string 223 | } catch (error) { 224 | console.warn( 225 | "CDP screenshot failed, falling back to standard screenshot:", 226 | error 227 | ); 228 | // Fall back to standard Playwright screenshot 229 | const buffer = await this._page.screenshot({ type: "png" }); 230 | return buffer.toString("base64"); 231 | } 232 | } 233 | 234 | async refresh(): Promise { 235 | /** 236 | * Refresh the current page. 237 | */ 238 | if (!this._page) { 239 | throw new Error("Page not initialized"); 240 | } 241 | 242 | await this._page.reload(); 243 | } 244 | 245 | async listTabs(): Promise { 246 | /** 247 | * Get the list of tabs, including the current tab. 248 | */ 249 | if (!this._page) { 250 | throw new Error("Page not initialized"); 251 | } 252 | 253 | const tabs = await this._page.context().pages(); 254 | const tabUrls = tabs.map((tab) => tab.url()); 255 | const currentTab = this._page.url(); 256 | return [...tabUrls, currentTab]; 257 | } 258 | 259 | async changeTab(tabUrl: string): Promise { 260 | /** 261 | * Change to a specific tab. 262 | */ 263 | if (!this._page) { 264 | throw new Error("Page not initialized"); 265 | } 266 | 267 | const tabs = await this._page.context().pages(); 268 | const tab = tabs.find((t) => t.url() === tabUrl); 269 | if (!tab) { 270 | throw new Error(`Tab with URL ${tabUrl} not found`); 271 | } 272 | await tab.bringToFront(); 273 | this._page = tab; 274 | } 275 | } 276 | --------------------------------------------------------------------------------