├── .gitignore ├── LICENSE ├── README.md ├── assets ├── flight-schedule.gif └── hyperagent-banner.png ├── cli.sh ├── eslint.config.mjs ├── evals ├── WebVoyager_data.jsonl └── WebVoyager_reference.json ├── examples ├── browser-providers │ └── hyperbrowser.ts ├── custom-tool │ ├── search │ │ └── exa.ts │ └── wikipedia-random-article │ │ └── run-custom-tool.ts ├── llms │ ├── anthropic.ts │ └── openai.ts ├── mcp │ ├── google-sheets │ │ ├── best-buy-reviews.ts │ │ ├── car-price-comparison.ts │ │ └── most-populated-states.ts │ ├── notion │ │ └── create-shopping-list.ts │ └── weather │ │ ├── get-weather-alert.ts │ │ └── servers │ │ └── weather-server.js ├── output-to-schema │ └── output-to-schema.ts └── simple │ └── add-to-amazon-cart.ts ├── package.json ├── scripts ├── run-webvoyager-eval.ts ├── test-async.ts ├── test-extract.ts ├── test-page-ai.ts ├── test-variables.ts └── test.ts ├── src ├── agent │ ├── actions │ │ ├── click-element.ts │ │ ├── complete-validator.ts │ │ ├── complete-with-output-schema.ts │ │ ├── complete.ts │ │ ├── extract.ts │ │ ├── go-to-url.ts │ │ ├── index.ts │ │ ├── input-text.ts │ │ ├── key-press.ts │ │ ├── page-back.ts │ │ ├── page-forward.ts │ │ ├── pdf.ts │ │ ├── refresh-page.ts │ │ ├── scroll.ts │ │ ├── select-option.ts │ │ ├── thinking.ts │ │ └── utils.ts │ ├── error.ts │ ├── index.ts │ ├── llms │ │ └── structured-output.ts │ ├── mcp │ │ └── client.ts │ ├── messages │ │ ├── builder.ts │ │ ├── examples-actions.ts │ │ ├── input-format.ts │ │ ├── output-format.ts │ │ ├── system-prompt.ts │ │ └── utils.ts │ └── tools │ │ ├── agent.ts │ │ └── types.ts ├── browser-providers │ ├── hyperbrowser.ts │ ├── index.ts │ └── local.ts ├── cli │ └── index.ts ├── context-providers │ └── dom │ │ ├── build-dom-view.ts │ │ ├── builder.ts │ │ ├── const.ts │ │ ├── elem-interactive.ts │ │ ├── find-interactive-elements.ts │ │ ├── get-css-path.ts │ │ ├── get-x-path.ts │ │ ├── highlight.ts │ │ ├── index.ts │ │ ├── inject │ │ ├── build-dom-view-script.js │ │ └── build-dom-view.ts │ │ ├── types.ts │ │ └── window-type.ts ├── custom-actions │ ├── index.ts │ └── user-interaction.ts ├── index.ts ├── types │ ├── agent │ │ ├── actions │ │ │ └── types.ts │ │ └── types.ts │ ├── browser-providers │ │ └── types.ts │ ├── config.ts │ └── index.ts └── utils │ ├── error-emitter.ts │ ├── html-to-markdown.ts │ ├── index.ts │ ├── retry.ts │ └── sleep.ts ├── tsconfig.json └── yarn.lock /.gitignore: -------------------------------------------------------------------------------- 1 | # Logs 2 | logs 3 | *.log 4 | npm-debug.log* 5 | yarn-debug.log* 6 | yarn-error.log* 7 | lerna-debug.log* 8 | .pnpm-debug.log* 9 | tmp 10 | debug 11 | 12 | # Diagnostic reports (https://nodejs.org/api/report.html) 13 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json 14 | 15 | # Runtime data 16 | pids 17 | *.pid 18 | *.seed 19 | *.pid.lock 20 | 21 | .DS_Store 22 | 23 | # Directory for instrumented libs generated by jscoverage/JSCover 24 | lib-cov 25 | 26 | # Coverage directory used by tools like istanbul 27 | coverage 28 | *.lcov 29 | 30 | # nyc test coverage 31 | .nyc_output 32 | 33 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files) 34 | .grunt 35 | 36 | # Bower dependency directory (https://bower.io/) 37 | bower_components 38 | 39 | # node-waf configuration 40 | .lock-wscript 41 | 42 | # Compiled binary addons (https://nodejs.org/api/addons.html) 43 | build/Release 44 | 45 | # Dependency directories 46 | node_modules/ 47 | jspm_packages/ 48 | 49 | # Snowpack dependency directory (https://snowpack.dev/) 50 | web_modules/ 51 | 52 | # TypeScript cache 53 | *.tsbuildinfo 54 | 55 | # Optional npm cache directory 56 | .npm 57 | 58 | # Optional eslint cache 59 | .eslintcache 60 | 61 | # Optional stylelint cache 62 | .stylelintcache 63 | 64 | # Microbundle cache 65 | .rpt2_cache/ 66 | .rts2_cache_cjs/ 67 | .rts2_cache_es/ 68 | .rts2_cache_umd/ 69 | 70 | # Optional REPL history 71 | .node_repl_history 72 | 73 | # Output of 'npm pack' 74 | *.tgz 75 | 76 | # Yarn Integrity file 77 | .yarn-integrity 78 | 79 | # dotenv environment variable files 80 | .env 81 | .env.development.local 82 | .env.test.local 83 | .env.production.local 84 | .env.local 85 | 86 | # parcel-bundler cache (https://parceljs.org/) 87 | .cache 88 | .parcel-cache 89 | 90 | # Next.js build output 91 | .next 92 | out 93 | 94 | # Nuxt.js build / generate output 95 | .nuxt 96 | dist 97 | 98 | # Gatsby files 99 | .cache/ 100 | # Comment in the public line in if your project uses Gatsby and not Next.js 101 | # https://nextjs.org/blog/next-9-1#public-directory-support 102 | # public 103 | 104 | # vuepress build output 105 | .vuepress/dist 106 | 107 | # vuepress v2.x temp and cache directory 108 | .temp 109 | .cache 110 | 111 | # Docusaurus cache and generated files 112 | .docusaurus 113 | 114 | # Serverless directories 115 | .serverless/ 116 | 117 | # FuseBox cache 118 | .fusebox/ 119 | 120 | # DynamoDB Local files 121 | .dynamodb/ 122 | 123 | # TernJS port file 124 | .tern-port 125 | 126 | # Stores VSCode versions used for testing VSCode extensions 127 | .vscode-test 128 | 129 | # yarn v2 130 | .yarn/cache 131 | .yarn/unplugged 132 | .yarn/build-state.yml 133 | .yarn/install-state.gz 134 | .pnp.* 135 | 136 | .ignore 137 | extensions 138 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU AFFERO GENERAL PUBLIC LICENSE 2 | Version 3, 19 November 2007 3 | 4 | Copyright (c) 2025 S2 Labs Inc. 5 | 6 | This program is free software: you can redistribute it and/or modify 7 | it under the terms of the GNU Affero General Public License as published by 8 | the Free Software Foundation, either version 3 of the License, or 9 | (at your option) any later version. 10 | 11 | This program is distributed in the hope that it will be useful, 12 | but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | GNU Affero General Public License for more details. 15 | 16 | You should have received a copy of the GNU Affero General Public License 17 | along with this program. If not, see . 18 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 | Hyperagent Banner 3 | 4 |

5 | Intelligent Browser Automation with LLMs 6 |

7 | 8 |

9 | 10 | npm version 11 | 12 | 13 | license 14 | 15 | 16 | Discord 17 | 18 | 19 | X (formerly Twitter) Follow 20 | 21 |

22 |
23 | 24 | ## Overview 25 | 26 | Hyperagent is Playwright supercharged with AI. No more brittle scripts, just powerful natural language commands. 27 | Just looking for scalable headless browsers or scraping infra? Go to [Hyperbrowser](https://app.hyperbrowser.ai/) to get started for free! 28 | 29 | ### Features 30 | 31 | - 🤖 **AI Commands**: Simple APIs like `page.ai()`, `page.extract()` and `executeTask()` for any AI automation 32 | - ⚡ **Fallback to Regular Playwright**: Use regular Playwright when AI isn't needed 33 | - 🥷 **Stealth Mode** – Avoid detection with built-in anti-bot patches 34 | - ☁️ **Cloud Ready** – Instantly scale to hundreds of sessions via [Hyperbrowser](https://app.hyperbrowser.ai/) 35 | - 🔌 **MCP Client** – Connect to tools like Composio for full workflows (e.g. writing web data to Google Sheets) 36 | 37 | ## Quick Start 38 | 39 | ### Installation 40 | 41 | ```bash 42 | # Using npm 43 | npm install @hyperbrowser/agent 44 | 45 | # Using yarn 46 | yarn add @hyperbrowser/agent 47 | ``` 48 | 49 | ### CLI 50 | 51 | ```bash 52 | $ npx @hyperbrowser/agent -c "Find a route from Miami to New Orleans, and provide the detailed route information." 53 | ``` 54 | 55 |

56 | Hyperagent Demo 57 |

58 | 59 | The CLI supports options for debugging or using hyperbrowser instead of a local browser 60 | 61 | ```bash 62 | -d, --debug Enable debug mode 63 | -c, --command Command to run 64 | --hyperbrowser Use Hyperbrowser for the browser provider 65 | ``` 66 | 67 | ### Library 68 | 69 | ```typescript 70 | import { HyperAgent } from "@hyperbrowser/agent"; 71 | import { ChatOpenAI } from "@langchain/openai"; 72 | import { z } from "zod"; 73 | 74 | // Initialize the agent 75 | const agent = new HyperAgent({ 76 | llm: new ChatOpenAI({ 77 | openAIApiKey: process.env.OPENAI_API_KEY, 78 | modelName: "gpt-4o", 79 | }), 80 | }); 81 | 82 | // Execute a task 83 | const result = await agent.executeTask( 84 | "Navigate to amazon.com, search for 'laptop', and extract the prices of the first 5 results" 85 | ); 86 | console.log(result.output); 87 | 88 | // Use page.ai and page.extract 89 | const page = await agent.newPage(); 90 | await page.goto("https://flights.google.com", { waitUntil: "load" }); 91 | await page.ai("search for flights from Rio to LAX from July 16 to July 22"); 92 | const res = await page.extract( 93 | "give me the flight options", 94 | z.object({ 95 | flights: z.array( 96 | z.object({ 97 | price: z.number(), 98 | departure: z.string(), 99 | arrival: z.string(), 100 | }) 101 | ), 102 | }) 103 | ); 104 | console.log(res); 105 | 106 | // Clean up 107 | await agent.closeAgent(); 108 | ``` 109 | 110 | ## ☁️ Cloud 111 | 112 | You can scale HyperAgent with cloud headless browsers using Hyperbrowser 113 | 114 | 1. Get a free api key from [Hyperbrowser](https://app.hyperbrowser.ai/) 115 | 2. Add it to your env as `HYPERBROWSER_API_KEY` 116 | 3. Set your `browserProvider` to `"Hyperbrowser"` 117 | 118 | ```typescript 119 | const agent = new HyperAgent({ 120 | browserProvider: "Hyperbrowser", 121 | }); 122 | 123 | const response = await agent.executeTask( 124 | "Go to hackernews, and list me the 5 most recent article titles" 125 | ); 126 | 127 | console.log(response); 128 | await agent.closeAgent(); 129 | ``` 130 | 131 | ## Usage Guide 132 | 133 | ### Multi-Page Management 134 | 135 | ```typescript 136 | // Create and manage multiple pages 137 | const page1 = await agent.newPage(); 138 | const page2 = await agent.newPage(); 139 | 140 | // Execute tasks on specific pages 141 | const page1Response = await page1.ai( 142 | "Go to google.com/travel/explore and set the starting location to New York. Then, return to me the first recommended destination that shows up. Return to me only the name of the location." 143 | ); 144 | const page2Response = await page2.ai( 145 | `I want to plan a trip to ${page1Response.output}. Recommend me places to visit there.` 146 | ); 147 | 148 | console.log(page2Response.output); 149 | 150 | // Get all active pages 151 | const pages = await agent.getPages(); 152 | await agent.closeAgent(); 153 | ``` 154 | 155 | ## Customization 156 | 157 | ### Output Schema Definition 158 | 159 | HyperAgent can extract data in a specified schema. The schema can be passed in at a per-task level 160 | 161 | ```typescript 162 | import { z } from "zod"; 163 | 164 | const agent = new HyperAgent(); 165 | const agentResponse = await agent.executeTask( 166 | "Navigate to imdb.com, search for 'The Matrix', and extract the director, release year, and rating", 167 | { 168 | outputSchema: z.object({ 169 | director: z.string().describe("The name of the movie director"), 170 | releaseYear: z.number().describe("The year the movie was released"), 171 | rating: z.string().describe("The IMDb rating of the movie"), 172 | }), 173 | } 174 | ); 175 | console.log(agentResponse.output); 176 | await agent.closeAgent(); 177 | ``` 178 | 179 | ```bash 180 | { 181 | "director": "Lana Wachowski, Lilly Wachowski", 182 | "releaseYear": 1999, 183 | "rating": "8.7/10" 184 | } 185 | ``` 186 | 187 | ### Using Different LLM Providers 188 | 189 | Hyperagent supports multiple LLM providers. A provider can be anything that extends to the Langchain `BaseChatModel` class. 190 | 191 | ```typescript 192 | // Using OpenAI 193 | const agent = new HyperAgent({ 194 | llm: new ChatOpenAI({ 195 | openAIApiKey: process.env.OPENAI_API_KEY, 196 | modelName: "gpt-4o", 197 | }), 198 | }); 199 | 200 | // Using Anthropic's Claude 201 | const agent = new HyperAgent({ 202 | llm: new ChatAnthropic({ 203 | anthropicApiKey: process.env.ANTHROPIC_API_KEY, 204 | modelName: "claude-3-7-sonnet-latest", 205 | }), 206 | }); 207 | ``` 208 | 209 | ### MCP Support 210 | 211 | HyperAgent functions as a fully functional MCP client. For best results, we recommend using 212 | `gpt-4o` as your LLM. 213 | 214 | Here is an example which reads from wikipedia, and inserts information into a google sheet using the composio Google Sheet MCP. For the full example, see [here](https://github.com/hyperbrowserai/HyperAgent/tree/main/examples/mcp/google-sheets/most-populated-states.ts) 215 | 216 | ```typescript 217 | const agent = new HyperAgent({ 218 | llm: llm, 219 | debug: true, 220 | }); 221 | 222 | await agent.initializeMCPClient({ 223 | servers: [ 224 | { 225 | command: "npx", 226 | args: [ 227 | "@composio/mcp@latest", 228 | "start", 229 | "--url", 230 | "https://mcp.composio.dev/googlesheets/...", 231 | ], 232 | env: { 233 | npm_config_yes: "true", 234 | }, 235 | }, 236 | ], 237 | }); 238 | 239 | const response = await agent.executeTask( 240 | "Go to https://en.wikipedia.org/wiki/List_of_U.S._states_and_territories_by_population and get the data on the top 5 most populous states from the table. Then insert that data into a google sheet. You may need to first check if there is an active connection to google sheet, and if there isn't connect to it and present me with the link to sign in. " 241 | ); 242 | 243 | console.log(response); 244 | await agent.closeAgent(); 245 | ``` 246 | 247 | ### Custom Actions 248 | 249 | HyperAgent's capabilities can be extended with custom actions. Custom actions require 3 things: 250 | 251 | - type: Name of the action. Should be something descriptive about the action. 252 | - actionParams: A zod object describing the parameters that the action may consume. 253 | - run: A function that takes in a context, and the params for the action and produces a result based on the params. 254 | 255 | Here is an example that performs a search using Exa 256 | 257 | ```typescript 258 | const exaInstance = new Exa(process.env.EXA_API_KEY); 259 | 260 | export const RunSearchActionDefinition: AgentActionDefinition = { 261 | type: "perform_search", 262 | actionParams: z.object({ 263 | search: z 264 | .string() 265 | .describe( 266 | "The search query for something you want to search about. Keep the search query concise and to-the-point." 267 | ), 268 | }).describe("Search and return the results for a given query.");, 269 | run: async function ( 270 | ctx: ActionContext, 271 | params: z.infer 272 | ): Promise { 273 | const results = (await exaInstance.search(params.search, {})).results 274 | .map( 275 | (res) => 276 | `title: ${res.title} || url: ${res.url} || relevance: ${res.score}` 277 | ) 278 | .join("\n"); 279 | 280 | return { 281 | success: true, 282 | message: `Succesfully performed search for query ${params.search}. Got results: \n${results}`, 283 | }; 284 | }, 285 | }; 286 | 287 | const agent = new HyperAgent({ 288 | "Search about the news for today in New York", 289 | customActions: [RunSearchActionDefinition], 290 | }); 291 | ``` 292 | 293 | ## Contributing 294 | 295 | We welcome contributions to Hyperagent! Here's how you can help: 296 | 297 | 1. Fork the repository 298 | 2. Create your feature branch (`git checkout -b feature/AmazingFeature`) 299 | 3. Commit your changes (`git commit -m 'Add some AmazingFeature'`) 300 | 4. Push to the branch (`git push origin feature/AmazingFeature`) 301 | 5. Open a Pull Request 302 | 303 | ## Support 304 | 305 | - 📚 [Documentation](https://docs.hyperbrowser.ai/hyperagent/about-hyperagent) 306 | - 💬 [Discord Community](https://discord.gg/zsYzsgVRjh) 307 | - 🐛 [Issue Tracker](https://github.com/hyperbrowserai/HyperAgent/issues) 308 | - 📧 [Email Support](mailto:info@hyperbrowser.ai) 309 | -------------------------------------------------------------------------------- /assets/flight-schedule.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hyperbrowserai/HyperAgent/138076315fc49580c6955f2de6ce231a490be394/assets/flight-schedule.gif -------------------------------------------------------------------------------- /assets/hyperagent-banner.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hyperbrowserai/HyperAgent/138076315fc49580c6955f2de6ce231a490be394/assets/hyperagent-banner.png -------------------------------------------------------------------------------- /cli.sh: -------------------------------------------------------------------------------- 1 | SCRIPT_DIR="$(cd "$(dirname $(realpath "$0"))" && pwd)" 2 | NODE_OPTIONS="--no-deprecation" node "$SCRIPT_DIR/dist/cli/index.js" "$@" -------------------------------------------------------------------------------- /eslint.config.mjs: -------------------------------------------------------------------------------- 1 | import typescriptEslint from "@typescript-eslint/eslint-plugin"; 2 | import tsParser from "@typescript-eslint/parser"; 3 | import path from "node:path"; 4 | import { fileURLToPath } from "node:url"; 5 | import js from "@eslint/js"; 6 | import { FlatCompat } from "@eslint/eslintrc"; 7 | 8 | const __filename = fileURLToPath(import.meta.url); 9 | const __dirname = path.dirname(__filename); 10 | const compat = new FlatCompat({ 11 | baseDirectory: __dirname, 12 | recommendedConfig: js.configs.recommended, 13 | allConfig: js.configs.all 14 | }); 15 | 16 | export default [ 17 | ...compat.extends("eslint:recommended", "plugin:@typescript-eslint/recommended", "prettier"), 18 | { 19 | plugins: { 20 | "@typescript-eslint": typescriptEslint, 21 | }, 22 | 23 | languageOptions: { 24 | parser: tsParser, 25 | }, 26 | }, 27 | ]; -------------------------------------------------------------------------------- /examples/browser-providers/hyperbrowser.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * # Hyperbrowser Provider Example 3 | * 4 | * This example demonstrates how to configure and use HyperAgent with the Hyperbrowser 5 | * provider for web browsing tasks with proxy support. 6 | * 7 | * ## What This Example Does 8 | * 9 | * The agent performs a simple web search task that: 10 | * 1. Configures HyperAgent with Hyperbrowser-specific settings 11 | * 2. Enables proxy support for enhanced privacy and reliability 12 | * 3. Searches for and extracts specific information about a movie release date 13 | * 14 | * ## Prerequisites 15 | * 16 | * 1. Node.js environment 17 | * 2. OpenAI API key set in your .env file (OPENAI_API_KEY) 18 | * 19 | * ## Running the Example 20 | * 21 | * ```bash 22 | * yarn ts-node examples/browser-providers/hyperbrowser.ts 23 | * ``` 24 | */ 25 | 26 | import "dotenv/config"; 27 | import { HyperAgent } from "@hyperbrowser/agent"; 28 | import { ChatOpenAI } from "@langchain/openai"; 29 | import chalk from "chalk"; 30 | 31 | async function runEval() { 32 | const llm = new ChatOpenAI({ 33 | apiKey: process.env.OPENAI_API_KEY, 34 | model: "gpt-4o", 35 | }); 36 | 37 | const agent = new HyperAgent({ 38 | llm: llm, 39 | debug: true, 40 | browserProvider: "Hyperbrowser", 41 | hyperbrowserConfig: { 42 | hyperbrowserSessionOptions: { 43 | useProxy: true, 44 | }, 45 | }, 46 | }); 47 | const result = await agent.executeTask( 48 | "Find the initial release date for Guardians of the Galaxy Vol. 3 the movie", 49 | { 50 | debugOnAgentOutput: (agentOutput) => { 51 | console.log("\n" + chalk.cyan.bold("===== AGENT OUTPUT =====")); 52 | console.dir(agentOutput, { depth: null, colors: true }); 53 | console.log(chalk.cyan.bold("===============") + "\n"); 54 | }, 55 | onStep: (step) => { 56 | console.log("\n" + chalk.cyan.bold(`===== STEP ${step.idx} =====`)); 57 | console.dir(step, { depth: null, colors: true }); 58 | console.log(chalk.cyan.bold("===============") + "\n"); 59 | }, 60 | } 61 | ); 62 | await agent.closeAgent(); 63 | console.log(chalk.green.bold("\nResult:")); 64 | console.log(chalk.white(result.output)); 65 | return result; 66 | } 67 | 68 | (async () => { 69 | await runEval(); 70 | })().catch((error) => { 71 | console.error(chalk.red("Error:"), error); 72 | process.exit(1); 73 | }); 74 | -------------------------------------------------------------------------------- /examples/custom-tool/search/exa.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * # Custom Search Tool Example with Exa 3 | * 4 | * This example demonstrates how to create and use a custom search tool with HyperAgent 5 | * using the Exa search API to perform web searches and process the results. 6 | * 7 | * ## What This Example Does 8 | * 9 | * The agent performs a multi-step task that showcases custom tool integration: 10 | * 1. Defines a custom search action using the Exa API 11 | * 2. Creates a schema for the search parameters using Zod 12 | * 3. Implements a search function that returns formatted results with titles, URLs, and relevance scores 13 | * 4. Demonstrates the tool usage with a complex travel planning task for Tokyo that: 14 | * - Searches for relevant information about Tokyo attractions 15 | * - Analyzes search results and filters for relevance 16 | * - Navigates to selected URLs to extract detailed information 17 | * - Compiles recommendations based on uniqueness and frequency 18 | * 19 | * ## Prerequisites 20 | * 21 | * 1. Node.js environment 22 | * 2. OpenAI API key set in your .env file (OPENAI_API_KEY) 23 | * 3. Exa API key set in your .env file (EXA_API_KEY) 24 | * 25 | * ## Custom Tool Configuration 26 | * 27 | * The example includes: 28 | * - Custom search action definition with Zod schema validation 29 | * - Integration with Exa search API 30 | * - Formatted result output with relevance scoring 31 | * 32 | * ## Running the Example 33 | * 34 | * ```bash 35 | * yarn ts-node -r tsconfig-paths/register examples/custom-tool/search/exa.ts 36 | * ``` 37 | * 38 | * ## Example Output 39 | * 40 | * The final output will include a detailed trip plan for Tokyo based on 41 | * searched and analyzed web content, with recommended places and their details. 42 | */ 43 | 44 | import "dotenv/config"; 45 | import HyperAgent from "@hyperbrowser/agent"; 46 | import { 47 | AgentActionDefinition, 48 | ActionContext, 49 | ActionOutput, 50 | } from "@hyperbrowser/agent/types"; 51 | import chalk from "chalk"; 52 | import { ChatOpenAI } from "@langchain/openai"; 53 | import Exa from "exa-js"; 54 | 55 | import * as z from "zod"; 56 | 57 | const exaInstance = new Exa(process.env.EXA_API_KEY); 58 | 59 | const searchSchema = z 60 | .object({ 61 | search: z 62 | .string() 63 | .describe( 64 | "The search query for something you want to search about. Keep the search query concise and to-the-point." 65 | ), 66 | }) 67 | .describe("Search and return the results for a given query."); 68 | 69 | export const RunSearchActionDefinition: AgentActionDefinition = { 70 | type: "perform_search", 71 | actionParams: searchSchema, 72 | run: async function ( 73 | ctx: ActionContext, 74 | params: z.infer 75 | ): Promise { 76 | const results = (await exaInstance.search(params.search, {})).results 77 | .map( 78 | (res) => 79 | `title: ${res.title} || url: ${res.url} || relevance: ${res.score}` 80 | ) 81 | .join("\n"); 82 | 83 | return { 84 | success: true, 85 | message: `Succesfully performed search for query ${params.search}. Got results: \n${results}`, 86 | }; 87 | }, 88 | }; 89 | 90 | async function runEval() { 91 | console.log(chalk.cyan.bold("\n===== Running Custom Tool Example =====")); 92 | 93 | const llm = new ChatOpenAI({ 94 | apiKey: process.env.OPENAI_API_KEY, 95 | model: "gpt-4o", 96 | }); 97 | 98 | const agent = new HyperAgent({ 99 | llm: llm, 100 | debug: true, 101 | customActions: [RunSearchActionDefinition], 102 | }); 103 | 104 | const result = await agent.executeTask( 105 | `Make me a trip plan for Tokyo. 106 | Steps: 107 | 108 | - Peform search about the place and things to see there using the 'perform_search' tool. 109 | - Analyze part of the urls provided, filtering results for relevance, and information and collecting a subset of urls that you think warrant further examination. 110 | - For each page that you've 111 | - Navigate to that url 112 | - Extract information about trip recommendations 113 | - You must do this in order. Navigate to a single page, and then perform extraction on that page. Do not perform multiple navigations one after another. 114 | - Narrow down on places based on uniqueness, frequency of recommendation, and whatever else you feel is valuable. 115 | - Return to me a list of places you recommend, and their details (if any)`, 116 | { 117 | debugOnAgentOutput: (agentOutput) => { 118 | console.log("\n" + chalk.cyan.bold("===== AGENT OUTPUT =====")); 119 | console.dir(agentOutput, { depth: null, colors: true }); 120 | console.log(chalk.cyan.bold("===============") + "\n"); 121 | }, 122 | onStep: (step) => { 123 | console.log("\n" + chalk.cyan.bold(`===== STEP ${step.idx} =====`)); 124 | console.dir(step, { depth: null, colors: true }); 125 | console.log(chalk.cyan.bold("===============") + "\n"); 126 | }, 127 | } 128 | ); 129 | await agent.closeAgent(); 130 | console.log(chalk.green.bold("\nResult:")); 131 | console.log(chalk.white(result.output)); 132 | return result; 133 | } 134 | 135 | (async () => { 136 | await runEval(); 137 | })().catch((error) => { 138 | console.error(chalk.red("Error:"), error); 139 | process.exit(1); 140 | }); 141 | -------------------------------------------------------------------------------- /examples/custom-tool/wikipedia-random-article/run-custom-tool.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * # Custom Wikipedia Random Article Tool Example 3 | * 4 | * This example demonstrates how to create a simple custom tool for HyperAgent 5 | * that navigates to random Wikipedia articles and extracts their content. 6 | * 7 | * ## What This Example Does 8 | * 9 | * The agent performs a straightforward task using a custom tool that: 10 | * 1. Defines a custom action to navigate to Wikipedia's random article page 11 | * 2. Retrieves the page title and URL 12 | * 3. Extracts and describes the content of the randomly selected article 13 | * 14 | * ## Prerequisites 15 | * 16 | * 1. Node.js environment 17 | * 2. OpenAI API key set in your .env file (OPENAI_API_KEY) 18 | * 19 | * ## Running the Example 20 | * 21 | * ```bash 22 | * yarn ts-node -r tsconfig-paths/register examples/custom-tool/wikipedia-random-article/run-custom-tool.ts 23 | * ``` 24 | */ 25 | 26 | import "dotenv/config"; 27 | import { HyperAgent } from "@hyperbrowser/agent"; 28 | import { 29 | AgentActionDefinition, 30 | ActionContext, 31 | ActionOutput, 32 | } from "@hyperbrowser/agent/types"; 33 | import chalk from "chalk"; 34 | import { ChatOpenAI } from "@langchain/openai"; 35 | 36 | import * as z from "zod"; 37 | 38 | export const GoToWikipediaActionDefinition: AgentActionDefinition = { 39 | type: "go_to_random_wikipedia_page", 40 | actionParams: z 41 | .object({}) 42 | .describe( 43 | "Navigate to a random wikipedia page and return the title and url of the page." 44 | ), 45 | run: async function (ctx: ActionContext): Promise { 46 | await ctx.page.goto("https://en.wikipedia.org/wiki/Special:Random", { 47 | waitUntil: "domcontentloaded", 48 | }); 49 | 50 | const url = ctx.page.url(); 51 | const title = await ctx.page.title(); 52 | return { 53 | success: true, 54 | message: `Succesfully navigated to URL: ${url} and title: ${title}`, 55 | }; 56 | }, 57 | }; 58 | 59 | async function runEval() { 60 | console.log(chalk.cyan.bold("\n===== Running Custom Tool Example =====")); 61 | 62 | const llm = new ChatOpenAI({ 63 | apiKey: process.env.OPENAI_API_KEY, 64 | model: "gpt-4o", 65 | }); 66 | 67 | const agent = new HyperAgent({ 68 | llm: llm, 69 | debug: true, 70 | customActions: [GoToWikipediaActionDefinition], 71 | }); 72 | 73 | const result = await agent.executeTask( 74 | "Navigate to a random wikipedia page, and describe to me the contents of that page.", 75 | { 76 | debugOnAgentOutput: (agentOutput) => { 77 | console.log("\n" + chalk.cyan.bold("===== AGENT OUTPUT =====")); 78 | console.dir(agentOutput, { depth: null, colors: true }); 79 | console.log(chalk.cyan.bold("===============") + "\n"); 80 | }, 81 | onStep: (step) => { 82 | console.log("\n" + chalk.cyan.bold(`===== STEP ${step.idx} =====`)); 83 | console.dir(step, { depth: null, colors: true }); 84 | console.log(chalk.cyan.bold("===============") + "\n"); 85 | }, 86 | } 87 | ); 88 | await agent.closeAgent(); 89 | console.log(chalk.green.bold("\nResult:")); 90 | console.log(chalk.white(result.output)); 91 | return result; 92 | } 93 | 94 | (async () => { 95 | await runEval(); 96 | })().catch((error) => { 97 | console.error(chalk.red("Error:"), error); 98 | process.exit(1); 99 | }); 100 | -------------------------------------------------------------------------------- /examples/llms/anthropic.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * # Anthropic LLM Integration Example 3 | * 4 | * This example demonstrates how to configure and use HyperAgent with Anthropic's 5 | * Claude language models for web automation tasks. 6 | * 7 | * ## What This Example Does 8 | * 9 | * The agent performs a web scraping task that: 10 | * 1. Configures HyperAgent with Anthropic's Claude 3 Sonnet model 11 | * 2. Navigates to Hacker News 12 | * 3. Searches for and extracts information about "Show HN" posts 13 | * 14 | * ## Prerequisites 15 | * 16 | * 1. Node.js environment 17 | * 2. Anthropic API key set in your .env file (ANTHROPIC_API_KEY) 18 | * 19 | * ## Running the Example 20 | * 21 | * ```bash 22 | * yarn ts-node -r tsconfig-paths/register examples/llms/anthropic.ts 23 | * ``` 24 | */ 25 | 26 | import "dotenv/config"; 27 | import HyperAgent from "@hyperbrowser/agent"; 28 | 29 | import chalk from "chalk"; 30 | import { ChatAnthropic } from "@langchain/anthropic"; 31 | 32 | const TASK = 33 | "Go to hackernews, and find if there's any SHOW HN post up there. If it is, then tell me the title of the post."; 34 | 35 | async function runEval() { 36 | const llm = new ChatAnthropic({ 37 | apiKey: process.env.ANTHROPIC_API_KEY, 38 | model: "claude-3-7-sonnet-latest", 39 | }); 40 | 41 | const agent = new HyperAgent({ 42 | llm: llm, 43 | }); 44 | 45 | console.log(`\n${chalk.green("Running agent with Claude Sonnet 3.7")}\n`); 46 | 47 | const result = await agent.executeTask(TASK, { 48 | debugOnAgentOutput: (agentOutput) => { 49 | console.log("\n" + chalk.cyan.bold("===== AGENT OUTPUT =====")); 50 | console.dir(agentOutput, { depth: null, colors: true }); 51 | console.log(chalk.cyan.bold("===============") + "\n"); 52 | }, 53 | onStep: (step) => { 54 | console.log("\n" + chalk.cyan.bold(`===== STEP ${step.idx} =====`)); 55 | console.dir(step, { depth: null, colors: true }); 56 | console.log(chalk.cyan.bold("===============") + "\n"); 57 | }, 58 | }); 59 | await agent.closeAgent(); 60 | console.log(chalk.green.bold("\nResult:")); 61 | console.log(chalk.white(result.output)); 62 | return result; 63 | } 64 | 65 | (async () => { 66 | await runEval(); 67 | })().catch((error) => { 68 | console.error(chalk.red("Error:"), error); 69 | process.exit(1); 70 | }); 71 | -------------------------------------------------------------------------------- /examples/llms/openai.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * # OpenAI LLM Integration Example 3 | * 4 | * This example demonstrates how to configure and use HyperAgent with OpenAI's 5 | * language models for web automation tasks. 6 | * 7 | * ## What This Example Does 8 | * 9 | * The agent performs a web scraping task that: 10 | * 1. Configures HyperAgent with OpenAI's GPT-4 model 11 | * 2. Navigates to Hacker News 12 | * 3. Searches for and extracts information about "Show HN" posts 13 | * 14 | * ## Prerequisites 15 | * 16 | * 1. Node.js environment 17 | * 2. OpenAI API key set in your .env file (OPENAI_API_KEY) 18 | * 19 | * ## Running the Example 20 | * 21 | * ```bash 22 | * yarn ts-node -r tsconfig-paths/register examples/llms/openai.ts 23 | * ``` 24 | */ 25 | 26 | import "dotenv/config"; 27 | import HyperAgent from "@hyperbrowser/agent"; 28 | 29 | import chalk from "chalk"; 30 | import { ChatOpenAI } from "@langchain/openai"; 31 | 32 | const TASK = 33 | "Go to hackernews, and find if there's any SHOW HN post up there. If it is, then tell me the title of the post."; 34 | 35 | async function runEval() { 36 | const llm = new ChatOpenAI({ 37 | apiKey: process.env.OPENAI_API_KEY, 38 | model: "gpt-4o", 39 | }); 40 | 41 | const agent = new HyperAgent({ 42 | llm: llm, 43 | debug: true, 44 | }); 45 | 46 | console.log(`\n${chalk.green("Running agent with GPT-4o")}\n`); 47 | 48 | const result = await agent.executeTask(TASK, { 49 | debugOnAgentOutput: (agentOutput) => { 50 | console.log("\n" + chalk.cyan.bold("===== AGENT OUTPUT =====")); 51 | console.dir(agentOutput, { depth: null, colors: true }); 52 | console.log(chalk.cyan.bold("===============") + "\n"); 53 | }, 54 | onStep: (step) => { 55 | console.log("\n" + chalk.cyan.bold(`===== STEP ${step.idx} =====`)); 56 | console.dir(step, { depth: null, colors: true }); 57 | console.log(chalk.cyan.bold("===============") + "\n"); 58 | }, 59 | }); 60 | await agent.closeAgent(); 61 | console.log(chalk.green.bold("\nResult:")); 62 | console.log(chalk.white(result.output)); 63 | return result; 64 | } 65 | 66 | (async () => { 67 | await runEval(); 68 | })().catch((error) => { 69 | console.error(chalk.red("Error:"), error); 70 | process.exit(1); 71 | }); 72 | -------------------------------------------------------------------------------- /examples/mcp/google-sheets/best-buy-reviews.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * # Google Sheets MCP Server Example 3 | * 4 | * This example demonstrates how to use HyperAgent with the Composio Googlesheets MCP server 5 | * to connect to Google Sheets, create a new spreadsheet, and populate it with data scraped from the web. 6 | * 7 | * ## What This Example Does 8 | * 9 | * The agent performs a multi-step task that requires web browsing and Google Sheets integration: 10 | * 1. Checks if there is an active connection to Composio Googlesheets MCP server 11 | * 2. If no connection exists, initiates a connection and waits for the user to authenticate 12 | * 3. Creates a new spreadsheet titled "BestBuy Reviews" 13 | * 4. Navigates to BestBuy to gather data on the reviews for the MacBook Air M2 14 | * 5. Adds the data to the created spreadsheet 15 | * 16 | * ## Prerequisites 17 | * 18 | * 1. Node.js environment 19 | * 2. OpenAI API key set in your .env file (OPENAI_API_KEY) 20 | * 3. Need to have a Composio account, can sign up at https://app.composio.dev 21 | * - Go to this link and get your secure MCP URL (you just need the URL part from the command): https://mcp.composio.dev/googlesheets 22 | * - You will use the url to run the script, for example: 23 | * ``` 24 | * yarn ts-node tsconfig-paths/register examples/mcp/google-sheets/best-buy-reviews.ts 25 | * ``` 26 | * - When running for the first time, there will be no active connection so you will need to login 27 | * with Google OAUTH at the link provided by the agent to authenticate 28 | * 29 | * ## MCP Server Configuration 30 | * 31 | * This example uses the Composio Googlesheets MCP server which provides tools for: 32 | * - `GOOGLESHEETS_CHECK_ACTIVE_CONNECTION`: Verifies if there's an active connection to Google Sheets 33 | * - `GOOGLESHEETS_INITIATE_CONNECTION`: Starts the authentication process for Google Sheets 34 | * - `GOOGLESHEETS_CREATE_GOOGLE_SHEET1`: Creates a new Google Sheet 35 | * - `GOOGLESHEETS_SHEET_FROM_JSON`: Converts JSON data to a Google Sheet format 36 | * - `GOOGLESHEETS_BATCH_UPDATE`: Updates multiple cells in a spreadsheet 37 | * - `GOOGLESHEETS_GET_SPREADSHEET_INFO`: Retrieves information about a spreadsheet 38 | * - `GOOGLESHEETS_LOOKUP_SPREADSHEET_ROW`: Looks up a specific row in a spreadsheet 39 | * - `GOOGLESHEETS_BATCH_GET`: Gets values from multiple ranges in a spreadsheet 40 | * - `GOOGLESHEETS_GET_SHEET_NAMES`: Gets the names of all sheets in a spreadsheet 41 | * - `GOOGLESHEETS_CLEAR_VALUES`: Clears values from a range in a spreadsheet 42 | * - `GOOGLESHEETS_GET_REQUIRED_PARAMETERS`: Gets required parameters for Google Sheets operations 43 | * 44 | * ## Debugging and Monitoring 45 | * 46 | * The example includes callback functions to monitor: 47 | * - Agent output: Raw output from the LLM agent 48 | * - Step execution: Each step the agent takes during the task 49 | * 50 | * ## Running the Example 51 | * 52 | * ``` 53 | * yarn ts-node examples/mcp/google-sheets/best-buy-reviews.ts 54 | * ``` 55 | * 56 | * ## Example Output 57 | * 58 | * The final output will include confirmation that the agent has successfully created a new Google Sheet 59 | * and populated it with information about the reviews for the MacBook Air M2. 60 | */ 61 | 62 | import dotenv from "dotenv"; 63 | import chalk from "chalk"; 64 | import { ChatOpenAI } from "@langchain/openai"; 65 | import HyperAgent from "@hyperbrowser/agent"; 66 | 67 | dotenv.config(); 68 | 69 | const TASK = `1. Run GOOGLESHEETS_CHECK_ACTIVE_CONNECTION to check if there is an active connection. 70 | 2. If there is an active connection, go to 4. Otherwise, go to 3. 71 | 3. Run GOOGLESHEETS_INITIATE_CONNECTION and output the the auth link to the user, then wait for the connection to be active. 72 | 4. Create a new spreadsheet titled "BestBuy Reviews". 73 | 5. Go to https://www.bestbuy.com/site/apple-macbook-air-13-inch-apple-m2-chip-built-for-apple-intelligence-16gb-memory-256gb-ssd-midnight/6602763.p?skuId=6602763 . 74 | 6. Scroll down until you see the "See All Customer Reviews" button and click on the button. 75 | 7. Once on the next page, get all the reviews from the first page. 76 | 8. Add the reviews to the "BestBuy Reviews" spreadsheet. Include these columns and data for all the columns: Review Title, Rating, Review Text, Verified Purchase, and Review Date. 77 | Make sure that the data is well formatted and the columns are all there, make sure to not cut off any of the full review text, please include it all and to get all the reviews.`; 78 | 79 | async function run(mcpUrl: string) { 80 | console.log(chalk.cyan.bold("\n===== Running Task =====")); 81 | console.log(chalk.white(`Task: ${TASK}`)); 82 | console.log(chalk.cyan.bold("=======================\n")); 83 | 84 | console.log(chalk.yellow("Initializing OpenAI LLM...")); 85 | const llm = new ChatOpenAI({ 86 | apiKey: process.env.OPENAI_API_KEY, 87 | model: "gpt-4o", 88 | }); 89 | 90 | console.log(chalk.yellow("Creating HyperAgent...")); 91 | 92 | try { 93 | const agent = new HyperAgent({ 94 | llm: llm, 95 | debug: true, 96 | }); 97 | console.log(chalk.green("Agent created successfully")); 98 | 99 | console.log( 100 | chalk.yellow("Connecting to Composio Googlesheets MCP server...") 101 | ); 102 | await agent.initializeMCPClient({ 103 | servers: [ 104 | { 105 | command: "npx", 106 | args: ["@composio/mcp@latest", "start", "--url", mcpUrl], 107 | env: { 108 | npm_config_yes: "true", 109 | }, 110 | }, 111 | ], 112 | }); 113 | console.log( 114 | chalk.green( 115 | "Connected to Composio Googlesheets MCP server, executing task..." 116 | ) 117 | ); 118 | 119 | const result = await agent.executeTask(TASK, { 120 | debugOnAgentOutput: (agentOutput) => { 121 | console.log("\n" + chalk.cyan.bold("===== AGENT OUTPUT =====")); 122 | console.dir(agentOutput, { depth: null, colors: true }); 123 | console.log(chalk.cyan.bold("===============") + "\n"); 124 | }, 125 | onStep: (step) => { 126 | console.log("\n" + chalk.cyan.bold(`===== STEP ${step.idx} =====`)); 127 | console.dir(step, { depth: null, colors: true }); 128 | console.log(chalk.cyan.bold("===============") + "\n"); 129 | }, 130 | }); 131 | 132 | await agent.closeAgent(); 133 | console.log(chalk.green.bold("\nResult:")); 134 | console.log(chalk.white(result.output)); 135 | return result; 136 | } catch (error) { 137 | console.error(chalk.red.bold("Error creating agent or executing task:")); 138 | console.error( 139 | chalk.red(error instanceof Error ? error.stack : String(error)) 140 | ); 141 | } 142 | } 143 | 144 | (async () => { 145 | try { 146 | if (process.argv.length < 3) { 147 | console.error( 148 | chalk.red("Error: Please provide your MCP URL as an argument") 149 | ); 150 | process.exit(1); 151 | } 152 | await run(process.argv[2]); 153 | } catch (error) { 154 | console.error(chalk.red("Error:"), error); 155 | process.exit(1); 156 | } 157 | })(); 158 | -------------------------------------------------------------------------------- /examples/mcp/google-sheets/car-price-comparison.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * # Car Price Comparison with Google Sheets MCP Server Example 3 | * 4 | * This example demonstrates how to use HyperAgent with the Composio Googlesheets MCP server 5 | * to connect to Google Sheets and populate it with car price comparison data scraped from multiple websites. 6 | * 7 | * ## What This Example Does 8 | * 9 | * The agent performs a multi-step task that requires web browsing and Google Sheets integration: 10 | * 1. Checks if there is an active connection to Composio Googlesheets MCP server 11 | * 2. If no connection exists, initiates a connection and waits for the user to authenticate 12 | * 3. Creates a new spreadsheet titled with the car name and current date 13 | * 4. Searches for the specified car (Toyota Corolla) on multiple car comparison sites: 14 | * - Carvana 15 | * - Carmax 16 | * 5. Collects the 5 cheapest listings from each site including details like price, mileage, model year, and trim 17 | * 6. Adds the collected data to the spreadsheet in a well-formatted manner 18 | * 19 | * ## Prerequisites 20 | * 21 | * 1. Node.js environment 22 | * 2. OpenAI API key set in your .env file (OPENAI_API_KEY) 23 | * 3. Need to have a Composio account, can sign up at https://app.composio.dev 24 | * - Go to this link and get your secure MCP URL (you just need the URL part from the command): https://mcp.composio.dev/googlesheets 25 | * - You will use the url to run the script, for example: 26 | * ``` 27 | * yarn ts-node examples/mcp/google-sheets/car-price-comparison.ts 28 | * ``` 29 | * - When running for the first time, there will be no active connection so you will need to login 30 | * with Google OAUTH at the link provided by the agent to authenticate 31 | * 32 | * ## MCP Server Configuration 33 | * 34 | * This example uses the Composio Googlesheets MCP server which provides tools for: 35 | * - `GOOGLESHEETS_CHECK_ACTIVE_CONNECTION`: Verifies if there's an active connection to Google Sheets 36 | * - `GOOGLESHEETS_INITIATE_CONNECTION`: Starts the authentication process for Google Sheets 37 | * - `GOOGLESHEETS_CREATE_GOOGLE_SHEET1`: Creates a new Google Sheet 38 | * - `GOOGLESHEETS_SHEET_FROM_JSON`: Converts JSON data to a Google Sheet format 39 | * - `GOOGLESHEETS_BATCH_UPDATE`: Updates multiple cells in a spreadsheet 40 | * - `GOOGLESHEETS_GET_SPREADSHEET_INFO`: Retrieves information about a spreadsheet 41 | * - `GOOGLESHEETS_LOOKUP_SPREADSHEET_ROW`: Looks up a specific row in a spreadsheet 42 | * - `GOOGLESHEETS_BATCH_GET`: Gets values from multiple ranges in a spreadsheet 43 | * - `GOOGLESHEETS_GET_SHEET_NAMES`: Gets the names of all sheets in a spreadsheet 44 | * - `GOOGLESHEETS_CLEAR_VALUES`: Clears values from a range in a spreadsheet 45 | * - `GOOGLESHEETS_GET_REQUIRED_PARAMETERS`: Gets required parameters for Google Sheets operations 46 | * 47 | * ## Debugging and Monitoring 48 | * 49 | * The example includes callback functions to monitor: 50 | * - Agent output: Raw output from the LLM agent 51 | * - Step execution: Each step the agent takes during the task 52 | * 53 | * ## Running the Example 54 | * 55 | * ``` 56 | * yarn ts-node examples/mcp/google-sheets/car-price-comparison.ts 57 | * ``` 58 | * 59 | * ## Example Output 60 | * 61 | * The final output will include confirmation that the agent has successfully created a new Google Sheet 62 | * and populated it with information about the car prices from different websites. 63 | */ 64 | 65 | import dotenv from "dotenv"; 66 | import chalk from "chalk"; 67 | import { ChatOpenAI } from "@langchain/openai"; 68 | import HyperbrowserAgent from "@hyperbrowser/agent"; 69 | 70 | dotenv.config(); 71 | 72 | const CAR_NAME = "Toyota Corolla"; 73 | 74 | const TASK_STEPS = ` 75 | Your task is to search for a certain car, namely ${CAR_NAME} and compare it's prices across multiple car price comparison sites, namely 76 | - Carvana (https://www.carvana.com/) 77 | - Carmax (https://www.carmax.com) 78 | 79 | You will search for the results for the mentioned car on each of these websites, sort the results by lowest to highest, and then add the 5 cheapest results of each website to a google sheet. As much as possible, sort results using the websites own sort, and do not try to sort results by extraction. 80 | 81 | ## Google Sheet setup: 82 | 1. Run GOOGLESHEETS_CHECK_ACTIVE_CONNECTION to check if there is an active connection. 83 | 2. If there is an active connection, go to 4. Otherwise, go to 3. 84 | 3. Run GOOGLESHEETS_INITIATE_CONNECTION and output the the auth link to the user, then wait for the connection to be active. 85 | 4. Create a new spreadsheet titled "${CAR_NAME} Comparison - {{CURRENT_DATE}}". 86 | 5. Get the results from each website, and insert the relevant data (like price, mileage, model year, model name/trim), along with the website source. 87 | 6. Add that information to the spreadsheet properly. 88 | Make sure that the data is well formatted and the columns are all there.`; 89 | 90 | async function run(mcpUrl: string) { 91 | console.log(chalk.cyan.bold("\n===== Running Task =====")); 92 | console.log(chalk.white(`Task: ${TASK_STEPS}`)); 93 | console.log(chalk.cyan.bold("=======================\n")); 94 | 95 | console.log(chalk.yellow("Initializing OpenAI LLM...")); 96 | const llm = new ChatOpenAI({ 97 | apiKey: process.env.OPENAI_API_KEY, 98 | model: "gpt-4o", 99 | }); 100 | 101 | console.log(chalk.yellow("Creating Hyperbrowser Agent...")); 102 | 103 | try { 104 | const agent = new HyperbrowserAgent({ 105 | llm: llm, 106 | debug: true, 107 | }); 108 | console.log(chalk.green("Agent created successfully")); 109 | 110 | console.log( 111 | chalk.yellow("Connecting to Composio Googlesheets MCP server...") 112 | ); 113 | await agent.initializeMCPClient({ 114 | servers: [ 115 | { 116 | command: "npx", 117 | args: ["@composio/mcp@latest", "start", "--url", mcpUrl], 118 | env: { 119 | npm_config_yes: "true", 120 | }, 121 | }, 122 | ], 123 | }); 124 | console.log( 125 | chalk.green( 126 | "Connected to Composio Googlesheets MCP server, executing task..." 127 | ) 128 | ); 129 | 130 | const result = await agent.executeTask( 131 | `Your task is to look for a certain car on car comparisons website ${TASK_STEPS}`, 132 | { 133 | debugOnAgentOutput: (agentOutput) => { 134 | console.log("\n" + chalk.cyan.bold("===== AGENT OUTPUT =====")); 135 | console.dir(agentOutput, { depth: null, colors: true }); 136 | console.log(chalk.cyan.bold("===============") + "\n"); 137 | }, 138 | onStep: (step) => { 139 | console.log("\n" + chalk.cyan.bold(`===== STEP ${step.idx} =====`)); 140 | console.dir(step, { depth: null, colors: true }); 141 | console.log(chalk.cyan.bold("===============") + "\n"); 142 | }, 143 | } 144 | ); 145 | 146 | await agent.closeAgent(); 147 | console.log(chalk.green.bold("\nResult:")); 148 | console.log(chalk.white(result.output)); 149 | return result; 150 | } catch (error) { 151 | console.error(chalk.red.bold("Error creating agent or executing task:")); 152 | console.error( 153 | chalk.red(error instanceof Error ? error.stack : String(error)) 154 | ); 155 | } 156 | } 157 | 158 | (async () => { 159 | try { 160 | if (process.argv.length < 3) { 161 | console.error( 162 | chalk.red("Error: Please provide your MCP URL as an argument") 163 | ); 164 | process.exit(1); 165 | } 166 | await run(process.argv[2]); 167 | } catch (error) { 168 | console.error(chalk.red("Error:"), error); 169 | process.exit(1); 170 | } 171 | })(); 172 | -------------------------------------------------------------------------------- /examples/mcp/google-sheets/most-populated-states.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * # Google Sheets MCP Server Example 3 | * 4 | * This example demonstrates how to use HyperAgent with the Composio Googlesheets MCP server 5 | * to connect to Google Sheets, create a new spreadsheet, and populate it with data scraped from the web. 6 | * 7 | * ## What This Example Does 8 | * 9 | * The agent performs a multi-step task that requires web browsing and Google Sheets integration: 10 | * 1. Checks if there is an active connection to Composio Googlesheets MCP server 11 | * 2. If no connection exists, initiates a connection and waits for the user to authenticate 12 | * 3. Creates a new spreadsheet titled "Most Populated States" 13 | * 4. Navigates to Wikipedia to gather data on the 5 most populous US states 14 | * 5. Adds the data to the created spreadsheet 15 | * 16 | * ## Prerequisites 17 | * 18 | * 1. Node.js environment 19 | * 2. OpenAI API key set in your .env file (OPENAI_API_KEY) 20 | * 3. Need to have a Composio account, can sign up at https://app.composio.dev 21 | * - Go to this link and get your secure MCP URL (you just need the URL part from the command): https://mcp.composio.dev/googlesheets 22 | * - You will use the url to run the script, for example: 23 | * ``` 24 | * yarn ts-node tsconfig-paths/register examples/mcp/google-sheets/most-populated-states.ts 25 | * ``` 26 | * - When running for the first time, there will be no active connection so you will need to login 27 | * with Google OAUTH at the link provided by the agent to authenticate 28 | * 29 | * ## MCP Server Configuration 30 | * 31 | * This example uses the Composio Googlesheets MCP server which provides tools for: 32 | * - `GOOGLESHEETS_CHECK_ACTIVE_CONNECTION`: Verifies if there's an active connection to Google Sheets 33 | * - `GOOGLESHEETS_INITIATE_CONNECTION`: Starts the authentication process for Google Sheets 34 | * - `GOOGLESHEETS_CREATE_GOOGLE_SHEET1`: Creates a new Google Sheet 35 | * - `GOOGLESHEETS_SHEET_FROM_JSON`: Converts JSON data to a Google Sheet format 36 | * - `GOOGLESHEETS_BATCH_UPDATE`: Updates multiple cells in a spreadsheet 37 | * - `GOOGLESHEETS_GET_SPREADSHEET_INFO`: Retrieves information about a spreadsheet 38 | * - `GOOGLESHEETS_LOOKUP_SPREADSHEET_ROW`: Looks up a specific row in a spreadsheet 39 | * - `GOOGLESHEETS_BATCH_GET`: Gets values from multiple ranges in a spreadsheet 40 | * - `GOOGLESHEETS_GET_SHEET_NAMES`: Gets the names of all sheets in a spreadsheet 41 | * - `GOOGLESHEETS_CLEAR_VALUES`: Clears values from a range in a spreadsheet 42 | * - `GOOGLESHEETS_GET_REQUIRED_PARAMETERS`: Gets required parameters for Google Sheets operations 43 | * 44 | * ## Debugging and Monitoring 45 | * 46 | * The example includes callback functions to monitor: 47 | * - Agent output: Raw output from the LLM agent 48 | * - Step execution: Each step the agent takes during the task 49 | * 50 | * ## Running the Example 51 | * 52 | * ``` 53 | * yarn ts-node examples/mcp/google-sheets/most-populated-states.ts 54 | * ``` 55 | * 56 | * ## Example Output 57 | * 58 | * The final output will include confirmation that the agent has successfully created a new Google Sheet 59 | * and populated it with information about the top 5 most populous US states. 60 | */ 61 | 62 | import dotenv from "dotenv"; 63 | import chalk from "chalk"; 64 | import { ChatOpenAI } from "@langchain/openai"; 65 | import HyperAgent from "@hyperbrowser/agent"; 66 | 67 | dotenv.config(); 68 | 69 | const TASK = `1. Run GOOGLESHEETS_CHECK_ACTIVE_CONNECTION to check if there is an active connection. 70 | 2. If there is an active connection, go to 4. Otherwise, go to 3. 71 | 3. Run GOOGLESHEETS_INITIATE_CONNECTION and output the the auth link to the user, then wait for the connection to be active. 72 | 4. Create a new spreadsheet titled "Most Populated States". 73 | 5. Go to https://en.wikipedia.org/wiki/List_of_U.S._states_and_territories_by_population and get the data on the top 5 most populous states from the table. 74 | 6. Add that information to the spreadsheet properly. 75 | Make sure that the data is well formatted and the columns are all there.`; 76 | 77 | async function run(mcpUrl: string) { 78 | console.log(chalk.cyan.bold("\n===== Running Task =====")); 79 | console.log(chalk.white(`Task: ${TASK}`)); 80 | console.log(chalk.cyan.bold("=======================\n")); 81 | 82 | console.log(chalk.yellow("Initializing OpenAI LLM...")); 83 | const llm = new ChatOpenAI({ 84 | apiKey: process.env.OPENAI_API_KEY, 85 | model: "gpt-4o", 86 | }); 87 | 88 | console.log(chalk.yellow("Creating HyperAgent...")); 89 | 90 | try { 91 | const agent = new HyperAgent({ 92 | llm: llm, 93 | debug: true, 94 | }); 95 | console.log(chalk.green("Agent created successfully")); 96 | 97 | console.log( 98 | chalk.yellow("Connecting to Composio Googlesheets MCP server...") 99 | ); 100 | await agent.initializeMCPClient({ 101 | servers: [ 102 | { 103 | command: "npx", 104 | args: ["@composio/mcp@latest", "start", "--url", mcpUrl], 105 | env: { 106 | npm_config_yes: "true", 107 | }, 108 | }, 109 | ], 110 | }); 111 | console.log( 112 | chalk.green( 113 | "Connected to Composio Googlesheets MCP server, executing task..." 114 | ) 115 | ); 116 | 117 | const result = await agent.executeTask(TASK, { 118 | debugOnAgentOutput: (agentOutput) => { 119 | console.log("\n" + chalk.cyan.bold("===== AGENT OUTPUT =====")); 120 | console.dir(agentOutput, { depth: null, colors: true }); 121 | console.log(chalk.cyan.bold("===============") + "\n"); 122 | }, 123 | onStep: (step) => { 124 | console.log("\n" + chalk.cyan.bold(`===== STEP ${step.idx} =====`)); 125 | console.dir(step, { depth: null, colors: true }); 126 | console.log(chalk.cyan.bold("===============") + "\n"); 127 | }, 128 | }); 129 | 130 | await agent.closeAgent(); 131 | console.log(chalk.green.bold("\nResult:")); 132 | console.log(chalk.white(result.output)); 133 | return result; 134 | } catch (error) { 135 | console.error(chalk.red.bold("Error creating agent or executing task:")); 136 | console.error( 137 | chalk.red(error instanceof Error ? error.stack : String(error)) 138 | ); 139 | } 140 | } 141 | 142 | (async () => { 143 | try { 144 | if (process.argv.length < 3) { 145 | console.error( 146 | chalk.red("Error: Please provide your MCP URL as an argument") 147 | ); 148 | process.exit(1); 149 | } 150 | await run(process.argv[2]); 151 | } catch (error) { 152 | console.error(chalk.red("Error:"), error); 153 | process.exit(1); 154 | } 155 | })(); 156 | -------------------------------------------------------------------------------- /examples/mcp/notion/create-shopping-list.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * # Notion MCP Server Example 3 | * 4 | * This example demonstrates how to use HyperAgent with the Composio Notion MCP server 5 | * to connect to Notion, create a new page, and populate it with ingredients for a recipe scraped from allrecipes. 6 | * 7 | * ## What This Example Does 8 | * 9 | * The agent performs a multi-step task that requires web browsing and Notion MCP: 10 | * 1. Checks if there is an active connection to Composio Notion MCP server 11 | * 2. If no connection exists, initiates a connection and waits for the user to authenticate 12 | * 3. Creates a new notion page titled "{{RECIPE}} ingredients" 13 | * 4. Navigates to allrecipes and finds a recipe matching the criterias 14 | * 5. Adds the data to the created spreadsheet 15 | * 16 | * ## Prerequisites 17 | * 18 | * 1. Node.js environment 19 | * 2. OpenAI API key set in your .env file (OPENAI_API_KEY) 20 | * 3. Need to have a Composio account, can sign up at https://app.composio.dev 21 | * - Go to this link and get your secure MCP URL (you just need the URL part from the command): https://mcp.composio.dev/notion 22 | * - You will use the url to run the script, for example: 23 | * ``` 24 | * yarn ts-node examples/mcp/notion/create-shoppping-list.ts 25 | * ``` 26 | * - When running for the first time, there will be no active connection so you will need to login 27 | * with Notion OAUTH at the link provided by the agent to authenticate 28 | * 29 | * ## MCP Server Configuration 30 | * 31 | * This example uses the Composio Notion MCP server which provides tools for a number of use cases. We will be using: : 32 | * - `NOTION_CHECK_ACTIVE_CONNECTION`: Verifies if there's an active connection to Notion 33 | * - `NOTION_INITIATE_CONNECTION`: Starts the authentication process for Notion 34 | * - `NOTION_ADD_PAGE_CONTENT`: Adds a single content block to a Notion page 35 | * - `NOTION_CREATE_PAGE`: Creates a new page in Notion 36 | * 37 | * ## Debugging and Monitoring 38 | * 39 | * The example includes callback functions to monitor: 40 | * - Agent output: Raw output from the LLM agent 41 | * - Step execution: Each step the agent takes during the task 42 | * 43 | * ## Running the Example 44 | * 45 | * ``` 46 | * yarn ts-node examples/mcp/notion/create-shoppping-list.ts 47 | * ``` 48 | * 49 | * ## Example Output 50 | * 51 | * The final output will include confirmation that the agent has successfully created a new Notion Page 52 | * and populated it with the ingredients for a recipe. 53 | */ 54 | 55 | import dotenv from "dotenv"; 56 | import chalk from "chalk"; 57 | import { ChatOpenAI } from "@langchain/openai"; 58 | import HyperbrowserAgent from "@hyperbrowser/agent"; 59 | 60 | dotenv.config(); 61 | 62 | const TASK = ` 63 | Go to allrecipes and find a suitable recipe for Salsa verde with more than 100 ratings. Then insert each ingredient into a notion page. Don't get the trivial ingredients like salt, water, or pepper. 64 | 65 | 66 | ## Steps to insert into a notion page: 67 | 68 | 1. Run NOTION_CHECK_ACTIVE_CONNECTION to check if there is an active connection. 69 | 2. If there is an active connection, go to 4. Otherwise, go to 3. 70 | 3. Run NOTION_INITIATE_CONNECTION and output the the auth link to the user, then wait for the connection to be active. 71 | 4. Create a new notion page title - {{RECIPE}} Ingredients 72 | 5. Go to allrecipes, find a suitable recipe for {{RECIPE}}, and get it's ingredients 73 | 6. For each ingredient, call NOTION_ADD_PAGE_CONTENT to insert a single ingredient 74 | 75 | Make sure that the data is well formatted and the columns are all there.`; 76 | 77 | async function run(mcpUrl: string) { 78 | console.log(chalk.cyan.bold("\n===== Running Task =====")); 79 | console.log(chalk.white(`Task: ${TASK}`)); 80 | console.log(chalk.cyan.bold("=======================\n")); 81 | 82 | console.log(chalk.yellow("Initializing OpenAI LLM...")); 83 | const llm = new ChatOpenAI({ 84 | apiKey: process.env.OPENAI_API_KEY, 85 | model: "gpt-4o", 86 | }); 87 | 88 | console.log(chalk.yellow("Creating Hyperbrowser Agent...")); 89 | 90 | try { 91 | const agent = new HyperbrowserAgent({ 92 | llm: llm, 93 | debug: true, 94 | }); 95 | console.log(chalk.green("Agent created successfully")); 96 | 97 | console.log( 98 | chalk.yellow("Connecting to Composio Notion MCP server...") 99 | ); 100 | await agent.initializeMCPClient({ 101 | servers: [ 102 | { 103 | command: "npx", 104 | args: ["@composio/mcp@latest", "start", "--url", mcpUrl], 105 | env: { 106 | npm_config_yes: "true", 107 | }, 108 | }, 109 | ], 110 | }); 111 | console.log( 112 | chalk.green( 113 | "Connected to Composio Notion MCP server, executing task..." 114 | ) 115 | ); 116 | 117 | const result = await agent.executeTask(TASK, { 118 | debugOnAgentOutput: (agentOutput) => { 119 | console.log("\n" + chalk.cyan.bold("===== AGENT OUTPUT =====")); 120 | console.dir(agentOutput, { depth: null, colors: true }); 121 | console.log(chalk.cyan.bold("===============") + "\n"); 122 | }, 123 | onStep: (step) => { 124 | console.log("\n" + chalk.cyan.bold(`===== STEP ${step.idx} =====`)); 125 | console.dir(step, { depth: null, colors: true }); 126 | console.log(chalk.cyan.bold("===============") + "\n"); 127 | }, 128 | }); 129 | 130 | await agent.closeAgent(); 131 | console.log(chalk.green.bold("\nResult:")); 132 | console.log(chalk.white(result.output)); 133 | return result; 134 | } catch (error) { 135 | console.error(chalk.red.bold("Error creating agent or executing task:")); 136 | console.error( 137 | chalk.red(error instanceof Error ? error.stack : String(error)) 138 | ); 139 | } 140 | } 141 | 142 | (async () => { 143 | try { 144 | if (process.argv.length < 3) { 145 | console.error( 146 | chalk.red("Error: Please provide your MCP URL as an argument") 147 | ); 148 | process.exit(1); 149 | } 150 | await run(process.argv[2]); 151 | } catch (error) { 152 | console.error(chalk.red("Error:"), error); 153 | process.exit(1); 154 | } 155 | })(); 156 | -------------------------------------------------------------------------------- /examples/mcp/weather/get-weather-alert.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * # Weather MCP Server Example 3 | * 4 | * This example demonstrates how to use HyperAgent with a MCP (Model Context Protocol) server 5 | * to browse the web, extract information, and use that information to query a separate API service. 6 | * 7 | * ## What This Example Does 8 | * 9 | * The agent performs a multi-step task that requires web browsing and data extraction: 10 | * 1. Navigates to a Wikipedia page listing US states by population 11 | * 2. Identifies the most populated state 12 | * 3. Uses the custom weather MCP server to find weather alerts for that state 13 | * 14 | * ## Prerequisites 15 | * 16 | * - Node.js environment 17 | * - OpenAI API key set in your .env file (OPENAI_API_KEY) 18 | * 19 | * ## MCP Server Configuration 20 | * 21 | * This example uses a custom MCP server (weather-server.js) that provides tools for: 22 | * - `get-alerts`: Fetches weather alerts for a specific state from the National Weather Service API 23 | * - `get-forecast`: Retrieves weather forecasts for specific coordinates 24 | * 25 | * 26 | * ## Debugging and Monitoring 27 | * 28 | * The example includes callback functions to monitor: 29 | * - Agent output: Raw output from the LLM agent 30 | * - Step execution: Each step the agent takes during the task 31 | * 32 | * ## Running the Example 33 | * 34 | * ``` 35 | * yarn ts-node examples/mcp/weather/get-weather-alert.ts 36 | * ``` 37 | * 38 | * ## Example Output 39 | * 40 | * The final output will include the most populated US state and a list of current weather alerts for that state. 41 | */ 42 | 43 | import dotenv from "dotenv"; 44 | import chalk from "chalk"; 45 | import path from "path"; 46 | import { ChatOpenAI } from "@langchain/openai"; 47 | import HyperAgent from "@hyperbrowser/agent"; 48 | 49 | dotenv.config(); 50 | 51 | const TASK = `Go to https://en.wikipedia.org/wiki/List_of_U.S._states_and_territories_by_population and find the most populated state. 52 | Then list 3 weather alerts for that state.`; 53 | 54 | async function run() { 55 | console.log(chalk.cyan.bold("\n===== Running Task =====")); 56 | console.log(chalk.white(`Task: ${TASK}`)); 57 | console.log(chalk.cyan.bold("=======================\n")); 58 | 59 | const llm = new ChatOpenAI({ 60 | apiKey: process.env.OPENAI_API_KEY, 61 | model: "gpt-4o", 62 | }); 63 | 64 | const mcpServerPath = path.join(__dirname, "/servers/weather-server.js"); 65 | 66 | console.log(chalk.yellow("Creating Hyperbrowser Agent...")); 67 | 68 | try { 69 | const agent = new HyperAgent({ 70 | llm: llm, 71 | debug: true, 72 | }); 73 | 74 | await agent.initializeMCPClient({ 75 | servers: [ 76 | { 77 | command: "node", 78 | args: [mcpServerPath], 79 | }, 80 | ], 81 | }); 82 | 83 | const result = await agent.executeTask(TASK, { 84 | debugOnAgentOutput: (agentOutput) => { 85 | console.log("\n" + chalk.cyan.bold("===== AGENT OUTPUT =====")); 86 | console.dir(agentOutput, { depth: null, colors: true }); 87 | console.log(chalk.cyan.bold("===============") + "\n"); 88 | }, 89 | onStep: (step) => { 90 | console.log("\n" + chalk.cyan.bold(`===== STEP ${step.idx} =====`)); 91 | console.dir(step, { depth: null, colors: true }); 92 | console.log(chalk.cyan.bold("===============") + "\n"); 93 | }, 94 | }); 95 | 96 | await agent.closeAgent(); 97 | console.log(chalk.green.bold("\nResult:")); 98 | console.log(chalk.white(result.output)); 99 | return result; 100 | } catch (error) { 101 | console.error(chalk.red("Error creating agent or executing task:")); 102 | console.error( 103 | chalk.red(error instanceof Error ? error.stack : String(error)) 104 | ); 105 | } 106 | } 107 | 108 | (async () => { 109 | try { 110 | await run(); 111 | } catch (error) { 112 | console.error(chalk.red("Error:"), error); 113 | process.exit(1); 114 | } 115 | })(); 116 | -------------------------------------------------------------------------------- /examples/mcp/weather/servers/weather-server.js: -------------------------------------------------------------------------------- 1 | const { McpServer } = require("@modelcontextprotocol/sdk/server/mcp.js"); 2 | const { StdioServerTransport } = require("@modelcontextprotocol/sdk/server/stdio.js"); 3 | const { z } = require("zod"); 4 | 5 | const NWS_API_BASE = "https://api.weather.gov"; 6 | const USER_AGENT = "weather-app/1.0"; 7 | 8 | // Helper function for making NWS API requests 9 | /** 10 | * @param {string} url 11 | * @returns {Promise} 12 | */ 13 | async function makeNWSRequest(url) { 14 | const headers = { 15 | "User-Agent": USER_AGENT, 16 | Accept: "application/geo+json", 17 | }; 18 | 19 | try { 20 | const response = await fetch(url, { headers }); 21 | if (!response.ok) { 22 | throw new Error(`HTTP error! status: ${response.status}`); 23 | } 24 | return await response.json(); 25 | } catch (error) { 26 | console.error("Error making NWS request:", error); 27 | return null; 28 | } 29 | } 30 | 31 | /** 32 | * @typedef {Object} AlertFeature 33 | * @property {Object} properties 34 | * @property {string} [properties.event] 35 | * @property {string} [properties.areaDesc] 36 | * @property {string} [properties.severity] 37 | * @property {string} [properties.status] 38 | * @property {string} [properties.headline] 39 | */ 40 | 41 | // Format alert data 42 | /** 43 | * @param {AlertFeature} feature 44 | * @returns {string} 45 | */ 46 | function formatAlert(feature) { 47 | const props = feature.properties; 48 | return [ 49 | `Event: ${props.event || "Unknown"}`, 50 | `Area: ${props.areaDesc || "Unknown"}`, 51 | `Severity: ${props.severity || "Unknown"}`, 52 | `Status: ${props.status || "Unknown"}`, 53 | `Headline: ${props.headline || "No headline"}`, 54 | "---", 55 | ].join("\n"); 56 | } 57 | 58 | /** 59 | * @typedef {Object} ForecastPeriod 60 | * @property {string} [name] 61 | * @property {number} [temperature] 62 | * @property {string} [temperatureUnit] 63 | * @property {string} [windSpeed] 64 | * @property {string} [windDirection] 65 | * @property {string} [shortForecast] 66 | */ 67 | 68 | /** 69 | * @typedef {Object} AlertsResponse 70 | * @property {AlertFeature[]} features 71 | */ 72 | 73 | /** 74 | * @typedef {Object} PointsResponse 75 | * @property {Object} properties 76 | * @property {string} [properties.forecast] 77 | */ 78 | 79 | /** 80 | * @typedef {Object} ForecastResponse 81 | * @property {Object} properties 82 | * @property {ForecastPeriod[]} properties.periods 83 | */ 84 | 85 | // Create server instance 86 | const server = new McpServer({ 87 | name: "weather", 88 | version: "1.0.0", 89 | }); 90 | 91 | // Register weather tools 92 | server.tool( 93 | "get-alerts", 94 | "Get weather alerts for a state", 95 | { 96 | state: z.string().length(2).describe("Two-letter state code (e.g. CA, NY)"), 97 | }, 98 | async ({ state }) => { 99 | const stateCode = state.toUpperCase(); 100 | const alertsUrl = `${NWS_API_BASE}/alerts?area=${stateCode}`; 101 | const alertsData = await makeNWSRequest(alertsUrl); 102 | 103 | if (!alertsData) { 104 | return { 105 | content: [ 106 | { 107 | type: "text", 108 | text: "Failed to retrieve alerts data", 109 | }, 110 | ], 111 | }; 112 | } 113 | 114 | const features = alertsData.features || []; 115 | if (features.length === 0) { 116 | return { 117 | content: [ 118 | { 119 | type: "text", 120 | text: `No active alerts for ${stateCode}`, 121 | }, 122 | ], 123 | }; 124 | } 125 | 126 | const formattedAlerts = features.map(formatAlert); 127 | const alertsText = `Active alerts for ${stateCode}:\n\n${formattedAlerts.join("\n")}`; 128 | 129 | return { 130 | content: [ 131 | { 132 | type: "text", 133 | text: alertsText, 134 | }, 135 | ], 136 | }; 137 | }, 138 | ); 139 | 140 | server.tool( 141 | "get-forecast", 142 | "Get weather forecast for a location", 143 | { 144 | latitude: z.number().min(-90).max(90).describe("Latitude of the location"), 145 | longitude: z 146 | .number() 147 | .min(-180) 148 | .max(180) 149 | .describe("Longitude of the location"), 150 | }, 151 | async ({ latitude, longitude }) => { 152 | // Get grid point data 153 | const pointsUrl = `${NWS_API_BASE}/points/${latitude.toFixed(4)},${longitude.toFixed(4)}`; 154 | const pointsData = await makeNWSRequest(pointsUrl); 155 | 156 | if (!pointsData) { 157 | return { 158 | content: [ 159 | { 160 | type: "text", 161 | text: `Failed to retrieve grid point data for coordinates: ${latitude}, ${longitude}. This location may not be supported by the NWS API (only US locations are supported).`, 162 | }, 163 | ], 164 | }; 165 | } 166 | 167 | const forecastUrl = pointsData.properties?.forecast; 168 | if (!forecastUrl) { 169 | return { 170 | content: [ 171 | { 172 | type: "text", 173 | text: "Failed to get forecast URL from grid point data", 174 | }, 175 | ], 176 | }; 177 | } 178 | 179 | // Get forecast data 180 | const forecastData = await makeNWSRequest(forecastUrl); 181 | if (!forecastData) { 182 | return { 183 | content: [ 184 | { 185 | type: "text", 186 | text: "Failed to retrieve forecast data", 187 | }, 188 | ], 189 | }; 190 | } 191 | 192 | const periods = forecastData.properties?.periods || []; 193 | if (periods.length === 0) { 194 | return { 195 | content: [ 196 | { 197 | type: "text", 198 | text: "No forecast periods available", 199 | }, 200 | ], 201 | }; 202 | } 203 | 204 | // Format forecast periods 205 | const formattedForecast = periods.map((period) => 206 | [ 207 | `${period.name || "Unknown"}:`, 208 | `Temperature: ${period.temperature || "Unknown"}°${period.temperatureUnit || "F"}`, 209 | `Wind: ${period.windSpeed || "Unknown"} ${period.windDirection || ""}`, 210 | `${period.shortForecast || "No forecast available"}`, 211 | "---", 212 | ].join("\n"), 213 | ); 214 | 215 | const forecastText = `Forecast for ${latitude}, ${longitude}:\n\n${formattedForecast.join("\n")}`; 216 | 217 | return { 218 | content: [ 219 | { 220 | type: "text", 221 | text: forecastText, 222 | }, 223 | ], 224 | }; 225 | }, 226 | ); 227 | 228 | // Start the server 229 | async function main() { 230 | const transport = new StdioServerTransport(); 231 | await server.connect(transport); 232 | console.error("Weather MCP Server running on stdio"); 233 | } 234 | 235 | main().catch((error) => { 236 | console.error("Fatal error in main():", error); 237 | process.exit(1); 238 | }); -------------------------------------------------------------------------------- /examples/output-to-schema/output-to-schema.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * # Output Schema Example 3 | * 4 | * This example demonstrates how to use HyperAgent with a defined output schema 5 | * to ensure structured and validated responses from the agent. 6 | * 7 | * ## What This Example Does 8 | * 9 | * The agent performs a task with structured output that: 10 | * 1. Defines a Zod schema for the expected output format 11 | * 2. Performs actions to complete the specified task 12 | * 3. Returns movie information in a structured format specified 13 | * 14 | * ## Prerequisites 15 | * 16 | * 1. Node.js environment 17 | * 2. OpenAI API key set in your .env file (OPENAI_API_KEY) 18 | * 19 | * ## Running the Example 20 | * 21 | * ```bash 22 | * yarn ts-node -r tsconfig-paths/register examples/output-to-schema/output-to-schema.ts 23 | * ``` 24 | */ 25 | 26 | import "dotenv/config"; 27 | import { HyperAgent } from "@hyperbrowser/agent"; 28 | 29 | import chalk from "chalk"; 30 | import { sleep } from "../../src/utils/sleep"; 31 | import { ChatOpenAI } from "@langchain/openai"; 32 | import { z } from "zod"; 33 | 34 | const TASK = 35 | "Navigate to imdb.com, search for 'The Matrix', and extract the director, release year, and rating"; 36 | 37 | async function runEval() { 38 | const llm = new ChatOpenAI({ 39 | apiKey: process.env.OPENAI_API_KEY, 40 | model: "gpt-4o", 41 | }); 42 | 43 | const agent = new HyperAgent({ 44 | llm: llm, 45 | debug: true, 46 | }); 47 | 48 | await sleep(1000); 49 | const result = await agent.executeTask(TASK, { 50 | debugOnAgentOutput: (agentOutput) => { 51 | console.log("\n" + chalk.cyan.bold("===== AGENT OUTPUT =====")); 52 | console.dir(agentOutput, { depth: null, colors: true }); 53 | console.log(chalk.cyan.bold("===============") + "\n"); 54 | }, 55 | onStep: (step) => { 56 | console.log("\n" + chalk.cyan.bold(`===== STEP ${step.idx} =====`)); 57 | console.dir(step, { depth: null, colors: true }); 58 | console.log(chalk.cyan.bold("===============") + "\n"); 59 | }, 60 | outputSchema: z.object({ 61 | director: z.string().describe("The name of the movie director"), 62 | releaseYear: z.number().describe("The year the movie was released"), 63 | rating: z.string().describe("The IMDb rating of the movie"), 64 | }), 65 | }); 66 | await agent.closeAgent(); 67 | console.log(chalk.green.bold("\nResult:")); 68 | console.log(chalk.white(result.output)); 69 | return result; 70 | } 71 | 72 | (async () => { 73 | await runEval(); 74 | })().catch((error) => { 75 | console.error(chalk.red("Error:"), error); 76 | process.exit(1); 77 | }); 78 | -------------------------------------------------------------------------------- /examples/simple/add-to-amazon-cart.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * # Simple Amazon Cart Example 3 | * 4 | * This example demonstrates how to use HyperAgent to automate a basic 5 | * e-commerce task on Amazon.com. 6 | * 7 | * ## What This Example Does 8 | * 9 | * The agent performs a simple shopping task that: 10 | * 1. Navigates to Amazon.com 11 | * 2. Searches for a specific product 12 | * 3. Adds an item to the cart that matches the specific requirements (only a single item) 13 | * 14 | * ## Prerequisites 15 | * 16 | * 1. Node.js environment 17 | * 2. OpenAI API key set in your .env file (OPENAI_API_KEY) 18 | * 19 | * ## Running the Example 20 | * 21 | * ```bash 22 | * yarn ts-node -r tsconfig-paths/register examples/simple/add-to-amazon-cart.ts 23 | * ``` 24 | */ 25 | 26 | import "dotenv/config"; 27 | import { HyperAgent } from "@hyperbrowser/agent"; 28 | import chalk from "chalk"; 29 | import { ChatOpenAI } from "@langchain/openai"; 30 | 31 | async function runEval() { 32 | console.log(chalk.cyan.bold("\n===== Running Add to amazon Example =====")); 33 | 34 | const llm = new ChatOpenAI({ 35 | apiKey: process.env.OPENAI_API_KEY, 36 | model: "gpt-4o", 37 | }); 38 | 39 | const agent = new HyperAgent({ 40 | llm: llm, 41 | }); 42 | 43 | const result = await agent.executeTask( 44 | "Navigate to amazon.com, and add the one chip challenge to my cart. Add only the version containing a single item, not multiple items. Once you have added a single product, and do not get any sort of failure form that addition, finish up.", 45 | { 46 | debugOnAgentOutput: (agentOutput) => { 47 | console.log("\n" + chalk.cyan.bold("===== AGENT OUTPUT =====")); 48 | console.dir(agentOutput, { depth: null, colors: true }); 49 | console.log(chalk.cyan.bold("===============") + "\n"); 50 | }, 51 | onStep: (step) => { 52 | console.log("\n" + chalk.cyan.bold(`===== STEP ${step.idx} =====`)); 53 | console.dir(step, { depth: null, colors: true }); 54 | console.log(chalk.cyan.bold("===============") + "\n"); 55 | }, 56 | } 57 | ); 58 | await agent.closeAgent(); 59 | console.log(chalk.green.bold("\nResult:")); 60 | console.log(chalk.white(result.output)); 61 | return result; 62 | } 63 | 64 | (async () => { 65 | await runEval(); 66 | })().catch((error) => { 67 | console.error(chalk.red("Error:"), error); 68 | process.exit(1); 69 | }); 70 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "@hyperbrowser/agent", 3 | "version": "0.3.1", 4 | "description": "Hyperbrowsers Web Agent", 5 | "author": "", 6 | "main": "dist/index.js", 7 | "types": "dist/index.d.ts", 8 | "type": "commonjs", 9 | "license": "AGPL-3.0", 10 | "scripts": { 11 | "build": "rm -rf dist && tsc && tsc-alias && node -e \"require('fs').chmodSync('dist/cli/index.js', '755')\" && node -e \"require('fs').chmodSync('cli.sh', '755')\"", 12 | "build-dom-tree-script": "ts-node src/context-providers/dom/builder.ts", 13 | "lint": "eslint src/**/*.ts", 14 | "prepare": "yarn build", 15 | "test": "jest", 16 | "format": "prettier --write 'src/**/*.ts'", 17 | "cli": "yarn ts-node -r tsconfig-paths/register src/cli/index.ts", 18 | "example": "yarn ts-node -r tsconfig-paths/register" 19 | }, 20 | "bin": { 21 | "hyperagent-cli": "cli.sh" 22 | }, 23 | "files": [ 24 | "dist", 25 | "README.md", 26 | "LICENSE", 27 | "cli.sh" 28 | ], 29 | "keywords": [ 30 | "hyperbrowser", 31 | "browser", 32 | "automation", 33 | "webscraping", 34 | "webcrawling", 35 | "scraping", 36 | "crawling", 37 | "ai" 38 | ], 39 | "dependencies": { 40 | "@google/genai": "^0.8.0", 41 | "@hyperbrowser/sdk": "^0.46.0", 42 | "@inquirer/prompts": "^7.4.1", 43 | "@langchain/core": "^0.3.43", 44 | "@modelcontextprotocol/sdk": "^1.9.0", 45 | "@types/crypto-js": "^4.2.2", 46 | "boxen": "5.1.2", 47 | "chalk": "4.1.2", 48 | "commander": "^13.1.0", 49 | "crypto-js": "^4.2.0", 50 | "dotenv": "^16.4.5", 51 | "joplin-turndown-plugin-gfm": "^1.0.12", 52 | "langchain": "^0.3.19", 53 | "lodash": "^4.17.21", 54 | "minimatch": "^9.0.3", 55 | "ora": "5.4.1", 56 | "playwright": "npm:rebrowser-playwright@1.49.1", 57 | "readline": "^1.3.0", 58 | "sharp": "^0.34.1", 59 | "turndown": "^7.2.0", 60 | "zod": "^3.24.1", 61 | "zod-to-json-schema": "^3.24.1" 62 | }, 63 | "devDependencies": { 64 | "@langchain/anthropic": "^0.3.17", 65 | "@types/lodash": "^4.17.16", 66 | "@types/node": "^22.9.1", 67 | "@types/turndown": "^5.0.5", 68 | "@typescript-eslint/eslint-plugin": "^8.15.0", 69 | "@typescript-eslint/parser": "^8.15.0", 70 | "axios": "^1.8.4", 71 | "esbuild": "^0.25.2", 72 | "eslint": "^9.15.0", 73 | "eslint-config-prettier": "^9.1.0", 74 | "exa-js": "^1.5.13", 75 | "prettier": "^3.3.3", 76 | "ts-node": "^10.9.2", 77 | "tsc-alias": "^1.8.15", 78 | "tsconfig-paths": "^4.2.0", 79 | "tsx": "^4.19.3", 80 | "typescript": "^5.6.3" 81 | }, 82 | "exports": { 83 | ".": { 84 | "types": "./dist/index.d.ts", 85 | "default": "./dist/index.js" 86 | }, 87 | "./types": { 88 | "types": "./dist/types/index.d.ts", 89 | "default": "./dist/types/index.js" 90 | }, 91 | "./custom-actions": { 92 | "types": "./dist/custom-actions/index.d.ts", 93 | "default": "./dist/custom-actions/index.js" 94 | } 95 | }, 96 | "typesVersions": { 97 | "*": { 98 | ".": [ 99 | "./dist/index.d.ts" 100 | ], 101 | "types": [ 102 | "./dist/types/index.d.ts" 103 | ], 104 | "./custom-actions": [ 105 | "./dist/custom-actions/index.d.ts" 106 | ] 107 | } 108 | } 109 | } -------------------------------------------------------------------------------- /scripts/run-webvoyager-eval.ts: -------------------------------------------------------------------------------- 1 | import { HyperAgent } from "../src/agent"; 2 | import dotenv from "dotenv"; 3 | import chalk from "chalk"; 4 | import fs from "fs"; 5 | import path from "path"; 6 | import { sleep } from "../src/utils/sleep"; 7 | import { retry } from "../src/utils/retry"; 8 | import { ChatOpenAI } from "@langchain/openai"; 9 | import { z } from "zod"; 10 | import { minimatch } from "minimatch"; 11 | 12 | dotenv.config(); 13 | 14 | class Logger { 15 | private logStream: fs.WriteStream; 16 | private logToConsole: boolean; 17 | 18 | constructor(runId: string, evalId: string, logToConsole = false) { 19 | const logDir = path.join(__dirname, `../logs/${runId}/${evalId}`); 20 | if (!fs.existsSync(logDir)) { 21 | fs.mkdirSync(logDir, { recursive: true }); 22 | } 23 | const logPath = path.join(logDir, `webvoyager-eval.log`); 24 | this.logStream = fs.createWriteStream(logPath, { flags: "a" }); 25 | this.logToConsole = logToConsole; 26 | this.log(`Log started at ${new Date().toISOString()}\n`); 27 | } 28 | 29 | log(message: string, type: "info" | "error" | "success" = "info") { 30 | this.logStream.write(message); 31 | if (this.logToConsole) { 32 | switch (type) { 33 | case "error": 34 | console.error(chalk.red(message)); 35 | break; 36 | case "success": 37 | console.log(chalk.green(message)); 38 | break; 39 | default: 40 | console.log(message); 41 | } 42 | } 43 | } 44 | 45 | logObject(obj: any, prefix = "") { 46 | const objString = JSON.stringify(obj, null, 2); 47 | this.log(`${prefix}${objString}`); 48 | } 49 | 50 | close() { 51 | this.logStream.end(); 52 | } 53 | } 54 | 55 | interface WebVoyagerEval { 56 | web_name: string; 57 | id: string; 58 | ques: string; 59 | web: string; 60 | } 61 | 62 | interface ReferenceAnswer { 63 | id: number; 64 | type: string; 65 | ans: string; 66 | notes?: string; 67 | } 68 | 69 | interface WebsiteReference { 70 | notice?: string; 71 | answers: ReferenceAnswer[]; 72 | } 73 | 74 | interface References { 75 | [website: string]: WebsiteReference; 76 | } 77 | 78 | interface EvalResult { 79 | id: string; 80 | correct: boolean; 81 | question: string; 82 | actual?: string; 83 | expected?: string; 84 | reason?: string; 85 | evaluationReason?: string; 86 | notes?: string; 87 | } 88 | 89 | const AnswerEvaluationSchema = z.object({ 90 | isCorrect: z 91 | .boolean() 92 | .describe( 93 | "Whether the generated answer is correct compared to the reference" 94 | ), 95 | reason: z.string().describe("Reason for the evaluation"), 96 | }); 97 | 98 | type AnswerEvaluation = z.infer; 99 | 100 | async function loadEvals() { 101 | const evalPath = path.join(__dirname, "../evals/WebVoyager_data.jsonl"); 102 | const fileContent = await fs.promises.readFile(evalPath, "utf-8"); 103 | const lines = fileContent.split("\n"); 104 | const result: WebVoyagerEval[] = []; 105 | for (const line of lines) { 106 | const eval_data = JSON.parse(line) as WebVoyagerEval; 107 | if (line.trim()) { 108 | result.push(eval_data); 109 | } 110 | } 111 | return result; 112 | } 113 | 114 | async function loadReferences(): Promise { 115 | const refPath = path.join(__dirname, "../evals/WebVoyager_reference.json"); 116 | const fileContent = await fs.promises.readFile(refPath, "utf-8"); 117 | return JSON.parse(fileContent); 118 | } 119 | 120 | async function checkAnswerAgainstReference( 121 | answer: string, 122 | reference: string, 123 | question: string, 124 | screenshotPath: string, 125 | notes?: string 126 | ): Promise { 127 | const screenshotBase64 = fs.readFileSync(screenshotPath, { 128 | encoding: "base64", 129 | }); 130 | const imageUrl = `data:image/png;base64,${screenshotBase64}`; 131 | 132 | const messages = [ 133 | { 134 | role: "system", 135 | content: 136 | "You are an evaluator checking if a web navigation agent correctly answered a question. Your task is to verify the agent's answer by examining the final webpage screenshot and comparing it to a reference answer. Focus primarily on the visual evidence in the screenshot rather than just comparing text answers.", 137 | }, 138 | { 139 | role: "user", 140 | content: [ 141 | { 142 | type: "text", 143 | text: `Question: ${question} 144 | 145 | Reference Answer: ${reference} 146 | 147 | Generated Answer: ${answer} 148 | 149 | ${notes ? `Additional Notes: ${notes}` : ""} 150 | 151 | Please evaluate if the generated answer is correct by: 152 | 1. Primarily using the screenshot to verify the information 153 | 2. Checking if key information matches between the reference and generated answer 154 | 3. Being somewhat lenient - if the main points are correct, minor differences in exact numbers or formatting are acceptable (especially stuff like ratings and reviews which may update over time) 155 | 156 | Respond in JSON format with { isCorrect: true | false, reason: string }`, 157 | }, 158 | { 159 | type: "image_url", 160 | image_url: { 161 | url: imageUrl, 162 | }, 163 | }, 164 | ], 165 | }, 166 | ]; 167 | 168 | const llm = new ChatOpenAI({ 169 | apiKey: process.env.OPENAI_API_KEY, 170 | model: "gpt-4o", 171 | }); 172 | return await llm 173 | .withStructuredOutput(AnswerEvaluationSchema) 174 | .invoke(messages); 175 | } 176 | 177 | async function runEvalHelper( 178 | agent: HyperAgent, 179 | eval_data: WebVoyagerEval, 180 | references: References, 181 | logger: Logger, 182 | runId: string 183 | ): Promise { 184 | logger.log("\n===== Running Eval ====="); 185 | logger.log(`\nID: ${eval_data.id}`); 186 | logger.log(`\nWebsite: ${eval_data.web_name}`); 187 | logger.log(`\nQuestion: ${eval_data.ques}`); 188 | logger.log("\n=======================\n"); 189 | 190 | const page = await agent.getCurrentPage(); 191 | await page.goto(eval_data.web, { 192 | waitUntil: "domcontentloaded", 193 | }); 194 | await sleep(1000); 195 | await page.reload({ waitUntil: "domcontentloaded" }); 196 | await sleep(1000); 197 | 198 | const result = await agent.executeTask(eval_data.ques, { 199 | maxSteps: 25, 200 | debugDir: path.join(__dirname, `../logs/${runId}/${eval_data.id}/debug`), 201 | debugOnAgentOutput: (agentOutput) => { 202 | logger.log("\n===== AGENT OUTPUT ====="); 203 | logger.logObject(agentOutput); 204 | logger.log("===============\n"); 205 | }, 206 | onStep: (step) => { 207 | logger.log(`\n===== STEP ${step.idx} =====`); 208 | logger.logObject(step); 209 | logger.log("===============\n"); 210 | }, 211 | }); 212 | if (!result.output) { 213 | throw new Error("No output from agent"); 214 | } 215 | 216 | logger.log(result.output || ""); 217 | 218 | // Take screenshot of final state 219 | const screenshotPath = path.join( 220 | __dirname, 221 | `../logs/${runId}/${eval_data.id}/final-state.png` 222 | ); 223 | await page.screenshot({ path: screenshotPath, fullPage: true }); 224 | await agent.closeAgent(); 225 | 226 | // Check against reference 227 | const websiteRefs = references[eval_data.web_name]; 228 | if (!websiteRefs) { 229 | logger.log("No references found for this website", "error"); 230 | return { 231 | id: eval_data.id, 232 | question: eval_data.ques, 233 | correct: false, 234 | reason: "No references found for this website", 235 | }; 236 | } 237 | const relevantRef = 238 | websiteRefs.answers[parseInt(eval_data.id.split("--")[1])]; 239 | if (!relevantRef?.ans) { 240 | logger.log("No reference found for this specific evaluation ID", "error"); 241 | return { 242 | id: eval_data.id, 243 | question: eval_data.ques, 244 | correct: false, 245 | reason: "No reference found for this specific evaluation ID", 246 | }; 247 | } 248 | 249 | logger.log("\nChecking against reference..."); 250 | try { 251 | const evaluation = await checkAnswerAgainstReference( 252 | result.output, 253 | relevantRef.ans, 254 | eval_data.ques, 255 | screenshotPath, 256 | relevantRef.notes 257 | ); 258 | logger.log( 259 | evaluation.isCorrect ? "✓ CORRECT" : "✗ INCORRECT", 260 | evaluation.isCorrect ? "success" : "error" 261 | ); 262 | return { 263 | id: eval_data.id, 264 | question: eval_data.ques, 265 | correct: evaluation.isCorrect, 266 | evaluationReason: evaluation.reason, 267 | actual: result.output, 268 | expected: relevantRef.ans, 269 | notes: relevantRef.notes, 270 | }; 271 | } catch (error) { 272 | logger.log(`Error checking answer against reference: ${error}`, "error"); 273 | return { 274 | id: eval_data.id, 275 | question: eval_data.ques, 276 | correct: false, 277 | actual: result.output, 278 | expected: relevantRef.ans, 279 | reason: `Error checking answer against reference: ${error}`, 280 | }; 281 | } 282 | } 283 | 284 | const runEval = async ( 285 | eval_data: WebVoyagerEval, 286 | references: References, 287 | runId: string 288 | ): Promise => { 289 | const logger = new Logger(runId, eval_data.id); 290 | const llm = new ChatOpenAI({ 291 | apiKey: process.env.OPENAI_API_KEY, 292 | model: "gpt-4o", 293 | }); 294 | const agent = new HyperAgent({ 295 | llm: llm, 296 | hyperbrowserConfig: { 297 | hyperbrowserSessionOptions: { 298 | screen: { width: 1500, height: 1500 }, 299 | }, 300 | }, 301 | debug: true, 302 | }); 303 | try { 304 | const timeoutPromise = new Promise((_, reject) => { 305 | setTimeout( 306 | () => reject(new Error("Evaluation timed out after 10 minutes")), 307 | 10 * 60 * 1000 308 | ); 309 | }); 310 | return await Promise.race([ 311 | retry({ 312 | func: async () => 313 | runEvalHelper(agent, eval_data, references, logger, runId), 314 | params: { retryCount: 3 }, 315 | }), 316 | timeoutPromise, 317 | ]); 318 | } catch (error) { 319 | await agent.closeAgent(); 320 | logger.log(`Error: ${error}`, "error"); 321 | return { 322 | id: eval_data.id, 323 | question: eval_data.ques, 324 | correct: false, 325 | reason: `Error: ${error}`, 326 | }; 327 | } finally { 328 | logger.close(); 329 | } 330 | }; 331 | 332 | async function runEvalsBatch( 333 | evals: WebVoyagerEval[], 334 | references: References, 335 | runId: string, 336 | concurrency: number = 25 337 | ): Promise { 338 | const results: EvalResult[] = []; 339 | const queue = [...evals]; 340 | const inProgress = new Set>(); 341 | 342 | // Helper to run a single eval and maintain the queue 343 | const runNext = async () => { 344 | if (queue.length === 0) return; 345 | const eval_data = queue.shift()!; 346 | const promise = runEval(eval_data, references, runId); 347 | inProgress.add(promise); 348 | 349 | promise 350 | .then((result) => { 351 | results.push(result); 352 | inProgress.delete(promise); 353 | // Start next eval if there are more in queue 354 | if (queue.length > 0) { 355 | runNext(); 356 | } 357 | }) 358 | .catch((error) => { 359 | console.error(`Error in evaluation ${eval_data.id}:`, error); 360 | inProgress.delete(promise); 361 | // Even on error, try to keep the pool full 362 | if (queue.length > 0) { 363 | runNext(); 364 | } 365 | }); 366 | }; 367 | 368 | // Initialize the pool with concurrent evaluations 369 | const initialCount = Math.min(concurrency, queue.length); 370 | for (let i = 0; i < initialCount; i++) { 371 | await runNext(); 372 | } 373 | 374 | // Wait for all evaluations to complete 375 | while (inProgress.size > 0) { 376 | await Promise.race([...inProgress]); 377 | } 378 | 379 | return results; 380 | } 381 | 382 | (async () => { 383 | let evals = await loadEvals(); 384 | const references = await loadReferences(); 385 | const targetId = process.argv[2]; 386 | const runId = new Date().toISOString().replace(/[:.]/g, "-"); 387 | const logDir = path.join(__dirname, `../logs/${runId}`); 388 | if (!fs.existsSync(logDir)) { 389 | fs.mkdirSync(logDir, { recursive: true }); 390 | } 391 | 392 | if (targetId) { 393 | evals = evals.filter((e) => minimatch(e.id, targetId)); 394 | if (evals.length === 0) { 395 | console.log( 396 | chalk.red(`No evals found matching glob pattern: ${targetId}`) 397 | ); 398 | process.exit(1); 399 | } 400 | } 401 | 402 | console.log(chalk.cyan(`Running ${evals.length} evaluations in parallel...`)); 403 | const results = await runEvalsBatch(evals, references, runId); 404 | 405 | const totalEvals = results.length; 406 | const correctEvals = results.filter((r) => r.correct).length; 407 | 408 | const summary = { 409 | totalEvaluations: totalEvals, 410 | correctEvaluations: correctEvals, 411 | failedEvaluations: totalEvals - correctEvals, 412 | successRate: Math.round((correctEvals / totalEvals) * 100), 413 | detailedResults: results.map((result) => ({ 414 | id: result.id, 415 | status: result.correct ? "PASSED" : "FAILED", 416 | question: result.question, 417 | actual: result.actual, 418 | expected: result.expected, 419 | reason: result.reason || null, 420 | evaluationReason: result.evaluationReason || null, 421 | notes: result.notes || null, 422 | })), 423 | }; 424 | const summaryPath = path.join(logDir, "summary.json"); 425 | fs.writeFileSync(summaryPath, JSON.stringify(summary, null, 2)); 426 | 427 | // Also log to console for visibility 428 | console.log(chalk.cyan("Evaluation results:")); 429 | console.log(chalk.white(`Total evaluations: ${totalEvals}`)); 430 | console.log( 431 | chalk.green( 432 | `Correct: ${correctEvals} (${Math.round((correctEvals / totalEvals) * 100)}%)` 433 | ) 434 | ); 435 | console.log(chalk.red(`Failed: ${totalEvals - correctEvals}`)); 436 | console.log(chalk.white("\nDetailed results:")); 437 | results.forEach((result) => { 438 | console.log( 439 | `${result.correct ? chalk.green("✓ PASSED") : chalk.red("✗ FAILED")} Eval ID: ${result.id}${ 440 | result.reason ? "\n " + chalk.red(result.reason) : "" 441 | }` 442 | ); 443 | }); 444 | console.log(chalk.green("\nAll evaluations completed!")); 445 | })().catch((error) => { 446 | console.error(chalk.red("Error running evaluations:"), error); 447 | process.exit(1); 448 | }); 449 | -------------------------------------------------------------------------------- /scripts/test-async.ts: -------------------------------------------------------------------------------- 1 | import { HyperAgent } from "../src/agent"; 2 | import dotenv from "dotenv"; 3 | import chalk from "chalk"; 4 | 5 | dotenv.config(); 6 | 7 | const agent = new HyperAgent({ 8 | // a: process.env.OPENAI_API_KEY, 9 | }); 10 | 11 | (async () => { 12 | const control = await agent.executeTaskAsync( 13 | "Go to give me a summary of the second link on the show section of hacker news, be sure to actually go to it", 14 | { 15 | onStep: (step) => { 16 | console.log("\n" + chalk.cyan.bold("===== STEP =====")); 17 | console.dir(step, { depth: null, colors: true }); 18 | console.log(chalk.cyan.bold("===============") + "\n"); 19 | }, 20 | } 21 | ); 22 | // console.log(chalk.green.bold("\nResult:")); 23 | // console.log(chalk.white(result.output)); 24 | await new Promise((resolve) => setTimeout(resolve, 10000)); 25 | console.log("pausing"); 26 | control.pause(); 27 | await new Promise((resolve) => setTimeout(resolve, 20000)); 28 | console.log("resuming"); 29 | control.resume(); 30 | })(); 31 | -------------------------------------------------------------------------------- /scripts/test-extract.ts: -------------------------------------------------------------------------------- 1 | import { z } from "zod"; 2 | import { HyperAgent } from "../src/agent"; 3 | import dotenv from "dotenv"; 4 | 5 | dotenv.config(); 6 | 7 | const agent = new HyperAgent(); 8 | 9 | (async () => { 10 | const page = await agent.newPage(); 11 | await page.goto("https://flights.google.com", { waitUntil: "load" }); 12 | const res = await page.extract("What are the preselected options?"); 13 | console.log(res); 14 | const res2 = await page.extract( 15 | "What are the preselected options?", 16 | z.object({ 17 | options: z.array(z.string()), 18 | }) 19 | ); 20 | console.log(res2); 21 | })(); 22 | -------------------------------------------------------------------------------- /scripts/test-page-ai.ts: -------------------------------------------------------------------------------- 1 | import { HyperAgent } from "../src/agent"; 2 | import dotenv from "dotenv"; 3 | 4 | dotenv.config(); 5 | 6 | const agent = new HyperAgent(); 7 | 8 | (async () => { 9 | const page = await agent.newPage(); 10 | page.ai( 11 | "Go to https://flights.google.com and find a round-trip flight from Rio de Janeiro to Los Angeles, leaving on May 15, 2025, and returning on May 22, 2025, and select the option with the least carbon dioxide emissions." 12 | ); 13 | const page2 = await agent.newPage(); 14 | await page2.goto("https://maps.google.com"); 15 | page2.ai("Find the nearest restaurant to the current page"); 16 | })(); 17 | -------------------------------------------------------------------------------- /scripts/test-variables.ts: -------------------------------------------------------------------------------- 1 | import { HyperAgent } from "../src/agent"; 2 | import dotenv from "dotenv"; 3 | import chalk from "chalk"; 4 | 5 | dotenv.config(); 6 | 7 | const agent = new HyperAgent({ 8 | debug: true, 9 | }); 10 | 11 | (async () => { 12 | agent.addVariable({ 13 | key: "departure_date", 14 | description: "Enter this date as the departure date", 15 | value: "May 15, 2025", 16 | }); 17 | agent.addVariable({ 18 | key: "returning_date", 19 | description: "Enter this date as the return date", 20 | value: "May 22, 2025", 21 | }); 22 | const result = await agent.executeTask( 23 | "Go to https://flights.google.com and find a round-trip flight from Rio de Janeiro to Los Angeles and select the option with the least carbon dioxide emissions.", 24 | { 25 | debugOnAgentOutput: (agentOutput) => { 26 | console.log("\n" + chalk.cyan.bold("===== AGENT OUTPUT =====")); 27 | console.dir(agentOutput, { depth: null, colors: true }); 28 | console.log(chalk.cyan.bold("===============") + "\n"); 29 | }, 30 | onStep: (step) => { 31 | console.log("\n" + chalk.cyan.bold("===== STEP =====")); 32 | console.dir(step, { depth: null, colors: true }); 33 | console.log(chalk.cyan.bold("===============") + "\n"); 34 | }, 35 | } 36 | ); 37 | console.log(chalk.green.bold("\nResult:")); 38 | console.log(chalk.white(result.output)); 39 | })(); 40 | -------------------------------------------------------------------------------- /scripts/test.ts: -------------------------------------------------------------------------------- 1 | import { HyperAgent } from "../src/agent"; 2 | import dotenv from "dotenv"; 3 | import chalk from "chalk"; 4 | 5 | dotenv.config(); 6 | 7 | const agent = new HyperAgent({ 8 | debug: true, 9 | }); 10 | 11 | (async () => { 12 | const result = await agent.executeTask( 13 | "Go to https://flights.google.com and find a round-trip flight from Rio de Janeiro to Los Angeles, leaving on May 15, 2025, and returning on May 22, 2025, and select the option with the least carbon dioxide emissions.", 14 | { 15 | debugOnAgentOutput: (agentOutput) => { 16 | console.log("\n" + chalk.cyan.bold("===== AGENT OUTPUT =====")); 17 | console.dir(agentOutput, { depth: null, colors: true }); 18 | console.log(chalk.cyan.bold("===============") + "\n"); 19 | }, 20 | onStep: (step) => { 21 | console.log("\n" + chalk.cyan.bold("===== STEP =====")); 22 | console.dir(step, { depth: null, colors: true }); 23 | console.log(chalk.cyan.bold("===============") + "\n"); 24 | }, 25 | } 26 | ); 27 | console.log(chalk.green.bold("\nResult:")); 28 | console.log(chalk.white(result.output)); 29 | })(); 30 | -------------------------------------------------------------------------------- /src/agent/actions/click-element.ts: -------------------------------------------------------------------------------- 1 | import { z } from "zod"; 2 | import { Locator } from "playwright"; 3 | import { ActionContext, ActionOutput, AgentActionDefinition } from "@/types"; 4 | import { sleep } from "@/utils"; 5 | import { getLocator } from "./utils"; 6 | 7 | const ClickElementAction = z 8 | .object({ 9 | index: z.number().describe("The numeric index of the element to click."), 10 | }) 11 | .describe("Click on an element identified by its index"); 12 | 13 | type ClickElementActionType = z.infer; 14 | 15 | const MAX_STABLE_CHECKS = 2; 16 | const CLICK_CHECK_TIMEOUT_PERIOD = 2_500; 17 | 18 | export const ClickElementActionDefinition: AgentActionDefinition = { 19 | type: "clickElement" as const, 20 | actionParams: ClickElementAction, 21 | run: async function ( 22 | ctx: ActionContext, 23 | action: ClickElementActionType 24 | ): Promise { 25 | const { index } = action; 26 | const locator = getLocator(ctx, index); 27 | if (!locator) { 28 | return { success: false, message: "Element not found" }; 29 | } 30 | 31 | const exists = (await locator.count()) > 0; 32 | if (!exists) { 33 | return { success: false, message: "Element not found on page" }; 34 | } 35 | 36 | await locator.scrollIntoViewIfNeeded({ 37 | timeout: CLICK_CHECK_TIMEOUT_PERIOD, 38 | }); 39 | 40 | await Promise.all([ 41 | locator.waitFor({ 42 | state: "visible", 43 | timeout: CLICK_CHECK_TIMEOUT_PERIOD, 44 | }), 45 | waitForElementToBeEnabled(locator, CLICK_CHECK_TIMEOUT_PERIOD), 46 | waitForElementToBeStable(locator, CLICK_CHECK_TIMEOUT_PERIOD), 47 | ]); 48 | 49 | await locator.click({ force: true }); 50 | return { success: true, message: `Clicked element with index ${index}` }; 51 | }, 52 | pprintAction: function (params: ClickElementActionType): string { 53 | return `Click element at index ${params.index}`; 54 | }, 55 | }; 56 | 57 | /** 58 | * Waits for an element to become enabled with a timeout 59 | * @param locator The Playwright locator to check 60 | * @param timeout Maximum time to wait in milliseconds 61 | * @returns Promise that resolves when element is enabled or rejects on timeout 62 | */ 63 | async function waitForElementToBeEnabled( 64 | locator: Locator, 65 | timeout: number = 5000 66 | ): Promise { 67 | return Promise.race([ 68 | (async () => { 69 | while (true) { 70 | if (await locator.isEnabled()) { 71 | return; 72 | } 73 | await sleep(100); 74 | } 75 | })(), 76 | new Promise((_, reject) => { 77 | setTimeout( 78 | () => reject(new Error("Timeout waiting for element to be enabled")), 79 | timeout 80 | ); 81 | }), 82 | ]); 83 | } 84 | 85 | /** 86 | * Waits for an element to become stable (not moving) with a timeout 87 | * @param locator The Playwright locator to check 88 | * @param timeout Maximum time to wait in milliseconds 89 | * @returns Promise that resolves when element is stable or rejects on timeout 90 | */ 91 | async function waitForElementToBeStable( 92 | locator: Locator, 93 | timeout: number = 5000 94 | ): Promise { 95 | return Promise.race([ 96 | (async () => { 97 | let previousRect: { 98 | x: number; 99 | y: number; 100 | width: number; 101 | height: number; 102 | } | null = null; 103 | let stableCount = 0; 104 | 105 | while (true) { 106 | const currentRect = await locator.boundingBox(); 107 | if (!currentRect) { 108 | await sleep(100); 109 | continue; 110 | } 111 | 112 | if ( 113 | previousRect && 114 | previousRect.x === currentRect.x && 115 | previousRect.y === currentRect.y && 116 | currentRect.width === (previousRect.width ?? 0) && 117 | currentRect.height === (previousRect.height ?? 0) 118 | ) { 119 | stableCount++; 120 | if (stableCount >= MAX_STABLE_CHECKS) { 121 | // Element stable for {{ MAX_STABLE_CHECKS }} consecutive checks 122 | return; 123 | } 124 | } else { 125 | stableCount = 0; 126 | } 127 | 128 | previousRect = currentRect; 129 | await sleep(100); 130 | } 131 | })(), 132 | new Promise((_, reject) => { 133 | setTimeout( 134 | () => reject(new Error("Timeout waiting for element to be stable")), 135 | timeout 136 | ); 137 | }), 138 | ]); 139 | } 140 | -------------------------------------------------------------------------------- /src/agent/actions/complete-validator.ts: -------------------------------------------------------------------------------- 1 | import { z } from "zod"; 2 | import { ActionContext, ActionOutput, AgentActionDefinition } from "@/types"; 3 | 4 | export const CompletionValidateAction = z 5 | .object({ 6 | task: z 7 | .string() 8 | .describe("The detailed description of the task to complete."), 9 | completionCriteria: z.array( 10 | z.object({ 11 | subTask: z 12 | .string() 13 | .describe("The description of the specific sub task of the task."), 14 | subTaskSatisfied: z 15 | .boolean() 16 | .describe("Is the specific sub task of the task completed."), 17 | subTaskSatisfiedReason: z 18 | .string() 19 | .describe( 20 | "How and why has this subtask been marked as completed (if completed). Provide the result as well if this response required an action, and that action produced a result." 21 | ), 22 | }) 23 | ), 24 | }) 25 | .describe( 26 | `Must run this before issuing the final complete action to validate that the task is completed. 27 | Evaluate if all the sub parts of the task are completed, and so if the task itself is completed. If you don't run this step, you will be heavily penalized.` 28 | ); 29 | 30 | export type CompleteValidateActionType = z.infer< 31 | typeof CompletionValidateAction 32 | >; 33 | 34 | export const CompletionValidateActionDefinition: AgentActionDefinition = { 35 | type: "taskCompleteValidation", 36 | actionParams: CompletionValidateAction, 37 | run: async ( 38 | ctx: ActionContext, 39 | action: CompleteValidateActionType 40 | ): Promise => { 41 | const completionCriteria = action.completionCriteria 42 | .map( 43 | (subTask) => 44 | `subTask:${subTask.subTask} || condition satisfied: ${subTask.subTaskSatisfied}` 45 | ) 46 | .join("\n"); 47 | return { 48 | success: true, 49 | message: `Task Completion Report: \ntask:${action.task} \nsubtasks: \n${completionCriteria}`, 50 | }; 51 | }, 52 | }; 53 | -------------------------------------------------------------------------------- /src/agent/actions/complete-with-output-schema.ts: -------------------------------------------------------------------------------- 1 | import { z } from "zod"; 2 | import { ActionContext, ActionOutput, AgentActionDefinition } from "@/types"; 3 | 4 | export const generateCompleteActionWithOutputDefinition = ( 5 | outputSchema: z.AnyZodObject 6 | ): AgentActionDefinition => { 7 | const actionParamsSchema = z 8 | .object({ 9 | success: z 10 | .boolean() 11 | .describe("Whether the task was completed successfully."), 12 | outputSchema: outputSchema 13 | .nullable() 14 | .describe( 15 | "The output model to return the response in. Given the previous data, try your best to fit the final response into the given schema." 16 | ), 17 | }) 18 | .describe( 19 | "Complete the task. An output schema has been provided to you. Try your best to provide your response so that it fits the output schema provided." 20 | ); 21 | 22 | type CompeleteActionWithOutputSchema = z.infer; 23 | 24 | return { 25 | type: "complete" as const, 26 | actionParams: actionParamsSchema, 27 | run: async ( 28 | ctx: ActionContext, 29 | actionParams: CompeleteActionWithOutputSchema 30 | ): Promise => { 31 | if (actionParams.success && actionParams.outputSchema) { 32 | return { 33 | success: true, 34 | message: "The action generated an object", 35 | extract: actionParams.outputSchema, 36 | }; 37 | } else { 38 | return { 39 | success: false, 40 | message: 41 | "Could not complete task and/or could not extract response into output schema.", 42 | }; 43 | } 44 | }, 45 | completeAction: async (params: CompeleteActionWithOutputSchema) => { 46 | return JSON.stringify(params.outputSchema, null, 2); 47 | }, 48 | }; 49 | }; 50 | -------------------------------------------------------------------------------- /src/agent/actions/complete.ts: -------------------------------------------------------------------------------- 1 | import { z } from "zod"; 2 | import { ActionOutput, AgentActionDefinition } from "@/types"; 3 | 4 | export const CompleteAction = z 5 | .object({ 6 | success: z 7 | .boolean() 8 | .describe("Whether the task was completed successfully."), 9 | text: z 10 | .string() 11 | .nullable() 12 | .describe( 13 | "The text to complete the task with, make this answer the ultimate goal of the task. Be sure to include all the information requested in the task in explicit detail." 14 | ), 15 | }) 16 | .describe("Complete the task, this must be the final action in the sequence"); 17 | 18 | export type CompleteActionType = z.infer; 19 | 20 | export const CompleteActionDefinition: AgentActionDefinition = { 21 | type: "complete" as const, 22 | actionParams: CompleteAction, 23 | run: async (): Promise => { 24 | return { success: true, message: "Task Complete" }; 25 | }, 26 | completeAction: async (params: CompleteActionType) => { 27 | return params.text ?? "No response text found"; 28 | }, 29 | pprintAction: function (params: CompleteActionType): string { 30 | return `Complete task with ${params.success ? "success" : "failure"}`; 31 | }, 32 | }; 33 | -------------------------------------------------------------------------------- /src/agent/actions/extract.ts: -------------------------------------------------------------------------------- 1 | import { z } from "zod"; 2 | import { ActionContext, ActionOutput, AgentActionDefinition } from "@/types"; 3 | import { parseMarkdown } from "@/utils/html-to-markdown"; 4 | import fs from "fs"; 5 | 6 | export const ExtractAction = z 7 | .object({ 8 | objective: z.string().describe("The goal of the extraction."), 9 | }) 10 | .describe( 11 | "Extract content from the page according to the objective, e.g. product prices, contact information, article text, table data, or specific metadata fields" 12 | ) 13 | 14 | export type ExtractActionType = z.infer; 15 | 16 | export const ExtractActionDefinition: AgentActionDefinition = { 17 | type: "extract" as const, 18 | actionParams: ExtractAction, 19 | run: async ( 20 | ctx: ActionContext, 21 | action: ExtractActionType 22 | ): Promise => { 23 | try { 24 | const content = await ctx.page.content(); 25 | const markdown = await parseMarkdown(content); 26 | const objective = action.objective; 27 | 28 | // Take a screenshot of the page 29 | const cdpSession = await ctx.page.context().newCDPSession(ctx.page); 30 | const screenshot = await cdpSession.send("Page.captureScreenshot"); 31 | cdpSession.detach(); 32 | 33 | // Save screenshot to debug dir if exists 34 | if (ctx.debugDir) { 35 | fs.writeFileSync( 36 | `${ctx.debugDir}/extract-screenshot.png`, 37 | Buffer.from(screenshot.data, "base64") 38 | ); 39 | } 40 | 41 | // Trim markdown to stay within token limit 42 | // TODO: this is a hack, we should use a better token counting method 43 | const avgTokensPerChar = 0.75; // Conservative estimate of tokens per character 44 | const maxChars = Math.floor(ctx.tokenLimit / avgTokensPerChar); 45 | const trimmedMarkdown = 46 | markdown.length > maxChars 47 | ? markdown.slice(0, maxChars) + "\n[Content truncated due to length]" 48 | : markdown; 49 | if (ctx.debugDir) { 50 | fs.writeFileSync( 51 | `${ctx.debugDir}/extract-markdown-content.md`, 52 | trimmedMarkdown 53 | ); 54 | } 55 | 56 | const response = await ctx.llm.invoke([ 57 | { 58 | role: "user", 59 | content: [ 60 | { 61 | type: "text", 62 | text: `Extract the following information from the page according to this objective: "${objective}"\n\nPage content:\n${trimmedMarkdown}\nHere is as screenshot of the page:\n`, 63 | }, 64 | { 65 | type: "image_url", 66 | image_url: { 67 | url: `data:image/png;base64,${screenshot.data}`, 68 | }, 69 | }, 70 | ], 71 | }, 72 | ]); 73 | if (response.content.length === 0) { 74 | return { 75 | success: false, 76 | message: `No content extracted from page.`, 77 | }; 78 | } 79 | return { 80 | success: true, 81 | message: `Extracted content from page:\n${response.content}`, 82 | }; 83 | } catch (error) { 84 | return { 85 | success: false, 86 | message: `Failed to extract content: ${error}`, 87 | }; 88 | } 89 | }, 90 | pprintAction: function(params: ExtractActionType): string { 91 | return `Extract content from page with objective: "${params.objective}"`; 92 | }, 93 | }; 94 | -------------------------------------------------------------------------------- /src/agent/actions/go-to-url.ts: -------------------------------------------------------------------------------- 1 | import { z } from "zod"; 2 | import { ActionContext, AgentActionDefinition } from "@/types"; 3 | 4 | export const GoToUrlAction = z 5 | .object({ 6 | url: z.string().describe("The URL you want to navigate to."), 7 | }) 8 | .describe("Navigate to a specific URL in the browser"); 9 | 10 | export type GoToUrlActionType = z.infer; 11 | 12 | export const GoToURLActionDefinition: AgentActionDefinition = { 13 | type: "goToUrl" as const, 14 | actionParams: GoToUrlAction, 15 | run: async (ctx: ActionContext, action: GoToUrlActionType) => { 16 | const { url } = action; 17 | await ctx.page.goto(url); 18 | return { success: true, message: `Navigated to ${url}` }; 19 | }, 20 | pprintAction: function(params: GoToUrlActionType): string { 21 | return `Navigate to URL: ${params.url}`; 22 | }, 23 | }; 24 | -------------------------------------------------------------------------------- /src/agent/actions/index.ts: -------------------------------------------------------------------------------- 1 | import { GoToURLActionDefinition } from "./go-to-url"; 2 | import { ClickElementActionDefinition } from "./click-element"; 3 | import { InputTextActionDefinition } from "./input-text"; 4 | import { CompleteActionDefinition } from "./complete"; 5 | import { generateCompleteActionWithOutputDefinition } from "./complete-with-output-schema"; 6 | import { ExtractActionDefinition } from "./extract"; 7 | import { SelectOptionActionDefinition } from "./select-option"; 8 | import { ScrollActionDefinition } from "./scroll"; 9 | import { PageBackActionDefinition } from "./page-back"; 10 | import { PageForwardActionDefinition } from "./page-forward"; 11 | import { KeyPressActionDefinition } from "./key-press"; 12 | import { ThinkingActionDefinition } from "./thinking"; 13 | import { RefreshPageActionDefinition } from "./refresh-page"; 14 | import { PDFActionDefinition } from "./pdf"; 15 | 16 | /** 17 | * Custom error class for when an action is not found in the registry 18 | * This helps distinguish between general errors and specifically when an action type doesn't exist 19 | */ 20 | export class ActionNotFoundError extends Error { 21 | constructor(actionType: string) { 22 | super(`Action type "${actionType}" not found in the action registry`); 23 | this.name = "ActionNotFoundError"; 24 | 25 | // Maintains proper stack trace for where our error was thrown (only available on V8) 26 | if (Error.captureStackTrace) { 27 | Error.captureStackTrace(this, ActionNotFoundError); 28 | } 29 | } 30 | } 31 | 32 | const DEFAULT_ACTIONS = [ 33 | GoToURLActionDefinition, 34 | PageBackActionDefinition, 35 | PageForwardActionDefinition, 36 | RefreshPageActionDefinition, 37 | ExtractActionDefinition, 38 | ClickElementActionDefinition, 39 | SelectOptionActionDefinition, 40 | ScrollActionDefinition, 41 | InputTextActionDefinition, 42 | KeyPressActionDefinition, 43 | ThinkingActionDefinition, 44 | ]; 45 | 46 | if (process.env.GEMINI_API_KEY) { 47 | DEFAULT_ACTIONS.push(PDFActionDefinition); 48 | } 49 | 50 | export { 51 | DEFAULT_ACTIONS, 52 | CompleteActionDefinition, 53 | generateCompleteActionWithOutputDefinition, 54 | }; 55 | -------------------------------------------------------------------------------- /src/agent/actions/input-text.ts: -------------------------------------------------------------------------------- 1 | import { z } from "zod"; 2 | import { ActionContext, AgentActionDefinition } from "@/types"; 3 | import { getLocator } from "./utils"; 4 | 5 | export const InputTextAction = z 6 | .object({ 7 | index: z 8 | .number() 9 | .describe("The numeric index of the element to input text."), 10 | text: z.string().describe("The text to input."), 11 | }) 12 | .describe("Input text into a input interactive element"); 13 | 14 | export type InputTextActionType = z.infer; 15 | 16 | export const InputTextActionDefinition: AgentActionDefinition = { 17 | type: "inputText" as const, 18 | actionParams: InputTextAction, 19 | run: async (ctx: ActionContext, action: InputTextActionType) => { 20 | let { index, text } = action; 21 | const locator = getLocator(ctx, index); 22 | for (const variable of ctx.variables) { 23 | text = text.replace(`<<${variable.key}>>`, variable.value); 24 | } 25 | if (!locator) { 26 | return { success: false, message: "Element not found" }; 27 | } 28 | await locator.fill(text, { timeout: 5_000 }); 29 | return { 30 | success: true, 31 | message: `Inputted text "${text}" into element with index ${index}`, 32 | }; 33 | }, 34 | pprintAction: function (params: InputTextActionType): string { 35 | return `Input text "${params.text}" into element at index ${params.index}`; 36 | }, 37 | }; 38 | -------------------------------------------------------------------------------- /src/agent/actions/key-press.ts: -------------------------------------------------------------------------------- 1 | import { z } from "zod"; 2 | import { ActionContext, AgentActionDefinition } from "@/types"; 3 | 4 | /** 5 | * Translates xdotool-like key strings to Playwright-compatible keys. 6 | * Reference: https://developer.mozilla.org/en-US/docs/Web/API/KeyboardEvent/key/Key_Values 7 | */ 8 | function translateKey(key: string): string { 9 | const keyMap: Record = { 10 | // Common / Basic Keys 11 | return: "Enter", 12 | enter: "Enter", 13 | tab: "Tab", 14 | backspace: "Backspace", 15 | up: "ArrowUp", 16 | down: "ArrowDown", 17 | left: "ArrowLeft", 18 | right: "ArrowRight", 19 | space: "Space", 20 | ctrl: "Control", 21 | control: "Control", 22 | alt: "Alt", 23 | shift: "Shift", 24 | meta: "Meta", 25 | command: "Meta", 26 | cmd: "Meta", 27 | windows: "Meta", 28 | esc: "Escape", 29 | escape: "Escape", 30 | // Numpad Keys 31 | kp_0: "Numpad0", 32 | kp_1: "Numpad1", 33 | kp_2: "Numpad2", 34 | kp_3: "Numpad3", 35 | kp_4: "Numpad4", 36 | kp_5: "Numpad5", 37 | kp_6: "Numpad6", 38 | kp_7: "Numpad7", 39 | kp_8: "Numpad8", 40 | kp_9: "Numpad9", 41 | // Numpad Operations 42 | kp_enter: "NumpadEnter", 43 | kp_multiply: "NumpadMultiply", 44 | kp_add: "NumpadAdd", 45 | kp_subtract: "NumpadSubtract", 46 | kp_decimal: "NumpadDecimal", 47 | kp_divide: "NumpadDivide", 48 | // Navigation 49 | page_down: "PageDown", 50 | page_up: "PageUp", 51 | home: "Home", 52 | end: "End", 53 | insert: "Insert", 54 | delete: "Delete", 55 | // Function Keys 56 | f1: "F1", 57 | f2: "F2", 58 | f3: "F3", 59 | f4: "F4", 60 | f5: "F5", 61 | f6: "F6", 62 | f7: "F7", 63 | f8: "F8", 64 | f9: "F9", 65 | f10: "F10", 66 | f11: "F11", 67 | f12: "F12", 68 | // Left/Right Variants 69 | shift_l: "ShiftLeft", 70 | shift_r: "ShiftRight", 71 | control_l: "ControlLeft", 72 | control_r: "ControlRight", 73 | alt_l: "AltLeft", 74 | alt_r: "AltRight", 75 | // Media Keys 76 | audiovolumemute: "AudioVolumeMute", 77 | audiovolumedown: "AudioVolumeDown", 78 | audiovolumeup: "AudioVolumeUp", 79 | // Additional Special Keys 80 | print: "PrintScreen", 81 | scroll_lock: "ScrollLock", 82 | pause: "Pause", 83 | menu: "ContextMenu", 84 | }; 85 | 86 | return keyMap[key.toLowerCase()] || key; 87 | } 88 | 89 | export const KeyPressAction = z 90 | .object({ 91 | text: z.string().describe( 92 | `Press a key or key-combination on the keyboard.\n 93 | - This supports xdotool's \`key\` syntax.\n 94 | - Examples: "a", "Return", "alt+Tab", "ctrl+s", "Up", "KP_0" (for the numpad 0 key). 95 | ` 96 | ), 97 | }) 98 | .describe("Press a key or key-combination on the keyboard"); 99 | 100 | export type KeyPressActionType = z.infer; 101 | 102 | export const KeyPressActionDefinition: AgentActionDefinition = { 103 | type: "keyPress" as const, 104 | actionParams: KeyPressAction, 105 | run: async (ctx: ActionContext, action: KeyPressActionType) => { 106 | const { text } = action; 107 | 108 | if (text.includes(" ") && !text.includes("+")) { 109 | const keys = text.split(" "); 110 | for (const k of keys) { 111 | await ctx.page.keyboard.press(translateKey(k)); 112 | } 113 | } else if (text.includes("+")) { 114 | const keys = text.split("+"); 115 | for (let i = 0; i < keys.length - 1; i++) { 116 | await ctx.page.keyboard.down(translateKey(keys[i])); 117 | } 118 | await ctx.page.keyboard.press(translateKey(keys[keys.length - 1])); 119 | for (let i = keys.length - 2; i >= 0; i--) { 120 | await ctx.page.keyboard.up(translateKey(keys[i])); 121 | } 122 | } else { 123 | await ctx.page.keyboard.press(translateKey(text)); 124 | } 125 | 126 | return { 127 | success: true, 128 | message: `Pressed key "${text}"`, 129 | }; 130 | }, 131 | pprintAction: function(params: KeyPressActionType): string { 132 | return `Press key "${params.text}"`; 133 | }, 134 | }; 135 | -------------------------------------------------------------------------------- /src/agent/actions/page-back.ts: -------------------------------------------------------------------------------- 1 | import { z } from "zod"; 2 | import { ActionContext, AgentActionDefinition } from "@/types"; 3 | 4 | export const PageBackAction = z 5 | .object({}) 6 | .describe("Navigate back to the previous page in the browser history"); 7 | 8 | export type PageBackActionType = z.infer; 9 | 10 | export const PageBackActionDefinition: AgentActionDefinition = { 11 | type: "pageBack" as const, 12 | actionParams: PageBackAction, 13 | run: async (ctx: ActionContext) => { 14 | await ctx.page.goBack(); 15 | return { success: true, message: "Navigated back to the previous page" }; 16 | }, 17 | pprintAction: function(): string { 18 | return "Navigate back to previous page"; 19 | }, 20 | }; 21 | -------------------------------------------------------------------------------- /src/agent/actions/page-forward.ts: -------------------------------------------------------------------------------- 1 | import { z } from "zod"; 2 | import { ActionContext, AgentActionDefinition } from "@/types"; 3 | 4 | export const PageForwardAction = z 5 | .object({}) 6 | .describe("Navigate forward to the next page in the browser history"); 7 | 8 | export type PageForwardActionType = z.infer; 9 | 10 | export const PageForwardActionDefinition: AgentActionDefinition = { 11 | type: "pageForward" as const, 12 | actionParams: PageForwardAction, 13 | run: async (ctx: ActionContext) => { 14 | await ctx.page.goForward(); 15 | return { success: true, message: "Navigated forward to the next page" }; 16 | }, 17 | pprintAction: function(): string { 18 | return "Navigate forward to next page"; 19 | }, 20 | }; 21 | -------------------------------------------------------------------------------- /src/agent/actions/pdf.ts: -------------------------------------------------------------------------------- 1 | import { z } from "zod"; 2 | import { ActionContext, AgentActionDefinition } from "@/types"; 3 | import { config } from "dotenv"; 4 | import { GoogleGenAI } from "@google/genai"; 5 | 6 | config(); 7 | 8 | export const PDFAction = z 9 | .object({ 10 | pdfUrl: z.string().describe("The URL of the PDF to analyze."), 11 | prompt: z.string().describe("The prompt/question to ask about the PDF."), 12 | }) 13 | .describe("Analyze a PDF using Gemini and a prompt"); 14 | 15 | export type PDFActionType = z.infer; 16 | 17 | export const PDFActionDefinition: AgentActionDefinition = { 18 | type: "analyzePdf" as const, 19 | actionParams: PDFAction, 20 | run: async (ctx: ActionContext, action: PDFActionType) => { 21 | const goog = new GoogleGenAI({ apiKey: process.env.GEMINI_API_KEY! }); 22 | const { pdfUrl, prompt } = action; 23 | let pdfBuffer: Buffer | null = null; 24 | try { 25 | // Try direct request first (works for direct PDF links) 26 | const response = await ctx.page.request.get(pdfUrl); 27 | if ( 28 | response.ok() && 29 | response.headers()["content-type"]?.includes("pdf") 30 | ) { 31 | pdfBuffer = Buffer.from(await response.body()); 32 | } else { 33 | // Fallback: navigate and intercept response 34 | 35 | const [resp] = await Promise.all([ 36 | ctx.page.waitForResponse( 37 | (r) => 38 | r.url() === pdfUrl && r.headers()["content-type"]?.includes("pdf") 39 | ), 40 | ctx.page.goto(pdfUrl, { waitUntil: "networkidle" }), 41 | ]); 42 | pdfBuffer = Buffer.from(await resp.body()); 43 | } 44 | } catch (err) { 45 | return { 46 | success: false, 47 | message: `Failed to download PDF: ${err}`, 48 | }; 49 | } 50 | if (!pdfBuffer) { 51 | return { 52 | success: false, 53 | message: "Could not retrieve PDF file.", 54 | }; 55 | } 56 | const geminiResponse = await goog.models.generateContent({ 57 | model: "gemini-2.5-pro-preview-03-25", 58 | contents: [ 59 | { text: prompt }, 60 | { 61 | inlineData: { 62 | mimeType: "application/pdf", 63 | data: pdfBuffer.toString("base64"), 64 | }, 65 | }, 66 | ], 67 | }); 68 | return { 69 | success: true, 70 | message: geminiResponse.text || "No response text returned.", 71 | }; 72 | }, 73 | pprintAction: function (params: PDFActionType): string { 74 | return `Analyze PDF at URL: ${params.pdfUrl} with prompt: ${params.prompt}`; 75 | }, 76 | }; 77 | -------------------------------------------------------------------------------- /src/agent/actions/refresh-page.ts: -------------------------------------------------------------------------------- 1 | import { z } from "zod"; 2 | import { ActionContext, AgentActionDefinition } from "@/types"; 3 | 4 | export const RefreshPageAction = z 5 | .object({}) 6 | .describe( 7 | "Refresh a webpage. Refreshing a webpage is usually a good way if you need to reset the state on a page. Take care since every thing you did on that page will be reset." 8 | ); 9 | 10 | export type RefreshPageActionType = z.infer; 11 | 12 | export const RefreshPageActionDefinition: AgentActionDefinition = { 13 | type: "refreshPage" as const, 14 | actionParams: RefreshPageAction, 15 | run: async (ctx: ActionContext) => { 16 | await ctx.page.reload(); 17 | return { success: true, message: "Succesfully refreshed a page." }; 18 | }, 19 | pprintAction: function(): string { 20 | return "Refresh current page"; 21 | }, 22 | }; 23 | -------------------------------------------------------------------------------- /src/agent/actions/scroll.ts: -------------------------------------------------------------------------------- 1 | import { z } from "zod"; 2 | import { ActionContext, AgentActionDefinition } from "@/types"; 3 | 4 | export const ScrollAction = z 5 | .object({ 6 | direction: z 7 | .enum(["up", "down", "left", "right"]) 8 | .describe("The direction to scroll."), 9 | }) 10 | .describe("Scroll in a specific direction in the browser"); 11 | 12 | export type ScrollActionType = z.infer; 13 | 14 | export const ScrollActionDefinition: AgentActionDefinition = { 15 | type: "scroll" as const, 16 | actionParams: ScrollAction, 17 | run: async (ctx: ActionContext, action: ScrollActionType) => { 18 | const { direction } = action; 19 | switch (direction) { 20 | case "up": 21 | await ctx.page.evaluate(() => window.scrollBy(0, -window.innerHeight)); 22 | break; 23 | case "down": 24 | await ctx.page.evaluate(() => window.scrollBy(0, window.innerHeight)); 25 | break; 26 | case "left": 27 | await ctx.page.evaluate(() => window.scrollBy(-window.innerWidth, 0)); 28 | break; 29 | case "right": 30 | await ctx.page.evaluate(() => window.scrollBy(window.innerWidth, 0)); 31 | break; 32 | } 33 | return { success: true, message: `Scrolled ${direction}` }; 34 | }, 35 | pprintAction: function(params: ScrollActionType): string { 36 | return `Scroll ${params.direction}`; 37 | }, 38 | }; 39 | -------------------------------------------------------------------------------- /src/agent/actions/select-option.ts: -------------------------------------------------------------------------------- 1 | import { z } from "zod"; 2 | import { ActionContext, AgentActionDefinition } from "@/types"; 3 | import { getLocator } from "./utils"; 4 | 5 | export const SelectOptionAction = z 6 | .object({ 7 | index: z 8 | .number() 9 | .describe("The numeric index of the element to select an option."), 10 | text: z.string().describe("The text of the option to select."), 11 | }) 12 | .describe("Select an option from a dropdown element"); 13 | 14 | export type SelectOptionActionType = z.infer; 15 | 16 | export const SelectOptionActionDefinition: AgentActionDefinition = { 17 | type: "selectOption" as const, 18 | actionParams: SelectOptionAction, 19 | run: async (ctx: ActionContext, action: SelectOptionActionType) => { 20 | const { index, text } = action; 21 | const locator = getLocator(ctx, index); 22 | if (!locator) { 23 | return { success: false, message: "Element not found" }; 24 | } 25 | await locator.selectOption({ label: text }); 26 | return { 27 | success: true, 28 | message: `Selected option "${text}" from element with index ${index}`, 29 | }; 30 | }, 31 | pprintAction: function (params: SelectOptionActionType): string { 32 | return `Select option "${params.text}" from element at index ${params.index}`; 33 | }, 34 | }; 35 | -------------------------------------------------------------------------------- /src/agent/actions/thinking.ts: -------------------------------------------------------------------------------- 1 | import { z } from "zod"; 2 | import { ActionContext, AgentActionDefinition } from "@/types"; 3 | 4 | export const ThinkingAction = z 5 | .object({ 6 | thought: z 7 | .string() 8 | .describe( 9 | "Think about what your current course of action, and your future steps, and what difficulties you might encounter, and how you'd tackle them." 10 | ), 11 | }) 12 | .describe( 13 | `Think about a course of action. Think what your current task is, what your next should be, and how you would possibly do that. This step is especially useful if performing a complex task, and/or working on a visually complex page (think nodes > 300).` 14 | ); 15 | 16 | export type ThinkingActionType = z.infer; 17 | 18 | export const ThinkingActionDefinition: AgentActionDefinition = { 19 | type: "thinkAction" as const, 20 | actionParams: ThinkingAction, 21 | run: async (ctx: ActionContext, action: ThinkingActionType) => { 22 | const { thought } = action; 23 | return { 24 | success: true, 25 | message: `A simple thought process about your next steps. You thought about: ${thought}`, 26 | }; 27 | }, 28 | pprintAction: function(params: ThinkingActionType): string { 29 | return `Think about: "${params.thought}"`; 30 | }, 31 | }; 32 | -------------------------------------------------------------------------------- /src/agent/actions/utils.ts: -------------------------------------------------------------------------------- 1 | import { ActionContext } from "@hyperbrowser/agent/types"; 2 | 3 | export function getLocator(ctx: ActionContext, index: number) { 4 | const element = ctx.domState.elements.get(index); 5 | if (!element) { 6 | return null; 7 | } 8 | if (element.isUnderShadowRoot) { 9 | return ctx.page.locator(element.cssPath); 10 | } else { 11 | return ctx.page.locator(`xpath=${element.xpath}`); 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /src/agent/error.ts: -------------------------------------------------------------------------------- 1 | export class HyperagentError extends Error { 2 | constructor( 3 | message: string, 4 | public statusCode?: number 5 | ) { 6 | super(`[Hyperagent]: ${message}`); 7 | this.name = "HyperagentError"; 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /src/agent/llms/structured-output.ts: -------------------------------------------------------------------------------- 1 | import { BaseChatModel } from "@langchain/core/language_models/chat_models"; 2 | 3 | /** 4 | * Determines the appropriate structured output method based on the LLM type 5 | * @param llm The language model instance 6 | * @returns The structured output method to use ("functionCalling" or "jsonMode") 7 | */ 8 | export function getStructuredOutputMethod(llm: BaseChatModel) { 9 | const modelName = llm.getName(); 10 | if (modelName === "ChatAnthropic") { 11 | return "functionCalling"; 12 | } else if (modelName === "ChatOpenAI") { 13 | return undefined; 14 | } 15 | 16 | // Default to functionCalling for other models 17 | return "functionCalling"; 18 | } 19 | -------------------------------------------------------------------------------- /src/agent/mcp/client.ts: -------------------------------------------------------------------------------- 1 | import { z } from "zod"; 2 | import { Client } from "@modelcontextprotocol/sdk/client/index.js"; 3 | import { StdioClientTransport } from "@modelcontextprotocol/sdk/client/stdio.js"; 4 | import { SSEClientTransport } from "@modelcontextprotocol/sdk/client/sse.js"; 5 | import { Tool } from "@modelcontextprotocol/sdk/types"; 6 | import { MCPServerConfig } from "@/types/config"; 7 | import { ActionContext, ActionOutput, AgentActionDefinition } from "@/types"; 8 | import { v4 as uuidv4 } from "uuid"; 9 | 10 | interface ServerConnection { 11 | id: string; 12 | config: MCPServerConfig; 13 | client: Client; 14 | transport: StdioClientTransport | SSEClientTransport; 15 | tools: Map; 16 | actions: AgentActionDefinition[]; 17 | } 18 | 19 | class MCPClient { 20 | private servers: Map = new Map(); 21 | private debug: boolean; 22 | constructor(debug: boolean = false) { 23 | this.debug = debug; 24 | } 25 | 26 | /** 27 | * Connect to an MCP server and register its tools 28 | * @param serverConfig The server configuration 29 | * @returns List of action definitions provided by the server 30 | */ 31 | async connectToServer( 32 | serverConfig: MCPServerConfig 33 | ): Promise<{ serverId: string; actions: AgentActionDefinition[] }> { 34 | try { 35 | // Generate or use provided server ID 36 | const serverId = serverConfig.id || uuidv4(); 37 | 38 | // Create transport for this server 39 | let transport; 40 | const connectionType = serverConfig?.connectionType || "stdio"; 41 | 42 | if (connectionType === "sse") { 43 | if (!serverConfig.sseUrl) { 44 | throw new Error("SSE URL is required for SSE connection type"); 45 | } 46 | 47 | if (this.debug) { 48 | console.log( 49 | `Establishing SSE connection to ${serverConfig.sseUrl}...` 50 | ); 51 | } 52 | 53 | transport = new SSEClientTransport( 54 | new URL(serverConfig.sseUrl), 55 | serverConfig.sseHeaders 56 | ? { 57 | requestInit: { 58 | headers: serverConfig.sseHeaders, 59 | }, 60 | } 61 | : undefined 62 | ); 63 | 64 | transport.onerror = (error: any) => { 65 | console.error(`SSE error: ${error.message}`); 66 | }; 67 | } else { 68 | if (!serverConfig.command) { 69 | throw new Error("Command is required for stdio connection type"); 70 | } 71 | 72 | transport = new StdioClientTransport({ 73 | command: serverConfig.command, 74 | args: serverConfig.args, 75 | env: { 76 | ...((process.env ?? {}) as Record), 77 | ...(serverConfig.env ?? {}), 78 | }, 79 | // Pipe stdin/stdout, ignore stderr 80 | stderr: this.debug ? "inherit" : "ignore", 81 | }); 82 | } 83 | 84 | const client = new Client({ 85 | name: `hyperagent-mcp-client-${serverId}`, 86 | version: "1.0.0", 87 | }); 88 | 89 | await client.connect(transport); 90 | 91 | const toolsResult = await client.listTools(); 92 | const toolsMap = new Map(); 93 | 94 | // Create actions for each tool 95 | const actions = toolsResult.tools 96 | .filter((tool) => { 97 | if ( 98 | serverConfig.includeTools && 99 | !serverConfig.includeTools.includes(tool.name) 100 | ) { 101 | return false; 102 | } 103 | if ( 104 | serverConfig.excludeTools && 105 | serverConfig.excludeTools.includes(tool.name) 106 | ) { 107 | return false; 108 | } 109 | return true; 110 | }) 111 | .map((tool) => { 112 | // Store tool reference for later use 113 | toolsMap.set(tool.name, tool); 114 | 115 | // Create action definition 116 | return { 117 | type: tool.name, 118 | actionParams: z 119 | .object({ 120 | params: z 121 | .string() 122 | .describe( 123 | `The stringified parameters to the ${tool.name} MCP tool. Here is the schema: ${JSON.stringify(tool.inputSchema)}` 124 | ), 125 | }) 126 | .describe(tool.description ?? ""), 127 | run: async ( 128 | ctx: ActionContext, 129 | action: any 130 | ): Promise => { 131 | if (!ctx.mcpClient) { 132 | throw new Error( 133 | "MCP client not available. Please ensure an MCP server is connected." 134 | ); 135 | } 136 | 137 | const params = JSON.parse(action.params); 138 | const targetServerId = serverId; 139 | 140 | const result = await ctx.mcpClient.executeTool( 141 | tool.name, 142 | params, 143 | targetServerId 144 | ); 145 | 146 | return { 147 | success: true, 148 | message: `MCP tool ${tool.name} execution successful: ${JSON.stringify(result)}`, 149 | }; 150 | }, 151 | }; 152 | }); 153 | 154 | // Store server connection 155 | this.servers.set(serverId, { 156 | id: serverId, 157 | config: serverConfig, 158 | client, 159 | transport, 160 | tools: toolsMap, 161 | actions, 162 | }); 163 | if (this.debug) { 164 | console.log(`Connected to MCP server with ID: ${serverId}`); 165 | console.log("Added tools:", Array.from(toolsMap.keys())); 166 | } 167 | return { serverId, actions }; 168 | } catch (e) { 169 | console.error("Failed to connect to MCP server: ", e); 170 | throw e; 171 | } 172 | } 173 | 174 | /** 175 | * Execute a tool on a specific server 176 | * @param toolName The name of the tool to execute 177 | * @param parameters The parameters to pass to the tool 178 | * @param serverId The ID of the server to use (optional) 179 | * @returns The result of the tool execution 180 | */ 181 | async executeTool( 182 | toolName: string, 183 | parameters: Record, 184 | serverId?: string 185 | ): Promise { 186 | // If no server ID provided and only one server exists, use that one 187 | if (!serverId && this.servers.size === 1) { 188 | serverId = [...this.servers.keys()][0]; 189 | } 190 | 191 | // If no server ID provided and multiple servers exist, try to find one with the tool 192 | if (!serverId && this.servers.size > 1) { 193 | for (const [id, server] of this.servers.entries()) { 194 | if (server.tools.has(toolName)) { 195 | serverId = id; 196 | break; 197 | } 198 | } 199 | } 200 | 201 | if (!serverId || !this.servers.has(serverId)) { 202 | throw new Error(`No valid server found for tool ${toolName}`); 203 | } 204 | 205 | const server = this.servers.get(serverId); 206 | if (!server) { 207 | throw new Error(`Server with ID ${serverId} not found`); 208 | } 209 | 210 | try { 211 | const result = await server.client.callTool({ 212 | name: toolName, 213 | arguments: parameters, 214 | }); 215 | 216 | return result; 217 | } catch (e) { 218 | console.error( 219 | `Error executing tool ${toolName} on server ${serverId}:`, 220 | e 221 | ); 222 | throw e; 223 | } 224 | } 225 | 226 | /** 227 | * Get all registered action definitions from all connected servers 228 | * @returns Array of action definitions 229 | */ 230 | getAllActions(): AgentActionDefinition[] { 231 | const allActions: AgentActionDefinition[] = []; 232 | for (const server of this.servers.values()) { 233 | allActions.push(...server.actions); 234 | } 235 | return allActions; 236 | } 237 | 238 | /** 239 | * Get the IDs of all connected servers 240 | * @returns Array of server IDs 241 | */ 242 | getServerIds(): string[] { 243 | return [...this.servers.keys()]; 244 | } 245 | 246 | /** 247 | * Disconnect from a specific server 248 | * @param serverId The ID of the server to disconnect from 249 | */ 250 | async disconnectServer(serverId: string): Promise { 251 | const server = this.servers.get(serverId); 252 | if (server) { 253 | await server.transport.close(); 254 | this.servers.delete(serverId); 255 | if (this.debug) { 256 | console.log(`Disconnected from MCP server with ID: ${serverId}`); 257 | } 258 | } 259 | } 260 | 261 | /** 262 | * Disconnect from all servers 263 | */ 264 | async disconnect(): Promise { 265 | for (const serverId of this.servers.keys()) { 266 | await this.disconnectServer(serverId); 267 | } 268 | } 269 | 270 | /** 271 | * Check if a tool exists on any connected server 272 | * @param toolName The name of the tool to check 273 | * @returns Boolean indicating if the tool exists and the server ID it exists on 274 | */ 275 | hasTool(toolName: string): { exists: boolean; serverId?: string } { 276 | for (const [serverId, server] of this.servers.entries()) { 277 | if (server.tools.has(toolName)) { 278 | return { exists: true, serverId }; 279 | } 280 | } 281 | return { exists: false }; 282 | } 283 | 284 | /** 285 | * Get information about all connected servers 286 | * @returns Array of server information objects 287 | */ 288 | getServerInfo(): Array<{ 289 | id: string; 290 | toolCount: number; 291 | toolNames: string[]; 292 | }> { 293 | return Array.from(this.servers.entries()).map(([id, server]) => ({ 294 | id, 295 | toolCount: server.tools.size, 296 | toolNames: Array.from(server.tools.keys()), 297 | })); 298 | } 299 | 300 | /** 301 | * Check if any servers are connected 302 | * @returns Boolean indicating if any servers are connected 303 | */ 304 | hasConnections(): boolean { 305 | return this.servers.size > 0; 306 | } 307 | } 308 | 309 | export { MCPClient }; 310 | -------------------------------------------------------------------------------- /src/agent/messages/builder.ts: -------------------------------------------------------------------------------- 1 | import { AgentStep } from "@/types"; 2 | import { BaseMessageLike } from "@langchain/core/messages"; 3 | import { Page } from "playwright"; 4 | import { getScrollInfo } from "./utils"; 5 | import { retry } from "@/utils/retry"; 6 | import { DOMState } from "@/context-providers/dom/types"; 7 | import { HyperVariable } from "@/types/agent/types"; 8 | 9 | export const buildAgentStepMessages = async ( 10 | baseMessages: BaseMessageLike[], 11 | steps: AgentStep[], 12 | task: string, 13 | page: Page, 14 | domState: DOMState, 15 | screenshot: string, 16 | variables: HyperVariable[] 17 | ): Promise => { 18 | const messages = [...baseMessages]; 19 | 20 | // Add the final goal section 21 | messages.push({ 22 | role: "user", 23 | content: `=== Final Goal ===\n${task}\n`, 24 | }); 25 | 26 | // Add current URL section 27 | messages.push({ 28 | role: "user", 29 | content: `=== Current URL ===\n${page.url()}\n`, 30 | }); 31 | 32 | // Add variables section 33 | messages.push({ 34 | role: "user", 35 | content: `=== Variables ===\n${variables.map((v) => `<<${v.key}>> - ${v.description}`).join("\n")}\n`, 36 | }); 37 | 38 | // Add previous actions section if there are steps 39 | if (steps.length > 0) { 40 | messages.push({ 41 | role: "user", 42 | content: "=== Previous Actions ===\n", 43 | }); 44 | for (const step of steps) { 45 | messages.push({ 46 | role: "ai", 47 | content: JSON.stringify(step.agentOutput), 48 | }); 49 | for (const actionOutput of step.actionOutputs) { 50 | messages.push({ 51 | role: "user", 52 | content: actionOutput.extract 53 | ? `${actionOutput.message} :\n ${JSON.stringify(actionOutput.extract)}` 54 | : actionOutput.message, 55 | }); 56 | } 57 | } 58 | } 59 | 60 | // Add elements section with DOM tree 61 | messages.push({ 62 | role: "user", 63 | content: `=== Elements ===\n${domState.domState}\n`, 64 | }); 65 | 66 | // Add page screenshot section 67 | const scrollInfo = await retry({ func: () => getScrollInfo(page) }); 68 | messages.push({ 69 | role: "user", 70 | content: [ 71 | { 72 | type: "text", 73 | text: "=== Page Screenshot ===\n", 74 | }, 75 | { 76 | type: "image_url", 77 | image_url: { 78 | url: `data:image/png;base64,${screenshot}`, 79 | }, 80 | }, 81 | { 82 | type: "text", 83 | text: `=== Page State ===\nPixels above: ${scrollInfo[0]}\nPixels below: ${scrollInfo[1]}\n`, 84 | }, 85 | ], 86 | }); 87 | 88 | return messages; 89 | }; 90 | -------------------------------------------------------------------------------- /src/agent/messages/examples-actions.ts: -------------------------------------------------------------------------------- 1 | export const EXAMPLE_ACTIONS = `- Search: [ 2 | {"type": "textInput", "params": {"text": "search query"}}, 3 | {"type": "keyPress", "params": {"key": "Enter"}} 4 | ] 5 | - Clicking on an element: [ 6 | {"type": "clickElement", "params": {"index": 1}} 7 | ] 8 | - Extracting content (if your goal is to find any information on a page): [ 9 | {"type": "extractContent", "params": {"goal": "what specifically you need to extract"}} 10 | ] 11 | - Forms: [ 12 | {"type": "inputText", "params": {"index": 1, "text": "first name"}}, 13 | {"type": "inputText", "params": {"index": 2, "text": "last name"}}, 14 | {"type": "inputText", "params": {"index": 2, "text": "job title"}}, 15 | {"type": "clickElement", "params": {"index": 3}} 16 | ]`; 17 | -------------------------------------------------------------------------------- /src/agent/messages/input-format.ts: -------------------------------------------------------------------------------- 1 | export const INPUT_FORMAT = `=== Final Goal === 2 | [The final goal that needs to be accomplished] 3 | === Open Tabs === 4 | [The open tabs] 5 | === Current URL === 6 | [The current URL] 7 | === Variables === 8 | [Variables that can be used in the task] 9 | - Variables are referenced using <> syntax 10 | - Each variable has a name and description 11 | - Variables persist across actions and can be referenced in subsequent steps 12 | - Format: <> - {description} 13 | === Elements === 14 | [A list of the elements on the page in the following format] 15 | [index]value 16 | - type: HTML element type (button, input, etc.) 17 | - index: Numeric identifier for interaction 18 | - attributes: All HTML attributes of the element like type, name, value, class, etc. This can include: 19 | * Data attributes 20 | * ARIA attributes 21 | * Custom attributes 22 | * Any other valid HTML attributes 23 | * The attributes provide important context about the element's behavior, accessibility, and styling 24 | === Previous Actions === 25 | [The previous steps of the task] 26 | === Page Screenshot === 27 | - A screenshot of the current page with the interactive elements highlighted with their index 28 | === Page State === 29 | - Pixels below 30 | - Pixels above`; 31 | -------------------------------------------------------------------------------- /src/agent/messages/output-format.ts: -------------------------------------------------------------------------------- 1 | export const OUTPUT_FORMAT = `Your response MUST be in this exact format: 2 | { 3 | "thoughts": "Your thoughts on the task at hand, was the previous goal successful?", 4 | "memory": "Information that you need to remember to accomplish subsequent goals", 5 | "nextGoal": "The next goal you are trying to accomplish with the actions you have chosen", 6 | "actions": [ 7 | { 8 | "action": "The action you will take", 9 | "params": { 10 | ...Action Arguments... 11 | } 12 | } 13 | ] 14 | }` -------------------------------------------------------------------------------- /src/agent/messages/system-prompt.ts: -------------------------------------------------------------------------------- 1 | import { INPUT_FORMAT } from "./input-format"; 2 | import { OUTPUT_FORMAT } from "./output-format"; 3 | import { EXAMPLE_ACTIONS } from "./examples-actions"; 4 | 5 | const DATE_STRING = new Date().toLocaleString(undefined, { 6 | year: "numeric", 7 | month: "2-digit", 8 | day: "2-digit", 9 | weekday: "long", 10 | }); 11 | 12 | export const SYSTEM_PROMPT = `You are a smart and sophisticated agent that is designed to automate web browser interactions. 13 | You try to accomplish goals in a quick and concise manner. 14 | Your goal is to accomplish the final goal following the rules by using the provided actions and breaking down the task into smaller steps. 15 | You are provided with a set of actions that you can use to accomplish the task. 16 | 17 | # World State 18 | The current Date is ${DATE_STRING}. The date format is MM/DD/YYYY. 19 | 20 | # Input Format 21 | ${INPUT_FORMAT} 22 | 23 | # Output Format 24 | ${OUTPUT_FORMAT} 25 | 26 | ## Action Rules: 27 | - You can run multiple actions in the output, they will be executed in the given order 28 | - If you do run multiple actions, sequence similar ones together for efficiency. 29 | - Do NOT run actions that change the page entirely, you will get the new DOM after those actions and you can run the next actions then. 30 | - Use a maximum of 25 actions per sequence. 31 | 32 | ## Action Execution: 33 | - Actions are executed in the given order 34 | - If the page changes after an action, the sequence is interrupted and you get the new state. 35 | 36 | ## Common action examples: 37 | ${EXAMPLE_ACTIONS} 38 | 39 | # Rules 40 | 1. FINAL GOAL COMPLETION: 41 | - Only use the "complete" action when you have fully accomplished everything specified in the task 42 | - The "complete" action must be the final action in your sequence 43 | - Before using "complete", verify you have gathered all requested information and met all task requirements 44 | - Include detailed results in the "complete" action's text parameter to show how you satisfied each requirement 45 | 46 | 2. Validation: 47 | - Before you finish up your task, call the taskCompleteValidation. It will double check your task and it's subtasks. That will be used to see if you're done with all tasks and subtasks of that at this point. You **MUST** run this before performing a tool call to the "complete" tool. 48 | 49 | # Guidelines 50 | 1. NAVIGATION 51 | - If no suitable elements exist, use other functions to complete the task 52 | - Use scroll to find elements you are looking for 53 | - If you want to research something, open a new tab instead of using the current tab 54 | 55 | 2. GETTING UNSTUCK 56 | - Avoid getting stuck in loops. 57 | * You know your previous actions, and you know your current state. Do not keep repeating yourself expecting something to change. 58 | - If stuck, try: 59 | * Going back to a previous page 60 | * Starting a new search 61 | * Opening a new tab 62 | * Using alternative navigation paths 63 | * Trying a different website or source 64 | * Use the thinking action to think about the task and how to accomplish it 65 | 66 | 3. SPECIAL CASES 67 | - Cookies: Either try accepting the banner or closing it 68 | - Captcha: First try to solve it, otherwise try to refresh the website, if that doesn't work, try a different method to accomplish the task 69 | 70 | 4. Form filling: 71 | - If your action sequence is interrupted after filling an input field, it likely means the page changed (e.g., autocomplete suggestions appeared). 72 | - When suggestions appear, select an appropriate one before continuing. Important thing to note with this, you should prioritize selecting the most specific/detailed option when hierarchical or nested options are available. 73 | - For date selection, use the calendar/date picker controls (usually arrows to navigate through the months and years) or type the date directly into the input field rather than scrolling. Ensure the dates selected are the correct ones. 74 | - After completing all form fields, remember to click the submit/search button to process the form. 75 | 76 | 5. For Date Pickers with Calendars: 77 | - First try to type the date directly into the input field and send the enter key press action 78 | * Be sure to send the enter key press action after typing the date, if you don't do that, the date will not be selected 79 | - If that doesn't work, use the right arrow key to navigate through months and years until finding the correct date 80 | * Be patient and persistent with calendar navigation - it may take multiple attempts to reach the target month/year 81 | * Verify the correct date is selected before proceeding 82 | 83 | 5. For Flight Search: 84 | - If you are typing in the where from, ALWAYS send an enter key press action after typing the value 85 | - If you are typing in the where to, ALWAYS send an enter key press action after typing the value 86 | 87 | 5. For flight sources and destinations: 88 | - Send enter key press action after typing the source or destination 89 | 90 | # Search Strategy 91 | When searching, follow these best practices: 92 | 93 | 1. Primary Search Method: 94 | - Use textInput action followed by keyPress action with 'Enter' 95 | - If unsuccessful, look for clickable 'Search' text or magnifying glass icon 96 | - Only click search elements that are marked as interactive 97 | 98 | 2. Query Construction: 99 | - Search Engines (Google, Bing): 100 | * Can handle complex, natural language queries 101 | * Example: "trending python repositories" or "wizards latest game score" 102 | 103 | - Specific Websites: 104 | * Use simpler, more targeted queries 105 | * Follow up with filters and sorting 106 | * Example on GitHub: Search "language:python", then sort by trending/stars 107 | * Example on ESPN: Search "wizards", navigate to team page, find latest score 108 | 109 | 3. Important Considerations: 110 | - For date-based queries, use current date: ${DATE_STRING} 111 | - Use relative dates only when explicitly requested 112 | - With autocomplete: 113 | * You can ignore suggestions and enter custom input 114 | * Verify suggested options match requirements before selecting 115 | 116 | 4. Search Refinement: 117 | - Use available filters and sort options 118 | - Consider in-memory filtering when site options are limited 119 | - Break down complex searches into smaller, manageable steps 120 | `; 121 | -------------------------------------------------------------------------------- /src/agent/messages/utils.ts: -------------------------------------------------------------------------------- 1 | import { Page } from "playwright"; 2 | 3 | export const getScrollInfo = async (page: Page): Promise<[number, number]> => { 4 | const scrollY = (await page.evaluate("window.scrollY")) as number; 5 | const viewportHeight = (await page.evaluate("window.innerHeight")) as number; 6 | const totalHeight = (await page.evaluate( 7 | "document.documentElement.scrollHeight" 8 | )) as number; 9 | const pixelsAbove = scrollY; 10 | const pixelsBelow = totalHeight - (scrollY + viewportHeight); 11 | return [pixelsAbove, pixelsBelow]; 12 | }; 13 | -------------------------------------------------------------------------------- /src/agent/tools/agent.ts: -------------------------------------------------------------------------------- 1 | import { AgentStep } from "@/types/agent/types"; 2 | import fs from "fs"; 3 | 4 | import { 5 | ActionContext, 6 | ActionOutput, 7 | ActionType, 8 | AgentActionDefinition, 9 | } from "@/types"; 10 | import { getDom } from "@/context-providers/dom"; 11 | import { retry } from "@/utils/retry"; 12 | import { sleep } from "@/utils/sleep"; 13 | 14 | import { AgentOutputFn, endTaskStatuses } from "@hyperbrowser/agent/types"; 15 | import { 16 | TaskParams, 17 | TaskOutput, 18 | TaskState, 19 | TaskStatus, 20 | } from "@hyperbrowser/agent/types"; 21 | 22 | import { HyperagentError } from "../error"; 23 | import { buildAgentStepMessages } from "../messages/builder"; 24 | import { getStructuredOutputMethod } from "../llms/structured-output"; 25 | import { SYSTEM_PROMPT } from "../messages/system-prompt"; 26 | import { z } from "zod"; 27 | import { DOMState } from "@/context-providers/dom/types"; 28 | import { Page } from "playwright"; 29 | import { ActionNotFoundError } from "../actions"; 30 | import { AgentCtx } from "./types"; 31 | import sharp from "sharp"; 32 | 33 | const compositeScreenshot = async (page: Page, overlay: string) => { 34 | const screenshot = await page.screenshot(); 35 | const responseBuffer = await sharp(screenshot) 36 | .composite([{ input: Buffer.from(overlay, "base64") }]) 37 | .png() 38 | .toBuffer(); 39 | return responseBuffer.toString("base64"); 40 | }; 41 | 42 | const getActionSchema = (actions: Array) => { 43 | const zodDefs = actions.map((action) => 44 | z.object({ 45 | type: z.nativeEnum([action.type] as unknown as z.EnumLike), 46 | params: action.actionParams, 47 | actionDescription: z 48 | .string() 49 | .describe( 50 | "Describe why you are performing this action and what you aim to perform with this action." 51 | ), 52 | }) 53 | ); 54 | return z.union([zodDefs[0], zodDefs[1], ...zodDefs.splice(2)]); 55 | }; 56 | 57 | const getActionHandler = ( 58 | actions: Array, 59 | type: string 60 | ) => { 61 | const foundAction = actions.find((actions) => actions.type === type); 62 | if (foundAction) { 63 | return foundAction.run; 64 | } else { 65 | throw new ActionNotFoundError(type); 66 | } 67 | }; 68 | 69 | const runAction = async ( 70 | action: ActionType, 71 | domState: DOMState, 72 | page: Page, 73 | ctx: AgentCtx 74 | ): Promise => { 75 | const actionCtx: ActionContext = { 76 | domState, 77 | page, 78 | tokenLimit: ctx.tokenLimit, 79 | llm: ctx.llm, 80 | debugDir: ctx.debugDir, 81 | mcpClient: ctx.mcpClient || undefined, 82 | variables: Object.values(ctx.variables), 83 | }; 84 | const actionType = action.type; 85 | const actionHandler = getActionHandler(ctx.actions, action.type); 86 | if (!actionHandler) { 87 | return { 88 | success: false, 89 | message: `Unknown action type: ${actionType}`, 90 | }; 91 | } 92 | try { 93 | return await actionHandler(actionCtx, action.params); 94 | } catch (error) { 95 | return { 96 | success: false, 97 | message: `Action ${action.type} failed: ${error}`, 98 | }; 99 | } 100 | }; 101 | 102 | export const runAgentTask = async ( 103 | ctx: AgentCtx, 104 | taskState: TaskState, 105 | params?: TaskParams 106 | ): Promise => { 107 | const taskId = taskState.id; 108 | const debugDir = params?.debugDir || `debug/${taskId}`; 109 | if (ctx.debug) { 110 | console.log(`Debugging task ${taskId} in ${debugDir}`); 111 | } 112 | if (!taskState) { 113 | throw new HyperagentError(`Task ${taskId} not found`); 114 | } 115 | 116 | taskState.status = TaskStatus.RUNNING as TaskStatus; 117 | if (!ctx.llm) { 118 | throw new HyperagentError("LLM not initialized"); 119 | } 120 | const llmStructured = ctx.llm.withStructuredOutput( 121 | AgentOutputFn(getActionSchema(ctx.actions)), 122 | { 123 | method: getStructuredOutputMethod(ctx.llm), 124 | } 125 | ); 126 | const baseMsgs = [{ role: "system", content: SYSTEM_PROMPT }]; 127 | 128 | let output = ""; 129 | const page = taskState.startingPage; 130 | let currStep = 0; 131 | while (true) { 132 | // Status Checks 133 | if ((taskState.status as TaskStatus) == TaskStatus.PAUSED) { 134 | await sleep(100); 135 | continue; 136 | } 137 | if (endTaskStatuses.has(taskState.status)) { 138 | break; 139 | } 140 | if (params?.maxSteps && currStep >= params.maxSteps) { 141 | taskState.status = TaskStatus.CANCELLED; 142 | break; 143 | } 144 | const debugStepDir = `${debugDir}/step-${currStep}`; 145 | if (ctx.debug) { 146 | fs.mkdirSync(debugStepDir, { recursive: true }); 147 | } 148 | 149 | // Get DOM State 150 | const domState = await retry({ func: () => getDom(page) }); 151 | if (!domState) { 152 | console.log("no dom state, waiting 1 second."); 153 | await sleep(1000); 154 | continue; 155 | } 156 | 157 | const trimmedScreenshot = await compositeScreenshot( 158 | page, 159 | domState.screenshot.startsWith("data:image/png;base64,") 160 | ? domState.screenshot.slice("data:image/png;base64,".length) 161 | : domState.screenshot 162 | ); 163 | 164 | // Store Dom State for Debugging 165 | if (ctx.debug) { 166 | fs.mkdirSync(debugDir, { recursive: true }); 167 | fs.writeFileSync(`${debugStepDir}/elems.txt`, domState.domState); 168 | if (trimmedScreenshot) { 169 | fs.writeFileSync( 170 | `${debugStepDir}/screenshot.png`, 171 | Buffer.from(trimmedScreenshot, "base64") 172 | ); 173 | } 174 | } 175 | 176 | // Build Agent Step Messages 177 | const msgs = await buildAgentStepMessages( 178 | baseMsgs, 179 | taskState.steps, 180 | taskState.task, 181 | page, 182 | domState, 183 | trimmedScreenshot as string, 184 | Object.values(ctx.variables) 185 | ); 186 | 187 | // Store Agent Step Messages for Debugging 188 | if (ctx.debug) { 189 | fs.writeFileSync( 190 | `${debugStepDir}/msgs.json`, 191 | JSON.stringify(msgs, null, 2) 192 | ); 193 | } 194 | 195 | // Invoke LLM 196 | const agentOutput = await retry({ 197 | func: () => llmStructured.invoke(msgs), 198 | }); 199 | 200 | params?.debugOnAgentOutput?.(agentOutput); 201 | 202 | // Status Checks 203 | if ((taskState.status as TaskStatus) == TaskStatus.PAUSED) { 204 | await sleep(100); 205 | continue; 206 | } 207 | if (endTaskStatuses.has(taskState.status)) { 208 | break; 209 | } 210 | 211 | // Run Actions 212 | const agentStepActions = agentOutput.actions; 213 | const actionOutputs: ActionOutput[] = []; 214 | for (const action of agentStepActions) { 215 | if (action.type === "complete") { 216 | taskState.status = TaskStatus.COMPLETED; 217 | const actionDefinition = ctx.actions.find( 218 | (actionDefinition) => actionDefinition.type === "complete" 219 | ); 220 | if (actionDefinition) { 221 | output = 222 | (await actionDefinition.completeAction?.(action.params)) ?? 223 | "No complete action found"; 224 | } else { 225 | output = "No complete action found"; 226 | } 227 | } 228 | const actionOutput = await runAction( 229 | action as ActionType, 230 | domState, 231 | page, 232 | ctx 233 | ); 234 | actionOutputs.push(actionOutput); 235 | await sleep(2000); // TODO: look at this - smarter page loading 236 | } 237 | const step: AgentStep = { 238 | idx: currStep, 239 | agentOutput: agentOutput, 240 | actionOutputs, 241 | }; 242 | taskState.steps.push(step); 243 | await params?.onStep?.(step); 244 | currStep = currStep + 1; 245 | 246 | if (ctx.debug) { 247 | fs.writeFileSync( 248 | `${debugStepDir}/stepOutput.json`, 249 | JSON.stringify(step, null, 2) 250 | ); 251 | } 252 | } 253 | 254 | const taskOutput: TaskOutput = { 255 | status: taskState.status, 256 | steps: taskState.steps, 257 | output, 258 | }; 259 | if (ctx.debug) { 260 | fs.writeFileSync( 261 | `${debugDir}/taskOutput.json`, 262 | JSON.stringify(taskOutput, null, 2) 263 | ); 264 | } 265 | await params?.onComplete?.(taskOutput); 266 | return taskOutput; 267 | }; 268 | -------------------------------------------------------------------------------- /src/agent/tools/types.ts: -------------------------------------------------------------------------------- 1 | import { AgentActionDefinition } from "@/types/agent/actions/types"; 2 | import { MCPClient } from "../mcp/client"; 3 | import { BaseChatModel } from "@langchain/core/language_models/chat_models"; 4 | import { HyperVariable } from "@/types/agent/types"; 5 | 6 | export interface AgentCtx { 7 | mcpClient?: MCPClient; 8 | debugDir?: string; 9 | debug?: boolean; 10 | variables: Record; 11 | actions: Array; 12 | tokenLimit: number; 13 | llm: BaseChatModel; 14 | } 15 | -------------------------------------------------------------------------------- /src/browser-providers/hyperbrowser.ts: -------------------------------------------------------------------------------- 1 | import { chromium, Browser, ConnectOverCDPOptions } from "playwright"; 2 | import { Hyperbrowser } from "@hyperbrowser/sdk"; 3 | import { 4 | CreateSessionParams, 5 | HyperbrowserConfig, 6 | SessionDetail, 7 | } from "@hyperbrowser/sdk/types"; 8 | 9 | import BrowserProvider from "@/types/browser-providers/types"; 10 | 11 | export class HyperbrowserProvider extends BrowserProvider { 12 | browserConfig: Omit | undefined; 13 | sessionConfig: CreateSessionParams | undefined; 14 | config: HyperbrowserConfig | undefined; 15 | browser: Browser | undefined; 16 | session: SessionDetail | undefined; 17 | hbClient: Hyperbrowser | undefined; 18 | debug: boolean; 19 | 20 | constructor(params?: { 21 | debug?: boolean; 22 | browserConfig?: Omit; 23 | sessionConfig?: CreateSessionParams; 24 | config?: HyperbrowserConfig; 25 | }) { 26 | super(); 27 | this.debug = params?.debug ?? false; 28 | this.browserConfig = params?.browserConfig; 29 | this.sessionConfig = params?.sessionConfig; 30 | this.config = params?.config; 31 | } 32 | 33 | async start(): Promise { 34 | const client = new Hyperbrowser(this.config); 35 | const session = await client.sessions.create(this.sessionConfig); 36 | this.hbClient = client; 37 | this.session = session; 38 | this.browser = await chromium.connectOverCDP( 39 | session.wsEndpoint, 40 | this.browserConfig 41 | ); 42 | 43 | if (this.debug) { 44 | console.log( 45 | "\nHyperbrowser session info:", 46 | { 47 | liveUrl: session.liveUrl, 48 | sessionID: session.id, 49 | infoUrl: session.sessionUrl, 50 | }, 51 | "\n" 52 | ); 53 | } 54 | 55 | return this.browser; 56 | } 57 | 58 | async close(): Promise { 59 | await this.browser?.close(); 60 | if (this.session) { 61 | await this.hbClient?.sessions.stop(this.session.id); 62 | } 63 | } 64 | 65 | public getSession() { 66 | if (!this.session) { 67 | return null; 68 | } 69 | return this.session; 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /src/browser-providers/index.ts: -------------------------------------------------------------------------------- 1 | import { HyperbrowserProvider } from "./hyperbrowser"; 2 | import { LocalBrowserProvider } from "./local"; 3 | 4 | export { HyperbrowserProvider, LocalBrowserProvider }; 5 | -------------------------------------------------------------------------------- /src/browser-providers/local.ts: -------------------------------------------------------------------------------- 1 | import { chromium, Browser, LaunchOptions } from "playwright"; 2 | import BrowserProvider from "@/types/browser-providers/types"; 3 | 4 | export class LocalBrowserProvider extends BrowserProvider { 5 | options: Omit, "channel"> | undefined; 6 | session: Browser | undefined; 7 | constructor(options?: Omit, "channel">) { 8 | super(); 9 | this.options = options; 10 | } 11 | async start(): Promise { 12 | const launchArgs = this.options?.args ?? []; 13 | const browser = await chromium.launch({ 14 | ...(this.options ?? {}), 15 | channel: "chrome", 16 | headless: false, 17 | args: ["--disable-blink-features=AutomationControlled", ...launchArgs], 18 | }); 19 | this.session = browser; 20 | return this.session; 21 | } 22 | async close(): Promise { 23 | return await this.session?.close(); 24 | } 25 | public getSession() { 26 | if (!this.session) { 27 | return null; 28 | } 29 | return this.session; 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /src/cli/index.ts: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | import "dotenv/config"; 3 | import fs from "node:fs"; 4 | import { Command } from "commander"; 5 | import * as inquirer from "@inquirer/prompts"; 6 | import ora from "ora"; 7 | import boxen from "boxen"; 8 | import chalk from "chalk"; 9 | import readline from "readline"; 10 | import { zipWith } from "lodash"; 11 | 12 | import { HyperAgent } from "@/agent"; 13 | import { UserInteractionAction } from "@/custom-actions"; 14 | import { 15 | ActionOutput, 16 | ActionType, 17 | AgentOutput, 18 | AgentStep, 19 | Task, 20 | TaskOutput, 21 | TaskStatus, 22 | } from "@/types"; 23 | import { HyperagentError } from "@/agent/error"; 24 | import { SessionDetail } from "@hyperbrowser/sdk/types"; 25 | 26 | const program = new Command(); 27 | 28 | let currentSpinner = ora(); 29 | 30 | program 31 | .name("hyperbrowser") 32 | .description("CLI for Hyperbrowser - A powerful browser automation tool") 33 | .version("0.0.1"); 34 | 35 | program 36 | .command("run", { isDefault: true }) 37 | .description("Run the interactive CLI") 38 | .option("-d, --debug", "Enable debug mode") 39 | .option("-c, --command ", "Command to run") 40 | .option("-f, --file ", "Path to a file containing a command") 41 | .option("-m, --mcp ", "Path to a file containing mcp config") 42 | .option("--hyperbrowser", "Use Hyperbrowser for the browser provider") 43 | .action(async function () { 44 | const options = this.opts(); 45 | const debug = (options.debug as boolean) || false; 46 | const useHB = (options.hyperbrowser as boolean) || false; 47 | let taskDescription = (options.command as string) || undefined; 48 | const filePath = (options.file as string) || undefined; 49 | const mcpPath = (options.mcp as string) || undefined; 50 | 51 | console.log(chalk.blue("HyperAgent CLI")); 52 | currentSpinner.info( 53 | `Pause using ${chalk.bold("ctrl + p")} and resume using ${chalk.bold("ctrl + r")}\n` 54 | ); 55 | try { 56 | // Check for API key if using Hyperbrowser 57 | if (useHB && !process.env.HYPERBROWSER_API_KEY) { 58 | const apiKey = await inquirer.password({ 59 | message: 60 | "Hyperbrowser API key not found in environment variables. Please enter it here:", 61 | mask: "*", 62 | }); 63 | if (!apiKey) { 64 | console.log( 65 | chalk.yellow("Hyperbrowser API key is required. Exiting.") 66 | ); 67 | process.exit(0); 68 | } 69 | process.env.HYPERBROWSER_API_KEY = apiKey; // Set it for the current process 70 | } 71 | 72 | const agent = new HyperAgent({ 73 | debug: debug, 74 | browserProvider: useHB ? "Hyperbrowser" : "Local", 75 | customActions: [ 76 | UserInteractionAction( 77 | async ({ message, kind, choices }): Promise => { 78 | const currentText = currentSpinner.text; 79 | try { 80 | currentSpinner.stop(); 81 | currentSpinner.clear(); 82 | if (kind === "text_input") { 83 | const response = await inquirer.input({ 84 | message, 85 | required: true, 86 | }); 87 | return { 88 | success: true, 89 | message: `User responded with the text: "${response}"`, 90 | }; 91 | } else if (kind === "confirm") { 92 | const response = await inquirer.confirm({ 93 | message, 94 | }); 95 | return { 96 | success: true, 97 | message: `User responded with "${response}"`, 98 | }; 99 | } else if (kind === "password") { 100 | console.warn( 101 | chalk.red( 102 | "Providing passwords to LLMs can be dangerous. Passwords are passed in plain-text to the LLM and can be read by other people." 103 | ) 104 | ); 105 | const response = await inquirer.password({ 106 | message, 107 | }); 108 | return { 109 | success: true, 110 | message: `User responded with password: ${response}`, 111 | }; 112 | } else { 113 | if (!choices) { 114 | return { 115 | success: false, 116 | message: 117 | "For choices kind of user interaction, an array of choices is required.", 118 | }; 119 | } else { 120 | const response = await inquirer.select({ 121 | message, 122 | choices: choices.map((option) => ({ 123 | value: option, 124 | name: option, 125 | })), 126 | }); 127 | return { 128 | success: true, 129 | message: `User selected the choice: ${response}`, 130 | }; 131 | } 132 | } 133 | } finally { 134 | currentSpinner.start(currentText); 135 | } 136 | } 137 | ), 138 | ], 139 | }); 140 | 141 | let task: Task; 142 | 143 | readline.emitKeypressEvents(process.stdin); 144 | 145 | process.stdin.on("keypress", async (ch, key) => { 146 | if (key && key.ctrl && key.name == "p") { 147 | if (currentSpinner.isSpinning) { 148 | currentSpinner.stopAndPersist({ symbol: "⏸" }); 149 | } 150 | currentSpinner.start( 151 | chalk.blue( 152 | "Hyperagent will pause after completing this operation. Press Ctrl+r again to resume." 153 | ) 154 | ); 155 | currentSpinner.stopAndPersist({ symbol: "⏸" }); 156 | currentSpinner = ora(); 157 | 158 | if (task.getStatus() == TaskStatus.RUNNING) { 159 | task.pause(); 160 | } 161 | } else if (key && key.ctrl && key.name == "r") { 162 | if (task.getStatus() == TaskStatus.PAUSED) { 163 | currentSpinner.start(chalk.blue("Hyperagent will resume")); 164 | currentSpinner.stopAndPersist({ symbol: "⏵" }); 165 | currentSpinner = ora(); 166 | 167 | task.resume(); 168 | } 169 | } else if (key && key.ctrl && key.name == "c") { 170 | if (currentSpinner.isSpinning) { 171 | currentSpinner.stopAndPersist(); 172 | } 173 | console.log("\nShutting down HyperAgent"); 174 | try { 175 | await agent.closeAgent(); 176 | process.exit(0); 177 | } catch (err) { 178 | console.error("Error during shutdown:", err); 179 | process.exit(1); 180 | } 181 | } 182 | }); 183 | 184 | process.stdin.setRawMode(true); 185 | 186 | const onStep = (params: AgentStep) => { 187 | const actionsList = zipWith( 188 | params.actionOutputs, 189 | params.agentOutput.actions, 190 | (output, action) => ({ 191 | output, 192 | action, 193 | }) 194 | ); 195 | 196 | const actions = actionsList 197 | .map((action, index, array) => 198 | index < array.length - 1 199 | ? ` ├── [${action.output.success ? chalk.yellow(action.action.type) : chalk.red(action.action.type)}] ${action.output.success ? agent.pprintAction(action.action as ActionType) : chalk.red(action.output.message)}` 200 | : ` └── [${action.output.success ? chalk.yellow(action.action.type) : chalk.red(action.action.type)}] ${action.output.success ? agent.pprintAction(action.action as ActionType) : chalk.red(action.output.message)}` 201 | ) 202 | .join("\n"); 203 | 204 | currentSpinner.succeed( 205 | `[${chalk.yellow("task")}]: ${params.agentOutput.nextGoal}\n${actions}` 206 | ); 207 | currentSpinner = ora(); 208 | process.stdin.setRawMode(true); 209 | process.stdin.resume(); 210 | }; 211 | 212 | const debugAgentOutput = (params: AgentOutput) => { 213 | const actions = params.actions.map((action, index, array) => 214 | index < array.length - 1 215 | ? ` ├── [${chalk.yellow(action.type)}] ${agent.pprintAction(action as ActionType)}` 216 | : ` └── [${chalk.yellow(action.type)}] ${agent.pprintAction(action as ActionType)}` 217 | ); 218 | currentSpinner.start( 219 | `[${chalk.yellow("task")}]: ${params.nextGoal}\n${actions.join("\n")}` 220 | ); 221 | process.stdin.setRawMode(true); 222 | process.stdin.resume(); 223 | }; 224 | 225 | const onComplete = async (params: TaskOutput) => { 226 | console.log( 227 | boxen(params.output || "No Response", { 228 | title: chalk.yellow("HyperAgent Response"), 229 | titleAlignment: "center", 230 | float: "center", 231 | padding: 1, 232 | margin: { top: 2, left: 0, right: 0, bottom: 0 }, 233 | }) 234 | ); 235 | console.log("\n"); 236 | const continueTask = await inquirer.select({ 237 | message: "Would you like to continue ", 238 | choices: [ 239 | { name: "Yes", value: true }, 240 | { name: "No", value: false }, 241 | ], 242 | }); 243 | if (continueTask) { 244 | const taskDescription = await inquirer.input({ 245 | message: "What should HyperAgent do next for you?", 246 | required: true, 247 | }); 248 | 249 | process.stdin.setRawMode(true); 250 | process.stdin.resume(); 251 | 252 | task = await agent.executeTaskAsync(taskDescription, { 253 | onStep: onStep, 254 | debugOnAgentOutput: debugAgentOutput, 255 | onComplete: onComplete, 256 | }); 257 | task.emitter.addListener("error", (error) => { 258 | task.cancel(); 259 | throw error; 260 | }); 261 | } else { 262 | process.exit(0); 263 | } 264 | }; 265 | if (!taskDescription) { 266 | if (filePath) { 267 | taskDescription = (await fs.promises.readFile(filePath)).toString(); 268 | } else { 269 | taskDescription = await inquirer.input({ 270 | message: "What should HyperAgent do for you today?", 271 | required: true, 272 | }); 273 | } 274 | } 275 | 276 | if (mcpPath) { 277 | const mcpConfig = JSON.parse( 278 | (await fs.promises.readFile(mcpPath)).toString() 279 | ); 280 | await agent.initializeMCPClient({ servers: mcpConfig }); 281 | } 282 | 283 | if (useHB && !debug) { 284 | await agent.initBrowser(); 285 | const session = agent.getSession() as SessionDetail; 286 | console.log(`Hyperbrowser Live URL: ${session.liveUrl}\n`); 287 | } 288 | 289 | task = await agent.executeTaskAsync(taskDescription, { 290 | onStep: onStep, 291 | onComplete: onComplete, 292 | debugOnAgentOutput: debugAgentOutput, 293 | }); 294 | task.emitter.addListener("error", (error) => { 295 | task.cancel(); 296 | throw error; 297 | }); 298 | } catch (err) { 299 | if (err instanceof HyperagentError || err instanceof Error) { 300 | console.log(chalk.red(err.message)); 301 | if (debug) { 302 | console.trace(err); 303 | } 304 | } else { 305 | console.log(chalk.red(err)); 306 | if (debug) { 307 | console.trace(err); 308 | } 309 | } 310 | } 311 | }); 312 | 313 | program.parse(); 314 | -------------------------------------------------------------------------------- /src/context-providers/dom/build-dom-view.ts: -------------------------------------------------------------------------------- 1 | import { findInteractiveElements } from "./find-interactive-elements"; 2 | import { renderHighlightsOffscreen } from "./highlight"; 3 | import { getCSSPath } from "./get-css-path"; 4 | import { CONTEXT_ATTRIBUTES } from "./const"; 5 | import { DOMStateRaw } from "./types"; 6 | import { getXPath } from "./get-x-path"; 7 | 8 | // Helper function to convert ImageBitmap to PNG Data URL 9 | const imageBitmapToPngDataUrl = (bitmap: ImageBitmap): string => { 10 | try { 11 | // Create an intermediate canvas 12 | const canvas = document.createElement("canvas"); 13 | canvas.width = bitmap.width; 14 | canvas.height = bitmap.height; 15 | 16 | // Get context and draw the bitmap 17 | const ctx = canvas.getContext("2d") as CanvasRenderingContext2D; 18 | ctx.drawImage(bitmap, 0, 0); 19 | 20 | // Export as PNG Data URL 21 | // Note: might want to add error handling for toDataURL 22 | return canvas.toDataURL("image/png"); 23 | } finally { 24 | // Close the bitmap to free up resources (important!) 25 | bitmap.close(); 26 | } 27 | }; 28 | 29 | // --- Start new function definition --- 30 | const getElementTextContent = (el: HTMLElement): string => { 31 | const tagName = el.tagName.toLowerCase(); 32 | 33 | if (tagName === "input") { 34 | const inputElement = el as HTMLInputElement; 35 | let labelText: string | null = null; 36 | 37 | // Try finding label by "for" attribute 38 | if (inputElement.id) { 39 | const label = document.querySelector(`label[for="${inputElement.id}"]`); 40 | if (label) { 41 | labelText = label.textContent?.trim() || null; 42 | } 43 | } 44 | 45 | // Use label text if found, otherwise use input value. Fallback to empty string if neither. 46 | return labelText ?? inputElement.value?.trim() ?? ""; 47 | } else { 48 | // Original logic for non-input elements 49 | return el.textContent?.trim() || ""; 50 | } 51 | }; 52 | // --- End new function definition --- 53 | 54 | export const buildDomView = (): DOMStateRaw => { 55 | const interactiveElements = findInteractiveElements(); 56 | 57 | // 1. Render highlights to an ImageBitmap 58 | const screenBitmap = renderHighlightsOffscreen( 59 | interactiveElements.map((element, index) => ({ 60 | element: element.element, 61 | index: index + 1, // index range from 1 -> index 62 | parentIframe: element.iframe ?? null, 63 | })), 64 | window.innerWidth, 65 | window.innerHeight 66 | ); 67 | 68 | // 2. Convert the ImageBitmap to a PNG Data URL 69 | const screenshotPngDataUrl = imageBitmapToPngDataUrl(screenBitmap); 70 | 71 | for (let idx = 0; idx < interactiveElements.length; idx++) { 72 | const element = interactiveElements[idx]; 73 | element.highlightIndex = idx + 1; // index range from 1 -> index 74 | element.cssPath = getCSSPath(element.element); 75 | element.xpath = getXPath(element.element); 76 | } 77 | 78 | const domRepresentation: string[] = []; 79 | 80 | const getTextBetween = (node: Node, nextNode: Node | null): string => { 81 | const texts: string[] = []; 82 | let current = node.nextSibling; 83 | 84 | while (current && current !== nextNode) { 85 | if (current.nodeType === Node.TEXT_NODE && current.textContent) { 86 | const text = current.textContent.trim(); 87 | if (text) texts.push(text); 88 | } 89 | current = current.nextSibling; 90 | } 91 | 92 | return texts.join(" "); 93 | }; 94 | 95 | for (let i = 0; i < interactiveElements.length; i++) { 96 | const element = interactiveElements[i]; 97 | const el = element.element; 98 | const tagName = el.tagName.toLowerCase(); 99 | 100 | let attributes = ""; 101 | Array.from(el.attributes).forEach((attr) => { 102 | if (CONTEXT_ATTRIBUTES.includes(attr.name)) { 103 | attributes += ` ${attr.name}="${attr.value}"`; 104 | } 105 | }); 106 | 107 | // Use the helper function to get text content 108 | const textContent = getElementTextContent(el); 109 | 110 | const indexPrefix = `[${element.highlightIndex}]`; 111 | const truncatedText = 112 | textContent.length > 1000 113 | ? textContent.substring(0, 997) + "..." 114 | : textContent; 115 | const elementString = `${indexPrefix}<${tagName}${attributes}>${truncatedText.replace(/\s+/g, " ")}`; 116 | domRepresentation.push(elementString); 117 | 118 | const nextElement = interactiveElements[i + 1]?.element || null; 119 | const betweenText = getTextBetween(el, nextElement); 120 | if (betweenText) { 121 | domRepresentation.push(betweenText); 122 | } 123 | } 124 | 125 | return { 126 | elements: interactiveElements, 127 | domState: domRepresentation.join("\n"), 128 | screenshot: screenshotPngDataUrl, 129 | }; 130 | }; 131 | -------------------------------------------------------------------------------- /src/context-providers/dom/builder.ts: -------------------------------------------------------------------------------- 1 | import fs from "fs"; 2 | import path from "path"; 3 | import esbuild from "esbuild"; 4 | 5 | fs.mkdirSync(path.join(__dirname, "./inject"), { recursive: true }); 6 | 7 | esbuild.buildSync({ 8 | entryPoints: [path.join(__dirname, "build-dom-view.ts")], 9 | bundle: true, 10 | outfile: path.join(__dirname, "inject", "build-dom-view-script.js"), 11 | }); 12 | 13 | const scriptContent = fs.readFileSync( 14 | path.join(__dirname, "./inject/build-dom-view-script.js"), 15 | "utf8" 16 | ); 17 | const lines = scriptContent.trim().split("\n"); 18 | const trimmedContent = `(() => { 19 | ${lines.slice(2, -1).join("\n")} 20 | return buildDomView(); 21 | })();`; 22 | fs.writeFileSync( 23 | path.join(__dirname, "./inject/build-dom-view-script.js"), 24 | trimmedContent 25 | ); 26 | const escapedContent = trimmedContent 27 | .replace(/\\/g, "\\\\") 28 | .replace(/`/g, "\\`") 29 | .replace(/\$\{/g, "\\${"); 30 | const tsConstFile = `export const buildDomViewJs = \`${escapedContent}\`;`; 31 | 32 | fs.writeFileSync( 33 | path.join(__dirname, "./inject/build-dom-view.ts"), 34 | tsConstFile 35 | ); 36 | -------------------------------------------------------------------------------- /src/context-providers/dom/const.ts: -------------------------------------------------------------------------------- 1 | export const INTERACTIVE_ELEMENTS = new Set([ 2 | "a", 3 | "input", 4 | "button", 5 | "select", 6 | "menu", 7 | "menuitem", 8 | "textarea", 9 | "canvas", 10 | "embed", 11 | ]); 12 | 13 | export const INTERACTIVE_ROLES = new Set([ 14 | "button", 15 | "link", 16 | "checkbox", 17 | "radio", 18 | "textbox", 19 | "menuitem", 20 | "tab", 21 | "tabpanel", 22 | "tooltip", 23 | "slider", 24 | "progressbar", 25 | "switch", 26 | "listbox", 27 | "option", 28 | "combobox", 29 | "menu", 30 | "treeitem", 31 | "tree", 32 | "spinbutton", 33 | "scrollbar", 34 | "menuitemcheckbox", 35 | "menuitemradio", 36 | "action", 37 | ]); 38 | 39 | export const INTERACTIVE_EVENTS = new Set([ 40 | "click", 41 | "mousedown", 42 | "mouseup", 43 | "touchstart", 44 | "touchend", 45 | ]); 46 | 47 | export const INTERACTIVE_ARIA_PROPS = [ 48 | "aria-expanded", 49 | "aria-pressed", 50 | "aria-selected", 51 | "aria-checked", 52 | ]; 53 | 54 | export const CLICK_ATTRIBUTES = ["onclick", "ng-click", "@click", "v-on:click"]; 55 | 56 | export const CONTEXT_ATTRIBUTES = [ 57 | "title", 58 | "type", 59 | "name", 60 | "role", 61 | "aria-label", 62 | "placeholder", 63 | "value", 64 | "alt", 65 | "aria-expanded", 66 | ]; 67 | -------------------------------------------------------------------------------- /src/context-providers/dom/elem-interactive.ts: -------------------------------------------------------------------------------- 1 | import { 2 | INTERACTIVE_ELEMENTS, 3 | INTERACTIVE_ROLES, 4 | INTERACTIVE_ARIA_PROPS, 5 | CLICK_ATTRIBUTES, 6 | } from "./const"; 7 | 8 | export const isInteractiveElem = ( 9 | element: HTMLElement 10 | ): { isInteractive: boolean; reason: string } => { 11 | const tagName = element.tagName.toLowerCase(); 12 | const role = element.getAttribute("role"); 13 | const ariaRole = element.getAttribute("aria-role"); 14 | 15 | const hasInteractiveRole = 16 | INTERACTIVE_ELEMENTS.has(tagName) || 17 | INTERACTIVE_ROLES.has(role || "") || 18 | INTERACTIVE_ROLES.has(ariaRole || ""); 19 | 20 | if (hasInteractiveRole) { 21 | let reason = ""; 22 | if (INTERACTIVE_ELEMENTS.has(tagName)) { 23 | reason = `Interactive HTML element: <${tagName}>`; 24 | } else if (INTERACTIVE_ROLES.has(role || "")) { 25 | reason = `Interactive role: ${role}`; 26 | } else if (INTERACTIVE_ROLES.has(ariaRole || "")) { 27 | reason = `Interactive aria-role: ${ariaRole}`; 28 | } 29 | return { isInteractive: true, reason }; 30 | } 31 | 32 | const hasClickHandler = 33 | element.onclick !== null || 34 | element.getAttribute("onclick") !== null || 35 | CLICK_ATTRIBUTES.some((attr) => element.hasAttribute(attr)); 36 | 37 | if (hasClickHandler) { 38 | return { isInteractive: true, reason: "Has click handler" }; 39 | } 40 | 41 | // Check for the marker attribute set by the injected script 42 | const hasInjectedListener = element.hasAttribute("data-has-interactive-listener"); 43 | 44 | if (hasInjectedListener) { 45 | return { isInteractive: true, reason: "Has interactive event listener (tracked)" }; 46 | } 47 | 48 | const hasAriaProps = INTERACTIVE_ARIA_PROPS.some((prop) => 49 | element.hasAttribute(prop) 50 | ); 51 | 52 | if (hasAriaProps) { 53 | const props = INTERACTIVE_ARIA_PROPS.filter((prop) => 54 | element.hasAttribute(prop) 55 | ); 56 | return { 57 | isInteractive: true, 58 | reason: `Has interactive ARIA properties: ${props.join(", ")}`, 59 | }; 60 | } 61 | 62 | const isContentEditable = 63 | element.getAttribute("contenteditable") === "true" || 64 | element.isContentEditable; 65 | 66 | if (isContentEditable) { 67 | return { isInteractive: true, reason: "Is content editable" }; 68 | } 69 | 70 | const isDraggable = 71 | element.draggable || element.getAttribute("draggable") === "true"; 72 | 73 | if (isDraggable) { 74 | return { isInteractive: true, reason: "Is draggable" }; 75 | } 76 | 77 | return { isInteractive: false, reason: "Not interactive" }; 78 | }; 79 | 80 | export const isIgnoredElem = (element: HTMLElement): boolean => { 81 | const rect = element.getBoundingClientRect(); 82 | const isNotVisible = rect.width === 0 || rect.height === 0; 83 | 84 | return ( 85 | element.tagName.toLowerCase() === "html" || 86 | element.tagName.toLowerCase() === "body" || 87 | isNotVisible || 88 | element.hasAttribute("disabled") || 89 | element.getAttribute("aria-disabled") === "true" 90 | ); 91 | }; 92 | -------------------------------------------------------------------------------- /src/context-providers/dom/find-interactive-elements.ts: -------------------------------------------------------------------------------- 1 | import { isIgnoredElem, isInteractiveElem } from "./elem-interactive"; 2 | import { InteractiveElement } from "./types"; 3 | 4 | export const findInteractiveElements = (): InteractiveElement[] => { 5 | const interactiveElements: InteractiveElement[] = []; 6 | const processedElements = new Set(); 7 | 8 | const processRoot = ( 9 | root: Document | ShadowRoot, 10 | rootInfo: { 11 | iframe?: HTMLIFrameElement; 12 | shadowHost?: HTMLElement; 13 | } = {} 14 | ) => { 15 | const elements = root.querySelectorAll("*"); 16 | for (let i = 0; i < elements.length; i++) { 17 | const element = elements[i] as HTMLElement; 18 | if (processedElements.has(element)) { 19 | continue; 20 | } 21 | processedElements.add(element); 22 | if (element.shadowRoot) { 23 | processRoot(element.shadowRoot, { 24 | iframe: rootInfo.iframe, 25 | shadowHost: element, 26 | }); 27 | } 28 | const { isInteractive, reason } = isInteractiveElem(element); 29 | if (isIgnoredElem(element) || !isInteractive) { 30 | continue; 31 | } 32 | interactiveElements.push({ 33 | element, 34 | iframe: rootInfo.iframe, 35 | shadowHost: rootInfo.shadowHost, 36 | rect: element.getBoundingClientRect(), 37 | interactiveReason: reason, 38 | isUnderShadowRoot: 39 | element.getRootNode().nodeType === Node.DOCUMENT_FRAGMENT_NODE, 40 | cssPath: "", 41 | xpath: "", 42 | }); 43 | } 44 | }; 45 | 46 | processRoot(document); 47 | 48 | const iframes = document.querySelectorAll("iframe"); 49 | for (let i = 0; i < iframes.length; i++) { 50 | const iframe = iframes[i] as HTMLIFrameElement; 51 | try { 52 | const iframeDoc = 53 | iframe.contentDocument || iframe.contentWindow?.document; 54 | if (iframeDoc) { 55 | processRoot(iframeDoc, { iframe }); 56 | } 57 | } catch (e) { 58 | console.warn("error processing iframe", e); 59 | } 60 | } 61 | 62 | return interactiveElements; 63 | }; 64 | -------------------------------------------------------------------------------- /src/context-providers/dom/get-css-path.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Escapes characters that have special meaning in CSS selectors. 3 | * Handles common cases like IDs and class names. 4 | * 5 | * @param value The string to escape (e.g., an ID or class name). 6 | * @returns The escaped string suitable for use in a CSS selector. 7 | */ 8 | const escapeSelector = (value: string): string => { 9 | return CSS.escape(value); 10 | }; 11 | 12 | /** 13 | * Generates a unique CSS selector segment for a given element relative to its siblings. 14 | * Prefers ID, then unique classes, then :nth-of-type. 15 | * 16 | * @param element The element to generate the selector for. 17 | * @returns A CSS selector segment string (e.g., "div#myId", "button.btn.primary", "span:nth-of-type(2)"). 18 | */ 19 | const getUniqueSegment = (element: HTMLElement): string => { 20 | const tagName = element.tagName.toLowerCase(); 21 | const parent = element.parentElement; 22 | 23 | // 1. Try ID 24 | if (element.id) { 25 | const idSelector = `#${escapeSelector(element.id)}`; 26 | return idSelector; 27 | } 28 | 29 | // 2. Try unique combination of classes 30 | const classes = Array.from(element.classList).map(escapeSelector).join("."); 31 | if (classes && parent) { 32 | const classSelector = `${tagName}.${classes}`; 33 | const siblingsWithSameClasses = Array.from( 34 | parent.querySelectorAll(`:scope > ${classSelector}`) 35 | ); 36 | if ( 37 | siblingsWithSameClasses.length === 1 && 38 | siblingsWithSameClasses[0] === element 39 | ) { 40 | return classSelector; 41 | } 42 | } 43 | 44 | // 3. Fallback to :nth-of-type 45 | let index = 1; // CSS :nth-of-type is 1-based 46 | let sibling = element.previousElementSibling; 47 | while (sibling) { 48 | if (sibling.tagName === element.tagName) { 49 | index++; 50 | } 51 | sibling = sibling.previousElementSibling; 52 | } 53 | 54 | // Only add :nth-of-type if there are other siblings of the same type 55 | let hasSameTypeSiblings = index > 1; // Already found preceding siblings 56 | if (!hasSameTypeSiblings && parent) { 57 | sibling = element.nextElementSibling; 58 | while (sibling) { 59 | if (sibling.tagName === element.tagName) { 60 | hasSameTypeSiblings = true; 61 | break; 62 | } 63 | sibling = sibling.nextElementSibling; 64 | } 65 | } 66 | 67 | return hasSameTypeSiblings ? `${tagName}:nth-of-type(${index})` : tagName; 68 | }; 69 | 70 | /** 71 | * Calculates a CSS selector path for an element relative to a boundary node (Document or ShadowRoot). 72 | * Uses '>' as the child combinator. 73 | * 74 | * @param element The target element. 75 | * @param boundary The node (Document or ShadowRoot) to stop traversal at. 76 | * @returns A relative CSS selector string. 77 | */ 78 | const getRelativeCSSPath = (element: HTMLElement, boundary: Node): string => { 79 | if (element === boundary) { 80 | return ""; // Should not happen if called correctly, but return empty if it does 81 | } 82 | 83 | const segments: string[] = []; 84 | let currentElement: HTMLElement | null = element; 85 | 86 | while ( 87 | currentElement && 88 | currentElement !== boundary && 89 | currentElement.nodeType === Node.ELEMENT_NODE 90 | ) { 91 | const segment = getUniqueSegment(currentElement); 92 | segments.unshift(segment); 93 | 94 | const parent = currentElement.parentElement; 95 | // Stop if parent is null, not an element, or the boundary itself 96 | if ( 97 | !parent || 98 | parent === boundary || 99 | parent.nodeType !== Node.ELEMENT_NODE 100 | ) { 101 | break; 102 | } 103 | currentElement = parent as HTMLElement; 104 | } 105 | 106 | return segments.join(" > "); 107 | }; 108 | 109 | /** 110 | * Generates a full CSS selector path for a given element, handling shadow DOM boundaries. 111 | * Uses Playwright's '>>' syntax to denote shadow DOM transitions. 112 | * 113 | * @param element The target HTMLElement. 114 | * @returns A CSS selector string that can be used with Playwright locators. 115 | */ 116 | export const getCSSPath = (element: HTMLElement | null): string => { 117 | if (!element || element.nodeType !== Node.ELEMENT_NODE) { 118 | // console.warn("getCSSPath called with invalid element:", element); 119 | return ""; 120 | } 121 | 122 | if (!element.isConnected) { 123 | // console.warn("getCSSPath called with disconnected element:", element); 124 | // Attempting to generate path anyway, might be useful in some rare debugging cases 125 | } 126 | 127 | const root = element.getRootNode(); 128 | 129 | if (root instanceof ShadowRoot) { 130 | // Element is inside a shadow DOM 131 | const host = root.host as HTMLElement; 132 | if (!host) { 133 | console.warn("ShadowRoot found without a host element:", root); 134 | // Cannot generate a path from the document root if the host is unknown 135 | return ""; // Or potentially just the relative path within the shadow root? Unreliable. 136 | } 137 | const hostPath = getCSSPath(host); // Recursive call to get path to the host 138 | const relativePath = getRelativeCSSPath(element, root); // Path within the shadow root 139 | 140 | if (!hostPath) { 141 | console.warn("Could not determine CSS path for host element:", host); 142 | return ""; // Cannot construct full path 143 | } 144 | if (!relativePath) { 145 | console.warn( 146 | "Could not determine relative CSS path within ShadowRoot for:", 147 | element 148 | ); 149 | // Element might be the direct child/root of the shadow DOM, or path generation failed. 150 | // Playwright needs a selector after >>, maybe ':host' or '*' or just return hostPath? 151 | // Returning just hostPath might select the host instead of the shadow content. 152 | // Let's assume relativePath should usually exist. If not, path is likely invalid. 153 | return ""; 154 | } 155 | 156 | // Playwright syntax for piercing shadow DOM 157 | return `${hostPath} >> ${relativePath}`; 158 | } else if (root instanceof Document) { 159 | // Element is in the main document or an iframe document 160 | return getRelativeCSSPath(element, root); 161 | } else { 162 | console.warn( 163 | "Element root is neither Document nor ShadowRoot:", 164 | root, 165 | "for element:", 166 | element 167 | ); 168 | // Fallback: Try to compute path relative to its own root node anyway 169 | return getRelativeCSSPath(element, root); 170 | } 171 | }; 172 | -------------------------------------------------------------------------------- /src/context-providers/dom/get-x-path.ts: -------------------------------------------------------------------------------- 1 | export const getXPath = (element: HTMLElement) => { 2 | const segments = []; 3 | let currentElement: HTMLElement | null = element; 4 | 5 | while (currentElement && currentElement.nodeType === Node.ELEMENT_NODE) { 6 | if ( 7 | currentElement.parentNode instanceof ShadowRoot || 8 | currentElement.parentNode instanceof HTMLIFrameElement 9 | ) { 10 | break; 11 | } 12 | 13 | let index = 0; 14 | let hasSiblings = false; 15 | let sibling = currentElement.previousSibling; 16 | while (sibling) { 17 | if ( 18 | sibling.nodeType === Node.ELEMENT_NODE && 19 | sibling.nodeName === currentElement.nodeName 20 | ) { 21 | index++; 22 | hasSiblings = true; 23 | } 24 | sibling = sibling.previousSibling; 25 | } 26 | 27 | if (!hasSiblings) { 28 | sibling = currentElement.nextSibling; 29 | while (sibling) { 30 | if ( 31 | sibling.nodeType === Node.ELEMENT_NODE && 32 | sibling.nodeName === currentElement.nodeName 33 | ) { 34 | hasSiblings = true; 35 | break; 36 | } 37 | sibling = sibling.nextSibling; 38 | } 39 | } 40 | 41 | const tagName = currentElement.nodeName.toLowerCase(); 42 | 43 | // Always include position index if there are siblings with the same tag name 44 | // This ensures uniqueness of the XPath 45 | const xpathIndex = hasSiblings ? `[${index + 1}]` : ""; 46 | 47 | // Add id attribute for even more uniqueness if present 48 | if (currentElement.id && currentElement.id.toString().trim() !== "") { 49 | segments.unshift(`${tagName}[@id="${currentElement.id}"]`); 50 | } else { 51 | segments.unshift(`${tagName}${xpathIndex}`); 52 | } 53 | 54 | currentElement = currentElement.parentElement; 55 | } 56 | 57 | return segments.join("/"); 58 | }; 59 | -------------------------------------------------------------------------------- /src/context-providers/dom/highlight.ts: -------------------------------------------------------------------------------- 1 | // --- Interfaces --- 2 | 3 | interface HighlightInfo { 4 | element: HTMLElement; 5 | index: number; 6 | parentIframe: HTMLElement | null; 7 | } 8 | 9 | interface IframeOffset { 10 | x: number; 11 | y: number; 12 | } 13 | 14 | // --- Helper Functions (Stateless) --- 15 | 16 | const isElementPartiallyVisible = (rect: DOMRect): boolean => { 17 | // Check if the element is within the viewport, considering potential zero dimensions 18 | return ( 19 | rect.width > 0 && 20 | rect.height > 0 && 21 | rect.top < window.innerHeight && // These checks are relative to the current viewport 22 | rect.bottom > 0 && // where the rect was calculated. 23 | rect.left < window.innerWidth && 24 | rect.right > 0 25 | ); 26 | }; 27 | 28 | const getHighlightColor = ( 29 | index: number 30 | ): { baseColor: string; backgroundColor: string } => { 31 | const colors = [ 32 | "#FF0000", 33 | "#00FF00", 34 | "#0000FF", 35 | "#FFA500", 36 | "#800080", 37 | "#008080", 38 | "#FF69B4", 39 | "#4B0082", 40 | "#FF4500", 41 | "#2E8B57", 42 | "#DC143C", 43 | "#4682B4", 44 | ]; 45 | const colorIndex = index % colors.length; 46 | const baseColor = colors[colorIndex]; 47 | const backgroundColor = baseColor + "1A"; 48 | return { baseColor, backgroundColor }; 49 | }; 50 | 51 | // Calculates label position relative to the canvas (0,0 top-left) 52 | const calculateLabelPosition = ( 53 | rect: DOMRect, 54 | iframeOffset: IframeOffset, 55 | labelWidth: number, 56 | labelHeight: number, 57 | canvasWidth: number, // Pass canvas dims for bounds checking 58 | canvasHeight: number 59 | ): { top: number; left: number } => { 60 | const top = rect.top + iframeOffset.y; 61 | const left = rect.left + iframeOffset.x; 62 | 63 | // Default: top-right corner relative to element 64 | let labelTop = top - labelHeight; 65 | let labelLeft = left + rect.width - labelWidth; 66 | 67 | // Constraints to keep label within *canvas* bounds 68 | labelTop = Math.min(labelTop, canvasHeight - labelHeight); 69 | labelLeft = Math.min(labelLeft, canvasWidth - labelWidth); 70 | 71 | // Basic overlap check (can be improved) - position relative to element 72 | const elementBottom = top + rect.height; 73 | const elementRight = left + rect.width; 74 | 75 | // If the calculated top-left of the label is inside the element's box 76 | if ( 77 | labelTop + labelHeight > top && 78 | labelTop < elementBottom && 79 | labelLeft + labelWidth > left && 80 | labelLeft < elementRight 81 | ) { 82 | // Try bottom-right corner relative to element 83 | labelTop = elementBottom; 84 | labelLeft = elementRight - labelWidth; 85 | 86 | // Re-apply constraints 87 | labelTop = Math.min(labelTop, canvasHeight - labelHeight); 88 | labelLeft = Math.min(labelLeft, canvasWidth - labelWidth); 89 | } 90 | 91 | return { top: labelTop, left: labelLeft }; 92 | }; 93 | 94 | // --- Public API --- 95 | 96 | /** 97 | * Renders highlights for the given elements onto an OffscreenCanvas 98 | * and returns an ImageBitmap. 99 | * 100 | * @param highlightInfos Array of objects describing elements to highlight. 101 | * @param width The desired width of the canvas (e.g., window.innerWidth). 102 | * @param height The desired height of the canvas (e.g., window.innerHeight). 103 | * @returns A Promise resolving to an ImageBitmap containing the highlights. 104 | */ 105 | export function renderHighlightsOffscreen( 106 | highlightInfos: HighlightInfo[], 107 | width: number, 108 | height: number 109 | ): ImageBitmap { 110 | if (width <= 0 || height <= 0) { 111 | console.warn( 112 | "Attempted to render highlights on zero-sized canvas. Will default to innerWidth x innerHeight" 113 | ); 114 | // Return an empty bitmap maybe? Or null. 115 | const emptyCanvas = new OffscreenCanvas( 116 | window.innerWidth, 117 | window.innerHeight 118 | ); 119 | return emptyCanvas.transferToImageBitmap(); 120 | } 121 | 122 | const dpr = window.devicePixelRatio || 1; 123 | const canvasWidth = width * dpr; 124 | const canvasHeight = height * dpr; 125 | const offscreenCanvas = new OffscreenCanvas(canvasWidth, canvasHeight); 126 | const ctx = offscreenCanvas.getContext("2d", { 127 | alpha: true, 128 | }) as OffscreenCanvasRenderingContext2D; // Ensure alpha for transparency 129 | 130 | // Scale context for DPI awareness. All drawing coords should be in logical pixels. 131 | ctx.scale(dpr, dpr); 132 | 133 | // Clear canvas (important for transparency) 134 | ctx.clearRect(0, 0, width, height); 135 | 136 | try { 137 | highlightInfos.forEach(({ element, index, parentIframe }) => { 138 | // Element might be stale, ensure it's still in the DOM 139 | if (!document.body.contains(element)) { 140 | return; // Skip elements not in DOM 141 | } 142 | 143 | const rect = element.getBoundingClientRect(); 144 | // Skip elements that are not visible or have no dimensions 145 | if ( 146 | !rect || 147 | rect.width === 0 || 148 | rect.height === 0 || 149 | !isElementPartiallyVisible(rect) 150 | ) { 151 | return; 152 | } 153 | 154 | const iframeOffset: IframeOffset = { x: 0, y: 0 }; 155 | if (parentIframe && document.body.contains(parentIframe)) { 156 | const iframeRect = parentIframe.getBoundingClientRect(); 157 | iframeOffset.x = iframeRect.left; 158 | iframeOffset.y = iframeRect.top; 159 | } 160 | 161 | const colors = getHighlightColor(index); 162 | const drawTop = rect.top + iframeOffset.y; 163 | const drawLeft = rect.left + iframeOffset.x; 164 | 165 | // --- Draw overlay rectangle --- 166 | ctx.fillStyle = colors.backgroundColor; 167 | ctx.fillRect(drawLeft, drawTop, rect.width, rect.height); 168 | ctx.strokeStyle = colors.baseColor; 169 | ctx.lineWidth = 1; // Use 1 logical pixel for crispness after scaling 170 | ctx.strokeRect(drawLeft, drawTop, rect.width, rect.height); 171 | 172 | // --- Draw label --- 173 | const labelText = index.toString(); 174 | // Font size calculation needs to consider DPR if you want physical pixel size 175 | // Or keep it simple with logical pixels. Let's use logical pixels. 176 | const fontSize = Math.min(12, Math.max(9, rect.height * 0.3)); 177 | ctx.font = `bold ${fontSize}px sans-serif`; 178 | ctx.textAlign = "center"; 179 | ctx.textBaseline = "middle"; 180 | 181 | // Estimate label dimensions in logical pixels 182 | const textMetrics = ctx.measureText(labelText); 183 | const labelPadding = 4; 184 | const labelHeight = fontSize + labelPadding; 185 | // Ensure width is at least height for near-square background 186 | const labelWidth = Math.max( 187 | labelHeight, 188 | textMetrics.width + labelPadding * 2 189 | ); 190 | 191 | // Calculate position relative to the canvas (using logical pixels) 192 | const labelPos = calculateLabelPosition( 193 | rect, 194 | iframeOffset, 195 | labelWidth, 196 | labelHeight, 197 | width, 198 | height 199 | ); 200 | 201 | // Draw label background (logical pixels) 202 | ctx.fillStyle = colors.baseColor; 203 | ctx.fillRect(labelPos.left, labelPos.top, labelWidth, labelHeight); 204 | 205 | // Draw label text (logical pixels) 206 | ctx.fillStyle = "white"; 207 | ctx.fillText( 208 | labelText, 209 | labelPos.left + labelWidth / 2, 210 | labelPos.top + labelHeight / 2 211 | ); 212 | }); 213 | 214 | // Transfer the bitmap 215 | return offscreenCanvas.transferToImageBitmap(); 216 | } catch (error) { 217 | console.error("Error drawing highlights onto OffscreenCanvas:", error); 218 | // In case of error, maybe return an empty bitmap or null 219 | const emptyCanvas = new OffscreenCanvas(1, 1); 220 | return emptyCanvas.transferToImageBitmap(); // Or return null 221 | } 222 | } 223 | -------------------------------------------------------------------------------- /src/context-providers/dom/index.ts: -------------------------------------------------------------------------------- 1 | import { Page } from "playwright"; 2 | import { buildDomViewJs } from "./inject/build-dom-view"; 3 | import { DOMState, DOMStateRaw, InteractiveElement } from "./types"; 4 | 5 | export const getDom = async (page: Page): Promise => { 6 | const result = (await page.evaluate(buildDomViewJs)) as DOMStateRaw; 7 | const elements = new Map(); 8 | for (const element of result.elements) { 9 | if (element.highlightIndex !== undefined) { 10 | elements.set(element.highlightIndex, element); 11 | } 12 | } 13 | return { 14 | elements, 15 | domState: result.domState, 16 | screenshot: result.screenshot, 17 | }; 18 | }; 19 | -------------------------------------------------------------------------------- /src/context-providers/dom/types.ts: -------------------------------------------------------------------------------- 1 | export interface InteractiveElement { 2 | element: HTMLElement; 3 | iframe?: HTMLIFrameElement; 4 | shadowHost?: HTMLElement; 5 | isUnderShadowRoot: boolean; 6 | rect: DOMRect; 7 | interactiveReason?: string; 8 | highlightIndex?: number; 9 | cssPath: string; 10 | xpath: string; 11 | } 12 | 13 | export interface DOMStateRaw { 14 | elements: InteractiveElement[]; 15 | domState: string; 16 | screenshot: string; 17 | } 18 | 19 | export interface DOMState { 20 | elements: Map; 21 | domState: string; 22 | screenshot: string; 23 | } 24 | -------------------------------------------------------------------------------- /src/context-providers/dom/window-type.ts: -------------------------------------------------------------------------------- 1 | interface Window { 2 | getEventListeners?: (element: HTMLElement) => { 3 | [eventName: string]: Array<{ 4 | listener: Function; 5 | useCapture: boolean; 6 | }>; 7 | }; 8 | } 9 | -------------------------------------------------------------------------------- /src/custom-actions/index.ts: -------------------------------------------------------------------------------- 1 | import { UserInteractionAction } from "./user-interaction"; 2 | 3 | export { UserInteractionAction }; 4 | -------------------------------------------------------------------------------- /src/custom-actions/user-interaction.ts: -------------------------------------------------------------------------------- 1 | import { z } from "zod"; 2 | import { ActionOutput, AgentActionDefinition } from "@/types"; 3 | 4 | export const UserInteractionActionParams = z.object({ 5 | message: z 6 | .string() 7 | .describe( 8 | "A message to provide to the user. Make it friendly and ask them for a suitable response. Keep it short and between 1-2 sentences if possible." 9 | ), 10 | kind: z 11 | .enum(["password", "text_input", "select", "confirm"]) 12 | .describe( 13 | "The kind of response that is expected from the user. If you can't find a suitable option, then respond with confirm." 14 | ), 15 | choices: z 16 | .array(z.string()) 17 | .optional() 18 | .describe( 19 | "If you select choices as the kind option, then what options should be offered to the user." 20 | ), 21 | }).describe(`Action to request input from the user during task execution. 22 | Use this when you need to collect information from the user such as text input, password, 23 | selection from choices, or confirmation. The response will be returned to continue the workflow.`); 24 | 25 | export type UserInteractionActionParamsType = 26 | typeof UserInteractionActionParams; 27 | 28 | type userInputFn = ( 29 | params: z.infer 30 | ) => Promise; 31 | 32 | export const UserInteractionAction = ( 33 | userInputFn: userInputFn 34 | ): AgentActionDefinition => { 35 | return { 36 | type: "UserInteractionActionParams", 37 | actionParams: UserInteractionActionParams, 38 | run: async (ctx, action): Promise => 39 | await userInputFn(action), 40 | }; 41 | }; 42 | -------------------------------------------------------------------------------- /src/index.ts: -------------------------------------------------------------------------------- 1 | import { HyperAgent } from "./agent"; 2 | import { TaskStatus } from "./types/agent/types"; 3 | 4 | export { TaskStatus, HyperAgent }; 5 | export default HyperAgent; 6 | 7 | // For CommonJS compatibility 8 | if (typeof module !== "undefined" && module.exports) { 9 | module.exports = HyperAgent; 10 | module.exports.HyperAgent = HyperAgent; 11 | module.exports.TaskStatus = TaskStatus; 12 | module.exports.default = HyperAgent; 13 | } 14 | -------------------------------------------------------------------------------- /src/types/agent/actions/types.ts: -------------------------------------------------------------------------------- 1 | import { Page } from "playwright"; 2 | import { DOMState } from "../../../context-providers/dom/types"; 3 | import { BaseChatModel } from "@langchain/core/language_models/chat_models"; 4 | import { z } from "zod"; 5 | import { MCPClient } from "../../../agent/mcp/client"; 6 | import { HyperVariable } from "../types"; 7 | 8 | export interface ActionContext { 9 | page: Page; 10 | domState: DOMState; 11 | llm: BaseChatModel; 12 | tokenLimit: number; 13 | variables: HyperVariable[]; 14 | debugDir?: string; 15 | mcpClient?: MCPClient; 16 | } 17 | 18 | export interface ActionOutput { 19 | success: boolean; 20 | message: string; 21 | extract?: object; 22 | } 23 | 24 | export type ActionSchemaType = z.ZodObject< 25 | { 26 | type: z.ZodLiteral; 27 | 28 | // eslint-disable-next-line @typescript-eslint/no-empty-object-type 29 | params: z.ZodObject<{}, "strip", z.ZodTypeAny, {}, {}>; 30 | }, 31 | "strip", 32 | z.ZodTypeAny, 33 | { 34 | params: object; 35 | type: string; 36 | }, 37 | { 38 | params: object; 39 | type: string; 40 | } 41 | >; 42 | 43 | export type ActionType = z.infer; 44 | 45 | export interface AgentActionDefinition< 46 | T extends z.AnyZodObject = z.AnyZodObject, 47 | > { 48 | readonly type: string; 49 | actionParams: T; 50 | 51 | run(ctx: ActionContext, params: z.infer): Promise; 52 | /** 53 | * completeAction is only called if the name of this action is "complete". It is meant to format text into a proper format for output. 54 | * @param params 55 | */ 56 | completeAction?(params: z.infer): Promise; 57 | pprintAction?(params: z.infer): string; 58 | } 59 | -------------------------------------------------------------------------------- /src/types/agent/types.ts: -------------------------------------------------------------------------------- 1 | import { z } from "zod"; 2 | import { ActionOutput } from "./actions/types"; 3 | import { Page } from "playwright"; 4 | import { ErrorEmitter } from "@/utils"; 5 | 6 | export const AgentOutputFn = ( 7 | actionsSchema: z.ZodUnion 8 | ) => 9 | z.object({ 10 | thoughts: z 11 | .string() 12 | .describe( 13 | "Your thoughts on the task at hand, was the previous goal successful?" 14 | ), 15 | memory: z 16 | .string() 17 | .describe( 18 | "Information that you need to remember to accomplish subsequent goals" 19 | ), 20 | nextGoal: z 21 | .string() 22 | .describe( 23 | "The next goal you are trying to accomplish with the actions you have chosen" 24 | ), 25 | actions: z.array(actionsSchema), 26 | }); 27 | 28 | export type AgentOutput = z.infer>; 29 | 30 | export interface AgentStep { 31 | idx: number; 32 | agentOutput: AgentOutput; 33 | actionOutputs: ActionOutput[]; 34 | } 35 | 36 | export interface TaskParams { 37 | maxSteps?: number; 38 | debugDir?: string; 39 | outputSchema?: z.AnyZodObject; 40 | onStep?: (step: AgentStep) => Promise | void; 41 | onComplete?: (output: TaskOutput) => Promise | void; 42 | debugOnAgentOutput?: (step: AgentOutput) => void; 43 | } 44 | 45 | export interface TaskOutput { 46 | status?: TaskStatus; 47 | steps: AgentStep[]; 48 | output?: string; 49 | } 50 | 51 | export interface Task { 52 | getStatus: () => TaskStatus; 53 | pause: () => TaskStatus; 54 | resume: () => TaskStatus; 55 | cancel: () => TaskStatus; 56 | emitter: ErrorEmitter; 57 | } 58 | 59 | export enum TaskStatus { 60 | PENDING = "pending", 61 | RUNNING = "running", 62 | PAUSED = "paused", 63 | CANCELLED = "cancelled", 64 | COMPLETED = "completed", 65 | FAILED = "failed", 66 | } 67 | 68 | export const endTaskStatuses = new Set([ 69 | TaskStatus.CANCELLED, 70 | TaskStatus.COMPLETED, 71 | TaskStatus.FAILED, 72 | ]); 73 | 74 | export interface TaskState { 75 | id: string; 76 | task: string; 77 | status: TaskStatus; 78 | startingPage: Page; 79 | steps: AgentStep[]; 80 | output?: string; 81 | error?: string; 82 | } 83 | 84 | export interface HyperVariable { 85 | key: string; 86 | value: string; 87 | description: string; 88 | } 89 | 90 | export interface HyperPage extends Page { 91 | ai: (task: string, params?: TaskParams) => Promise; 92 | aiAsync: (task: string, params?: TaskParams) => Promise; 93 | extract( 94 | task?: string, 95 | outputSchema?: T 96 | ): Promise : string>; 97 | } 98 | -------------------------------------------------------------------------------- /src/types/browser-providers/types.ts: -------------------------------------------------------------------------------- 1 | import { Browser } from "playwright"; 2 | 3 | abstract class BrowserProvider { 4 | abstract session: unknown; 5 | abstract start(): Promise; 6 | abstract close(): Promise; 7 | abstract getSession(): T|null; 8 | } 9 | 10 | export default BrowserProvider; 11 | -------------------------------------------------------------------------------- /src/types/config.ts: -------------------------------------------------------------------------------- 1 | import { BaseChatModel } from "@langchain/core/language_models/chat_models"; 2 | import { AgentActionDefinition } from "./agent/actions/types"; 3 | 4 | import { 5 | HyperbrowserProvider, 6 | LocalBrowserProvider, 7 | } from "@/browser-providers"; 8 | 9 | export interface MCPServerConfig { 10 | id?: string; 11 | 12 | /** 13 | * The type of MCP server to use 14 | */ 15 | connectionType?: "stdio" | "sse"; 16 | 17 | /** 18 | * The executable to run to start the server. 19 | */ 20 | command?: string; 21 | /** 22 | * Command line arguments to pass to the executable. 23 | */ 24 | args?: string[]; 25 | /** 26 | * The environment to use when spawning the process. 27 | * 28 | */ 29 | env?: Record; 30 | 31 | /** 32 | * URL for SSE connection (required when connectionType is "sse") 33 | */ 34 | sseUrl?: string; 35 | /** 36 | * Headers for SSE connection 37 | */ 38 | sseHeaders?: Record; 39 | 40 | /** 41 | * List of tools to exclude from the MCP config 42 | */ 43 | excludeTools?: string[]; 44 | /** 45 | * List of tools to include from the MCP config 46 | */ 47 | includeTools?: string[]; 48 | } 49 | 50 | export interface MCPConfig { 51 | /** 52 | * List of servers to connect to 53 | */ 54 | servers: MCPServerConfig[]; 55 | } 56 | 57 | export type BrowserProviders = "Local" | "Hyperbrowser"; 58 | 59 | export interface HyperAgentConfig { 60 | customActions?: Array; 61 | 62 | browserProvider?: T; 63 | 64 | debug?: boolean; 65 | llm?: BaseChatModel; 66 | 67 | hyperbrowserConfig?: Omit< 68 | NonNullable[0]>, 69 | "debug" 70 | >; 71 | localConfig?: ConstructorParameters[0]; 72 | } 73 | -------------------------------------------------------------------------------- /src/types/index.ts: -------------------------------------------------------------------------------- 1 | // Agent Action Types 2 | import { 3 | ActionType, 4 | ActionSchemaType, 5 | AgentActionDefinition, 6 | ActionContext, 7 | ActionOutput, 8 | } from "./agent/actions/types"; 9 | 10 | // Agent Types 11 | import { 12 | AgentOutputFn, 13 | AgentOutput, 14 | AgentStep, 15 | TaskParams, 16 | TaskOutput, 17 | Task, 18 | TaskStatus, 19 | TaskState, 20 | endTaskStatuses, 21 | } from "./agent/types"; 22 | 23 | // Config Types 24 | import { MCPServerConfig, MCPConfig, HyperAgentConfig } from "./config"; 25 | 26 | // Browser Provider Types 27 | import BrowserProvider from "./browser-providers/types"; 28 | 29 | // Export all types 30 | export { 31 | // Agent Action Types 32 | ActionType, 33 | ActionSchemaType, 34 | AgentActionDefinition, 35 | ActionContext, 36 | ActionOutput, 37 | 38 | // Agent Types 39 | AgentOutputFn, 40 | AgentOutput, 41 | AgentStep, 42 | TaskParams, 43 | TaskOutput, 44 | Task, 45 | TaskStatus, 46 | TaskState, 47 | 48 | // Config Types 49 | MCPServerConfig, 50 | MCPConfig, 51 | HyperAgentConfig, 52 | 53 | // Browser Provider Types 54 | BrowserProvider, 55 | endTaskStatuses, 56 | }; 57 | 58 | // Extend NodeJS.ProcessEnv to include our environment variables 59 | declare global { 60 | // eslint-disable-next-line @typescript-eslint/no-namespace 61 | namespace NodeJS { 62 | interface ProcessEnv { 63 | OPENAI_API_KEY?: string; 64 | GEMINI_API_KEY?: string; 65 | } 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /src/utils/error-emitter.ts: -------------------------------------------------------------------------------- 1 | import EventEmitter from "events"; 2 | 3 | type ErrorEvents = { 4 | error: (error: Error) => void; 5 | }; 6 | 7 | export class ErrorEmitter extends EventEmitter { 8 | override on( 9 | event: K, 10 | listener: ErrorEvents[K] 11 | ): this { 12 | return super.on(event, listener); 13 | } 14 | 15 | override once( 16 | event: K, 17 | listener: ErrorEvents[K] 18 | ): this { 19 | return super.once(event, listener); 20 | } 21 | 22 | override off( 23 | event: K, 24 | listener: ErrorEvents[K] 25 | ): this { 26 | return super.off(event, listener); 27 | } 28 | 29 | override emit( 30 | event: K, 31 | ...args: Parameters 32 | ): boolean { 33 | return super.emit(event, ...args); 34 | } 35 | 36 | override addListener( 37 | eventName: K, 38 | listener: (...args: Parameters) => void 39 | ): this { 40 | return super.addListener(eventName, listener); 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /src/utils/html-to-markdown.ts: -------------------------------------------------------------------------------- 1 | import TurndownService from "turndown"; 2 | // TODO: Add gfm plugin 3 | // import { gfm } from "joplin-turndown-plugin-gfm"; 4 | 5 | export const turndownService = new TurndownService(); 6 | 7 | turndownService.addRule("removeUnwantedTags", { 8 | filter: ["head", "script", "style"], 9 | replacement: function () { 10 | return ""; 11 | }, 12 | }); 13 | 14 | turndownService.addRule("inlineLink", { 15 | filter: function (node: any, options: any) { 16 | return ( 17 | options.linkStyle === "inlined" && 18 | node.nodeName === "A" && 19 | node.getAttribute("href") 20 | ); 21 | }, 22 | replacement: function (content: string, node: any) { 23 | var href = node.getAttribute("href").trim(); 24 | var title = node.title ? ' "' + node.title + '"' : ""; 25 | return "[" + content.trim() + "](" + href + title + ")\n"; 26 | }, 27 | }); 28 | // turndownService.use(gfm); 29 | 30 | const processMultiLineLinks = (markdownContent: string): string => { 31 | let insideLinkContent = false; 32 | let newMarkdownContent = ""; 33 | let linkOpenCount = 0; 34 | for (let i = 0; i < markdownContent.length; i++) { 35 | const char = markdownContent[i]; 36 | 37 | if (char == "[") { 38 | linkOpenCount++; 39 | } else if (char == "]") { 40 | linkOpenCount = Math.max(0, linkOpenCount - 1); 41 | } 42 | insideLinkContent = linkOpenCount > 0; 43 | 44 | if (insideLinkContent && char == "\n") { 45 | newMarkdownContent += "\\" + "\n"; 46 | } else { 47 | newMarkdownContent += char; 48 | } 49 | } 50 | return newMarkdownContent; 51 | }; 52 | 53 | const removeSkipToContentLinks = (markdownContent: string): string => { 54 | // Remove [Skip to Content](#page) and [Skip to content](#skip) 55 | const newMarkdownContent = markdownContent.replace( 56 | /\[Skip to Content\]\(#[^\)]*\)/gi, 57 | "" 58 | ); 59 | return newMarkdownContent; 60 | }; 61 | 62 | export async function parseMarkdown( 63 | html: string | null | undefined 64 | ): Promise { 65 | if (!html) { 66 | return ""; 67 | } 68 | try { 69 | let markdownContent = turndownService.turndown(html); 70 | markdownContent = processMultiLineLinks(markdownContent); 71 | markdownContent = removeSkipToContentLinks(markdownContent); 72 | return markdownContent; 73 | } catch (error) { 74 | console.error("Error converting HTML to Markdown", { error }); 75 | return ""; // Optionally return an empty string or handle the error as needed 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /src/utils/index.ts: -------------------------------------------------------------------------------- 1 | import { sleep } from "./sleep"; 2 | import { retry } from "./retry"; 3 | import { ErrorEmitter } from "./error-emitter"; 4 | 5 | export { sleep, retry, ErrorEmitter }; 6 | -------------------------------------------------------------------------------- /src/utils/retry.ts: -------------------------------------------------------------------------------- 1 | import { sleep } from "./sleep"; 2 | export async function retry({ 3 | func, 4 | params, 5 | onError, 6 | }: { 7 | func: () => Promise; 8 | params?: { retryCount: number }; 9 | onError?: (...err: Array) => void; 10 | }) { 11 | let err = null; 12 | const retryCount = params?.retryCount || 3; 13 | for (let i = 0; i < retryCount; i++) { 14 | try { 15 | const resp = await func(); 16 | return resp; 17 | } catch (error) { 18 | onError?.(`Retry Attempt: ${i}`, error); 19 | err = error; 20 | await sleep(Math.pow(2, i) * 1000); 21 | continue; 22 | } 23 | } 24 | throw err; 25 | } 26 | -------------------------------------------------------------------------------- /src/utils/sleep.ts: -------------------------------------------------------------------------------- 1 | export const sleep = (ms: number): Promise => { 2 | return new Promise((resolve) => setTimeout(resolve, ms)); 3 | }; 4 | -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "target": "es2020", 4 | "lib": [ 5 | "es2020", 6 | "DOM" 7 | ], 8 | "module": "commonjs", 9 | "moduleResolution": "node", 10 | "declaration": true, 11 | "outDir": "./dist", 12 | "paths": { 13 | "@hyperbrowser/agent": [ 14 | "./src/index" 15 | ], 16 | "@hyperbrowser/agent/types": [ 17 | "./src/types/index" 18 | ], 19 | "@hyperbrowser/agent/custom-actions": [ 20 | "./src/custom-actions/index" 21 | ], 22 | "@/*": [ 23 | "./src/*" 24 | ] 25 | }, 26 | "strict": true, 27 | "esModuleInterop": true, 28 | "skipLibCheck": true, 29 | "forceConsistentCasingInFileNames": true, 30 | "allowJs": true, 31 | }, 32 | "include": [ 33 | "src/**/*.ts", 34 | "src/**/*.js" 35 | ], 36 | "exclude": [ 37 | "node_modules", 38 | "dist", 39 | "debug" 40 | ] 41 | } --------------------------------------------------------------------------------