├── .gitignore
├── LICENSE
├── README.md
├── assets
    ├── flight-schedule.gif
    └── hyperagent-banner.png
├── cli.sh
├── eslint.config.mjs
├── evals
    ├── WebVoyager_data.jsonl
    └── WebVoyager_reference.json
├── examples
    ├── browser-providers
    │   └── hyperbrowser.ts
    ├── custom-tool
    │   ├── search
    │   │   └── exa.ts
    │   └── wikipedia-random-article
    │   │   └── run-custom-tool.ts
    ├── llms
    │   ├── anthropic.ts
    │   └── openai.ts
    ├── mcp
    │   ├── google-sheets
    │   │   ├── best-buy-reviews.ts
    │   │   ├── car-price-comparison.ts
    │   │   └── most-populated-states.ts
    │   ├── notion
    │   │   └── create-shopping-list.ts
    │   └── weather
    │   │   ├── get-weather-alert.ts
    │   │   └── servers
    │   │       └── weather-server.js
    ├── output-to-schema
    │   └── output-to-schema.ts
    └── simple
    │   └── add-to-amazon-cart.ts
├── package.json
├── scripts
    ├── run-webvoyager-eval.ts
    ├── test-async.ts
    ├── test-extract.ts
    ├── test-page-ai.ts
    ├── test-variables.ts
    └── test.ts
├── src
    ├── agent
    │   ├── actions
    │   │   ├── click-element.ts
    │   │   ├── complete-validator.ts
    │   │   ├── complete-with-output-schema.ts
    │   │   ├── complete.ts
    │   │   ├── extract.ts
    │   │   ├── go-to-url.ts
    │   │   ├── index.ts
    │   │   ├── input-text.ts
    │   │   ├── key-press.ts
    │   │   ├── page-back.ts
    │   │   ├── page-forward.ts
    │   │   ├── pdf.ts
    │   │   ├── refresh-page.ts
    │   │   ├── scroll.ts
    │   │   ├── select-option.ts
    │   │   ├── thinking.ts
    │   │   └── utils.ts
    │   ├── error.ts
    │   ├── index.ts
    │   ├── llms
    │   │   └── structured-output.ts
    │   ├── mcp
    │   │   └── client.ts
    │   ├── messages
    │   │   ├── builder.ts
    │   │   ├── examples-actions.ts
    │   │   ├── input-format.ts
    │   │   ├── output-format.ts
    │   │   ├── system-prompt.ts
    │   │   └── utils.ts
    │   └── tools
    │   │   ├── agent.ts
    │   │   └── types.ts
    ├── browser-providers
    │   ├── hyperbrowser.ts
    │   ├── index.ts
    │   └── local.ts
    ├── cli
    │   └── index.ts
    ├── context-providers
    │   └── dom
    │   │   ├── build-dom-view.ts
    │   │   ├── builder.ts
    │   │   ├── const.ts
    │   │   ├── elem-interactive.ts
    │   │   ├── find-interactive-elements.ts
    │   │   ├── get-css-path.ts
    │   │   ├── get-x-path.ts
    │   │   ├── highlight.ts
    │   │   ├── index.ts
    │   │   ├── inject
    │   │       ├── build-dom-view-script.js
    │   │       └── build-dom-view.ts
    │   │   ├── types.ts
    │   │   └── window-type.ts
    ├── custom-actions
    │   ├── index.ts
    │   └── user-interaction.ts
    ├── index.ts
    ├── types
    │   ├── agent
    │   │   ├── actions
    │   │   │   └── types.ts
    │   │   └── types.ts
    │   ├── browser-providers
    │   │   └── types.ts
    │   ├── config.ts
    │   └── index.ts
    └── utils
    │   ├── error-emitter.ts
    │   ├── html-to-markdown.ts
    │   ├── index.ts
    │   ├── retry.ts
    │   └── sleep.ts
├── tsconfig.json
└── yarn.lock


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Logs
  2 | logs
  3 | *.log
  4 | npm-debug.log*
  5 | yarn-debug.log*
  6 | yarn-error.log*
  7 | lerna-debug.log*
  8 | .pnpm-debug.log*
  9 | tmp
 10 | debug
 11 | 
 12 | # Diagnostic reports (https://nodejs.org/api/report.html)
 13 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
 14 | 
 15 | # Runtime data
 16 | pids
 17 | *.pid
 18 | *.seed
 19 | *.pid.lock
 20 | 
 21 | .DS_Store
 22 | 
 23 | # Directory for instrumented libs generated by jscoverage/JSCover
 24 | lib-cov
 25 | 
 26 | # Coverage directory used by tools like istanbul
 27 | coverage
 28 | *.lcov
 29 | 
 30 | # nyc test coverage
 31 | .nyc_output
 32 | 
 33 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
 34 | .grunt
 35 | 
 36 | # Bower dependency directory (https://bower.io/)
 37 | bower_components
 38 | 
 39 | # node-waf configuration
 40 | .lock-wscript
 41 | 
 42 | # Compiled binary addons (https://nodejs.org/api/addons.html)
 43 | build/Release
 44 | 
 45 | # Dependency directories
 46 | node_modules/
 47 | jspm_packages/
 48 | 
 49 | # Snowpack dependency directory (https://snowpack.dev/)
 50 | web_modules/
 51 | 
 52 | # TypeScript cache
 53 | *.tsbuildinfo
 54 | 
 55 | # Optional npm cache directory
 56 | .npm
 57 | 
 58 | # Optional eslint cache
 59 | .eslintcache
 60 | 
 61 | # Optional stylelint cache
 62 | .stylelintcache
 63 | 
 64 | # Microbundle cache
 65 | .rpt2_cache/
 66 | .rts2_cache_cjs/
 67 | .rts2_cache_es/
 68 | .rts2_cache_umd/
 69 | 
 70 | # Optional REPL history
 71 | .node_repl_history
 72 | 
 73 | # Output of 'npm pack'
 74 | *.tgz
 75 | 
 76 | # Yarn Integrity file
 77 | .yarn-integrity
 78 | 
 79 | # dotenv environment variable files
 80 | .env
 81 | .env.development.local
 82 | .env.test.local
 83 | .env.production.local
 84 | .env.local
 85 | 
 86 | # parcel-bundler cache (https://parceljs.org/)
 87 | .cache
 88 | .parcel-cache
 89 | 
 90 | # Next.js build output
 91 | .next
 92 | out
 93 | 
 94 | # Nuxt.js build / generate output
 95 | .nuxt
 96 | dist
 97 | 
 98 | # Gatsby files
 99 | .cache/
100 | # Comment in the public line in if your project uses Gatsby and not Next.js
101 | # https://nextjs.org/blog/next-9-1#public-directory-support
102 | # public
103 | 
104 | # vuepress build output
105 | .vuepress/dist
106 | 
107 | # vuepress v2.x temp and cache directory
108 | .temp
109 | .cache
110 | 
111 | # Docusaurus cache and generated files
112 | .docusaurus
113 | 
114 | # Serverless directories
115 | .serverless/
116 | 
117 | # FuseBox cache
118 | .fusebox/
119 | 
120 | # DynamoDB Local files
121 | .dynamodb/
122 | 
123 | # TernJS port file
124 | .tern-port
125 | 
126 | # Stores VSCode versions used for testing VSCode extensions
127 | .vscode-test
128 | 
129 | # yarn v2
130 | .yarn/cache
131 | .yarn/unplugged
132 | .yarn/build-state.yml
133 | .yarn/install-state.gz
134 | .pnp.*
135 | 
136 | .ignore
137 | extensions
138 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | GNU AFFERO GENERAL PUBLIC LICENSE
 2 | Version 3, 19 November 2007
 3 | 
 4 | Copyright (c) 2025 S2 Labs Inc.
 5 | 
 6 | This program is free software: you can redistribute it and/or modify
 7 | it under the terms of the GNU Affero General Public License as published by
 8 | the Free Software Foundation, either version 3 of the License, or
 9 | (at your option) any later version.
10 | 
11 | This program is distributed in the hope that it will be useful,
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 | GNU Affero General Public License for more details.
15 | 
16 | You should have received a copy of the GNU Affero General Public License
17 | along with this program.  If not, see <https://www.gnu.org/licenses/>.
18 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <div align="center">
  2 |   <img src="assets/hyperagent-banner.png" alt="Hyperagent Banner" width="800"/>
  3 | 
  4 |   <p align="center">
  5 |     <strong>Intelligent Browser Automation with LLMs</strong>
  6 |   </p>
  7 | 
  8 |   <p align="center">
  9 |     <a href="https://www.npmjs.com/package/@hyperbrowser/agent">
 10 |       <img src="https://img.shields.io/npm/v/@hyperbrowser/agent?style=flat-square" alt="npm version" />
 11 |     </a>
 12 |     <a href="https://github.com/hyperbrowserai/hyperagent/blob/main/LICENSE">
 13 |       <img src="https://img.shields.io/npm/l/@hyperbrowser/agent?style=flat-square" alt="license" />
 14 |     </a>
 15 |     <a href="https://discord.gg/zsYzsgVRjh" style="text-decoration:none;">
 16 |       <img alt="Discord" src="https://img.shields.io/discord/1313014141165764619?style=flat-square&color=blue">
 17 |     </a>
 18 |     <a href="https://x.com/AkshayShekhaw12">
 19 |       <img alt="X (formerly Twitter) Follow" src="https://img.shields.io/twitter/follow/AkshayShekhaw12?style=social">
 20 |     </a>
 21 |   </p>
 22 | </div>
 23 | 
 24 | ## Overview
 25 | 
 26 | Hyperagent is Playwright supercharged with AI. No more brittle scripts, just powerful natural language commands.
 27 | Just looking for scalable headless browsers or scraping infra? Go to [Hyperbrowser](https://app.hyperbrowser.ai/) to get started for free!
 28 | 
 29 | ### Features
 30 | 
 31 | - 🤖 **AI Commands**: Simple APIs like `page.ai()`, `page.extract()` and `executeTask()` for any AI automation
 32 | - ⚡ **Fallback to Regular Playwright**: Use regular Playwright when AI isn't needed
 33 | - 🥷 **Stealth Mode** – Avoid detection with built-in anti-bot patches
 34 | - ☁️ **Cloud Ready** – Instantly scale to hundreds of sessions via [Hyperbrowser](https://app.hyperbrowser.ai/)
 35 | - 🔌 **MCP Client** – Connect to tools like Composio for full workflows (e.g. writing web data to Google Sheets)
 36 | 
 37 | ## Quick Start
 38 | 
 39 | ### Installation
 40 | 
 41 | ```bash
 42 | # Using npm
 43 | npm install @hyperbrowser/agent
 44 | 
 45 | # Using yarn
 46 | yarn add @hyperbrowser/agent
 47 | ```
 48 | 
 49 | ### CLI
 50 | 
 51 | ```bash
 52 | $ npx @hyperbrowser/agent -c "Find a route from Miami to New Orleans, and provide the detailed route information."
 53 | ```
 54 | 
 55 | <p align="center">
 56 |   <img src="assets/flight-schedule.gif" alt="Hyperagent Demo"/>
 57 | </p>
 58 | 
 59 | The CLI supports options for debugging or using hyperbrowser instead of a local browser
 60 | 
 61 | ```bash
 62 | -d, --debug                       Enable debug mode
 63 | -c, --command <task description>  Command to run
 64 | --hyperbrowser                    Use Hyperbrowser for the browser provider
 65 | ```
 66 | 
 67 | ### Library
 68 | 
 69 | ```typescript
 70 | import { HyperAgent } from "@hyperbrowser/agent";
 71 | import { ChatOpenAI } from "@langchain/openai";
 72 | import { z } from "zod";
 73 | 
 74 | // Initialize the agent
 75 | const agent = new HyperAgent({
 76 |   llm: new ChatOpenAI({
 77 |     openAIApiKey: process.env.OPENAI_API_KEY,
 78 |     modelName: "gpt-4o",
 79 |   }),
 80 | });
 81 | 
 82 | // Execute a task
 83 | const result = await agent.executeTask(
 84 |   "Navigate to amazon.com, search for 'laptop', and extract the prices of the first 5 results"
 85 | );
 86 | console.log(result.output);
 87 | 
 88 | // Use page.ai and page.extract
 89 | const page = await agent.newPage();
 90 | await page.goto("https://flights.google.com", { waitUntil: "load" });
 91 | await page.ai("search for flights from Rio to LAX from July 16 to July 22");
 92 | const res = await page.extract(
 93 |   "give me the flight options",
 94 |   z.object({
 95 |     flights: z.array(
 96 |       z.object({
 97 |         price: z.number(),
 98 |         departure: z.string(),
 99 |         arrival: z.string(),
100 |       })
101 |     ),
102 |   })
103 | );
104 | console.log(res);
105 | 
106 | // Clean up
107 | await agent.closeAgent();
108 | ```
109 | 
110 | ## ☁️ Cloud
111 | 
112 | You can scale HyperAgent with cloud headless browsers using Hyperbrowser
113 | 
114 | 1. Get a free api key from [Hyperbrowser](https://app.hyperbrowser.ai/)
115 | 2. Add it to your env as `HYPERBROWSER_API_KEY`
116 | 3. Set your `browserProvider` to `"Hyperbrowser"`
117 | 
118 | ```typescript
119 | const agent = new HyperAgent({
120 |   browserProvider: "Hyperbrowser",
121 | });
122 | 
123 | const response = await agent.executeTask(
124 |   "Go to hackernews, and list me the 5 most recent article titles"
125 | );
126 | 
127 | console.log(response);
128 | await agent.closeAgent();
129 | ```
130 | 
131 | ## Usage Guide
132 | 
133 | ### Multi-Page Management
134 | 
135 | ```typescript
136 | // Create and manage multiple pages
137 | const page1 = await agent.newPage();
138 | const page2 = await agent.newPage();
139 | 
140 | // Execute tasks on specific pages
141 | const page1Response = await page1.ai(
142 |   "Go to google.com/travel/explore and set the starting location to New York. Then, return to me the first recommended destination that shows up. Return to me only the name of the location."
143 | );
144 | const page2Response = await page2.ai(
145 |   `I want to plan a trip to ${page1Response.output}. Recommend me places to visit there.`
146 | );
147 | 
148 | console.log(page2Response.output);
149 | 
150 | // Get all active pages
151 | const pages = await agent.getPages();
152 | await agent.closeAgent();
153 | ```
154 | 
155 | ## Customization
156 | 
157 | ### Output Schema Definition
158 | 
159 | HyperAgent can extract data in a specified schema. The schema can be passed in at a per-task level
160 | 
161 | ```typescript
162 | import { z } from "zod";
163 | 
164 | const agent = new HyperAgent();
165 | const agentResponse = await agent.executeTask(
166 |   "Navigate to imdb.com, search for 'The Matrix', and extract the director, release year, and rating",
167 |   {
168 |     outputSchema: z.object({
169 |       director: z.string().describe("The name of the movie director"),
170 |       releaseYear: z.number().describe("The year the movie was released"),
171 |       rating: z.string().describe("The IMDb rating of the movie"),
172 |     }),
173 |   }
174 | );
175 | console.log(agentResponse.output);
176 | await agent.closeAgent();
177 | ```
178 | 
179 | ```bash
180 | {
181 |   "director": "Lana Wachowski, Lilly Wachowski",
182 |   "releaseYear": 1999,
183 |   "rating": "8.7/10"
184 | }
185 | ```
186 | 
187 | ### Using Different LLM Providers
188 | 
189 | Hyperagent supports multiple LLM providers. A provider can be anything that extends to the Langchain `BaseChatModel` class.
190 | 
191 | ```typescript
192 | // Using OpenAI
193 | const agent = new HyperAgent({
194 |   llm: new ChatOpenAI({
195 |     openAIApiKey: process.env.OPENAI_API_KEY,
196 |     modelName: "gpt-4o",
197 |   }),
198 | });
199 | 
200 | // Using Anthropic's Claude
201 | const agent = new HyperAgent({
202 |   llm: new ChatAnthropic({
203 |     anthropicApiKey: process.env.ANTHROPIC_API_KEY,
204 |     modelName: "claude-3-7-sonnet-latest",
205 |   }),
206 | });
207 | ```
208 | 
209 | ### MCP Support
210 | 
211 | HyperAgent functions as a fully functional MCP client. For best results, we recommend using
212 | `gpt-4o` as your LLM.
213 | 
214 | Here is an example which reads from wikipedia, and inserts information into a google sheet using the composio Google Sheet MCP. For the full example, see [here](https://github.com/hyperbrowserai/HyperAgent/tree/main/examples/mcp/google-sheets/most-populated-states.ts)
215 | 
216 | ```typescript
217 | const agent = new HyperAgent({
218 |   llm: llm,
219 |   debug: true,
220 | });
221 | 
222 | await agent.initializeMCPClient({
223 |   servers: [
224 |     {
225 |       command: "npx",
226 |       args: [
227 |         "@composio/mcp@latest",
228 |         "start",
229 |         "--url",
230 |         "https://mcp.composio.dev/googlesheets/...",
231 |       ],
232 |       env: {
233 |         npm_config_yes: "true",
234 |       },
235 |     },
236 |   ],
237 | });
238 | 
239 | const response = await agent.executeTask(
240 |   "Go to https://en.wikipedia.org/wiki/List_of_U.S._states_and_territories_by_population and get the data on the top 5 most populous states from the table. Then insert that data into a google sheet. You may need to first check if there is an active connection to google sheet, and if there isn't connect to it and present me with the link to sign in. "
241 | );
242 | 
243 | console.log(response);
244 | await agent.closeAgent();
245 | ```
246 | 
247 | ### Custom Actions
248 | 
249 | HyperAgent's capabilities can be extended with custom actions. Custom actions require 3 things:
250 | 
251 | - type: Name of the action. Should be something descriptive about the action.
252 | - actionParams: A zod object describing the parameters that the action may consume.
253 | - run: A function that takes in a context, and the params for the action and produces a result based on the params.
254 | 
255 | Here is an example that performs a search using Exa
256 | 
257 | ```typescript
258 | const exaInstance = new Exa(process.env.EXA_API_KEY);
259 | 
260 | export const RunSearchActionDefinition: AgentActionDefinition = {
261 |   type: "perform_search",
262 |   actionParams: z.object({
263 |     search: z
264 |       .string()
265 |       .describe(
266 |         "The search query for something you want to search about. Keep the search query concise and to-the-point."
267 |       ),
268 |   }).describe("Search and return the results for a given query.");,
269 |   run: async function (
270 |     ctx: ActionContext,
271 |     params: z.infer<typeof searchSchema>
272 |   ): Promise<ActionOutput> {
273 |     const results = (await exaInstance.search(params.search, {})).results
274 |       .map(
275 |         (res) =>
276 |           `title: ${res.title} || url: ${res.url} || relevance: ${res.score}`
277 |       )
278 |       .join("\n");
279 | 
280 |     return {
281 |       success: true,
282 |       message: `Succesfully performed search for query ${params.search}. Got results: \n${results}`,
283 |     };
284 |   },
285 | };
286 | 
287 | const agent = new HyperAgent({
288 |   "Search about the news for today in New York",
289 |   customActions: [RunSearchActionDefinition],
290 | });
291 | ```
292 | 
293 | ## Contributing
294 | 
295 | We welcome contributions to Hyperagent! Here's how you can help:
296 | 
297 | 1. Fork the repository
298 | 2. Create your feature branch (`git checkout -b feature/AmazingFeature`)
299 | 3. Commit your changes (`git commit -m 'Add some AmazingFeature'`)
300 | 4. Push to the branch (`git push origin feature/AmazingFeature`)
301 | 5. Open a Pull Request
302 | 
303 | ## Support
304 | 
305 | - 📚 [Documentation](https://docs.hyperbrowser.ai/hyperagent/about-hyperagent)
306 | - 💬 [Discord Community](https://discord.gg/zsYzsgVRjh)
307 | - 🐛 [Issue Tracker](https://github.com/hyperbrowserai/HyperAgent/issues)
308 | - 📧 [Email Support](mailto:info@hyperbrowser.ai)
309 | 


--------------------------------------------------------------------------------
/assets/flight-schedule.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hyperbrowserai/HyperAgent/138076315fc49580c6955f2de6ce231a490be394/assets/flight-schedule.gif


--------------------------------------------------------------------------------
/assets/hyperagent-banner.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hyperbrowserai/HyperAgent/138076315fc49580c6955f2de6ce231a490be394/assets/hyperagent-banner.png


--------------------------------------------------------------------------------
/cli.sh:
--------------------------------------------------------------------------------
1 | SCRIPT_DIR="$(cd "$(dirname $(realpath "$0"))" && pwd)"
2 | NODE_OPTIONS="--no-deprecation" node "$SCRIPT_DIR/dist/cli/index.js" "$@"


--------------------------------------------------------------------------------
/eslint.config.mjs:
--------------------------------------------------------------------------------
 1 | import typescriptEslint from "@typescript-eslint/eslint-plugin";
 2 | import tsParser from "@typescript-eslint/parser";
 3 | import path from "node:path";
 4 | import { fileURLToPath } from "node:url";
 5 | import js from "@eslint/js";
 6 | import { FlatCompat } from "@eslint/eslintrc";
 7 | 
 8 | const __filename = fileURLToPath(import.meta.url);
 9 | const __dirname = path.dirname(__filename);
10 | const compat = new FlatCompat({
11 |   baseDirectory: __dirname,
12 |   recommendedConfig: js.configs.recommended,
13 |   allConfig: js.configs.all
14 | });
15 | 
16 | export default [
17 |   ...compat.extends("eslint:recommended", "plugin:@typescript-eslint/recommended", "prettier"),
18 |   {
19 |     plugins: {
20 |       "@typescript-eslint": typescriptEslint,
21 |     },
22 | 
23 |     languageOptions: {
24 |       parser: tsParser,
25 |     },
26 |   },
27 | ];


--------------------------------------------------------------------------------
/examples/browser-providers/hyperbrowser.ts:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * # Hyperbrowser Provider Example
 3 |  *
 4 |  * This example demonstrates how to configure and use HyperAgent with the Hyperbrowser
 5 |  * provider for web browsing tasks with proxy support.
 6 |  *
 7 |  * ## What This Example Does
 8 |  *
 9 |  * The agent performs a simple web search task that:
10 |  * 1. Configures HyperAgent with Hyperbrowser-specific settings
11 |  * 2. Enables proxy support for enhanced privacy and reliability
12 |  * 3. Searches for and extracts specific information about a movie release date
13 |  *
14 |  * ## Prerequisites
15 |  *
16 |  * 1. Node.js environment
17 |  * 2. OpenAI API key set in your .env file (OPENAI_API_KEY)
18 |  *
19 |  * ## Running the Example
20 |  *
21 |  * ```bash
22 |  * yarn ts-node examples/browser-providers/hyperbrowser.ts
23 |  * ```
24 |  */
25 | 
26 | import "dotenv/config";
27 | import { HyperAgent } from "@hyperbrowser/agent";
28 | import { ChatOpenAI } from "@langchain/openai";
29 | import chalk from "chalk";
30 | 
31 | async function runEval() {
32 |   const llm = new ChatOpenAI({
33 |     apiKey: process.env.OPENAI_API_KEY,
34 |     model: "gpt-4o",
35 |   });
36 | 
37 |   const agent = new HyperAgent({
38 |     llm: llm,
39 |     debug: true,
40 |     browserProvider: "Hyperbrowser",
41 |     hyperbrowserConfig: {
42 |       hyperbrowserSessionOptions: {
43 |         useProxy: true,
44 |       },
45 |     },
46 |   });
47 |   const result = await agent.executeTask(
48 |     "Find the initial release date for Guardians of the Galaxy Vol. 3 the movie",
49 |     {
50 |       debugOnAgentOutput: (agentOutput) => {
51 |         console.log("\n" + chalk.cyan.bold("===== AGENT OUTPUT ====="));
52 |         console.dir(agentOutput, { depth: null, colors: true });
53 |         console.log(chalk.cyan.bold("===============") + "\n");
54 |       },
55 |       onStep: (step) => {
56 |         console.log("\n" + chalk.cyan.bold(`===== STEP ${step.idx} =====`));
57 |         console.dir(step, { depth: null, colors: true });
58 |         console.log(chalk.cyan.bold("===============") + "\n");
59 |       },
60 |     }
61 |   );
62 |   await agent.closeAgent();
63 |   console.log(chalk.green.bold("\nResult:"));
64 |   console.log(chalk.white(result.output));
65 |   return result;
66 | }
67 | 
68 | (async () => {
69 |   await runEval();
70 | })().catch((error) => {
71 |   console.error(chalk.red("Error:"), error);
72 |   process.exit(1);
73 | });
74 | 


--------------------------------------------------------------------------------
/examples/custom-tool/search/exa.ts:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * # Custom Search Tool Example with Exa
  3 |  * 
  4 |  * This example demonstrates how to create and use a custom search tool with HyperAgent
  5 |  * using the Exa search API to perform web searches and process the results.
  6 |  * 
  7 |  * ## What This Example Does
  8 |  * 
  9 |  * The agent performs a multi-step task that showcases custom tool integration:
 10 |  * 1. Defines a custom search action using the Exa API
 11 |  * 2. Creates a schema for the search parameters using Zod
 12 |  * 3. Implements a search function that returns formatted results with titles, URLs, and relevance scores
 13 |  * 4. Demonstrates the tool usage with a complex travel planning task for Tokyo that:
 14 |  *    - Searches for relevant information about Tokyo attractions
 15 |  *    - Analyzes search results and filters for relevance
 16 |  *    - Navigates to selected URLs to extract detailed information
 17 |  *    - Compiles recommendations based on uniqueness and frequency
 18 |  * 
 19 |  * ## Prerequisites
 20 |  * 
 21 |  * 1. Node.js environment
 22 |  * 2. OpenAI API key set in your .env file (OPENAI_API_KEY)
 23 |  * 3. Exa API key set in your .env file (EXA_API_KEY)
 24 |  * 
 25 |  * ## Custom Tool Configuration
 26 |  * 
 27 |  * The example includes:
 28 |  * - Custom search action definition with Zod schema validation
 29 |  * - Integration with Exa search API
 30 |  * - Formatted result output with relevance scoring
 31 |  * 
 32 |  * ## Running the Example
 33 |  * 
 34 |  * ```bash
 35 |  * yarn ts-node -r tsconfig-paths/register examples/custom-tool/search/exa.ts
 36 |  * ```
 37 |  * 
 38 |  * ## Example Output
 39 |  * 
 40 |  * The final output will include a detailed trip plan for Tokyo based on
 41 |  * searched and analyzed web content, with recommended places and their details.
 42 |  */
 43 | 
 44 | import "dotenv/config";
 45 | import HyperAgent from "@hyperbrowser/agent";
 46 | import {
 47 |   AgentActionDefinition,
 48 |   ActionContext,
 49 |   ActionOutput,
 50 | } from "@hyperbrowser/agent/types";
 51 | import chalk from "chalk";
 52 | import { ChatOpenAI } from "@langchain/openai";
 53 | import Exa from "exa-js";
 54 | 
 55 | import * as z from "zod";
 56 | 
 57 | const exaInstance = new Exa(process.env.EXA_API_KEY);
 58 | 
 59 | const searchSchema = z
 60 |   .object({
 61 |     search: z
 62 |       .string()
 63 |       .describe(
 64 |         "The search query for something you want to search about. Keep the search query concise and to-the-point."
 65 |       ),
 66 |   })
 67 |   .describe("Search and return the results for a given query.");
 68 | 
 69 | export const RunSearchActionDefinition: AgentActionDefinition = {
 70 |   type: "perform_search",
 71 |   actionParams: searchSchema,
 72 |   run: async function (
 73 |     ctx: ActionContext,
 74 |     params: z.infer<typeof searchSchema>
 75 |   ): Promise<ActionOutput> {
 76 |     const results = (await exaInstance.search(params.search, {})).results
 77 |       .map(
 78 |         (res) =>
 79 |           `title: ${res.title} || url: ${res.url} || relevance: ${res.score}`
 80 |       )
 81 |       .join("\n");
 82 | 
 83 |     return {
 84 |       success: true,
 85 |       message: `Succesfully performed search for query ${params.search}. Got results: \n${results}`,
 86 |     };
 87 |   },
 88 | };
 89 | 
 90 | async function runEval() {
 91 |   console.log(chalk.cyan.bold("\n===== Running Custom Tool Example ====="));
 92 | 
 93 |   const llm = new ChatOpenAI({
 94 |     apiKey: process.env.OPENAI_API_KEY,
 95 |     model: "gpt-4o",
 96 |   });
 97 | 
 98 |   const agent = new HyperAgent({
 99 |     llm: llm,
100 |     debug: true,
101 |     customActions: [RunSearchActionDefinition],
102 |   });
103 | 
104 |   const result = await agent.executeTask(
105 |     `Make me a trip plan for Tokyo. 
106 |     Steps:
107 |     
108 |     - Peform search about the place and things to see there using the 'perform_search' tool.
109 |     - Analyze part of the urls provided, filtering results for relevance, and information and collecting a subset of urls that you think warrant further examination.
110 |     - For each page that you've 
111 |         - Navigate to that url
112 |         - Extract information about trip recommendations
113 |         - You must do this in order. Navigate to a single page, and then perform extraction on that page. Do not perform multiple navigations one after another.
114 |     - Narrow down on places based on uniqueness, frequency of recommendation, and whatever else you feel is valuable.
115 |     - Return to me a list of places you recommend, and their details (if any)`,
116 |     {
117 |       debugOnAgentOutput: (agentOutput) => {
118 |         console.log("\n" + chalk.cyan.bold("===== AGENT OUTPUT ====="));
119 |         console.dir(agentOutput, { depth: null, colors: true });
120 |         console.log(chalk.cyan.bold("===============") + "\n");
121 |       },
122 |       onStep: (step) => {
123 |         console.log("\n" + chalk.cyan.bold(`===== STEP ${step.idx} =====`));
124 |         console.dir(step, { depth: null, colors: true });
125 |         console.log(chalk.cyan.bold("===============") + "\n");
126 |       },
127 |     }
128 |   );
129 |   await agent.closeAgent();
130 |   console.log(chalk.green.bold("\nResult:"));
131 |   console.log(chalk.white(result.output));
132 |   return result;
133 | }
134 | 
135 | (async () => {
136 |   await runEval();
137 | })().catch((error) => {
138 |   console.error(chalk.red("Error:"), error);
139 |   process.exit(1);
140 | });
141 | 


--------------------------------------------------------------------------------
/examples/custom-tool/wikipedia-random-article/run-custom-tool.ts:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * # Custom Wikipedia Random Article Tool Example
  3 |  * 
  4 |  * This example demonstrates how to create a simple custom tool for HyperAgent
  5 |  * that navigates to random Wikipedia articles and extracts their content.
  6 |  * 
  7 |  * ## What This Example Does
  8 |  * 
  9 |  * The agent performs a straightforward task using a custom tool that:
 10 |  * 1. Defines a custom action to navigate to Wikipedia's random article page
 11 |  * 2. Retrieves the page title and URL
 12 |  * 3. Extracts and describes the content of the randomly selected article
 13 |  * 
 14 |  * ## Prerequisites
 15 |  * 
 16 |  * 1. Node.js environment
 17 |  * 2. OpenAI API key set in your .env file (OPENAI_API_KEY)
 18 |  * 
 19 |  * ## Running the Example
 20 |  * 
 21 |  * ```bash
 22 |  * yarn ts-node -r tsconfig-paths/register examples/custom-tool/wikipedia-random-article/run-custom-tool.ts
 23 |  * ```
 24 |  */
 25 | 
 26 | import "dotenv/config";
 27 | import { HyperAgent } from "@hyperbrowser/agent";
 28 | import {
 29 |   AgentActionDefinition,
 30 |   ActionContext,
 31 |   ActionOutput,
 32 | } from "@hyperbrowser/agent/types";
 33 | import chalk from "chalk";
 34 | import { ChatOpenAI } from "@langchain/openai";
 35 | 
 36 | import * as z from "zod";
 37 | 
 38 | export const GoToWikipediaActionDefinition: AgentActionDefinition = {
 39 |   type: "go_to_random_wikipedia_page",
 40 |   actionParams: z
 41 |     .object({})
 42 |     .describe(
 43 |       "Navigate to a random wikipedia page and return the title and url of the page."
 44 |     ),
 45 |   run: async function (ctx: ActionContext): Promise<ActionOutput> {
 46 |     await ctx.page.goto("https://en.wikipedia.org/wiki/Special:Random", {
 47 |       waitUntil: "domcontentloaded",
 48 |     });
 49 | 
 50 |     const url = ctx.page.url();
 51 |     const title = await ctx.page.title();
 52 |     return {
 53 |       success: true,
 54 |       message: `Succesfully navigated to URL: ${url} and title: ${title}`,
 55 |     };
 56 |   },
 57 | };
 58 | 
 59 | async function runEval() {
 60 |   console.log(chalk.cyan.bold("\n===== Running Custom Tool Example ====="));
 61 | 
 62 |   const llm = new ChatOpenAI({
 63 |     apiKey: process.env.OPENAI_API_KEY,
 64 |     model: "gpt-4o",
 65 |   });
 66 | 
 67 |   const agent = new HyperAgent({
 68 |     llm: llm,
 69 |     debug: true,
 70 |     customActions: [GoToWikipediaActionDefinition],
 71 |   });
 72 | 
 73 |   const result = await agent.executeTask(
 74 |     "Navigate to a random wikipedia page, and describe to me the contents of that page.",
 75 |     {
 76 |       debugOnAgentOutput: (agentOutput) => {
 77 |         console.log("\n" + chalk.cyan.bold("===== AGENT OUTPUT ====="));
 78 |         console.dir(agentOutput, { depth: null, colors: true });
 79 |         console.log(chalk.cyan.bold("===============") + "\n");
 80 |       },
 81 |       onStep: (step) => {
 82 |         console.log("\n" + chalk.cyan.bold(`===== STEP ${step.idx} =====`));
 83 |         console.dir(step, { depth: null, colors: true });
 84 |         console.log(chalk.cyan.bold("===============") + "\n");
 85 |       },
 86 |     }
 87 |   );
 88 |   await agent.closeAgent();
 89 |   console.log(chalk.green.bold("\nResult:"));
 90 |   console.log(chalk.white(result.output));
 91 |   return result;
 92 | }
 93 | 
 94 | (async () => {
 95 |   await runEval();
 96 | })().catch((error) => {
 97 |   console.error(chalk.red("Error:"), error);
 98 |   process.exit(1);
 99 | });
100 | 


--------------------------------------------------------------------------------
/examples/llms/anthropic.ts:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * # Anthropic LLM Integration Example
 3 |  *
 4 |  * This example demonstrates how to configure and use HyperAgent with Anthropic's
 5 |  * Claude language models for web automation tasks.
 6 |  *
 7 |  * ## What This Example Does
 8 |  *
 9 |  * The agent performs a web scraping task that:
10 |  * 1. Configures HyperAgent with Anthropic's Claude 3 Sonnet model
11 |  * 2. Navigates to Hacker News
12 |  * 3. Searches for and extracts information about "Show HN" posts
13 |  *
14 |  * ## Prerequisites
15 |  *
16 |  * 1. Node.js environment
17 |  * 2. Anthropic API key set in your .env file (ANTHROPIC_API_KEY)
18 |  *
19 |  * ## Running the Example
20 |  *
21 |  * ```bash
22 |  * yarn ts-node -r tsconfig-paths/register examples/llms/anthropic.ts
23 |  * ```
24 |  */
25 | 
26 | import "dotenv/config";
27 | import HyperAgent from "@hyperbrowser/agent";
28 | 
29 | import chalk from "chalk";
30 | import { ChatAnthropic } from "@langchain/anthropic";
31 | 
32 | const TASK =
33 |   "Go to hackernews, and find if there's any SHOW HN post up there. If it is, then tell me the title of the post.";
34 | 
35 | async function runEval() {
36 |   const llm = new ChatAnthropic({
37 |     apiKey: process.env.ANTHROPIC_API_KEY,
38 |     model: "claude-3-7-sonnet-latest",
39 |   });
40 | 
41 |   const agent = new HyperAgent({
42 |     llm: llm,
43 |   });
44 | 
45 |   console.log(`\n${chalk.green("Running agent with Claude Sonnet 3.7")}\n`);
46 | 
47 |   const result = await agent.executeTask(TASK, {
48 |     debugOnAgentOutput: (agentOutput) => {
49 |       console.log("\n" + chalk.cyan.bold("===== AGENT OUTPUT ====="));
50 |       console.dir(agentOutput, { depth: null, colors: true });
51 |       console.log(chalk.cyan.bold("===============") + "\n");
52 |     },
53 |     onStep: (step) => {
54 |       console.log("\n" + chalk.cyan.bold(`===== STEP ${step.idx} =====`));
55 |       console.dir(step, { depth: null, colors: true });
56 |       console.log(chalk.cyan.bold("===============") + "\n");
57 |     },
58 |   });
59 |   await agent.closeAgent();
60 |   console.log(chalk.green.bold("\nResult:"));
61 |   console.log(chalk.white(result.output));
62 |   return result;
63 | }
64 | 
65 | (async () => {
66 |   await runEval();
67 | })().catch((error) => {
68 |   console.error(chalk.red("Error:"), error);
69 |   process.exit(1);
70 | });
71 | 


--------------------------------------------------------------------------------
/examples/llms/openai.ts:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * # OpenAI LLM Integration Example
 3 |  *
 4 |  * This example demonstrates how to configure and use HyperAgent with OpenAI's
 5 |  * language models for web automation tasks.
 6 |  *
 7 |  * ## What This Example Does
 8 |  *
 9 |  * The agent performs a web scraping task that:
10 |  * 1. Configures HyperAgent with OpenAI's GPT-4 model
11 |  * 2. Navigates to Hacker News
12 |  * 3. Searches for and extracts information about "Show HN" posts
13 |  *
14 |  * ## Prerequisites
15 |  *
16 |  * 1. Node.js environment
17 |  * 2. OpenAI API key set in your .env file (OPENAI_API_KEY)
18 |  *
19 |  * ## Running the Example
20 |  *
21 |  * ```bash
22 |  * yarn ts-node -r tsconfig-paths/register examples/llms/openai.ts
23 |  * ```
24 |  */
25 | 
26 | import "dotenv/config";
27 | import HyperAgent from "@hyperbrowser/agent";
28 | 
29 | import chalk from "chalk";
30 | import { ChatOpenAI } from "@langchain/openai";
31 | 
32 | const TASK =
33 |   "Go to hackernews, and find if there's any SHOW HN post up there. If it is, then tell me the title of the post.";
34 | 
35 | async function runEval() {
36 |   const llm = new ChatOpenAI({
37 |     apiKey: process.env.OPENAI_API_KEY,
38 |     model: "gpt-4o",
39 |   });
40 | 
41 |   const agent = new HyperAgent({
42 |     llm: llm,
43 |     debug: true,
44 |   });
45 | 
46 |   console.log(`\n${chalk.green("Running agent with GPT-4o")}\n`);
47 | 
48 |   const result = await agent.executeTask(TASK, {
49 |     debugOnAgentOutput: (agentOutput) => {
50 |       console.log("\n" + chalk.cyan.bold("===== AGENT OUTPUT ====="));
51 |       console.dir(agentOutput, { depth: null, colors: true });
52 |       console.log(chalk.cyan.bold("===============") + "\n");
53 |     },
54 |     onStep: (step) => {
55 |       console.log("\n" + chalk.cyan.bold(`===== STEP ${step.idx} =====`));
56 |       console.dir(step, { depth: null, colors: true });
57 |       console.log(chalk.cyan.bold("===============") + "\n");
58 |     },
59 |   });
60 |   await agent.closeAgent();
61 |   console.log(chalk.green.bold("\nResult:"));
62 |   console.log(chalk.white(result.output));
63 |   return result;
64 | }
65 | 
66 | (async () => {
67 |   await runEval();
68 | })().catch((error) => {
69 |   console.error(chalk.red("Error:"), error);
70 |   process.exit(1);
71 | });
72 | 


--------------------------------------------------------------------------------
/examples/mcp/google-sheets/best-buy-reviews.ts:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * # Google Sheets MCP Server Example
  3 |  *
  4 |  * This example demonstrates how to use HyperAgent with the Composio Googlesheets MCP server
  5 |  * to connect to Google Sheets, create a new spreadsheet, and populate it with data scraped from the web.
  6 |  *
  7 |  * ## What This Example Does
  8 |  *
  9 |  * The agent performs a multi-step task that requires web browsing and Google Sheets integration:
 10 |  * 1. Checks if there is an active connection to Composio Googlesheets MCP server
 11 |  * 2. If no connection exists, initiates a connection and waits for the user to authenticate
 12 |  * 3. Creates a new spreadsheet titled "BestBuy Reviews"
 13 |  * 4. Navigates to BestBuy to gather data on the reviews for the MacBook Air M2
 14 |  * 5. Adds the data to the created spreadsheet
 15 |  *
 16 |  * ## Prerequisites
 17 |  *
 18 |  * 1. Node.js environment
 19 |  * 2. OpenAI API key set in your .env file (OPENAI_API_KEY)
 20 |  * 3. Need to have a Composio account, can sign up at https://app.composio.dev
 21 |  *    - Go to this link and get your secure MCP URL (you just need the URL part from the command): https://mcp.composio.dev/googlesheets
 22 |  *    - You will use the url to run the script, for example:
 23 |  *    ```
 24 |  *    yarn ts-node tsconfig-paths/register examples/mcp/google-sheets/best-buy-reviews.ts <your-mcp-url>
 25 |  *    ```
 26 |  *    - When running for the first time, there will be no active connection so you will need to login
 27 |  *      with Google OAUTH at the link provided by the agent to authenticate
 28 |  *
 29 |  * ## MCP Server Configuration
 30 |  *
 31 |  * This example uses the Composio Googlesheets MCP server which provides tools for:
 32 |  * - `GOOGLESHEETS_CHECK_ACTIVE_CONNECTION`: Verifies if there's an active connection to Google Sheets
 33 |  * - `GOOGLESHEETS_INITIATE_CONNECTION`: Starts the authentication process for Google Sheets
 34 |  * - `GOOGLESHEETS_CREATE_GOOGLE_SHEET1`: Creates a new Google Sheet
 35 |  * - `GOOGLESHEETS_SHEET_FROM_JSON`: Converts JSON data to a Google Sheet format
 36 |  * - `GOOGLESHEETS_BATCH_UPDATE`: Updates multiple cells in a spreadsheet
 37 |  * - `GOOGLESHEETS_GET_SPREADSHEET_INFO`: Retrieves information about a spreadsheet
 38 |  * - `GOOGLESHEETS_LOOKUP_SPREADSHEET_ROW`: Looks up a specific row in a spreadsheet
 39 |  * - `GOOGLESHEETS_BATCH_GET`: Gets values from multiple ranges in a spreadsheet
 40 |  * - `GOOGLESHEETS_GET_SHEET_NAMES`: Gets the names of all sheets in a spreadsheet
 41 |  * - `GOOGLESHEETS_CLEAR_VALUES`: Clears values from a range in a spreadsheet
 42 |  * - `GOOGLESHEETS_GET_REQUIRED_PARAMETERS`: Gets required parameters for Google Sheets operations
 43 |  *
 44 |  * ## Debugging and Monitoring
 45 |  *
 46 |  * The example includes callback functions to monitor:
 47 |  * - Agent output: Raw output from the LLM agent
 48 |  * - Step execution: Each step the agent takes during the task
 49 |  *
 50 |  * ## Running the Example
 51 |  *
 52 |  * ```
 53 |  * yarn ts-node examples/mcp/google-sheets/best-buy-reviews.ts <your-mcp-url>
 54 |  * ```
 55 |  *
 56 |  * ## Example Output
 57 |  *
 58 |  * The final output will include confirmation that the agent has successfully created a new Google Sheet
 59 |  * and populated it with information about the reviews for the MacBook Air M2.
 60 |  */
 61 | 
 62 | import dotenv from "dotenv";
 63 | import chalk from "chalk";
 64 | import { ChatOpenAI } from "@langchain/openai";
 65 | import HyperAgent from "@hyperbrowser/agent";
 66 | 
 67 | dotenv.config();
 68 | 
 69 | const TASK = `1. Run GOOGLESHEETS_CHECK_ACTIVE_CONNECTION to check if there is an active connection.
 70 | 2. If there is an active connection, go to 4. Otherwise, go to 3.
 71 | 3. Run GOOGLESHEETS_INITIATE_CONNECTION and output the the auth link to the user, then wait for the connection to be active.
 72 | 4. Create a new spreadsheet titled "BestBuy Reviews".
 73 | 5. Go to https://www.bestbuy.com/site/apple-macbook-air-13-inch-apple-m2-chip-built-for-apple-intelligence-16gb-memory-256gb-ssd-midnight/6602763.p?skuId=6602763 .
 74 | 6. Scroll down until you see the "See All Customer Reviews" button and click on the button.
 75 | 7. Once on the next page, get all the reviews from the first page.
 76 | 8. Add the reviews to the "BestBuy Reviews" spreadsheet. Include these columns and data for all the columns: Review Title, Rating, Review Text, Verified Purchase, and Review Date.
 77 | Make sure that the data is well formatted and the columns are all there, make sure to not cut off any of the full review text, please include it all and to get all the reviews.`;
 78 | 
 79 | async function run(mcpUrl: string) {
 80 |   console.log(chalk.cyan.bold("\n===== Running Task ====="));
 81 |   console.log(chalk.white(`Task: ${TASK}`));
 82 |   console.log(chalk.cyan.bold("=======================\n"));
 83 | 
 84 |   console.log(chalk.yellow("Initializing OpenAI LLM..."));
 85 |   const llm = new ChatOpenAI({
 86 |     apiKey: process.env.OPENAI_API_KEY,
 87 |     model: "gpt-4o",
 88 |   });
 89 | 
 90 |   console.log(chalk.yellow("Creating HyperAgent..."));
 91 | 
 92 |   try {
 93 |     const agent = new HyperAgent({
 94 |       llm: llm,
 95 |       debug: true,
 96 |     });
 97 |     console.log(chalk.green("Agent created successfully"));
 98 | 
 99 |     console.log(
100 |       chalk.yellow("Connecting to Composio Googlesheets MCP server...")
101 |     );
102 |     await agent.initializeMCPClient({
103 |       servers: [
104 |         {
105 |           command: "npx",
106 |           args: ["@composio/mcp@latest", "start", "--url", mcpUrl],
107 |           env: {
108 |             npm_config_yes: "true",
109 |           },
110 |         },
111 |       ],
112 |     });
113 |     console.log(
114 |       chalk.green(
115 |         "Connected to Composio Googlesheets MCP server, executing task..."
116 |       )
117 |     );
118 | 
119 |     const result = await agent.executeTask(TASK, {
120 |       debugOnAgentOutput: (agentOutput) => {
121 |         console.log("\n" + chalk.cyan.bold("===== AGENT OUTPUT ====="));
122 |         console.dir(agentOutput, { depth: null, colors: true });
123 |         console.log(chalk.cyan.bold("===============") + "\n");
124 |       },
125 |       onStep: (step) => {
126 |         console.log("\n" + chalk.cyan.bold(`===== STEP ${step.idx} =====`));
127 |         console.dir(step, { depth: null, colors: true });
128 |         console.log(chalk.cyan.bold("===============") + "\n");
129 |       },
130 |     });
131 | 
132 |     await agent.closeAgent();
133 |     console.log(chalk.green.bold("\nResult:"));
134 |     console.log(chalk.white(result.output));
135 |     return result;
136 |   } catch (error) {
137 |     console.error(chalk.red.bold("Error creating agent or executing task:"));
138 |     console.error(
139 |       chalk.red(error instanceof Error ? error.stack : String(error))
140 |     );
141 |   }
142 | }
143 | 
144 | (async () => {
145 |   try {
146 |     if (process.argv.length < 3) {
147 |       console.error(
148 |         chalk.red("Error: Please provide your MCP URL as an argument")
149 |       );
150 |       process.exit(1);
151 |     }
152 |     await run(process.argv[2]);
153 |   } catch (error) {
154 |     console.error(chalk.red("Error:"), error);
155 |     process.exit(1);
156 |   }
157 | })();
158 | 


--------------------------------------------------------------------------------
/examples/mcp/google-sheets/car-price-comparison.ts:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * # Car Price Comparison with Google Sheets MCP Server Example
  3 |  *
  4 |  * This example demonstrates how to use HyperAgent with the Composio Googlesheets MCP server
  5 |  * to connect to Google Sheets and populate it with car price comparison data scraped from multiple websites.
  6 |  *
  7 |  * ## What This Example Does
  8 |  *
  9 |  * The agent performs a multi-step task that requires web browsing and Google Sheets integration:
 10 |  * 1. Checks if there is an active connection to Composio Googlesheets MCP server
 11 |  * 2. If no connection exists, initiates a connection and waits for the user to authenticate
 12 |  * 3. Creates a new spreadsheet titled with the car name and current date
 13 |  * 4. Searches for the specified car (Toyota Corolla) on multiple car comparison sites:
 14 |  *    - Carvana
 15 |  *    - Carmax
 16 |  * 5. Collects the 5 cheapest listings from each site including details like price, mileage, model year, and trim
 17 |  * 6. Adds the collected data to the spreadsheet in a well-formatted manner
 18 |  *
 19 |  * ## Prerequisites
 20 |  *
 21 |  * 1. Node.js environment
 22 |  * 2. OpenAI API key set in your .env file (OPENAI_API_KEY)
 23 |  * 3. Need to have a Composio account, can sign up at https://app.composio.dev
 24 |  *    - Go to this link and get your secure MCP URL (you just need the URL part from the command): https://mcp.composio.dev/googlesheets
 25 |  *    - You will use the url to run the script, for example:
 26 |  *    ```
 27 |  *    yarn ts-node examples/mcp/google-sheets/car-price-comparison.ts <your-mcp-url>
 28 |  *    ```
 29 |  *    - When running for the first time, there will be no active connection so you will need to login
 30 |  *      with Google OAUTH at the link provided by the agent to authenticate
 31 |  *
 32 |  * ## MCP Server Configuration
 33 |  *
 34 |  * This example uses the Composio Googlesheets MCP server which provides tools for:
 35 |  * - `GOOGLESHEETS_CHECK_ACTIVE_CONNECTION`: Verifies if there's an active connection to Google Sheets
 36 |  * - `GOOGLESHEETS_INITIATE_CONNECTION`: Starts the authentication process for Google Sheets
 37 |  * - `GOOGLESHEETS_CREATE_GOOGLE_SHEET1`: Creates a new Google Sheet
 38 |  * - `GOOGLESHEETS_SHEET_FROM_JSON`: Converts JSON data to a Google Sheet format
 39 |  * - `GOOGLESHEETS_BATCH_UPDATE`: Updates multiple cells in a spreadsheet
 40 |  * - `GOOGLESHEETS_GET_SPREADSHEET_INFO`: Retrieves information about a spreadsheet
 41 |  * - `GOOGLESHEETS_LOOKUP_SPREADSHEET_ROW`: Looks up a specific row in a spreadsheet
 42 |  * - `GOOGLESHEETS_BATCH_GET`: Gets values from multiple ranges in a spreadsheet
 43 |  * - `GOOGLESHEETS_GET_SHEET_NAMES`: Gets the names of all sheets in a spreadsheet
 44 |  * - `GOOGLESHEETS_CLEAR_VALUES`: Clears values from a range in a spreadsheet
 45 |  * - `GOOGLESHEETS_GET_REQUIRED_PARAMETERS`: Gets required parameters for Google Sheets operations
 46 |  *
 47 |  * ## Debugging and Monitoring
 48 |  *
 49 |  * The example includes callback functions to monitor:
 50 |  * - Agent output: Raw output from the LLM agent
 51 |  * - Step execution: Each step the agent takes during the task
 52 |  *
 53 |  * ## Running the Example
 54 |  *
 55 |  * ```
 56 |  * yarn ts-node examples/mcp/google-sheets/car-price-comparison.ts <your-mcp-url>
 57 |  * ```
 58 |  *
 59 |  * ## Example Output
 60 |  *
 61 |  * The final output will include confirmation that the agent has successfully created a new Google Sheet
 62 |  * and populated it with information about the car prices from different websites.
 63 |  */
 64 | 
 65 | import dotenv from "dotenv";
 66 | import chalk from "chalk";
 67 | import { ChatOpenAI } from "@langchain/openai";
 68 | import HyperbrowserAgent from "@hyperbrowser/agent";
 69 | 
 70 | dotenv.config();
 71 | 
 72 | const CAR_NAME = "Toyota Corolla";
 73 | 
 74 | const TASK_STEPS = `
 75 | Your task is to search for a certain car, namely ${CAR_NAME} and compare it's prices across multiple car price comparison sites, namely
 76 | - Carvana (https://www.carvana.com/)
 77 | - Carmax (https://www.carmax.com)
 78 | 
 79 | You will search for the results for the mentioned car on each of these websites, sort the results by lowest to highest, and then add the 5 cheapest results of each website to a google sheet. As much as possible, sort results using the websites own sort, and do not try to sort results by extraction. 
 80 | 
 81 | ## Google Sheet setup:
 82 | 1. Run GOOGLESHEETS_CHECK_ACTIVE_CONNECTION to check if there is an active connection.
 83 | 2. If there is an active connection, go to 4. Otherwise, go to 3.
 84 | 3. Run GOOGLESHEETS_INITIATE_CONNECTION and output the the auth link to the user, then wait for the connection to be active.
 85 | 4. Create a new spreadsheet titled "${CAR_NAME} Comparison - {{CURRENT_DATE}}".
 86 | 5. Get the results from each website, and insert the relevant data (like price, mileage, model year, model name/trim), along with the website source.
 87 | 6. Add that information to the spreadsheet properly.
 88 | Make sure that the data is well formatted and the columns are all there.`;
 89 | 
 90 | async function run(mcpUrl: string) {
 91 |   console.log(chalk.cyan.bold("\n===== Running Task ====="));
 92 |   console.log(chalk.white(`Task: ${TASK_STEPS}`));
 93 |   console.log(chalk.cyan.bold("=======================\n"));
 94 | 
 95 |   console.log(chalk.yellow("Initializing OpenAI LLM..."));
 96 |   const llm = new ChatOpenAI({
 97 |     apiKey: process.env.OPENAI_API_KEY,
 98 |     model: "gpt-4o",
 99 |   });
100 | 
101 |   console.log(chalk.yellow("Creating Hyperbrowser Agent..."));
102 | 
103 |   try {
104 |     const agent = new HyperbrowserAgent({
105 |       llm: llm,
106 |       debug: true,
107 |     });
108 |     console.log(chalk.green("Agent created successfully"));
109 | 
110 |     console.log(
111 |       chalk.yellow("Connecting to Composio Googlesheets MCP server...")
112 |     );
113 |     await agent.initializeMCPClient({
114 |       servers: [
115 |         {
116 |           command: "npx",
117 |           args: ["@composio/mcp@latest", "start", "--url", mcpUrl],
118 |           env: {
119 |             npm_config_yes: "true",
120 |           },
121 |         },
122 |       ],
123 |     });
124 |     console.log(
125 |       chalk.green(
126 |         "Connected to Composio Googlesheets MCP server, executing task..."
127 |       )
128 |     );
129 | 
130 |     const result = await agent.executeTask(
131 |       `Your task is to look for a certain car on car comparisons website ${TASK_STEPS}`,
132 |       {
133 |         debugOnAgentOutput: (agentOutput) => {
134 |           console.log("\n" + chalk.cyan.bold("===== AGENT OUTPUT ====="));
135 |           console.dir(agentOutput, { depth: null, colors: true });
136 |           console.log(chalk.cyan.bold("===============") + "\n");
137 |         },
138 |         onStep: (step) => {
139 |           console.log("\n" + chalk.cyan.bold(`===== STEP ${step.idx} =====`));
140 |           console.dir(step, { depth: null, colors: true });
141 |           console.log(chalk.cyan.bold("===============") + "\n");
142 |         },
143 |       }
144 |     );
145 | 
146 |     await agent.closeAgent();
147 |     console.log(chalk.green.bold("\nResult:"));
148 |     console.log(chalk.white(result.output));
149 |     return result;
150 |   } catch (error) {
151 |     console.error(chalk.red.bold("Error creating agent or executing task:"));
152 |     console.error(
153 |       chalk.red(error instanceof Error ? error.stack : String(error))
154 |     );
155 |   }
156 | }
157 | 
158 | (async () => {
159 |   try {
160 |     if (process.argv.length < 3) {
161 |       console.error(
162 |         chalk.red("Error: Please provide your MCP URL as an argument")
163 |       );
164 |       process.exit(1);
165 |     }
166 |     await run(process.argv[2]);
167 |   } catch (error) {
168 |     console.error(chalk.red("Error:"), error);
169 |     process.exit(1);
170 |   }
171 | })();
172 | 


--------------------------------------------------------------------------------
/examples/mcp/google-sheets/most-populated-states.ts:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * # Google Sheets MCP Server Example
  3 |  *
  4 |  * This example demonstrates how to use HyperAgent with the Composio Googlesheets MCP server
  5 |  * to connect to Google Sheets, create a new spreadsheet, and populate it with data scraped from the web.
  6 |  *
  7 |  * ## What This Example Does
  8 |  *
  9 |  * The agent performs a multi-step task that requires web browsing and Google Sheets integration:
 10 |  * 1. Checks if there is an active connection to Composio Googlesheets MCP server
 11 |  * 2. If no connection exists, initiates a connection and waits for the user to authenticate
 12 |  * 3. Creates a new spreadsheet titled "Most Populated States"
 13 |  * 4. Navigates to Wikipedia to gather data on the 5 most populous US states
 14 |  * 5. Adds the data to the created spreadsheet
 15 |  *
 16 |  * ## Prerequisites
 17 |  *
 18 |  * 1. Node.js environment
 19 |  * 2. OpenAI API key set in your .env file (OPENAI_API_KEY)
 20 |  * 3. Need to have a Composio account, can sign up at https://app.composio.dev
 21 |  *    - Go to this link and get your secure MCP URL (you just need the URL part from the command): https://mcp.composio.dev/googlesheets
 22 |  *    - You will use the url to run the script, for example:
 23 |  *    ```
 24 |  *    yarn ts-node tsconfig-paths/register examples/mcp/google-sheets/most-populated-states.ts <your-mcp-url>
 25 |  *    ```
 26 |  *    - When running for the first time, there will be no active connection so you will need to login
 27 |  *      with Google OAUTH at the link provided by the agent to authenticate
 28 |  *
 29 |  * ## MCP Server Configuration
 30 |  *
 31 |  * This example uses the Composio Googlesheets MCP server which provides tools for:
 32 |  * - `GOOGLESHEETS_CHECK_ACTIVE_CONNECTION`: Verifies if there's an active connection to Google Sheets
 33 |  * - `GOOGLESHEETS_INITIATE_CONNECTION`: Starts the authentication process for Google Sheets
 34 |  * - `GOOGLESHEETS_CREATE_GOOGLE_SHEET1`: Creates a new Google Sheet
 35 |  * - `GOOGLESHEETS_SHEET_FROM_JSON`: Converts JSON data to a Google Sheet format
 36 |  * - `GOOGLESHEETS_BATCH_UPDATE`: Updates multiple cells in a spreadsheet
 37 |  * - `GOOGLESHEETS_GET_SPREADSHEET_INFO`: Retrieves information about a spreadsheet
 38 |  * - `GOOGLESHEETS_LOOKUP_SPREADSHEET_ROW`: Looks up a specific row in a spreadsheet
 39 |  * - `GOOGLESHEETS_BATCH_GET`: Gets values from multiple ranges in a spreadsheet
 40 |  * - `GOOGLESHEETS_GET_SHEET_NAMES`: Gets the names of all sheets in a spreadsheet
 41 |  * - `GOOGLESHEETS_CLEAR_VALUES`: Clears values from a range in a spreadsheet
 42 |  * - `GOOGLESHEETS_GET_REQUIRED_PARAMETERS`: Gets required parameters for Google Sheets operations
 43 |  *
 44 |  * ## Debugging and Monitoring
 45 |  *
 46 |  * The example includes callback functions to monitor:
 47 |  * - Agent output: Raw output from the LLM agent
 48 |  * - Step execution: Each step the agent takes during the task
 49 |  *
 50 |  * ## Running the Example
 51 |  *
 52 |  * ```
 53 |  * yarn ts-node examples/mcp/google-sheets/most-populated-states.ts <your-mcp-url>
 54 |  * ```
 55 |  *
 56 |  * ## Example Output
 57 |  *
 58 |  * The final output will include confirmation that the agent has successfully created a new Google Sheet
 59 |  * and populated it with information about the top 5 most populous US states.
 60 |  */
 61 | 
 62 | import dotenv from "dotenv";
 63 | import chalk from "chalk";
 64 | import { ChatOpenAI } from "@langchain/openai";
 65 | import HyperAgent from "@hyperbrowser/agent";
 66 | 
 67 | dotenv.config();
 68 | 
 69 | const TASK = `1. Run GOOGLESHEETS_CHECK_ACTIVE_CONNECTION to check if there is an active connection.
 70 | 2. If there is an active connection, go to 4. Otherwise, go to 3.
 71 | 3. Run GOOGLESHEETS_INITIATE_CONNECTION and output the the auth link to the user, then wait for the connection to be active.
 72 | 4. Create a new spreadsheet titled "Most Populated States".
 73 | 5. Go to https://en.wikipedia.org/wiki/List_of_U.S._states_and_territories_by_population and get the data on the top 5 most populous states from the table.
 74 | 6. Add that information to the spreadsheet properly.
 75 | Make sure that the data is well formatted and the columns are all there.`;
 76 | 
 77 | async function run(mcpUrl: string) {
 78 |   console.log(chalk.cyan.bold("\n===== Running Task ====="));
 79 |   console.log(chalk.white(`Task: ${TASK}`));
 80 |   console.log(chalk.cyan.bold("=======================\n"));
 81 | 
 82 |   console.log(chalk.yellow("Initializing OpenAI LLM..."));
 83 |   const llm = new ChatOpenAI({
 84 |     apiKey: process.env.OPENAI_API_KEY,
 85 |     model: "gpt-4o",
 86 |   });
 87 | 
 88 |   console.log(chalk.yellow("Creating HyperAgent..."));
 89 | 
 90 |   try {
 91 |     const agent = new HyperAgent({
 92 |       llm: llm,
 93 |       debug: true,
 94 |     });
 95 |     console.log(chalk.green("Agent created successfully"));
 96 | 
 97 |     console.log(
 98 |       chalk.yellow("Connecting to Composio Googlesheets MCP server...")
 99 |     );
100 |     await agent.initializeMCPClient({
101 |       servers: [
102 |         {
103 |           command: "npx",
104 |           args: ["@composio/mcp@latest", "start", "--url", mcpUrl],
105 |           env: {
106 |             npm_config_yes: "true",
107 |           },
108 |         },
109 |       ],
110 |     });
111 |     console.log(
112 |       chalk.green(
113 |         "Connected to Composio Googlesheets MCP server, executing task..."
114 |       )
115 |     );
116 | 
117 |     const result = await agent.executeTask(TASK, {
118 |       debugOnAgentOutput: (agentOutput) => {
119 |         console.log("\n" + chalk.cyan.bold("===== AGENT OUTPUT ====="));
120 |         console.dir(agentOutput, { depth: null, colors: true });
121 |         console.log(chalk.cyan.bold("===============") + "\n");
122 |       },
123 |       onStep: (step) => {
124 |         console.log("\n" + chalk.cyan.bold(`===== STEP ${step.idx} =====`));
125 |         console.dir(step, { depth: null, colors: true });
126 |         console.log(chalk.cyan.bold("===============") + "\n");
127 |       },
128 |     });
129 | 
130 |     await agent.closeAgent();
131 |     console.log(chalk.green.bold("\nResult:"));
132 |     console.log(chalk.white(result.output));
133 |     return result;
134 |   } catch (error) {
135 |     console.error(chalk.red.bold("Error creating agent or executing task:"));
136 |     console.error(
137 |       chalk.red(error instanceof Error ? error.stack : String(error))
138 |     );
139 |   }
140 | }
141 | 
142 | (async () => {
143 |   try {
144 |     if (process.argv.length < 3) {
145 |       console.error(
146 |         chalk.red("Error: Please provide your MCP URL as an argument")
147 |       );
148 |       process.exit(1);
149 |     }
150 |     await run(process.argv[2]);
151 |   } catch (error) {
152 |     console.error(chalk.red("Error:"), error);
153 |     process.exit(1);
154 |   }
155 | })();
156 | 


--------------------------------------------------------------------------------
/examples/mcp/notion/create-shopping-list.ts:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * # Notion MCP Server Example
  3 |  *
  4 |  * This example demonstrates how to use HyperAgent with the Composio Notion MCP server
  5 |  * to connect to Notion, create a new page, and populate it with ingredients for a recipe scraped from allrecipes.
  6 |  *
  7 |  * ## What This Example Does
  8 |  *
  9 |  * The agent performs a multi-step task that requires web browsing and Notion MCP:
 10 |  * 1. Checks if there is an active connection to Composio Notion MCP server
 11 |  * 2. If no connection exists, initiates a connection and waits for the user to authenticate
 12 |  * 3. Creates a new notion page titled "{{RECIPE}} ingredients"
 13 |  * 4. Navigates to allrecipes and finds a recipe matching the criterias
 14 |  * 5. Adds the data to the created spreadsheet
 15 |  *
 16 |  * ## Prerequisites
 17 |  *
 18 |  * 1. Node.js environment
 19 |  * 2. OpenAI API key set in your .env file (OPENAI_API_KEY)
 20 |  * 3. Need to have a Composio account, can sign up at https://app.composio.dev
 21 |  *    - Go to this link and get your secure MCP URL (you just need the URL part from the command): https://mcp.composio.dev/notion
 22 |  *    - You will use the url to run the script, for example:
 23 |  *    ```
 24 |  *    yarn ts-node examples/mcp/notion/create-shoppping-list.ts <your-mcp-url>
 25 |  *    ```
 26 |  *    - When running for the first time, there will be no active connection so you will need to login
 27 |  *      with Notion OAUTH at the link provided by the agent to authenticate
 28 |  *
 29 |  * ## MCP Server Configuration
 30 |  *
 31 |  * This example uses the Composio Notion MCP server which provides tools for a number of use cases. We will be using: :
 32 |  * - `NOTION_CHECK_ACTIVE_CONNECTION`: Verifies if there's an active connection to Notion
 33 |  * - `NOTION_INITIATE_CONNECTION`: Starts the authentication process for Notion
 34 |  * - `NOTION_ADD_PAGE_CONTENT`: Adds a single content block to a Notion page
 35 |  * - `NOTION_CREATE_PAGE`: Creates a new page in Notion
 36 |  *
 37 |  * ## Debugging and Monitoring
 38 |  *
 39 |  * The example includes callback functions to monitor:
 40 |  * - Agent output: Raw output from the LLM agent
 41 |  * - Step execution: Each step the agent takes during the task
 42 |  *
 43 |  * ## Running the Example
 44 |  *
 45 |  * ```
 46 |  * yarn ts-node examples/mcp/notion/create-shoppping-list.ts <your-mcp-url>
 47 |  * ```
 48 |  *
 49 |  * ## Example Output
 50 |  *
 51 |  * The final output will include confirmation that the agent has successfully created a new Notion Page
 52 |  * and populated it with the ingredients for a recipe.
 53 |  */
 54 | 
 55 | import dotenv from "dotenv";
 56 | import chalk from "chalk";
 57 | import { ChatOpenAI } from "@langchain/openai";
 58 | import HyperbrowserAgent from "@hyperbrowser/agent";
 59 | 
 60 | dotenv.config();
 61 | 
 62 | const TASK = `
 63 | Go to allrecipes and find a suitable recipe for Salsa verde with more than 100 ratings. Then insert each ingredient into a notion page. Don't get the trivial ingredients like salt, water, or pepper.
 64 | 
 65 | 
 66 | ## Steps to insert into a notion page:
 67 | 
 68 | 1. Run NOTION_CHECK_ACTIVE_CONNECTION to check if there is an active connection.
 69 | 2. If there is an active connection, go to 4. Otherwise, go to 3.
 70 | 3. Run NOTION_INITIATE_CONNECTION and output the the auth link to the user, then wait for the connection to be active.
 71 | 4. Create a new notion page title - {{RECIPE}} Ingredients
 72 | 5. Go to allrecipes, find a suitable recipe for {{RECIPE}}, and get it's ingredients
 73 | 6. For each ingredient, call NOTION_ADD_PAGE_CONTENT to insert a single ingredient
 74 | 
 75 | Make sure that the data is well formatted and the columns are all there.`;
 76 | 
 77 | async function run(mcpUrl: string) {
 78 |   console.log(chalk.cyan.bold("\n===== Running Task ====="));
 79 |   console.log(chalk.white(`Task: ${TASK}`));
 80 |   console.log(chalk.cyan.bold("=======================\n"));
 81 | 
 82 |   console.log(chalk.yellow("Initializing OpenAI LLM..."));
 83 |   const llm = new ChatOpenAI({
 84 |     apiKey: process.env.OPENAI_API_KEY,
 85 |     model: "gpt-4o",
 86 |   });
 87 | 
 88 |   console.log(chalk.yellow("Creating Hyperbrowser Agent..."));
 89 | 
 90 |   try {
 91 |     const agent = new HyperbrowserAgent({
 92 |       llm: llm,
 93 |       debug: true,
 94 |     });
 95 |     console.log(chalk.green("Agent created successfully"));
 96 | 
 97 |     console.log(
 98 |       chalk.yellow("Connecting to Composio Notion MCP server...")
 99 |     );
100 |     await agent.initializeMCPClient({
101 |       servers: [
102 |         {
103 |           command: "npx",
104 |           args: ["@composio/mcp@latest", "start", "--url", mcpUrl],
105 |           env: {
106 |             npm_config_yes: "true",
107 |           },
108 |         },
109 |       ],
110 |     });
111 |     console.log(
112 |       chalk.green(
113 |         "Connected to Composio Notion MCP server, executing task..."
114 |       )
115 |     );
116 | 
117 |     const result = await agent.executeTask(TASK, {
118 |       debugOnAgentOutput: (agentOutput) => {
119 |         console.log("\n" + chalk.cyan.bold("===== AGENT OUTPUT ====="));
120 |         console.dir(agentOutput, { depth: null, colors: true });
121 |         console.log(chalk.cyan.bold("===============") + "\n");
122 |       },
123 |       onStep: (step) => {
124 |         console.log("\n" + chalk.cyan.bold(`===== STEP ${step.idx} =====`));
125 |         console.dir(step, { depth: null, colors: true });
126 |         console.log(chalk.cyan.bold("===============") + "\n");
127 |       },
128 |     });
129 | 
130 |     await agent.closeAgent();
131 |     console.log(chalk.green.bold("\nResult:"));
132 |     console.log(chalk.white(result.output));
133 |     return result;
134 |   } catch (error) {
135 |     console.error(chalk.red.bold("Error creating agent or executing task:"));
136 |     console.error(
137 |       chalk.red(error instanceof Error ? error.stack : String(error))
138 |     );
139 |   }
140 | }
141 | 
142 | (async () => {
143 |   try {
144 |     if (process.argv.length < 3) {
145 |       console.error(
146 |         chalk.red("Error: Please provide your MCP URL as an argument")
147 |       );
148 |       process.exit(1);
149 |     }
150 |     await run(process.argv[2]);
151 |   } catch (error) {
152 |     console.error(chalk.red("Error:"), error);
153 |     process.exit(1);
154 |   }
155 | })();
156 | 


--------------------------------------------------------------------------------
/examples/mcp/weather/get-weather-alert.ts:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * # Weather MCP Server Example
  3 |  *
  4 |  * This example demonstrates how to use HyperAgent with a MCP (Model Context Protocol) server
  5 |  * to browse the web, extract information, and use that information to query a separate API service.
  6 |  *
  7 |  * ## What This Example Does
  8 |  *
  9 |  * The agent performs a multi-step task that requires web browsing and data extraction:
 10 |  * 1. Navigates to a Wikipedia page listing US states by population
 11 |  * 2. Identifies the most populated state
 12 |  * 3. Uses the custom weather MCP server to find weather alerts for that state
 13 |  *
 14 |  * ## Prerequisites
 15 |  *
 16 |  * - Node.js environment
 17 |  * - OpenAI API key set in your .env file (OPENAI_API_KEY)
 18 |  *
 19 |  * ## MCP Server Configuration
 20 |  *
 21 |  * This example uses a custom MCP server (weather-server.js) that provides tools for:
 22 |  * - `get-alerts`: Fetches weather alerts for a specific state from the National Weather Service API
 23 |  * - `get-forecast`: Retrieves weather forecasts for specific coordinates
 24 |  *
 25 |  *
 26 |  * ## Debugging and Monitoring
 27 |  *
 28 |  * The example includes callback functions to monitor:
 29 |  * - Agent output: Raw output from the LLM agent
 30 |  * - Step execution: Each step the agent takes during the task
 31 |  *
 32 |  * ## Running the Example
 33 |  *
 34 |  * ```
 35 |  * yarn ts-node examples/mcp/weather/get-weather-alert.ts
 36 |  * ```
 37 |  *
 38 |  * ## Example Output
 39 |  *
 40 |  * The final output will include the most populated US state and a list of current weather alerts for that state.
 41 |  */
 42 | 
 43 | import dotenv from "dotenv";
 44 | import chalk from "chalk";
 45 | import path from "path";
 46 | import { ChatOpenAI } from "@langchain/openai";
 47 | import HyperAgent from "@hyperbrowser/agent";
 48 | 
 49 | dotenv.config();
 50 | 
 51 | const TASK = `Go to https://en.wikipedia.org/wiki/List_of_U.S._states_and_territories_by_population and find the most populated state.
 52 | Then list 3 weather alerts for that state.`;
 53 | 
 54 | async function run() {
 55 |   console.log(chalk.cyan.bold("\n===== Running Task ====="));
 56 |   console.log(chalk.white(`Task: ${TASK}`));
 57 |   console.log(chalk.cyan.bold("=======================\n"));
 58 | 
 59 |   const llm = new ChatOpenAI({
 60 |     apiKey: process.env.OPENAI_API_KEY,
 61 |     model: "gpt-4o",
 62 |   });
 63 | 
 64 |   const mcpServerPath = path.join(__dirname, "/servers/weather-server.js");
 65 | 
 66 |   console.log(chalk.yellow("Creating Hyperbrowser Agent..."));
 67 | 
 68 |   try {
 69 |     const agent = new HyperAgent({
 70 |       llm: llm,
 71 |       debug: true,
 72 |     });
 73 | 
 74 |     await agent.initializeMCPClient({
 75 |       servers: [
 76 |         {
 77 |           command: "node",
 78 |           args: [mcpServerPath],
 79 |         },
 80 |       ],
 81 |     });
 82 | 
 83 |     const result = await agent.executeTask(TASK, {
 84 |       debugOnAgentOutput: (agentOutput) => {
 85 |         console.log("\n" + chalk.cyan.bold("===== AGENT OUTPUT ====="));
 86 |         console.dir(agentOutput, { depth: null, colors: true });
 87 |         console.log(chalk.cyan.bold("===============") + "\n");
 88 |       },
 89 |       onStep: (step) => {
 90 |         console.log("\n" + chalk.cyan.bold(`===== STEP ${step.idx} =====`));
 91 |         console.dir(step, { depth: null, colors: true });
 92 |         console.log(chalk.cyan.bold("===============") + "\n");
 93 |       },
 94 |     });
 95 | 
 96 |     await agent.closeAgent();
 97 |     console.log(chalk.green.bold("\nResult:"));
 98 |     console.log(chalk.white(result.output));
 99 |     return result;
100 |   } catch (error) {
101 |     console.error(chalk.red("Error creating agent or executing task:"));
102 |     console.error(
103 |       chalk.red(error instanceof Error ? error.stack : String(error))
104 |     );
105 |   }
106 | }
107 | 
108 | (async () => {
109 |   try {
110 |     await run();
111 |   } catch (error) {
112 |     console.error(chalk.red("Error:"), error);
113 |     process.exit(1);
114 |   }
115 | })();
116 | 


--------------------------------------------------------------------------------
/examples/mcp/weather/servers/weather-server.js:
--------------------------------------------------------------------------------
  1 | const { McpServer } = require("@modelcontextprotocol/sdk/server/mcp.js");
  2 | const { StdioServerTransport } = require("@modelcontextprotocol/sdk/server/stdio.js");
  3 | const { z } = require("zod");
  4 | 
  5 | const NWS_API_BASE = "https://api.weather.gov";
  6 | const USER_AGENT = "weather-app/1.0";
  7 | 
  8 | // Helper function for making NWS API requests
  9 | /**
 10 |  * @param {string} url
 11 |  * @returns {Promise<object|null>}
 12 |  */
 13 | async function makeNWSRequest(url) {
 14 |   const headers = {
 15 |     "User-Agent": USER_AGENT,
 16 |     Accept: "application/geo+json",
 17 |   };
 18 | 
 19 |   try {
 20 |     const response = await fetch(url, { headers });
 21 |     if (!response.ok) {
 22 |       throw new Error(`HTTP error! status: ${response.status}`);
 23 |     }
 24 |     return await response.json();
 25 |   } catch (error) {
 26 |     console.error("Error making NWS request:", error);
 27 |     return null;
 28 |   }
 29 | }
 30 | 
 31 | /**
 32 |  * @typedef {Object} AlertFeature
 33 |  * @property {Object} properties
 34 |  * @property {string} [properties.event]
 35 |  * @property {string} [properties.areaDesc]
 36 |  * @property {string} [properties.severity]
 37 |  * @property {string} [properties.status]
 38 |  * @property {string} [properties.headline]
 39 |  */
 40 | 
 41 | // Format alert data
 42 | /**
 43 |  * @param {AlertFeature} feature
 44 |  * @returns {string}
 45 |  */
 46 | function formatAlert(feature) {
 47 |   const props = feature.properties;
 48 |   return [
 49 |     `Event: ${props.event || "Unknown"}`,
 50 |     `Area: ${props.areaDesc || "Unknown"}`,
 51 |     `Severity: ${props.severity || "Unknown"}`,
 52 |     `Status: ${props.status || "Unknown"}`,
 53 |     `Headline: ${props.headline || "No headline"}`,
 54 |     "---",
 55 |   ].join("\n");
 56 | }
 57 | 
 58 | /**
 59 |  * @typedef {Object} ForecastPeriod
 60 |  * @property {string} [name]
 61 |  * @property {number} [temperature]
 62 |  * @property {string} [temperatureUnit]
 63 |  * @property {string} [windSpeed]
 64 |  * @property {string} [windDirection]
 65 |  * @property {string} [shortForecast]
 66 |  */
 67 | 
 68 | /**
 69 |  * @typedef {Object} AlertsResponse
 70 |  * @property {AlertFeature[]} features
 71 |  */
 72 | 
 73 | /**
 74 |  * @typedef {Object} PointsResponse
 75 |  * @property {Object} properties
 76 |  * @property {string} [properties.forecast]
 77 |  */
 78 | 
 79 | /**
 80 |  * @typedef {Object} ForecastResponse
 81 |  * @property {Object} properties
 82 |  * @property {ForecastPeriod[]} properties.periods
 83 |  */
 84 | 
 85 | // Create server instance
 86 | const server = new McpServer({
 87 |   name: "weather",
 88 |   version: "1.0.0",
 89 | });
 90 | 
 91 | // Register weather tools
 92 | server.tool(
 93 |   "get-alerts",
 94 |   "Get weather alerts for a state",
 95 |   {
 96 |     state: z.string().length(2).describe("Two-letter state code (e.g. CA, NY)"),
 97 |   },
 98 |   async ({ state }) => {
 99 |     const stateCode = state.toUpperCase();
100 |     const alertsUrl = `${NWS_API_BASE}/alerts?area=${stateCode}`;
101 |     const alertsData = await makeNWSRequest(alertsUrl);
102 | 
103 |     if (!alertsData) {
104 |       return {
105 |         content: [
106 |           {
107 |             type: "text",
108 |             text: "Failed to retrieve alerts data",
109 |           },
110 |         ],
111 |       };
112 |     }
113 | 
114 |     const features = alertsData.features || [];
115 |     if (features.length === 0) {
116 |       return {
117 |         content: [
118 |           {
119 |             type: "text",
120 |             text: `No active alerts for ${stateCode}`,
121 |           },
122 |         ],
123 |       };
124 |     }
125 | 
126 |     const formattedAlerts = features.map(formatAlert);
127 |     const alertsText = `Active alerts for ${stateCode}:\n\n${formattedAlerts.join("\n")}`;
128 | 
129 |     return {
130 |       content: [
131 |         {
132 |           type: "text",
133 |           text: alertsText,
134 |         },
135 |       ],
136 |     };
137 |   },
138 | );
139 | 
140 | server.tool(
141 |   "get-forecast",
142 |   "Get weather forecast for a location",
143 |   {
144 |     latitude: z.number().min(-90).max(90).describe("Latitude of the location"),
145 |     longitude: z
146 |       .number()
147 |       .min(-180)
148 |       .max(180)
149 |       .describe("Longitude of the location"),
150 |   },
151 |   async ({ latitude, longitude }) => {
152 |     // Get grid point data
153 |     const pointsUrl = `${NWS_API_BASE}/points/${latitude.toFixed(4)},${longitude.toFixed(4)}`;
154 |     const pointsData = await makeNWSRequest(pointsUrl);
155 | 
156 |     if (!pointsData) {
157 |       return {
158 |         content: [
159 |           {
160 |             type: "text",
161 |             text: `Failed to retrieve grid point data for coordinates: ${latitude}, ${longitude}. This location may not be supported by the NWS API (only US locations are supported).`,
162 |           },
163 |         ],
164 |       };
165 |     }
166 | 
167 |     const forecastUrl = pointsData.properties?.forecast;
168 |     if (!forecastUrl) {
169 |       return {
170 |         content: [
171 |           {
172 |             type: "text",
173 |             text: "Failed to get forecast URL from grid point data",
174 |           },
175 |         ],
176 |       };
177 |     }
178 | 
179 |     // Get forecast data
180 |     const forecastData = await makeNWSRequest(forecastUrl);
181 |     if (!forecastData) {
182 |       return {
183 |         content: [
184 |           {
185 |             type: "text",
186 |             text: "Failed to retrieve forecast data",
187 |           },
188 |         ],
189 |       };
190 |     }
191 | 
192 |     const periods = forecastData.properties?.periods || [];
193 |     if (periods.length === 0) {
194 |       return {
195 |         content: [
196 |           {
197 |             type: "text",
198 |             text: "No forecast periods available",
199 |           },
200 |         ],
201 |       };
202 |     }
203 | 
204 |     // Format forecast periods
205 |     const formattedForecast = periods.map((period) =>
206 |       [
207 |         `${period.name || "Unknown"}:`,
208 |         `Temperature: ${period.temperature || "Unknown"}°${period.temperatureUnit || "F"}`,
209 |         `Wind: ${period.windSpeed || "Unknown"} ${period.windDirection || ""}`,
210 |         `${period.shortForecast || "No forecast available"}`,
211 |         "---",
212 |       ].join("\n"),
213 |     );
214 | 
215 |     const forecastText = `Forecast for ${latitude}, ${longitude}:\n\n${formattedForecast.join("\n")}`;
216 | 
217 |     return {
218 |       content: [
219 |         {
220 |           type: "text",
221 |           text: forecastText,
222 |         },
223 |       ],
224 |     };
225 |   },
226 | );
227 | 
228 | // Start the server
229 | async function main() {
230 |   const transport = new StdioServerTransport();
231 |   await server.connect(transport);
232 |   console.error("Weather MCP Server running on stdio");
233 | }
234 | 
235 | main().catch((error) => {
236 |   console.error("Fatal error in main():", error);
237 |   process.exit(1);
238 | });


--------------------------------------------------------------------------------
/examples/output-to-schema/output-to-schema.ts:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * # Output Schema Example
 3 |  *
 4 |  * This example demonstrates how to use HyperAgent with a defined output schema
 5 |  * to ensure structured and validated responses from the agent.
 6 |  *
 7 |  * ## What This Example Does
 8 |  *
 9 |  * The agent performs a task with structured output that:
10 |  * 1. Defines a Zod schema for the expected output format
11 |  * 2. Performs actions to complete the specified task
12 |  * 3. Returns movie information in a structured format specified
13 |  *
14 |  * ## Prerequisites
15 |  *
16 |  * 1. Node.js environment
17 |  * 2. OpenAI API key set in your .env file (OPENAI_API_KEY)
18 |  *
19 |  * ## Running the Example
20 |  *
21 |  * ```bash
22 |  * yarn ts-node -r tsconfig-paths/register examples/output-to-schema/output-to-schema.ts
23 |  * ```
24 |  */
25 | 
26 | import "dotenv/config";
27 | import { HyperAgent } from "@hyperbrowser/agent";
28 | 
29 | import chalk from "chalk";
30 | import { sleep } from "../../src/utils/sleep";
31 | import { ChatOpenAI } from "@langchain/openai";
32 | import { z } from "zod";
33 | 
34 | const TASK =
35 |   "Navigate to imdb.com, search for 'The Matrix', and extract the director, release year, and rating";
36 | 
37 | async function runEval() {
38 |   const llm = new ChatOpenAI({
39 |     apiKey: process.env.OPENAI_API_KEY,
40 |     model: "gpt-4o",
41 |   });
42 | 
43 |   const agent = new HyperAgent({
44 |     llm: llm,
45 |     debug: true,
46 |   });
47 | 
48 |   await sleep(1000);
49 |   const result = await agent.executeTask(TASK, {
50 |     debugOnAgentOutput: (agentOutput) => {
51 |       console.log("\n" + chalk.cyan.bold("===== AGENT OUTPUT ====="));
52 |       console.dir(agentOutput, { depth: null, colors: true });
53 |       console.log(chalk.cyan.bold("===============") + "\n");
54 |     },
55 |     onStep: (step) => {
56 |       console.log("\n" + chalk.cyan.bold(`===== STEP ${step.idx} =====`));
57 |       console.dir(step, { depth: null, colors: true });
58 |       console.log(chalk.cyan.bold("===============") + "\n");
59 |     },
60 |     outputSchema: z.object({
61 |       director: z.string().describe("The name of the movie director"),
62 |       releaseYear: z.number().describe("The year the movie was released"),
63 |       rating: z.string().describe("The IMDb rating of the movie"),
64 |     }),
65 |   });
66 |   await agent.closeAgent();
67 |   console.log(chalk.green.bold("\nResult:"));
68 |   console.log(chalk.white(result.output));
69 |   return result;
70 | }
71 | 
72 | (async () => {
73 |   await runEval();
74 | })().catch((error) => {
75 |   console.error(chalk.red("Error:"), error);
76 |   process.exit(1);
77 | });
78 | 


--------------------------------------------------------------------------------
/examples/simple/add-to-amazon-cart.ts:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * # Simple Amazon Cart Example
 3 |  * 
 4 |  * This example demonstrates how to use HyperAgent to automate a basic
 5 |  * e-commerce task on Amazon.com.
 6 |  * 
 7 |  * ## What This Example Does
 8 |  * 
 9 |  * The agent performs a simple shopping task that:
10 |  * 1. Navigates to Amazon.com
11 |  * 2. Searches for a specific product
12 |  * 3. Adds an item to the cart that matches the specific requirements (only a single item)
13 |  * 
14 |  * ## Prerequisites
15 |  * 
16 |  * 1. Node.js environment
17 |  * 2. OpenAI API key set in your .env file (OPENAI_API_KEY)
18 |  * 
19 |  * ## Running the Example
20 |  * 
21 |  * ```bash
22 |  * yarn ts-node -r tsconfig-paths/register examples/simple/add-to-amazon-cart.ts
23 |  * ```
24 |  */
25 | 
26 | import "dotenv/config";
27 | import { HyperAgent } from "@hyperbrowser/agent";
28 | import chalk from "chalk";
29 | import { ChatOpenAI } from "@langchain/openai";
30 | 
31 | async function runEval() {
32 |   console.log(chalk.cyan.bold("\n===== Running Add to amazon Example ====="));
33 | 
34 |   const llm = new ChatOpenAI({
35 |     apiKey: process.env.OPENAI_API_KEY,
36 |     model: "gpt-4o",
37 |   });
38 | 
39 |   const agent = new HyperAgent({
40 |     llm: llm,
41 |   });
42 | 
43 |   const result = await agent.executeTask(
44 |     "Navigate to amazon.com, and add the one chip challenge to my cart. Add only the version containing a single item, not multiple items. Once you have added a single product, and do not get any sort of failure form that addition, finish up.",
45 |     {
46 |       debugOnAgentOutput: (agentOutput) => {
47 |         console.log("\n" + chalk.cyan.bold("===== AGENT OUTPUT ====="));
48 |         console.dir(agentOutput, { depth: null, colors: true });
49 |         console.log(chalk.cyan.bold("===============") + "\n");
50 |       },
51 |       onStep: (step) => {
52 |         console.log("\n" + chalk.cyan.bold(`===== STEP ${step.idx} =====`));
53 |         console.dir(step, { depth: null, colors: true });
54 |         console.log(chalk.cyan.bold("===============") + "\n");
55 |       },
56 |     }
57 |   );
58 |   await agent.closeAgent();
59 |   console.log(chalk.green.bold("\nResult:"));
60 |   console.log(chalk.white(result.output));
61 |   return result;
62 | }
63 | 
64 | (async () => {
65 |   await runEval();
66 | })().catch((error) => {
67 |   console.error(chalk.red("Error:"), error);
68 |   process.exit(1);
69 | });
70 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "name": "@hyperbrowser/agent",
  3 |   "version": "0.3.1",
  4 |   "description": "Hyperbrowsers Web Agent",
  5 |   "author": "",
  6 |   "main": "dist/index.js",
  7 |   "types": "dist/index.d.ts",
  8 |   "type": "commonjs",
  9 |   "license": "AGPL-3.0",
 10 |   "scripts": {
 11 |     "build": "rm -rf dist && tsc && tsc-alias && node -e \"require('fs').chmodSync('dist/cli/index.js', '755')\" && node -e \"require('fs').chmodSync('cli.sh', '755')\"",
 12 |     "build-dom-tree-script": "ts-node src/context-providers/dom/builder.ts",
 13 |     "lint": "eslint src/**/*.ts",
 14 |     "prepare": "yarn build",
 15 |     "test": "jest",
 16 |     "format": "prettier --write 'src/**/*.ts'",
 17 |     "cli": "yarn ts-node -r tsconfig-paths/register src/cli/index.ts",
 18 |     "example": "yarn ts-node -r tsconfig-paths/register"
 19 |   },
 20 |   "bin": {
 21 |     "hyperagent-cli": "cli.sh"
 22 |   },
 23 |   "files": [
 24 |     "dist",
 25 |     "README.md",
 26 |     "LICENSE",
 27 |     "cli.sh"
 28 |   ],
 29 |   "keywords": [
 30 |     "hyperbrowser",
 31 |     "browser",
 32 |     "automation",
 33 |     "webscraping",
 34 |     "webcrawling",
 35 |     "scraping",
 36 |     "crawling",
 37 |     "ai"
 38 |   ],
 39 |   "dependencies": {
 40 |     "@google/genai": "^0.8.0",
 41 |     "@hyperbrowser/sdk": "^0.46.0",
 42 |     "@inquirer/prompts": "^7.4.1",
 43 |     "@langchain/core": "^0.3.43",
 44 |     "@modelcontextprotocol/sdk": "^1.9.0",
 45 |     "@types/crypto-js": "^4.2.2",
 46 |     "boxen": "5.1.2",
 47 |     "chalk": "4.1.2",
 48 |     "commander": "^13.1.0",
 49 |     "crypto-js": "^4.2.0",
 50 |     "dotenv": "^16.4.5",
 51 |     "joplin-turndown-plugin-gfm": "^1.0.12",
 52 |     "langchain": "^0.3.19",
 53 |     "lodash": "^4.17.21",
 54 |     "minimatch": "^9.0.3",
 55 |     "ora": "5.4.1",
 56 |     "playwright": "npm:rebrowser-playwright@1.49.1",
 57 |     "readline": "^1.3.0",
 58 |     "sharp": "^0.34.1",
 59 |     "turndown": "^7.2.0",
 60 |     "zod": "^3.24.1",
 61 |     "zod-to-json-schema": "^3.24.1"
 62 |   },
 63 |   "devDependencies": {
 64 |     "@langchain/anthropic": "^0.3.17",
 65 |     "@types/lodash": "^4.17.16",
 66 |     "@types/node": "^22.9.1",
 67 |     "@types/turndown": "^5.0.5",
 68 |     "@typescript-eslint/eslint-plugin": "^8.15.0",
 69 |     "@typescript-eslint/parser": "^8.15.0",
 70 |     "axios": "^1.8.4",
 71 |     "esbuild": "^0.25.2",
 72 |     "eslint": "^9.15.0",
 73 |     "eslint-config-prettier": "^9.1.0",
 74 |     "exa-js": "^1.5.13",
 75 |     "prettier": "^3.3.3",
 76 |     "ts-node": "^10.9.2",
 77 |     "tsc-alias": "^1.8.15",
 78 |     "tsconfig-paths": "^4.2.0",
 79 |     "tsx": "^4.19.3",
 80 |     "typescript": "^5.6.3"
 81 |   },
 82 |   "exports": {
 83 |     ".": {
 84 |       "types": "./dist/index.d.ts",
 85 |       "default": "./dist/index.js"
 86 |     },
 87 |     "./types": {
 88 |       "types": "./dist/types/index.d.ts",
 89 |       "default": "./dist/types/index.js"
 90 |     },
 91 |     "./custom-actions": {
 92 |       "types": "./dist/custom-actions/index.d.ts",
 93 |       "default": "./dist/custom-actions/index.js"
 94 |     }
 95 |   },
 96 |   "typesVersions": {
 97 |     "*": {
 98 |       ".": [
 99 |         "./dist/index.d.ts"
100 |       ],
101 |       "types": [
102 |         "./dist/types/index.d.ts"
103 |       ],
104 |       "./custom-actions": [
105 |         "./dist/custom-actions/index.d.ts"
106 |       ]
107 |     }
108 |   }
109 | }


--------------------------------------------------------------------------------
/scripts/run-webvoyager-eval.ts:
--------------------------------------------------------------------------------
  1 | import { HyperAgent } from "../src/agent";
  2 | import dotenv from "dotenv";
  3 | import chalk from "chalk";
  4 | import fs from "fs";
  5 | import path from "path";
  6 | import { sleep } from "../src/utils/sleep";
  7 | import { retry } from "../src/utils/retry";
  8 | import { ChatOpenAI } from "@langchain/openai";
  9 | import { z } from "zod";
 10 | import { minimatch } from "minimatch";
 11 | 
 12 | dotenv.config();
 13 | 
 14 | class Logger {
 15 |   private logStream: fs.WriteStream;
 16 |   private logToConsole: boolean;
 17 | 
 18 |   constructor(runId: string, evalId: string, logToConsole = false) {
 19 |     const logDir = path.join(__dirname, `../logs/${runId}/${evalId}`);
 20 |     if (!fs.existsSync(logDir)) {
 21 |       fs.mkdirSync(logDir, { recursive: true });
 22 |     }
 23 |     const logPath = path.join(logDir, `webvoyager-eval.log`);
 24 |     this.logStream = fs.createWriteStream(logPath, { flags: "a" });
 25 |     this.logToConsole = logToConsole;
 26 |     this.log(`Log started at ${new Date().toISOString()}\n`);
 27 |   }
 28 | 
 29 |   log(message: string, type: "info" | "error" | "success" = "info") {
 30 |     this.logStream.write(message);
 31 |     if (this.logToConsole) {
 32 |       switch (type) {
 33 |         case "error":
 34 |           console.error(chalk.red(message));
 35 |           break;
 36 |         case "success":
 37 |           console.log(chalk.green(message));
 38 |           break;
 39 |         default:
 40 |           console.log(message);
 41 |       }
 42 |     }
 43 |   }
 44 | 
 45 |   logObject(obj: any, prefix = "") {
 46 |     const objString = JSON.stringify(obj, null, 2);
 47 |     this.log(`${prefix}${objString}`);
 48 |   }
 49 | 
 50 |   close() {
 51 |     this.logStream.end();
 52 |   }
 53 | }
 54 | 
 55 | interface WebVoyagerEval {
 56 |   web_name: string;
 57 |   id: string;
 58 |   ques: string;
 59 |   web: string;
 60 | }
 61 | 
 62 | interface ReferenceAnswer {
 63 |   id: number;
 64 |   type: string;
 65 |   ans: string;
 66 |   notes?: string;
 67 | }
 68 | 
 69 | interface WebsiteReference {
 70 |   notice?: string;
 71 |   answers: ReferenceAnswer[];
 72 | }
 73 | 
 74 | interface References {
 75 |   [website: string]: WebsiteReference;
 76 | }
 77 | 
 78 | interface EvalResult {
 79 |   id: string;
 80 |   correct: boolean;
 81 |   question: string;
 82 |   actual?: string;
 83 |   expected?: string;
 84 |   reason?: string;
 85 |   evaluationReason?: string;
 86 |   notes?: string;
 87 | }
 88 | 
 89 | const AnswerEvaluationSchema = z.object({
 90 |   isCorrect: z
 91 |     .boolean()
 92 |     .describe(
 93 |       "Whether the generated answer is correct compared to the reference"
 94 |     ),
 95 |   reason: z.string().describe("Reason for the evaluation"),
 96 | });
 97 | 
 98 | type AnswerEvaluation = z.infer<typeof AnswerEvaluationSchema>;
 99 | 
100 | async function loadEvals() {
101 |   const evalPath = path.join(__dirname, "../evals/WebVoyager_data.jsonl");
102 |   const fileContent = await fs.promises.readFile(evalPath, "utf-8");
103 |   const lines = fileContent.split("\n");
104 |   const result: WebVoyagerEval[] = [];
105 |   for (const line of lines) {
106 |     const eval_data = JSON.parse(line) as WebVoyagerEval;
107 |     if (line.trim()) {
108 |       result.push(eval_data);
109 |     }
110 |   }
111 |   return result;
112 | }
113 | 
114 | async function loadReferences(): Promise<References> {
115 |   const refPath = path.join(__dirname, "../evals/WebVoyager_reference.json");
116 |   const fileContent = await fs.promises.readFile(refPath, "utf-8");
117 |   return JSON.parse(fileContent);
118 | }
119 | 
120 | async function checkAnswerAgainstReference(
121 |   answer: string,
122 |   reference: string,
123 |   question: string,
124 |   screenshotPath: string,
125 |   notes?: string
126 | ): Promise<AnswerEvaluation> {
127 |   const screenshotBase64 = fs.readFileSync(screenshotPath, {
128 |     encoding: "base64",
129 |   });
130 |   const imageUrl = `data:image/png;base64,${screenshotBase64}`;
131 | 
132 |   const messages = [
133 |     {
134 |       role: "system",
135 |       content:
136 |         "You are an evaluator checking if a web navigation agent correctly answered a question. Your task is to verify the agent's answer by examining the final webpage screenshot and comparing it to a reference answer. Focus primarily on the visual evidence in the screenshot rather than just comparing text answers.",
137 |     },
138 |     {
139 |       role: "user",
140 |       content: [
141 |         {
142 |           type: "text",
143 |           text: `Question: ${question}
144 | 
145 | Reference Answer: ${reference}
146 | 
147 | Generated Answer: ${answer}
148 | 
149 | ${notes ? `Additional Notes: ${notes}` : ""}
150 | 
151 | Please evaluate if the generated answer is correct by:
152 | 1. Primarily using the screenshot to verify the information
153 | 2. Checking if key information matches between the reference and generated answer
154 | 3. Being somewhat lenient - if the main points are correct, minor differences in exact numbers or formatting are acceptable (especially stuff like ratings and reviews which may update over time)
155 | 
156 | Respond in JSON format with { isCorrect: true | false, reason: string }`,
157 |         },
158 |         {
159 |           type: "image_url",
160 |           image_url: {
161 |             url: imageUrl,
162 |           },
163 |         },
164 |       ],
165 |     },
166 |   ];
167 | 
168 |   const llm = new ChatOpenAI({
169 |     apiKey: process.env.OPENAI_API_KEY,
170 |     model: "gpt-4o",
171 |   });
172 |   return await llm
173 |     .withStructuredOutput(AnswerEvaluationSchema)
174 |     .invoke(messages);
175 | }
176 | 
177 | async function runEvalHelper(
178 |   agent: HyperAgent,
179 |   eval_data: WebVoyagerEval,
180 |   references: References,
181 |   logger: Logger,
182 |   runId: string
183 | ): Promise<EvalResult> {
184 |   logger.log("\n===== Running Eval =====");
185 |   logger.log(`\nID: ${eval_data.id}`);
186 |   logger.log(`\nWebsite: ${eval_data.web_name}`);
187 |   logger.log(`\nQuestion: ${eval_data.ques}`);
188 |   logger.log("\n=======================\n");
189 | 
190 |   const page = await agent.getCurrentPage();
191 |   await page.goto(eval_data.web, {
192 |     waitUntil: "domcontentloaded",
193 |   });
194 |   await sleep(1000);
195 |   await page.reload({ waitUntil: "domcontentloaded" });
196 |   await sleep(1000);
197 | 
198 |   const result = await agent.executeTask(eval_data.ques, {
199 |     maxSteps: 25,
200 |     debugDir: path.join(__dirname, `../logs/${runId}/${eval_data.id}/debug`),
201 |     debugOnAgentOutput: (agentOutput) => {
202 |       logger.log("\n===== AGENT OUTPUT =====");
203 |       logger.logObject(agentOutput);
204 |       logger.log("===============\n");
205 |     },
206 |     onStep: (step) => {
207 |       logger.log(`\n===== STEP ${step.idx} =====`);
208 |       logger.logObject(step);
209 |       logger.log("===============\n");
210 |     },
211 |   });
212 |   if (!result.output) {
213 |     throw new Error("No output from agent");
214 |   }
215 | 
216 |   logger.log(result.output || "");
217 | 
218 |   // Take screenshot of final state
219 |   const screenshotPath = path.join(
220 |     __dirname,
221 |     `../logs/${runId}/${eval_data.id}/final-state.png`
222 |   );
223 |   await page.screenshot({ path: screenshotPath, fullPage: true });
224 |   await agent.closeAgent();
225 | 
226 |   // Check against reference
227 |   const websiteRefs = references[eval_data.web_name];
228 |   if (!websiteRefs) {
229 |     logger.log("No references found for this website", "error");
230 |     return {
231 |       id: eval_data.id,
232 |       question: eval_data.ques,
233 |       correct: false,
234 |       reason: "No references found for this website",
235 |     };
236 |   }
237 |   const relevantRef =
238 |     websiteRefs.answers[parseInt(eval_data.id.split("--")[1])];
239 |   if (!relevantRef?.ans) {
240 |     logger.log("No reference found for this specific evaluation ID", "error");
241 |     return {
242 |       id: eval_data.id,
243 |       question: eval_data.ques,
244 |       correct: false,
245 |       reason: "No reference found for this specific evaluation ID",
246 |     };
247 |   }
248 | 
249 |   logger.log("\nChecking against reference...");
250 |   try {
251 |     const evaluation = await checkAnswerAgainstReference(
252 |       result.output,
253 |       relevantRef.ans,
254 |       eval_data.ques,
255 |       screenshotPath,
256 |       relevantRef.notes
257 |     );
258 |     logger.log(
259 |       evaluation.isCorrect ? "✓ CORRECT" : "✗ INCORRECT",
260 |       evaluation.isCorrect ? "success" : "error"
261 |     );
262 |     return {
263 |       id: eval_data.id,
264 |       question: eval_data.ques,
265 |       correct: evaluation.isCorrect,
266 |       evaluationReason: evaluation.reason,
267 |       actual: result.output,
268 |       expected: relevantRef.ans,
269 |       notes: relevantRef.notes,
270 |     };
271 |   } catch (error) {
272 |     logger.log(`Error checking answer against reference: ${error}`, "error");
273 |     return {
274 |       id: eval_data.id,
275 |       question: eval_data.ques,
276 |       correct: false,
277 |       actual: result.output,
278 |       expected: relevantRef.ans,
279 |       reason: `Error checking answer against reference: ${error}`,
280 |     };
281 |   }
282 | }
283 | 
284 | const runEval = async (
285 |   eval_data: WebVoyagerEval,
286 |   references: References,
287 |   runId: string
288 | ): Promise<EvalResult> => {
289 |   const logger = new Logger(runId, eval_data.id);
290 |   const llm = new ChatOpenAI({
291 |     apiKey: process.env.OPENAI_API_KEY,
292 |     model: "gpt-4o",
293 |   });
294 |   const agent = new HyperAgent({
295 |     llm: llm,
296 |     hyperbrowserConfig: {
297 |       hyperbrowserSessionOptions: {
298 |         screen: { width: 1500, height: 1500 },
299 |       },
300 |     },
301 |     debug: true,
302 |   });
303 |   try {
304 |     const timeoutPromise = new Promise<EvalResult>((_, reject) => {
305 |       setTimeout(
306 |         () => reject(new Error("Evaluation timed out after 10 minutes")),
307 |         10 * 60 * 1000
308 |       );
309 |     });
310 |     return await Promise.race([
311 |       retry({
312 |         func: async () =>
313 |           runEvalHelper(agent, eval_data, references, logger, runId),
314 |         params: { retryCount: 3 },
315 |       }),
316 |       timeoutPromise,
317 |     ]);
318 |   } catch (error) {
319 |     await agent.closeAgent();
320 |     logger.log(`Error: ${error}`, "error");
321 |     return {
322 |       id: eval_data.id,
323 |       question: eval_data.ques,
324 |       correct: false,
325 |       reason: `Error: ${error}`,
326 |     };
327 |   } finally {
328 |     logger.close();
329 |   }
330 | };
331 | 
332 | async function runEvalsBatch(
333 |   evals: WebVoyagerEval[],
334 |   references: References,
335 |   runId: string,
336 |   concurrency: number = 25
337 | ): Promise<EvalResult[]> {
338 |   const results: EvalResult[] = [];
339 |   const queue = [...evals];
340 |   const inProgress = new Set<Promise<EvalResult>>();
341 | 
342 |   // Helper to run a single eval and maintain the queue
343 |   const runNext = async () => {
344 |     if (queue.length === 0) return;
345 |     const eval_data = queue.shift()!;
346 |     const promise = runEval(eval_data, references, runId);
347 |     inProgress.add(promise);
348 | 
349 |     promise
350 |       .then((result) => {
351 |         results.push(result);
352 |         inProgress.delete(promise);
353 |         // Start next eval if there are more in queue
354 |         if (queue.length > 0) {
355 |           runNext();
356 |         }
357 |       })
358 |       .catch((error) => {
359 |         console.error(`Error in evaluation ${eval_data.id}:`, error);
360 |         inProgress.delete(promise);
361 |         // Even on error, try to keep the pool full
362 |         if (queue.length > 0) {
363 |           runNext();
364 |         }
365 |       });
366 |   };
367 | 
368 |   // Initialize the pool with concurrent evaluations
369 |   const initialCount = Math.min(concurrency, queue.length);
370 |   for (let i = 0; i < initialCount; i++) {
371 |     await runNext();
372 |   }
373 | 
374 |   // Wait for all evaluations to complete
375 |   while (inProgress.size > 0) {
376 |     await Promise.race([...inProgress]);
377 |   }
378 | 
379 |   return results;
380 | }
381 | 
382 | (async () => {
383 |   let evals = await loadEvals();
384 |   const references = await loadReferences();
385 |   const targetId = process.argv[2];
386 |   const runId = new Date().toISOString().replace(/[:.]/g, "-");
387 |   const logDir = path.join(__dirname, `../logs/${runId}`);
388 |   if (!fs.existsSync(logDir)) {
389 |     fs.mkdirSync(logDir, { recursive: true });
390 |   }
391 | 
392 |   if (targetId) {
393 |     evals = evals.filter((e) => minimatch(e.id, targetId));
394 |     if (evals.length === 0) {
395 |       console.log(
396 |         chalk.red(`No evals found matching glob pattern: ${targetId}`)
397 |       );
398 |       process.exit(1);
399 |     }
400 |   }
401 | 
402 |   console.log(chalk.cyan(`Running ${evals.length} evaluations in parallel...`));
403 |   const results = await runEvalsBatch(evals, references, runId);
404 | 
405 |   const totalEvals = results.length;
406 |   const correctEvals = results.filter((r) => r.correct).length;
407 | 
408 |   const summary = {
409 |     totalEvaluations: totalEvals,
410 |     correctEvaluations: correctEvals,
411 |     failedEvaluations: totalEvals - correctEvals,
412 |     successRate: Math.round((correctEvals / totalEvals) * 100),
413 |     detailedResults: results.map((result) => ({
414 |       id: result.id,
415 |       status: result.correct ? "PASSED" : "FAILED",
416 |       question: result.question,
417 |       actual: result.actual,
418 |       expected: result.expected,
419 |       reason: result.reason || null,
420 |       evaluationReason: result.evaluationReason || null,
421 |       notes: result.notes || null,
422 |     })),
423 |   };
424 |   const summaryPath = path.join(logDir, "summary.json");
425 |   fs.writeFileSync(summaryPath, JSON.stringify(summary, null, 2));
426 | 
427 |   // Also log to console for visibility
428 |   console.log(chalk.cyan("Evaluation results:"));
429 |   console.log(chalk.white(`Total evaluations: ${totalEvals}`));
430 |   console.log(
431 |     chalk.green(
432 |       `Correct: ${correctEvals} (${Math.round((correctEvals / totalEvals) * 100)}%)`
433 |     )
434 |   );
435 |   console.log(chalk.red(`Failed: ${totalEvals - correctEvals}`));
436 |   console.log(chalk.white("\nDetailed results:"));
437 |   results.forEach((result) => {
438 |     console.log(
439 |       `${result.correct ? chalk.green("✓ PASSED") : chalk.red("✗ FAILED")} Eval ID: ${result.id}${
440 |         result.reason ? "\n  " + chalk.red(result.reason) : ""
441 |       }`
442 |     );
443 |   });
444 |   console.log(chalk.green("\nAll evaluations completed!"));
445 | })().catch((error) => {
446 |   console.error(chalk.red("Error running evaluations:"), error);
447 |   process.exit(1);
448 | });
449 | 


--------------------------------------------------------------------------------
/scripts/test-async.ts:
--------------------------------------------------------------------------------
 1 | import { HyperAgent } from "../src/agent";
 2 | import dotenv from "dotenv";
 3 | import chalk from "chalk";
 4 | 
 5 | dotenv.config();
 6 | 
 7 | const agent = new HyperAgent({
 8 |   // a: process.env.OPENAI_API_KEY,
 9 | });
10 | 
11 | (async () => {
12 |   const control = await agent.executeTaskAsync(
13 |     "Go to give me a summary of the second link on the show section of hacker news, be sure to actually go to it",
14 |     {
15 |       onStep: (step) => {
16 |         console.log("\n" + chalk.cyan.bold("===== STEP ====="));
17 |         console.dir(step, { depth: null, colors: true });
18 |         console.log(chalk.cyan.bold("===============") + "\n");
19 |       },
20 |     }
21 |   );
22 |   // console.log(chalk.green.bold("\nResult:"));
23 |   // console.log(chalk.white(result.output));
24 |   await new Promise((resolve) => setTimeout(resolve, 10000));
25 |   console.log("pausing");
26 |   control.pause();
27 |   await new Promise((resolve) => setTimeout(resolve, 20000));
28 |   console.log("resuming");
29 |   control.resume();
30 | })();
31 | 


--------------------------------------------------------------------------------
/scripts/test-extract.ts:
--------------------------------------------------------------------------------
 1 | import { z } from "zod";
 2 | import { HyperAgent } from "../src/agent";
 3 | import dotenv from "dotenv";
 4 | 
 5 | dotenv.config();
 6 | 
 7 | const agent = new HyperAgent();
 8 | 
 9 | (async () => {
10 |   const page = await agent.newPage();
11 |   await page.goto("https://flights.google.com", { waitUntil: "load" });
12 |   const res = await page.extract("What are the preselected options?");
13 |   console.log(res);
14 |   const res2 = await page.extract(
15 |     "What are the preselected options?",
16 |     z.object({
17 |       options: z.array(z.string()),
18 |     })
19 |   );
20 |   console.log(res2);
21 | })();
22 | 


--------------------------------------------------------------------------------
/scripts/test-page-ai.ts:
--------------------------------------------------------------------------------
 1 | import { HyperAgent } from "../src/agent";
 2 | import dotenv from "dotenv";
 3 | 
 4 | dotenv.config();
 5 | 
 6 | const agent = new HyperAgent();
 7 | 
 8 | (async () => {
 9 |   const page = await agent.newPage();
10 |   page.ai(
11 |     "Go to https://flights.google.com and find a round-trip flight from Rio de Janeiro to Los Angeles, leaving on May 15, 2025, and returning on May 22, 2025, and select the option with the least carbon dioxide emissions."
12 |   );
13 |   const page2 = await agent.newPage();
14 |   await page2.goto("https://maps.google.com");
15 |   page2.ai("Find the nearest restaurant to the current page");
16 | })();
17 | 


--------------------------------------------------------------------------------
/scripts/test-variables.ts:
--------------------------------------------------------------------------------
 1 | import { HyperAgent } from "../src/agent";
 2 | import dotenv from "dotenv";
 3 | import chalk from "chalk";
 4 | 
 5 | dotenv.config();
 6 | 
 7 | const agent = new HyperAgent({
 8 |   debug: true,
 9 | });
10 | 
11 | (async () => {
12 |   agent.addVariable({
13 |     key: "departure_date",
14 |     description: "Enter this date as the departure date",
15 |     value: "May 15, 2025",
16 |   });
17 |   agent.addVariable({
18 |     key: "returning_date",
19 |     description: "Enter this date as the return date",
20 |     value: "May 22, 2025",
21 |   });
22 |   const result = await agent.executeTask(
23 |     "Go to https://flights.google.com and find a round-trip flight from Rio de Janeiro to Los Angeles and select the option with the least carbon dioxide emissions.",
24 |     {
25 |       debugOnAgentOutput: (agentOutput) => {
26 |         console.log("\n" + chalk.cyan.bold("===== AGENT OUTPUT ====="));
27 |         console.dir(agentOutput, { depth: null, colors: true });
28 |         console.log(chalk.cyan.bold("===============") + "\n");
29 |       },
30 |       onStep: (step) => {
31 |         console.log("\n" + chalk.cyan.bold("===== STEP ====="));
32 |         console.dir(step, { depth: null, colors: true });
33 |         console.log(chalk.cyan.bold("===============") + "\n");
34 |       },
35 |     }
36 |   );
37 |   console.log(chalk.green.bold("\nResult:"));
38 |   console.log(chalk.white(result.output));
39 | })();
40 | 


--------------------------------------------------------------------------------
/scripts/test.ts:
--------------------------------------------------------------------------------
 1 | import { HyperAgent } from "../src/agent";
 2 | import dotenv from "dotenv";
 3 | import chalk from "chalk";
 4 | 
 5 | dotenv.config();
 6 | 
 7 | const agent = new HyperAgent({
 8 |   debug: true,
 9 | });
10 | 
11 | (async () => {
12 |   const result = await agent.executeTask(
13 |     "Go to https://flights.google.com and find a round-trip flight from Rio de Janeiro to Los Angeles, leaving on May 15, 2025, and returning on May 22, 2025, and select the option with the least carbon dioxide emissions.",
14 |     {
15 |       debugOnAgentOutput: (agentOutput) => {
16 |         console.log("\n" + chalk.cyan.bold("===== AGENT OUTPUT ====="));
17 |         console.dir(agentOutput, { depth: null, colors: true });
18 |         console.log(chalk.cyan.bold("===============") + "\n");
19 |       },
20 |       onStep: (step) => {
21 |         console.log("\n" + chalk.cyan.bold("===== STEP ====="));
22 |         console.dir(step, { depth: null, colors: true });
23 |         console.log(chalk.cyan.bold("===============") + "\n");
24 |       },
25 |     }
26 |   );
27 |   console.log(chalk.green.bold("\nResult:"));
28 |   console.log(chalk.white(result.output));
29 | })();
30 | 


--------------------------------------------------------------------------------
/src/agent/actions/click-element.ts:
--------------------------------------------------------------------------------
  1 | import { z } from "zod";
  2 | import { Locator } from "playwright";
  3 | import { ActionContext, ActionOutput, AgentActionDefinition } from "@/types";
  4 | import { sleep } from "@/utils";
  5 | import { getLocator } from "./utils";
  6 | 
  7 | const ClickElementAction = z
  8 |   .object({
  9 |     index: z.number().describe("The numeric index of the element to click."),
 10 |   })
 11 |   .describe("Click on an element identified by its index");
 12 | 
 13 | type ClickElementActionType = z.infer<typeof ClickElementAction>;
 14 | 
 15 | const MAX_STABLE_CHECKS = 2;
 16 | const CLICK_CHECK_TIMEOUT_PERIOD = 2_500;
 17 | 
 18 | export const ClickElementActionDefinition: AgentActionDefinition = {
 19 |   type: "clickElement" as const,
 20 |   actionParams: ClickElementAction,
 21 |   run: async function (
 22 |     ctx: ActionContext,
 23 |     action: ClickElementActionType
 24 |   ): Promise<ActionOutput> {
 25 |     const { index } = action;
 26 |     const locator = getLocator(ctx, index);
 27 |     if (!locator) {
 28 |       return { success: false, message: "Element not found" };
 29 |     }
 30 | 
 31 |     const exists = (await locator.count()) > 0;
 32 |     if (!exists) {
 33 |       return { success: false, message: "Element not found on page" };
 34 |     }
 35 | 
 36 |     await locator.scrollIntoViewIfNeeded({
 37 |       timeout: CLICK_CHECK_TIMEOUT_PERIOD,
 38 |     });
 39 | 
 40 |     await Promise.all([
 41 |       locator.waitFor({
 42 |         state: "visible",
 43 |         timeout: CLICK_CHECK_TIMEOUT_PERIOD,
 44 |       }),
 45 |       waitForElementToBeEnabled(locator, CLICK_CHECK_TIMEOUT_PERIOD),
 46 |       waitForElementToBeStable(locator, CLICK_CHECK_TIMEOUT_PERIOD),
 47 |     ]);
 48 | 
 49 |     await locator.click({ force: true });
 50 |     return { success: true, message: `Clicked element with index ${index}` };
 51 |   },
 52 |   pprintAction: function (params: ClickElementActionType): string {
 53 |     return `Click element at index ${params.index}`;
 54 |   },
 55 | };
 56 | 
 57 | /**
 58 |  * Waits for an element to become enabled with a timeout
 59 |  * @param locator The Playwright locator to check
 60 |  * @param timeout Maximum time to wait in milliseconds
 61 |  * @returns Promise that resolves when element is enabled or rejects on timeout
 62 |  */
 63 | async function waitForElementToBeEnabled(
 64 |   locator: Locator,
 65 |   timeout: number = 5000
 66 | ): Promise<void> {
 67 |   return Promise.race([
 68 |     (async () => {
 69 |       while (true) {
 70 |         if (await locator.isEnabled()) {
 71 |           return;
 72 |         }
 73 |         await sleep(100);
 74 |       }
 75 |     })(),
 76 |     new Promise<never>((_, reject) => {
 77 |       setTimeout(
 78 |         () => reject(new Error("Timeout waiting for element to be enabled")),
 79 |         timeout
 80 |       );
 81 |     }),
 82 |   ]);
 83 | }
 84 | 
 85 | /**
 86 |  * Waits for an element to become stable (not moving) with a timeout
 87 |  * @param locator The Playwright locator to check
 88 |  * @param timeout Maximum time to wait in milliseconds
 89 |  * @returns Promise that resolves when element is stable or rejects on timeout
 90 |  */
 91 | async function waitForElementToBeStable(
 92 |   locator: Locator,
 93 |   timeout: number = 5000
 94 | ): Promise<void> {
 95 |   return Promise.race([
 96 |     (async () => {
 97 |       let previousRect: {
 98 |         x: number;
 99 |         y: number;
100 |         width: number;
101 |         height: number;
102 |       } | null = null;
103 |       let stableCount = 0;
104 | 
105 |       while (true) {
106 |         const currentRect = await locator.boundingBox();
107 |         if (!currentRect) {
108 |           await sleep(100);
109 |           continue;
110 |         }
111 | 
112 |         if (
113 |           previousRect &&
114 |           previousRect.x === currentRect.x &&
115 |           previousRect.y === currentRect.y &&
116 |           currentRect.width === (previousRect.width ?? 0) &&
117 |           currentRect.height === (previousRect.height ?? 0)
118 |         ) {
119 |           stableCount++;
120 |           if (stableCount >= MAX_STABLE_CHECKS) {
121 |             // Element stable for {{ MAX_STABLE_CHECKS }} consecutive checks
122 |             return;
123 |           }
124 |         } else {
125 |           stableCount = 0;
126 |         }
127 | 
128 |         previousRect = currentRect;
129 |         await sleep(100);
130 |       }
131 |     })(),
132 |     new Promise<never>((_, reject) => {
133 |       setTimeout(
134 |         () => reject(new Error("Timeout waiting for element to be stable")),
135 |         timeout
136 |       );
137 |     }),
138 |   ]);
139 | }
140 | 


--------------------------------------------------------------------------------
/src/agent/actions/complete-validator.ts:
--------------------------------------------------------------------------------
 1 | import { z } from "zod";
 2 | import { ActionContext, ActionOutput, AgentActionDefinition } from "@/types";
 3 | 
 4 | export const CompletionValidateAction = z
 5 |   .object({
 6 |     task: z
 7 |       .string()
 8 |       .describe("The detailed description of the task to complete."),
 9 |     completionCriteria: z.array(
10 |       z.object({
11 |         subTask: z
12 |           .string()
13 |           .describe("The description of the specific sub task of the task."),
14 |         subTaskSatisfied: z
15 |           .boolean()
16 |           .describe("Is the specific sub task of the task completed."),
17 |         subTaskSatisfiedReason: z
18 |           .string()
19 |           .describe(
20 |             "How and why has this subtask been marked as completed (if completed). Provide the result as well if this response required an action, and that action produced a result."
21 |           ),
22 |       })
23 |     ),
24 |   })
25 |   .describe(
26 |     `Must run this before issuing the final complete action to validate that the task is completed.
27 |     Evaluate if all the sub parts of the task are completed, and so if the task itself is completed. If you don't run this step, you will be heavily penalized.`
28 |   );
29 | 
30 | export type CompleteValidateActionType = z.infer<
31 |   typeof CompletionValidateAction
32 | >;
33 | 
34 | export const CompletionValidateActionDefinition: AgentActionDefinition = {
35 |   type: "taskCompleteValidation",
36 |   actionParams: CompletionValidateAction,
37 |   run: async (
38 |     ctx: ActionContext,
39 |     action: CompleteValidateActionType
40 |   ): Promise<ActionOutput> => {
41 |     const completionCriteria = action.completionCriteria
42 |       .map(
43 |         (subTask) =>
44 |           `subTask:${subTask.subTask} || condition satisfied: ${subTask.subTaskSatisfied}`
45 |       )
46 |       .join("\n");
47 |     return {
48 |       success: true,
49 |       message: `Task Completion Report: \ntask:${action.task} \nsubtasks: \n${completionCriteria}`,
50 |     };
51 |   },
52 | };
53 | 


--------------------------------------------------------------------------------
/src/agent/actions/complete-with-output-schema.ts:
--------------------------------------------------------------------------------
 1 | import { z } from "zod";
 2 | import { ActionContext, ActionOutput, AgentActionDefinition } from "@/types";
 3 | 
 4 | export const generateCompleteActionWithOutputDefinition = (
 5 |   outputSchema: z.AnyZodObject
 6 | ): AgentActionDefinition => {
 7 |   const actionParamsSchema = z
 8 |     .object({
 9 |       success: z
10 |         .boolean()
11 |         .describe("Whether the task was completed successfully."),
12 |       outputSchema: outputSchema
13 |         .nullable()
14 |         .describe(
15 |           "The output model to return the response in. Given the previous data, try your best to fit the final response into the given schema."
16 |         ),
17 |     })
18 |     .describe(
19 |       "Complete the task. An output schema has been provided to you. Try your best to provide your response so that it fits the output schema provided."
20 |     );
21 | 
22 |   type CompeleteActionWithOutputSchema = z.infer<typeof actionParamsSchema>;
23 | 
24 |   return {
25 |     type: "complete" as const,
26 |     actionParams: actionParamsSchema,
27 |     run: async (
28 |       ctx: ActionContext,
29 |       actionParams: CompeleteActionWithOutputSchema
30 |     ): Promise<ActionOutput> => {
31 |       if (actionParams.success && actionParams.outputSchema) {
32 |         return {
33 |           success: true,
34 |           message: "The action generated an object",
35 |           extract: actionParams.outputSchema,
36 |         };
37 |       } else {
38 |         return {
39 |           success: false,
40 |           message:
41 |             "Could not complete task and/or could not extract response into output schema.",
42 |         };
43 |       }
44 |     },
45 |     completeAction: async (params: CompeleteActionWithOutputSchema) => {
46 |       return JSON.stringify(params.outputSchema, null, 2);
47 |     },
48 |   };
49 | };
50 | 


--------------------------------------------------------------------------------
/src/agent/actions/complete.ts:
--------------------------------------------------------------------------------
 1 | import { z } from "zod";
 2 | import { ActionOutput, AgentActionDefinition } from "@/types";
 3 | 
 4 | export const CompleteAction = z
 5 |   .object({
 6 |     success: z
 7 |       .boolean()
 8 |       .describe("Whether the task was completed successfully."),
 9 |     text: z
10 |       .string()
11 |       .nullable()
12 |       .describe(
13 |         "The text to complete the task with, make this answer the ultimate goal of the task. Be sure to include all the information requested in the task in explicit detail."
14 |       ),
15 |   })
16 |   .describe("Complete the task, this must be the final action in the sequence");
17 | 
18 | export type CompleteActionType = z.infer<typeof CompleteAction>;
19 | 
20 | export const CompleteActionDefinition: AgentActionDefinition = {
21 |   type: "complete" as const,
22 |   actionParams: CompleteAction,
23 |   run: async (): Promise<ActionOutput> => {
24 |     return { success: true, message: "Task Complete" };
25 |   },
26 |   completeAction: async (params: CompleteActionType) => {
27 |     return params.text ?? "No response text found";
28 |   },
29 |   pprintAction: function (params: CompleteActionType): string {
30 |     return `Complete task with ${params.success ? "success" : "failure"}`;
31 |   },
32 | };
33 | 


--------------------------------------------------------------------------------
/src/agent/actions/extract.ts:
--------------------------------------------------------------------------------
 1 | import { z } from "zod";
 2 | import { ActionContext, ActionOutput, AgentActionDefinition } from "@/types";
 3 | import { parseMarkdown } from "@/utils/html-to-markdown";
 4 | import fs from "fs";
 5 | 
 6 | export const ExtractAction = z
 7 |   .object({
 8 |     objective: z.string().describe("The goal of the extraction."),
 9 |   })
10 |   .describe(
11 |     "Extract content from the page according to the objective, e.g. product prices, contact information, article text, table data, or specific metadata fields"
12 |   )
13 | 
14 | export type ExtractActionType = z.infer<typeof ExtractAction>;
15 | 
16 | export const ExtractActionDefinition: AgentActionDefinition = {
17 |   type: "extract" as const,
18 |   actionParams: ExtractAction,
19 |   run: async (
20 |     ctx: ActionContext,
21 |     action: ExtractActionType
22 |   ): Promise<ActionOutput> => {
23 |     try {
24 |       const content = await ctx.page.content();
25 |       const markdown = await parseMarkdown(content);
26 |       const objective = action.objective;
27 | 
28 |       // Take a screenshot of the page
29 |       const cdpSession = await ctx.page.context().newCDPSession(ctx.page);
30 |       const screenshot = await cdpSession.send("Page.captureScreenshot");
31 |       cdpSession.detach();
32 | 
33 |       // Save screenshot to debug dir if exists
34 |       if (ctx.debugDir) {
35 |         fs.writeFileSync(
36 |           `${ctx.debugDir}/extract-screenshot.png`,
37 |           Buffer.from(screenshot.data, "base64")
38 |         );
39 |       }
40 | 
41 |       // Trim markdown to stay within token limit
42 |       // TODO: this is a hack, we should use a better token counting method
43 |       const avgTokensPerChar = 0.75; // Conservative estimate of tokens per character
44 |       const maxChars = Math.floor(ctx.tokenLimit / avgTokensPerChar);
45 |       const trimmedMarkdown =
46 |         markdown.length > maxChars
47 |           ? markdown.slice(0, maxChars) + "\n[Content truncated due to length]"
48 |           : markdown;
49 |       if (ctx.debugDir) {
50 |         fs.writeFileSync(
51 |           `${ctx.debugDir}/extract-markdown-content.md`,
52 |           trimmedMarkdown
53 |         );
54 |       }
55 | 
56 |       const response = await ctx.llm.invoke([
57 |         {
58 |           role: "user",
59 |           content: [
60 |             {
61 |               type: "text",
62 |               text: `Extract the following information from the page according to this objective: "${objective}"\n\nPage content:\n${trimmedMarkdown}\nHere is as screenshot of the page:\n`,
63 |             },
64 |             {
65 |               type: "image_url",
66 |               image_url: {
67 |                 url: `data:image/png;base64,${screenshot.data}`,
68 |               },
69 |             },
70 |           ],
71 |         },
72 |       ]);
73 |       if (response.content.length === 0) {
74 |         return {
75 |           success: false,
76 |           message: `No content extracted from page.`,
77 |         };
78 |       }
79 |       return {
80 |         success: true,
81 |         message: `Extracted content from page:\n${response.content}`,
82 |       };
83 |     } catch (error) {
84 |       return {
85 |         success: false,
86 |         message: `Failed to extract content: ${error}`,
87 |       };
88 |     }
89 |   },
90 |   pprintAction: function(params: ExtractActionType): string {
91 |     return `Extract content from page with objective: "${params.objective}"`;
92 |   },
93 | };
94 | 


--------------------------------------------------------------------------------
/src/agent/actions/go-to-url.ts:
--------------------------------------------------------------------------------
 1 | import { z } from "zod";
 2 | import { ActionContext, AgentActionDefinition } from "@/types";
 3 | 
 4 | export const GoToUrlAction = z
 5 |   .object({
 6 |     url: z.string().describe("The URL you want to navigate to."),
 7 |   })
 8 |   .describe("Navigate to a specific URL in the browser");
 9 | 
10 | export type GoToUrlActionType = z.infer<typeof GoToUrlAction>;
11 | 
12 | export const GoToURLActionDefinition: AgentActionDefinition = {
13 |   type: "goToUrl" as const,
14 |   actionParams: GoToUrlAction,
15 |   run: async (ctx: ActionContext, action: GoToUrlActionType) => {
16 |     const { url } = action;
17 |     await ctx.page.goto(url);
18 |     return { success: true, message: `Navigated to ${url}` };
19 |   },
20 |   pprintAction: function(params: GoToUrlActionType): string {
21 |     return `Navigate to URL: ${params.url}`;
22 |   },
23 | };
24 | 


--------------------------------------------------------------------------------
/src/agent/actions/index.ts:
--------------------------------------------------------------------------------
 1 | import { GoToURLActionDefinition } from "./go-to-url";
 2 | import { ClickElementActionDefinition } from "./click-element";
 3 | import { InputTextActionDefinition } from "./input-text";
 4 | import { CompleteActionDefinition } from "./complete";
 5 | import { generateCompleteActionWithOutputDefinition } from "./complete-with-output-schema";
 6 | import { ExtractActionDefinition } from "./extract";
 7 | import { SelectOptionActionDefinition } from "./select-option";
 8 | import { ScrollActionDefinition } from "./scroll";
 9 | import { PageBackActionDefinition } from "./page-back";
10 | import { PageForwardActionDefinition } from "./page-forward";
11 | import { KeyPressActionDefinition } from "./key-press";
12 | import { ThinkingActionDefinition } from "./thinking";
13 | import { RefreshPageActionDefinition } from "./refresh-page";
14 | import { PDFActionDefinition } from "./pdf";
15 | 
16 | /**
17 |  * Custom error class for when an action is not found in the registry
18 |  * This helps distinguish between general errors and specifically when an action type doesn't exist
19 |  */
20 | export class ActionNotFoundError extends Error {
21 |   constructor(actionType: string) {
22 |     super(`Action type "${actionType}" not found in the action registry`);
23 |     this.name = "ActionNotFoundError";
24 | 
25 |     // Maintains proper stack trace for where our error was thrown (only available on V8)
26 |     if (Error.captureStackTrace) {
27 |       Error.captureStackTrace(this, ActionNotFoundError);
28 |     }
29 |   }
30 | }
31 | 
32 | const DEFAULT_ACTIONS = [
33 |   GoToURLActionDefinition,
34 |   PageBackActionDefinition,
35 |   PageForwardActionDefinition,
36 |   RefreshPageActionDefinition,
37 |   ExtractActionDefinition,
38 |   ClickElementActionDefinition,
39 |   SelectOptionActionDefinition,
40 |   ScrollActionDefinition,
41 |   InputTextActionDefinition,
42 |   KeyPressActionDefinition,
43 |   ThinkingActionDefinition,
44 | ];
45 | 
46 | if (process.env.GEMINI_API_KEY) {
47 |   DEFAULT_ACTIONS.push(PDFActionDefinition);
48 | }
49 | 
50 | export {
51 |   DEFAULT_ACTIONS,
52 |   CompleteActionDefinition,
53 |   generateCompleteActionWithOutputDefinition,
54 | };
55 | 


--------------------------------------------------------------------------------
/src/agent/actions/input-text.ts:
--------------------------------------------------------------------------------
 1 | import { z } from "zod";
 2 | import { ActionContext, AgentActionDefinition } from "@/types";
 3 | import { getLocator } from "./utils";
 4 | 
 5 | export const InputTextAction = z
 6 |   .object({
 7 |     index: z
 8 |       .number()
 9 |       .describe("The numeric index of the element to input text."),
10 |     text: z.string().describe("The text to input."),
11 |   })
12 |   .describe("Input text into a input interactive element");
13 | 
14 | export type InputTextActionType = z.infer<typeof InputTextAction>;
15 | 
16 | export const InputTextActionDefinition: AgentActionDefinition = {
17 |     type: "inputText" as const,
18 |     actionParams: InputTextAction,
19 |     run: async (ctx: ActionContext, action: InputTextActionType) => {
20 |       let { index, text } = action;
21 |       const locator = getLocator(ctx, index);
22 |       for (const variable of ctx.variables) {
23 |         text = text.replace(`<<${variable.key}>>`, variable.value);
24 |       }
25 |       if (!locator) {
26 |         return { success: false, message: "Element not found" };
27 |       }
28 |       await locator.fill(text, { timeout: 5_000 });
29 |       return {
30 |         success: true,
31 |         message: `Inputted text "${text}" into element with index ${index}`,
32 |       };
33 |     },
34 |     pprintAction: function (params: InputTextActionType): string {
35 |       return `Input text "${params.text}" into element at index ${params.index}`;
36 |     },
37 |   };
38 | 


--------------------------------------------------------------------------------
/src/agent/actions/key-press.ts:
--------------------------------------------------------------------------------
  1 | import { z } from "zod";
  2 | import { ActionContext, AgentActionDefinition } from "@/types";
  3 | 
  4 | /**
  5 |  * Translates xdotool-like key strings to Playwright-compatible keys.
  6 |  * Reference: https://developer.mozilla.org/en-US/docs/Web/API/KeyboardEvent/key/Key_Values
  7 |  */
  8 | function translateKey(key: string): string {
  9 |   const keyMap: Record<string, string> = {
 10 |     // Common / Basic Keys
 11 |     return: "Enter",
 12 |     enter: "Enter",
 13 |     tab: "Tab",
 14 |     backspace: "Backspace",
 15 |     up: "ArrowUp",
 16 |     down: "ArrowDown",
 17 |     left: "ArrowLeft",
 18 |     right: "ArrowRight",
 19 |     space: "Space",
 20 |     ctrl: "Control",
 21 |     control: "Control",
 22 |     alt: "Alt",
 23 |     shift: "Shift",
 24 |     meta: "Meta",
 25 |     command: "Meta",
 26 |     cmd: "Meta",
 27 |     windows: "Meta",
 28 |     esc: "Escape",
 29 |     escape: "Escape",
 30 |     // Numpad Keys
 31 |     kp_0: "Numpad0",
 32 |     kp_1: "Numpad1",
 33 |     kp_2: "Numpad2",
 34 |     kp_3: "Numpad3",
 35 |     kp_4: "Numpad4",
 36 |     kp_5: "Numpad5",
 37 |     kp_6: "Numpad6",
 38 |     kp_7: "Numpad7",
 39 |     kp_8: "Numpad8",
 40 |     kp_9: "Numpad9",
 41 |     // Numpad Operations
 42 |     kp_enter: "NumpadEnter",
 43 |     kp_multiply: "NumpadMultiply",
 44 |     kp_add: "NumpadAdd",
 45 |     kp_subtract: "NumpadSubtract",
 46 |     kp_decimal: "NumpadDecimal",
 47 |     kp_divide: "NumpadDivide",
 48 |     // Navigation
 49 |     page_down: "PageDown",
 50 |     page_up: "PageUp",
 51 |     home: "Home",
 52 |     end: "End",
 53 |     insert: "Insert",
 54 |     delete: "Delete",
 55 |     // Function Keys
 56 |     f1: "F1",
 57 |     f2: "F2",
 58 |     f3: "F3",
 59 |     f4: "F4",
 60 |     f5: "F5",
 61 |     f6: "F6",
 62 |     f7: "F7",
 63 |     f8: "F8",
 64 |     f9: "F9",
 65 |     f10: "F10",
 66 |     f11: "F11",
 67 |     f12: "F12",
 68 |     // Left/Right Variants
 69 |     shift_l: "ShiftLeft",
 70 |     shift_r: "ShiftRight",
 71 |     control_l: "ControlLeft",
 72 |     control_r: "ControlRight",
 73 |     alt_l: "AltLeft",
 74 |     alt_r: "AltRight",
 75 |     // Media Keys
 76 |     audiovolumemute: "AudioVolumeMute",
 77 |     audiovolumedown: "AudioVolumeDown",
 78 |     audiovolumeup: "AudioVolumeUp",
 79 |     // Additional Special Keys
 80 |     print: "PrintScreen",
 81 |     scroll_lock: "ScrollLock",
 82 |     pause: "Pause",
 83 |     menu: "ContextMenu",
 84 |   };
 85 | 
 86 |   return keyMap[key.toLowerCase()] || key;
 87 | }
 88 | 
 89 | export const KeyPressAction = z
 90 |   .object({
 91 |     text: z.string().describe(
 92 |       `Press a key or key-combination on the keyboard.\n
 93 | - This supports xdotool's \`key\` syntax.\n
 94 | - Examples: "a", "Return", "alt+Tab", "ctrl+s", "Up", "KP_0" (for the numpad 0 key).
 95 | `
 96 |     ),
 97 |   })
 98 |   .describe("Press a key or key-combination on the keyboard");
 99 | 
100 | export type KeyPressActionType = z.infer<typeof KeyPressAction>;
101 | 
102 | export const KeyPressActionDefinition: AgentActionDefinition = {
103 |   type: "keyPress" as const,
104 |   actionParams: KeyPressAction,
105 |   run: async (ctx: ActionContext, action: KeyPressActionType) => {
106 |     const { text } = action;
107 | 
108 |     if (text.includes(" ") && !text.includes("+")) {
109 |       const keys = text.split(" ");
110 |       for (const k of keys) {
111 |         await ctx.page.keyboard.press(translateKey(k));
112 |       }
113 |     } else if (text.includes("+")) {
114 |       const keys = text.split("+");
115 |       for (let i = 0; i < keys.length - 1; i++) {
116 |         await ctx.page.keyboard.down(translateKey(keys[i]));
117 |       }
118 |       await ctx.page.keyboard.press(translateKey(keys[keys.length - 1]));
119 |       for (let i = keys.length - 2; i >= 0; i--) {
120 |         await ctx.page.keyboard.up(translateKey(keys[i]));
121 |       }
122 |     } else {
123 |       await ctx.page.keyboard.press(translateKey(text));
124 |     }
125 | 
126 |     return {
127 |       success: true,
128 |       message: `Pressed key "${text}"`,
129 |     };
130 |   },
131 |   pprintAction: function(params: KeyPressActionType): string {
132 |     return `Press key "${params.text}"`;
133 |   },
134 | };
135 | 


--------------------------------------------------------------------------------
/src/agent/actions/page-back.ts:
--------------------------------------------------------------------------------
 1 | import { z } from "zod";
 2 | import { ActionContext, AgentActionDefinition } from "@/types";
 3 | 
 4 | export const PageBackAction = z
 5 |   .object({})
 6 |   .describe("Navigate back to the previous page in the browser history");
 7 | 
 8 | export type PageBackActionType = z.infer<typeof PageBackAction>;
 9 | 
10 | export const PageBackActionDefinition: AgentActionDefinition = {
11 |   type: "pageBack" as const,
12 |   actionParams: PageBackAction,
13 |   run: async (ctx: ActionContext) => {
14 |     await ctx.page.goBack();
15 |     return { success: true, message: "Navigated back to the previous page" };
16 |   },
17 |   pprintAction: function(): string {
18 |     return "Navigate back to previous page";
19 |   },
20 | };
21 | 


--------------------------------------------------------------------------------
/src/agent/actions/page-forward.ts:
--------------------------------------------------------------------------------
 1 | import { z } from "zod";
 2 | import { ActionContext, AgentActionDefinition } from "@/types";
 3 | 
 4 | export const PageForwardAction = z
 5 |   .object({})
 6 |   .describe("Navigate forward to the next page in the browser history");
 7 | 
 8 | export type PageForwardActionType = z.infer<typeof PageForwardAction>;
 9 | 
10 | export const PageForwardActionDefinition: AgentActionDefinition = {
11 |   type: "pageForward" as const,
12 |   actionParams: PageForwardAction,
13 |   run: async (ctx: ActionContext) => {
14 |     await ctx.page.goForward();
15 |     return { success: true, message: "Navigated forward to the next page" };
16 |   },
17 |   pprintAction: function(): string {
18 |     return "Navigate forward to next page";
19 |   },
20 | };
21 | 


--------------------------------------------------------------------------------
/src/agent/actions/pdf.ts:
--------------------------------------------------------------------------------
 1 | import { z } from "zod";
 2 | import { ActionContext, AgentActionDefinition } from "@/types";
 3 | import { config } from "dotenv";
 4 | import { GoogleGenAI } from "@google/genai";
 5 | 
 6 | config();
 7 | 
 8 | export const PDFAction = z
 9 |   .object({
10 |     pdfUrl: z.string().describe("The URL of the PDF to analyze."),
11 |     prompt: z.string().describe("The prompt/question to ask about the PDF."),
12 |   })
13 |   .describe("Analyze a PDF using Gemini and a prompt");
14 | 
15 | export type PDFActionType = z.infer<typeof PDFAction>;
16 | 
17 | export const PDFActionDefinition: AgentActionDefinition = {
18 |   type: "analyzePdf" as const,
19 |   actionParams: PDFAction,
20 |   run: async (ctx: ActionContext, action: PDFActionType) => {
21 |     const goog = new GoogleGenAI({ apiKey: process.env.GEMINI_API_KEY! });
22 |     const { pdfUrl, prompt } = action;
23 |     let pdfBuffer: Buffer | null = null;
24 |     try {
25 |       // Try direct request first (works for direct PDF links)
26 |       const response = await ctx.page.request.get(pdfUrl);
27 |       if (
28 |         response.ok() &&
29 |         response.headers()["content-type"]?.includes("pdf")
30 |       ) {
31 |         pdfBuffer = Buffer.from(await response.body());
32 |       } else {
33 |         // Fallback: navigate and intercept response
34 | 
35 |         const [resp] = await Promise.all([
36 |           ctx.page.waitForResponse(
37 |             (r) =>
38 |               r.url() === pdfUrl && r.headers()["content-type"]?.includes("pdf")
39 |           ),
40 |           ctx.page.goto(pdfUrl, { waitUntil: "networkidle" }),
41 |         ]);
42 |         pdfBuffer = Buffer.from(await resp.body());
43 |       }
44 |     } catch (err) {
45 |       return {
46 |         success: false,
47 |         message: `Failed to download PDF: ${err}`,
48 |       };
49 |     }
50 |     if (!pdfBuffer) {
51 |       return {
52 |         success: false,
53 |         message: "Could not retrieve PDF file.",
54 |       };
55 |     }
56 |     const geminiResponse = await goog.models.generateContent({
57 |       model: "gemini-2.5-pro-preview-03-25",
58 |       contents: [
59 |         { text: prompt },
60 |         {
61 |           inlineData: {
62 |             mimeType: "application/pdf",
63 |             data: pdfBuffer.toString("base64"),
64 |           },
65 |         },
66 |       ],
67 |     });
68 |     return {
69 |       success: true,
70 |       message: geminiResponse.text || "No response text returned.",
71 |     };
72 |   },
73 |   pprintAction: function (params: PDFActionType): string {
74 |     return `Analyze PDF at URL: ${params.pdfUrl} with prompt: ${params.prompt}`;
75 |   },
76 | };
77 | 


--------------------------------------------------------------------------------
/src/agent/actions/refresh-page.ts:
--------------------------------------------------------------------------------
 1 | import { z } from "zod";
 2 | import { ActionContext, AgentActionDefinition } from "@/types";
 3 | 
 4 | export const RefreshPageAction = z
 5 |   .object({})
 6 |   .describe(
 7 |     "Refresh a webpage. Refreshing a webpage is usually a good way if you need to reset the state on a page. Take care since every thing you did on that page will be reset."
 8 |   );
 9 | 
10 | export type RefreshPageActionType = z.infer<typeof RefreshPageAction>;
11 | 
12 | export const RefreshPageActionDefinition: AgentActionDefinition = {
13 |   type: "refreshPage" as const,
14 |   actionParams: RefreshPageAction,
15 |   run: async (ctx: ActionContext) => {
16 |     await ctx.page.reload();
17 |     return { success: true, message: "Succesfully refreshed a page." };
18 |   },
19 |   pprintAction: function(): string {
20 |     return "Refresh current page";
21 |   },
22 | };
23 | 


--------------------------------------------------------------------------------
/src/agent/actions/scroll.ts:
--------------------------------------------------------------------------------
 1 | import { z } from "zod";
 2 | import { ActionContext, AgentActionDefinition } from "@/types";
 3 | 
 4 | export const ScrollAction = z
 5 |   .object({
 6 |     direction: z
 7 |       .enum(["up", "down", "left", "right"])
 8 |       .describe("The direction to scroll."),
 9 |   })
10 |   .describe("Scroll in a specific direction in the browser");
11 | 
12 | export type ScrollActionType = z.infer<typeof ScrollAction>;
13 | 
14 | export const ScrollActionDefinition: AgentActionDefinition = {
15 |   type: "scroll" as const,
16 |   actionParams: ScrollAction,
17 |   run: async (ctx: ActionContext, action: ScrollActionType) => {
18 |     const { direction } = action;
19 |     switch (direction) {
20 |       case "up":
21 |         await ctx.page.evaluate(() => window.scrollBy(0, -window.innerHeight));
22 |         break;
23 |       case "down":
24 |         await ctx.page.evaluate(() => window.scrollBy(0, window.innerHeight));
25 |         break;
26 |       case "left":
27 |         await ctx.page.evaluate(() => window.scrollBy(-window.innerWidth, 0));
28 |         break;
29 |       case "right":
30 |         await ctx.page.evaluate(() => window.scrollBy(window.innerWidth, 0));
31 |         break;
32 |     }
33 |     return { success: true, message: `Scrolled ${direction}` };
34 |   },
35 |   pprintAction: function(params: ScrollActionType): string {
36 |     return `Scroll ${params.direction}`;
37 |   },
38 | };
39 | 


--------------------------------------------------------------------------------
/src/agent/actions/select-option.ts:
--------------------------------------------------------------------------------
 1 | import { z } from "zod";
 2 | import { ActionContext, AgentActionDefinition } from "@/types";
 3 | import { getLocator } from "./utils";
 4 | 
 5 | export const SelectOptionAction = z
 6 |   .object({
 7 |     index: z
 8 |       .number()
 9 |       .describe("The numeric index of the  element to select an option."),
10 |     text: z.string().describe("The text of the option to select."),
11 |   })
12 |   .describe("Select an option from a dropdown element");
13 | 
14 | export type SelectOptionActionType = z.infer<typeof SelectOptionAction>;
15 | 
16 | export const SelectOptionActionDefinition: AgentActionDefinition = {
17 |   type: "selectOption" as const,
18 |   actionParams: SelectOptionAction,
19 |   run: async (ctx: ActionContext, action: SelectOptionActionType) => {
20 |     const { index, text } = action;
21 |     const locator = getLocator(ctx, index);
22 |     if (!locator) {
23 |       return { success: false, message: "Element not found" };
24 |     }
25 |     await locator.selectOption({ label: text });
26 |     return {
27 |       success: true,
28 |       message: `Selected option "${text}" from element with index ${index}`,
29 |     };
30 |   },
31 |   pprintAction: function (params: SelectOptionActionType): string {
32 |     return `Select option "${params.text}" from element at index ${params.index}`;
33 |   },
34 | };
35 | 


--------------------------------------------------------------------------------
/src/agent/actions/thinking.ts:
--------------------------------------------------------------------------------
 1 | import { z } from "zod";
 2 | import { ActionContext, AgentActionDefinition } from "@/types";
 3 | 
 4 | export const ThinkingAction = z
 5 |   .object({
 6 |     thought: z
 7 |       .string()
 8 |       .describe(
 9 |         "Think about what your current course of action, and your future steps, and what difficulties you might encounter, and how you'd tackle them."
10 |       ),
11 |   })
12 |   .describe(
13 |     `Think about a course of action. Think what your current task is, what your next should be, and how you would possibly do that. This step is especially useful if performing a complex task, and/or working on a visually complex page (think nodes > 300).`
14 |   );
15 | 
16 | export type ThinkingActionType = z.infer<typeof ThinkingAction>;
17 | 
18 | export const ThinkingActionDefinition: AgentActionDefinition = {
19 |   type: "thinkAction" as const,
20 |   actionParams: ThinkingAction,
21 |   run: async (ctx: ActionContext, action: ThinkingActionType) => {
22 |     const { thought } = action;
23 |     return {
24 |       success: true,
25 |       message: `A simple thought process about your next steps. You thought about: ${thought}`,
26 |     };
27 |   },
28 |   pprintAction: function(params: ThinkingActionType): string {
29 |     return `Think about: "${params.thought}"`;
30 |   },
31 | };
32 | 


--------------------------------------------------------------------------------
/src/agent/actions/utils.ts:
--------------------------------------------------------------------------------
 1 | import { ActionContext } from "@hyperbrowser/agent/types";
 2 | 
 3 | export function getLocator(ctx: ActionContext, index: number) {
 4 |   const element = ctx.domState.elements.get(index);
 5 |   if (!element) {
 6 |     return null;
 7 |   }
 8 |   if (element.isUnderShadowRoot) {
 9 |     return ctx.page.locator(element.cssPath);
10 |   } else {
11 |     return ctx.page.locator(`xpath=${element.xpath}`);
12 |   }
13 | }
14 | 


--------------------------------------------------------------------------------
/src/agent/error.ts:
--------------------------------------------------------------------------------
 1 | export class HyperagentError extends Error {
 2 |   constructor(
 3 |     message: string,
 4 |     public statusCode?: number
 5 |   ) {
 6 |     super(`[Hyperagent]: ${message}`);
 7 |     this.name = "HyperagentError";
 8 |   }
 9 | }
10 | 


--------------------------------------------------------------------------------
/src/agent/llms/structured-output.ts:
--------------------------------------------------------------------------------
 1 | import { BaseChatModel } from "@langchain/core/language_models/chat_models";
 2 | 
 3 | /**
 4 |  * Determines the appropriate structured output method based on the LLM type
 5 |  * @param llm The language model instance
 6 |  * @returns The structured output method to use ("functionCalling" or "jsonMode")
 7 |  */
 8 | export function getStructuredOutputMethod(llm: BaseChatModel) {
 9 |   const modelName = llm.getName();
10 |   if (modelName === "ChatAnthropic") {
11 |     return "functionCalling";
12 |   } else if (modelName === "ChatOpenAI") {
13 |     return undefined;
14 |   }
15 | 
16 |   // Default to functionCalling for other models
17 |   return "functionCalling";
18 | }
19 | 


--------------------------------------------------------------------------------
/src/agent/mcp/client.ts:
--------------------------------------------------------------------------------
  1 | import { z } from "zod";
  2 | import { Client } from "@modelcontextprotocol/sdk/client/index.js";
  3 | import { StdioClientTransport } from "@modelcontextprotocol/sdk/client/stdio.js";
  4 | import { SSEClientTransport } from "@modelcontextprotocol/sdk/client/sse.js";
  5 | import { Tool } from "@modelcontextprotocol/sdk/types";
  6 | import { MCPServerConfig } from "@/types/config";
  7 | import { ActionContext, ActionOutput, AgentActionDefinition } from "@/types";
  8 | import { v4 as uuidv4 } from "uuid";
  9 | 
 10 | interface ServerConnection {
 11 |   id: string;
 12 |   config: MCPServerConfig;
 13 |   client: Client;
 14 |   transport: StdioClientTransport | SSEClientTransport;
 15 |   tools: Map<string, Tool>;
 16 |   actions: AgentActionDefinition[];
 17 | }
 18 | 
 19 | class MCPClient {
 20 |   private servers: Map<string, ServerConnection> = new Map();
 21 |   private debug: boolean;
 22 |   constructor(debug: boolean = false) {
 23 |     this.debug = debug;
 24 |   }
 25 | 
 26 |   /**
 27 |    * Connect to an MCP server and register its tools
 28 |    * @param serverConfig The server configuration
 29 |    * @returns List of action definitions provided by the server
 30 |    */
 31 |   async connectToServer(
 32 |     serverConfig: MCPServerConfig
 33 |   ): Promise<{ serverId: string; actions: AgentActionDefinition[] }> {
 34 |     try {
 35 |       // Generate or use provided server ID
 36 |       const serverId = serverConfig.id || uuidv4();
 37 | 
 38 |       // Create transport for this server
 39 |       let transport;
 40 |       const connectionType = serverConfig?.connectionType || "stdio";
 41 | 
 42 |       if (connectionType === "sse") {
 43 |         if (!serverConfig.sseUrl) {
 44 |           throw new Error("SSE URL is required for SSE connection type");
 45 |         }
 46 | 
 47 |         if (this.debug) {
 48 |           console.log(
 49 |             `Establishing SSE connection to ${serverConfig.sseUrl}...`
 50 |           );
 51 |         }
 52 | 
 53 |         transport = new SSEClientTransport(
 54 |           new URL(serverConfig.sseUrl),
 55 |           serverConfig.sseHeaders
 56 |             ? {
 57 |                 requestInit: {
 58 |                   headers: serverConfig.sseHeaders,
 59 |                 },
 60 |               }
 61 |             : undefined
 62 |         );
 63 | 
 64 |         transport.onerror = (error: any) => {
 65 |           console.error(`SSE error: ${error.message}`);
 66 |         };
 67 |       } else {
 68 |         if (!serverConfig.command) {
 69 |           throw new Error("Command is required for stdio connection type");
 70 |         }
 71 | 
 72 |         transport = new StdioClientTransport({
 73 |           command: serverConfig.command,
 74 |           args: serverConfig.args,
 75 |           env: {
 76 |             ...((process.env ?? {}) as Record<string, string>),
 77 |             ...(serverConfig.env ?? {}),
 78 |           },
 79 |           // Pipe stdin/stdout, ignore stderr
 80 |           stderr: this.debug ? "inherit" : "ignore",
 81 |         });
 82 |       }
 83 | 
 84 |       const client = new Client({
 85 |         name: `hyperagent-mcp-client-${serverId}`,
 86 |         version: "1.0.0",
 87 |       });
 88 | 
 89 |       await client.connect(transport);
 90 | 
 91 |       const toolsResult = await client.listTools();
 92 |       const toolsMap = new Map<string, Tool>();
 93 | 
 94 |       // Create actions for each tool
 95 |       const actions = toolsResult.tools
 96 |         .filter((tool) => {
 97 |           if (
 98 |             serverConfig.includeTools &&
 99 |             !serverConfig.includeTools.includes(tool.name)
100 |           ) {
101 |             return false;
102 |           }
103 |           if (
104 |             serverConfig.excludeTools &&
105 |             serverConfig.excludeTools.includes(tool.name)
106 |           ) {
107 |             return false;
108 |           }
109 |           return true;
110 |         })
111 |         .map((tool) => {
112 |           // Store tool reference for later use
113 |           toolsMap.set(tool.name, tool);
114 | 
115 |           // Create action definition
116 |           return {
117 |             type: tool.name,
118 |             actionParams: z
119 |               .object({
120 |                 params: z
121 |                   .string()
122 |                   .describe(
123 |                     `The stringified parameters to the ${tool.name} MCP tool. Here is the schema: ${JSON.stringify(tool.inputSchema)}`
124 |                   ),
125 |               })
126 |               .describe(tool.description ?? ""),
127 |             run: async (
128 |               ctx: ActionContext,
129 |               action: any
130 |             ): Promise<ActionOutput> => {
131 |               if (!ctx.mcpClient) {
132 |                 throw new Error(
133 |                   "MCP client not available. Please ensure an MCP server is connected."
134 |                 );
135 |               }
136 | 
137 |               const params = JSON.parse(action.params);
138 |               const targetServerId = serverId;
139 | 
140 |               const result = await ctx.mcpClient.executeTool(
141 |                 tool.name,
142 |                 params,
143 |                 targetServerId
144 |               );
145 | 
146 |               return {
147 |                 success: true,
148 |                 message: `MCP tool ${tool.name} execution successful: ${JSON.stringify(result)}`,
149 |               };
150 |             },
151 |           };
152 |         });
153 | 
154 |       // Store server connection
155 |       this.servers.set(serverId, {
156 |         id: serverId,
157 |         config: serverConfig,
158 |         client,
159 |         transport,
160 |         tools: toolsMap,
161 |         actions,
162 |       });
163 |       if (this.debug) {
164 |         console.log(`Connected to MCP server with ID: ${serverId}`);
165 |         console.log("Added tools:", Array.from(toolsMap.keys()));
166 |       }
167 |       return { serverId, actions };
168 |     } catch (e) {
169 |       console.error("Failed to connect to MCP server: ", e);
170 |       throw e;
171 |     }
172 |   }
173 | 
174 |   /**
175 |    * Execute a tool on a specific server
176 |    * @param toolName The name of the tool to execute
177 |    * @param parameters The parameters to pass to the tool
178 |    * @param serverId The ID of the server to use (optional)
179 |    * @returns The result of the tool execution
180 |    */
181 |   async executeTool(
182 |     toolName: string,
183 |     parameters: Record<string, any>,
184 |     serverId?: string
185 |   ): Promise<any> {
186 |     // If no server ID provided and only one server exists, use that one
187 |     if (!serverId && this.servers.size === 1) {
188 |       serverId = [...this.servers.keys()][0];
189 |     }
190 | 
191 |     // If no server ID provided and multiple servers exist, try to find one with the tool
192 |     if (!serverId && this.servers.size > 1) {
193 |       for (const [id, server] of this.servers.entries()) {
194 |         if (server.tools.has(toolName)) {
195 |           serverId = id;
196 |           break;
197 |         }
198 |       }
199 |     }
200 | 
201 |     if (!serverId || !this.servers.has(serverId)) {
202 |       throw new Error(`No valid server found for tool ${toolName}`);
203 |     }
204 | 
205 |     const server = this.servers.get(serverId);
206 |     if (!server) {
207 |       throw new Error(`Server with ID ${serverId} not found`);
208 |     }
209 | 
210 |     try {
211 |       const result = await server.client.callTool({
212 |         name: toolName,
213 |         arguments: parameters,
214 |       });
215 | 
216 |       return result;
217 |     } catch (e) {
218 |       console.error(
219 |         `Error executing tool ${toolName} on server ${serverId}:`,
220 |         e
221 |       );
222 |       throw e;
223 |     }
224 |   }
225 | 
226 |   /**
227 |    * Get all registered action definitions from all connected servers
228 |    * @returns Array of action definitions
229 |    */
230 |   getAllActions(): AgentActionDefinition[] {
231 |     const allActions: AgentActionDefinition[] = [];
232 |     for (const server of this.servers.values()) {
233 |       allActions.push(...server.actions);
234 |     }
235 |     return allActions;
236 |   }
237 | 
238 |   /**
239 |    * Get the IDs of all connected servers
240 |    * @returns Array of server IDs
241 |    */
242 |   getServerIds(): string[] {
243 |     return [...this.servers.keys()];
244 |   }
245 | 
246 |   /**
247 |    * Disconnect from a specific server
248 |    * @param serverId The ID of the server to disconnect from
249 |    */
250 |   async disconnectServer(serverId: string): Promise<void> {
251 |     const server = this.servers.get(serverId);
252 |     if (server) {
253 |       await server.transport.close();
254 |       this.servers.delete(serverId);
255 |       if (this.debug) {
256 |         console.log(`Disconnected from MCP server with ID: ${serverId}`);
257 |       }
258 |     }
259 |   }
260 | 
261 |   /**
262 |    * Disconnect from all servers
263 |    */
264 |   async disconnect(): Promise<void> {
265 |     for (const serverId of this.servers.keys()) {
266 |       await this.disconnectServer(serverId);
267 |     }
268 |   }
269 | 
270 |   /**
271 |    * Check if a tool exists on any connected server
272 |    * @param toolName The name of the tool to check
273 |    * @returns Boolean indicating if the tool exists and the server ID it exists on
274 |    */
275 |   hasTool(toolName: string): { exists: boolean; serverId?: string } {
276 |     for (const [serverId, server] of this.servers.entries()) {
277 |       if (server.tools.has(toolName)) {
278 |         return { exists: true, serverId };
279 |       }
280 |     }
281 |     return { exists: false };
282 |   }
283 | 
284 |   /**
285 |    * Get information about all connected servers
286 |    * @returns Array of server information objects
287 |    */
288 |   getServerInfo(): Array<{
289 |     id: string;
290 |     toolCount: number;
291 |     toolNames: string[];
292 |   }> {
293 |     return Array.from(this.servers.entries()).map(([id, server]) => ({
294 |       id,
295 |       toolCount: server.tools.size,
296 |       toolNames: Array.from(server.tools.keys()),
297 |     }));
298 |   }
299 | 
300 |   /**
301 |    * Check if any servers are connected
302 |    * @returns Boolean indicating if any servers are connected
303 |    */
304 |   hasConnections(): boolean {
305 |     return this.servers.size > 0;
306 |   }
307 | }
308 | 
309 | export { MCPClient };
310 | 


--------------------------------------------------------------------------------
/src/agent/messages/builder.ts:
--------------------------------------------------------------------------------
 1 | import { AgentStep } from "@/types";
 2 | import { BaseMessageLike } from "@langchain/core/messages";
 3 | import { Page } from "playwright";
 4 | import { getScrollInfo } from "./utils";
 5 | import { retry } from "@/utils/retry";
 6 | import { DOMState } from "@/context-providers/dom/types";
 7 | import { HyperVariable } from "@/types/agent/types";
 8 | 
 9 | export const buildAgentStepMessages = async (
10 |   baseMessages: BaseMessageLike[],
11 |   steps: AgentStep[],
12 |   task: string,
13 |   page: Page,
14 |   domState: DOMState,
15 |   screenshot: string,
16 |   variables: HyperVariable[]
17 | ): Promise<BaseMessageLike[]> => {
18 |   const messages = [...baseMessages];
19 | 
20 |   // Add the final goal section
21 |   messages.push({
22 |     role: "user",
23 |     content: `=== Final Goal ===\n${task}\n`,
24 |   });
25 | 
26 |   // Add current URL section
27 |   messages.push({
28 |     role: "user",
29 |     content: `=== Current URL ===\n${page.url()}\n`,
30 |   });
31 | 
32 |   // Add variables section
33 |   messages.push({
34 |     role: "user",
35 |     content: `=== Variables ===\n${variables.map((v) => `<<${v.key}>> - ${v.description}`).join("\n")}\n`,
36 |   });
37 | 
38 |   // Add previous actions section if there are steps
39 |   if (steps.length > 0) {
40 |     messages.push({
41 |       role: "user",
42 |       content: "=== Previous Actions ===\n",
43 |     });
44 |     for (const step of steps) {
45 |       messages.push({
46 |         role: "ai",
47 |         content: JSON.stringify(step.agentOutput),
48 |       });
49 |       for (const actionOutput of step.actionOutputs) {
50 |         messages.push({
51 |           role: "user",
52 |           content: actionOutput.extract
53 |             ? `${actionOutput.message} :\n ${JSON.stringify(actionOutput.extract)}`
54 |             : actionOutput.message,
55 |         });
56 |       }
57 |     }
58 |   }
59 | 
60 |   // Add elements section with DOM tree
61 |   messages.push({
62 |     role: "user",
63 |     content: `=== Elements ===\n${domState.domState}\n`,
64 |   });
65 | 
66 |   // Add page screenshot section
67 |   const scrollInfo = await retry({ func: () => getScrollInfo(page) });
68 |   messages.push({
69 |     role: "user",
70 |     content: [
71 |       {
72 |         type: "text",
73 |         text: "=== Page Screenshot ===\n",
74 |       },
75 |       {
76 |         type: "image_url",
77 |         image_url: {
78 |           url: `data:image/png;base64,${screenshot}`,
79 |         },
80 |       },
81 |       {
82 |         type: "text",
83 |         text: `=== Page State ===\nPixels above: ${scrollInfo[0]}\nPixels below: ${scrollInfo[1]}\n`,
84 |       },
85 |     ],
86 |   });
87 | 
88 |   return messages;
89 | };
90 | 


--------------------------------------------------------------------------------
/src/agent/messages/examples-actions.ts:
--------------------------------------------------------------------------------
 1 | export const EXAMPLE_ACTIONS = `- Search: [
 2 |     {"type": "textInput", "params": {"text": "search query"}},
 3 |     {"type": "keyPress", "params": {"key": "Enter"}}
 4 | ]
 5 | - Clicking on an element: [
 6 |     {"type": "clickElement", "params": {"index": 1}}
 7 | ]
 8 | - Extracting content (if your goal is to find any information on a page): [
 9 |     {"type": "extractContent", "params": {"goal": "what specifically you need to extract"}}
10 | ]
11 | - Forms: [
12 |     {"type": "inputText", "params": {"index": 1, "text": "first name"}},
13 |     {"type": "inputText", "params": {"index": 2, "text": "last name"}},
14 |     {"type": "inputText", "params": {"index": 2, "text": "job title"}},
15 |     {"type": "clickElement", "params": {"index": 3}}
16 | ]`;
17 | 


--------------------------------------------------------------------------------
/src/agent/messages/input-format.ts:
--------------------------------------------------------------------------------
 1 | export const INPUT_FORMAT = `=== Final Goal ===
 2 | [The final goal that needs to be accomplished]
 3 | === Open Tabs ===
 4 | [The open tabs]
 5 | === Current URL ===
 6 | [The current URL]
 7 | === Variables ===
 8 | [Variables that can be used in the task]
 9 | - Variables are referenced using <<name>> syntax
10 | - Each variable has a name and description
11 | - Variables persist across actions and can be referenced in subsequent steps
12 | - Format: <<name>> - {description}
13 | === Elements ===
14 | [A list of the elements on the page in the following format]
15 | [index]<type attributes...>value</type>
16 | - type: HTML element type (button, input, etc.)
17 | - index: Numeric identifier for interaction 
18 | - attributes: All HTML attributes of the element like type, name, value, class, etc. This can include:
19 |   * Data attributes
20 |   * ARIA attributes 
21 |   * Custom attributes
22 |   * Any other valid HTML attributes
23 |   * The attributes provide important context about the element's behavior, accessibility, and styling
24 | === Previous Actions ===
25 | [The previous steps of the task]
26 | === Page Screenshot ===
27 | - A screenshot of the current page with the interactive elements highlighted with their index
28 | === Page State ===
29 | - Pixels below
30 | - Pixels above`;
31 | 


--------------------------------------------------------------------------------
/src/agent/messages/output-format.ts:
--------------------------------------------------------------------------------
 1 | export const OUTPUT_FORMAT = `Your response MUST be in this exact format:
 2 | {
 3 |   "thoughts": "Your thoughts on the task at hand, was the previous goal successful?",
 4 |   "memory": "Information that you need to remember to accomplish subsequent goals",
 5 |   "nextGoal": "The next goal you are trying to accomplish with the actions you have chosen",
 6 |   "actions": [
 7 |     {
 8 |       "action": "The action you will take",
 9 |       "params": {
10 |         ...Action Arguments...
11 |       }
12 |     }
13 |   ]
14 | }`


--------------------------------------------------------------------------------
/src/agent/messages/system-prompt.ts:
--------------------------------------------------------------------------------
  1 | import { INPUT_FORMAT } from "./input-format";
  2 | import { OUTPUT_FORMAT } from "./output-format";
  3 | import { EXAMPLE_ACTIONS } from "./examples-actions";
  4 | 
  5 | const DATE_STRING = new Date().toLocaleString(undefined, {
  6 |   year: "numeric",
  7 |   month: "2-digit",
  8 |   day: "2-digit",
  9 |   weekday: "long",
 10 | });
 11 | 
 12 | export const SYSTEM_PROMPT = `You are a smart and sophisticated agent that is designed to automate web browser interactions.
 13 | You try to accomplish goals in a quick and concise manner.
 14 | Your goal is to accomplish the final goal following the rules by using the provided actions and breaking down the task into smaller steps.
 15 | You are provided with a set of actions that you can use to accomplish the task.
 16 | 
 17 | # World State
 18 | The current Date is ${DATE_STRING}. The date format is MM/DD/YYYY.
 19 | 
 20 | # Input Format
 21 | ${INPUT_FORMAT}
 22 | 
 23 | # Output Format
 24 | ${OUTPUT_FORMAT}
 25 | 
 26 | ## Action Rules:
 27 | - You can run multiple actions in the output, they will be executed in the given order
 28 | - If you do run multiple actions, sequence similar ones together for efficiency.
 29 | - Do NOT run actions that change the page entirely, you will get the new DOM after those actions and you can run the next actions then.
 30 | - Use a maximum of 25 actions per sequence.
 31 | 
 32 | ## Action Execution:
 33 | - Actions are executed in the given order
 34 | - If the page changes after an action, the sequence is interrupted and you get the new state.
 35 | 
 36 | ## Common action examples:
 37 | ${EXAMPLE_ACTIONS}
 38 | 
 39 | # Rules
 40 | 1. FINAL GOAL COMPLETION:
 41 | - Only use the "complete" action when you have fully accomplished everything specified in the task
 42 | - The "complete" action must be the final action in your sequence
 43 | - Before using "complete", verify you have gathered all requested information and met all task requirements
 44 | - Include detailed results in the "complete" action's text parameter to show how you satisfied each requirement
 45 | 
 46 | 2. Validation:
 47 | - Before you finish up your task, call the taskCompleteValidation. It will double check your task and it's subtasks. That will be used to see if you're done with all tasks and subtasks of that at this point. You **MUST** run this before performing a tool call to the "complete" tool.
 48 | 
 49 | # Guidelines
 50 | 1. NAVIGATION
 51 | - If no suitable elements exist, use other functions to complete the task
 52 | - Use scroll to find elements you are looking for
 53 | - If you want to research something, open a new tab instead of using the current tab
 54 | 
 55 | 2. GETTING UNSTUCK
 56 | - Avoid getting stuck in loops.
 57 |   * You know your previous actions, and you know your current state. Do not keep repeating yourself expecting something to change.
 58 | - If stuck, try:
 59 |   * Going back to a previous page
 60 |   * Starting a new search
 61 |   * Opening a new tab
 62 |   * Using alternative navigation paths
 63 |   * Trying a different website or source
 64 |   * Use the thinking action to think about the task and how to accomplish it
 65 | 
 66 | 3. SPECIAL CASES
 67 | - Cookies: Either try accepting the banner or closing it
 68 | - Captcha: First try to solve it, otherwise try to refresh the website, if that doesn't work, try a different method to accomplish the task 
 69 | 
 70 | 4. Form filling:
 71 | - If your action sequence is interrupted after filling an input field, it likely means the page changed (e.g., autocomplete suggestions appeared).
 72 | - When suggestions appear, select an appropriate one before continuing. Important thing to note with this, you should prioritize selecting the most specific/detailed option when hierarchical or nested options are available.
 73 | - For date selection, use the calendar/date picker controls (usually arrows to navigate through the months and years) or type the date directly into the input field rather than scrolling. Ensure the dates selected are the correct ones.
 74 | - After completing all form fields, remember to click the submit/search button to process the form.
 75 | 
 76 | 5. For Date Pickers with Calendars:
 77 |   - First try to type the date directly into the input field and send the enter key press action
 78 |     * Be sure to send the enter key press action after typing the date, if you don't do that, the date will not be selected
 79 |   - If that doesn't work, use the right arrow key to navigate through months and years until finding the correct date
 80 |     * Be patient and persistent with calendar navigation - it may take multiple attempts to reach the target month/year
 81 |     * Verify the correct date is selected before proceeding
 82 | 
 83 | 5. For Flight Search:
 84 |   - If you are typing in the where from, ALWAYS send an enter key press action after typing the value
 85 |   - If you are typing in the where to, ALWAYS send an enter key press action after typing the value
 86 | 
 87 | 5. For flight sources and destinations:
 88 |   - Send enter key press action after typing the source or destination
 89 | 
 90 | # Search Strategy
 91 | When searching, follow these best practices:
 92 | 
 93 | 1. Primary Search Method:
 94 | - Use textInput action followed by keyPress action with 'Enter'
 95 | - If unsuccessful, look for clickable 'Search' text or magnifying glass icon
 96 | - Only click search elements that are marked as interactive
 97 | 
 98 | 2. Query Construction:
 99 | - Search Engines (Google, Bing):
100 |   * Can handle complex, natural language queries
101 |   * Example: "trending python repositories" or "wizards latest game score"
102 | 
103 | - Specific Websites:
104 |   * Use simpler, more targeted queries
105 |   * Follow up with filters and sorting
106 |   * Example on GitHub: Search "language:python", then sort by trending/stars
107 |   * Example on ESPN: Search "wizards", navigate to team page, find latest score
108 | 
109 | 3. Important Considerations:
110 | - For date-based queries, use current date: ${DATE_STRING}
111 | - Use relative dates only when explicitly requested
112 | - With autocomplete:
113 |   * You can ignore suggestions and enter custom input
114 |   * Verify suggested options match requirements before selecting
115 | 
116 | 4. Search Refinement:
117 | - Use available filters and sort options
118 | - Consider in-memory filtering when site options are limited
119 | - Break down complex searches into smaller, manageable steps
120 | `;
121 | 


--------------------------------------------------------------------------------
/src/agent/messages/utils.ts:
--------------------------------------------------------------------------------
 1 | import { Page } from "playwright";
 2 | 
 3 | export const getScrollInfo = async (page: Page): Promise<[number, number]> => {
 4 |   const scrollY = (await page.evaluate("window.scrollY")) as number;
 5 |   const viewportHeight = (await page.evaluate("window.innerHeight")) as number;
 6 |   const totalHeight = (await page.evaluate(
 7 |     "document.documentElement.scrollHeight"
 8 |   )) as number;
 9 |   const pixelsAbove = scrollY;
10 |   const pixelsBelow = totalHeight - (scrollY + viewportHeight);
11 |   return [pixelsAbove, pixelsBelow];
12 | };
13 | 


--------------------------------------------------------------------------------
/src/agent/tools/agent.ts:
--------------------------------------------------------------------------------
  1 | import { AgentStep } from "@/types/agent/types";
  2 | import fs from "fs";
  3 | 
  4 | import {
  5 |   ActionContext,
  6 |   ActionOutput,
  7 |   ActionType,
  8 |   AgentActionDefinition,
  9 | } from "@/types";
 10 | import { getDom } from "@/context-providers/dom";
 11 | import { retry } from "@/utils/retry";
 12 | import { sleep } from "@/utils/sleep";
 13 | 
 14 | import { AgentOutputFn, endTaskStatuses } from "@hyperbrowser/agent/types";
 15 | import {
 16 |   TaskParams,
 17 |   TaskOutput,
 18 |   TaskState,
 19 |   TaskStatus,
 20 | } from "@hyperbrowser/agent/types";
 21 | 
 22 | import { HyperagentError } from "../error";
 23 | import { buildAgentStepMessages } from "../messages/builder";
 24 | import { getStructuredOutputMethod } from "../llms/structured-output";
 25 | import { SYSTEM_PROMPT } from "../messages/system-prompt";
 26 | import { z } from "zod";
 27 | import { DOMState } from "@/context-providers/dom/types";
 28 | import { Page } from "playwright";
 29 | import { ActionNotFoundError } from "../actions";
 30 | import { AgentCtx } from "./types";
 31 | import sharp from "sharp";
 32 | 
 33 | const compositeScreenshot = async (page: Page, overlay: string) => {
 34 |   const screenshot = await page.screenshot();
 35 |   const responseBuffer = await sharp(screenshot)
 36 |     .composite([{ input: Buffer.from(overlay, "base64") }])
 37 |     .png()
 38 |     .toBuffer();
 39 |   return responseBuffer.toString("base64");
 40 | };
 41 | 
 42 | const getActionSchema = (actions: Array<AgentActionDefinition>) => {
 43 |   const zodDefs = actions.map((action) =>
 44 |     z.object({
 45 |       type: z.nativeEnum([action.type] as unknown as z.EnumLike),
 46 |       params: action.actionParams,
 47 |       actionDescription: z
 48 |         .string()
 49 |         .describe(
 50 |           "Describe why you are performing this action and what you aim to perform with this action."
 51 |         ),
 52 |     })
 53 |   );
 54 |   return z.union([zodDefs[0], zodDefs[1], ...zodDefs.splice(2)]);
 55 | };
 56 | 
 57 | const getActionHandler = (
 58 |   actions: Array<AgentActionDefinition>,
 59 |   type: string
 60 | ) => {
 61 |   const foundAction = actions.find((actions) => actions.type === type);
 62 |   if (foundAction) {
 63 |     return foundAction.run;
 64 |   } else {
 65 |     throw new ActionNotFoundError(type);
 66 |   }
 67 | };
 68 | 
 69 | const runAction = async (
 70 |   action: ActionType,
 71 |   domState: DOMState,
 72 |   page: Page,
 73 |   ctx: AgentCtx
 74 | ): Promise<ActionOutput> => {
 75 |   const actionCtx: ActionContext = {
 76 |     domState,
 77 |     page,
 78 |     tokenLimit: ctx.tokenLimit,
 79 |     llm: ctx.llm,
 80 |     debugDir: ctx.debugDir,
 81 |     mcpClient: ctx.mcpClient || undefined,
 82 |     variables: Object.values(ctx.variables),
 83 |   };
 84 |   const actionType = action.type;
 85 |   const actionHandler = getActionHandler(ctx.actions, action.type);
 86 |   if (!actionHandler) {
 87 |     return {
 88 |       success: false,
 89 |       message: `Unknown action type: ${actionType}`,
 90 |     };
 91 |   }
 92 |   try {
 93 |     return await actionHandler(actionCtx, action.params);
 94 |   } catch (error) {
 95 |     return {
 96 |       success: false,
 97 |       message: `Action ${action.type} failed: ${error}`,
 98 |     };
 99 |   }
100 | };
101 | 
102 | export const runAgentTask = async (
103 |   ctx: AgentCtx,
104 |   taskState: TaskState,
105 |   params?: TaskParams
106 | ): Promise<TaskOutput> => {
107 |   const taskId = taskState.id;
108 |   const debugDir = params?.debugDir || `debug/${taskId}`;
109 |   if (ctx.debug) {
110 |     console.log(`Debugging task ${taskId} in ${debugDir}`);
111 |   }
112 |   if (!taskState) {
113 |     throw new HyperagentError(`Task ${taskId} not found`);
114 |   }
115 | 
116 |   taskState.status = TaskStatus.RUNNING as TaskStatus;
117 |   if (!ctx.llm) {
118 |     throw new HyperagentError("LLM not initialized");
119 |   }
120 |   const llmStructured = ctx.llm.withStructuredOutput(
121 |     AgentOutputFn(getActionSchema(ctx.actions)),
122 |     {
123 |       method: getStructuredOutputMethod(ctx.llm),
124 |     }
125 |   );
126 |   const baseMsgs = [{ role: "system", content: SYSTEM_PROMPT }];
127 | 
128 |   let output = "";
129 |   const page = taskState.startingPage;
130 |   let currStep = 0;
131 |   while (true) {
132 |     // Status Checks
133 |     if ((taskState.status as TaskStatus) == TaskStatus.PAUSED) {
134 |       await sleep(100);
135 |       continue;
136 |     }
137 |     if (endTaskStatuses.has(taskState.status)) {
138 |       break;
139 |     }
140 |     if (params?.maxSteps && currStep >= params.maxSteps) {
141 |       taskState.status = TaskStatus.CANCELLED;
142 |       break;
143 |     }
144 |     const debugStepDir = `${debugDir}/step-${currStep}`;
145 |     if (ctx.debug) {
146 |       fs.mkdirSync(debugStepDir, { recursive: true });
147 |     }
148 | 
149 |     // Get DOM State
150 |     const domState = await retry({ func: () => getDom(page) });
151 |     if (!domState) {
152 |       console.log("no dom state, waiting 1 second.");
153 |       await sleep(1000);
154 |       continue;
155 |     }
156 | 
157 |     const trimmedScreenshot = await compositeScreenshot(
158 |       page,
159 |       domState.screenshot.startsWith("data:image/png;base64,")
160 |         ? domState.screenshot.slice("data:image/png;base64,".length)
161 |         : domState.screenshot
162 |     );
163 | 
164 |     // Store Dom State for Debugging
165 |     if (ctx.debug) {
166 |       fs.mkdirSync(debugDir, { recursive: true });
167 |       fs.writeFileSync(`${debugStepDir}/elems.txt`, domState.domState);
168 |       if (trimmedScreenshot) {
169 |         fs.writeFileSync(
170 |           `${debugStepDir}/screenshot.png`,
171 |           Buffer.from(trimmedScreenshot, "base64")
172 |         );
173 |       }
174 |     }
175 | 
176 |     // Build Agent Step Messages
177 |     const msgs = await buildAgentStepMessages(
178 |       baseMsgs,
179 |       taskState.steps,
180 |       taskState.task,
181 |       page,
182 |       domState,
183 |       trimmedScreenshot as string,
184 |       Object.values(ctx.variables)
185 |     );
186 | 
187 |     // Store Agent Step Messages for Debugging
188 |     if (ctx.debug) {
189 |       fs.writeFileSync(
190 |         `${debugStepDir}/msgs.json`,
191 |         JSON.stringify(msgs, null, 2)
192 |       );
193 |     }
194 | 
195 |     // Invoke LLM
196 |     const agentOutput = await retry({
197 |       func: () => llmStructured.invoke(msgs),
198 |     });
199 | 
200 |     params?.debugOnAgentOutput?.(agentOutput);
201 | 
202 |     // Status Checks
203 |     if ((taskState.status as TaskStatus) == TaskStatus.PAUSED) {
204 |       await sleep(100);
205 |       continue;
206 |     }
207 |     if (endTaskStatuses.has(taskState.status)) {
208 |       break;
209 |     }
210 | 
211 |     // Run Actions
212 |     const agentStepActions = agentOutput.actions;
213 |     const actionOutputs: ActionOutput[] = [];
214 |     for (const action of agentStepActions) {
215 |       if (action.type === "complete") {
216 |         taskState.status = TaskStatus.COMPLETED;
217 |         const actionDefinition = ctx.actions.find(
218 |           (actionDefinition) => actionDefinition.type === "complete"
219 |         );
220 |         if (actionDefinition) {
221 |           output =
222 |             (await actionDefinition.completeAction?.(action.params)) ??
223 |             "No complete action found";
224 |         } else {
225 |           output = "No complete action found";
226 |         }
227 |       }
228 |       const actionOutput = await runAction(
229 |         action as ActionType,
230 |         domState,
231 |         page,
232 |         ctx
233 |       );
234 |       actionOutputs.push(actionOutput);
235 |       await sleep(2000); // TODO: look at this - smarter page loading
236 |     }
237 |     const step: AgentStep = {
238 |       idx: currStep,
239 |       agentOutput: agentOutput,
240 |       actionOutputs,
241 |     };
242 |     taskState.steps.push(step);
243 |     await params?.onStep?.(step);
244 |     currStep = currStep + 1;
245 | 
246 |     if (ctx.debug) {
247 |       fs.writeFileSync(
248 |         `${debugStepDir}/stepOutput.json`,
249 |         JSON.stringify(step, null, 2)
250 |       );
251 |     }
252 |   }
253 | 
254 |   const taskOutput: TaskOutput = {
255 |     status: taskState.status,
256 |     steps: taskState.steps,
257 |     output,
258 |   };
259 |   if (ctx.debug) {
260 |     fs.writeFileSync(
261 |       `${debugDir}/taskOutput.json`,
262 |       JSON.stringify(taskOutput, null, 2)
263 |     );
264 |   }
265 |   await params?.onComplete?.(taskOutput);
266 |   return taskOutput;
267 | };
268 | 


--------------------------------------------------------------------------------
/src/agent/tools/types.ts:
--------------------------------------------------------------------------------
 1 | import { AgentActionDefinition } from "@/types/agent/actions/types";
 2 | import { MCPClient } from "../mcp/client";
 3 | import { BaseChatModel } from "@langchain/core/language_models/chat_models";
 4 | import { HyperVariable } from "@/types/agent/types";
 5 | 
 6 | export interface AgentCtx {
 7 |   mcpClient?: MCPClient;
 8 |   debugDir?: string;
 9 |   debug?: boolean;
10 |   variables: Record<string, HyperVariable>;
11 |   actions: Array<AgentActionDefinition>;
12 |   tokenLimit: number;
13 |   llm: BaseChatModel;
14 | }
15 | 


--------------------------------------------------------------------------------
/src/browser-providers/hyperbrowser.ts:
--------------------------------------------------------------------------------
 1 | import { chromium, Browser, ConnectOverCDPOptions } from "playwright";
 2 | import { Hyperbrowser } from "@hyperbrowser/sdk";
 3 | import {
 4 |   CreateSessionParams,
 5 |   HyperbrowserConfig,
 6 |   SessionDetail,
 7 | } from "@hyperbrowser/sdk/types";
 8 | 
 9 | import BrowserProvider from "@/types/browser-providers/types";
10 | 
11 | export class HyperbrowserProvider extends BrowserProvider<SessionDetail> {
12 |   browserConfig: Omit<ConnectOverCDPOptions, "endpointURL"> | undefined;
13 |   sessionConfig: CreateSessionParams | undefined;
14 |   config: HyperbrowserConfig | undefined;
15 |   browser: Browser | undefined;
16 |   session: SessionDetail | undefined;
17 |   hbClient: Hyperbrowser | undefined;
18 |   debug: boolean;
19 | 
20 |   constructor(params?: {
21 |     debug?: boolean;
22 |     browserConfig?: Omit<ConnectOverCDPOptions, "endpointURL">;
23 |     sessionConfig?: CreateSessionParams;
24 |     config?: HyperbrowserConfig;
25 |   }) {
26 |     super();
27 |     this.debug = params?.debug ?? false;
28 |     this.browserConfig = params?.browserConfig;
29 |     this.sessionConfig = params?.sessionConfig;
30 |     this.config = params?.config;
31 |   }
32 | 
33 |   async start(): Promise<Browser> {
34 |     const client = new Hyperbrowser(this.config);
35 |     const session = await client.sessions.create(this.sessionConfig);
36 |     this.hbClient = client;
37 |     this.session = session;
38 |     this.browser = await chromium.connectOverCDP(
39 |       session.wsEndpoint,
40 |       this.browserConfig
41 |     );
42 | 
43 |     if (this.debug) {
44 |       console.log(
45 |         "\nHyperbrowser session info:",
46 |         {
47 |           liveUrl: session.liveUrl,
48 |           sessionID: session.id,
49 |           infoUrl: session.sessionUrl,
50 |         },
51 |         "\n"
52 |       );
53 |     }
54 | 
55 |     return this.browser;
56 |   }
57 | 
58 |   async close(): Promise<void> {
59 |     await this.browser?.close();
60 |     if (this.session) {
61 |       await this.hbClient?.sessions.stop(this.session.id);
62 |     }
63 |   }
64 | 
65 |   public getSession() {
66 |     if (!this.session) {
67 |       return null;
68 |     }
69 |     return this.session;
70 |   }
71 | }
72 | 


--------------------------------------------------------------------------------
/src/browser-providers/index.ts:
--------------------------------------------------------------------------------
1 | import { HyperbrowserProvider } from "./hyperbrowser";
2 | import { LocalBrowserProvider } from "./local";
3 | 
4 | export { HyperbrowserProvider, LocalBrowserProvider };
5 | 


--------------------------------------------------------------------------------
/src/browser-providers/local.ts:
--------------------------------------------------------------------------------
 1 | import { chromium, Browser, LaunchOptions } from "playwright";
 2 | import BrowserProvider from "@/types/browser-providers/types";
 3 | 
 4 | export class LocalBrowserProvider extends BrowserProvider<Browser> {
 5 |   options: Omit<Omit<LaunchOptions, "headless">, "channel"> | undefined;
 6 |   session: Browser | undefined;
 7 |   constructor(options?: Omit<Omit<LaunchOptions, "headless">, "channel">) {
 8 |     super();
 9 |     this.options = options;
10 |   }
11 |   async start(): Promise<Browser> {
12 |     const launchArgs = this.options?.args ?? [];
13 |     const browser = await chromium.launch({
14 |       ...(this.options ?? {}),
15 |       channel: "chrome",
16 |       headless: false,
17 |       args: ["--disable-blink-features=AutomationControlled", ...launchArgs],
18 |     });
19 |     this.session = browser;
20 |     return this.session;
21 |   }
22 |   async close(): Promise<void> {
23 |     return await this.session?.close();
24 |   }
25 |   public getSession() {
26 |     if (!this.session) {
27 |       return null;
28 |     }
29 |     return this.session;
30 |   }
31 | }
32 | 


--------------------------------------------------------------------------------
/src/cli/index.ts:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env node
  2 | import "dotenv/config";
  3 | import fs from "node:fs";
  4 | import { Command } from "commander";
  5 | import * as inquirer from "@inquirer/prompts";
  6 | import ora from "ora";
  7 | import boxen from "boxen";
  8 | import chalk from "chalk";
  9 | import readline from "readline";
 10 | import { zipWith } from "lodash";
 11 | 
 12 | import { HyperAgent } from "@/agent";
 13 | import { UserInteractionAction } from "@/custom-actions";
 14 | import {
 15 |   ActionOutput,
 16 |   ActionType,
 17 |   AgentOutput,
 18 |   AgentStep,
 19 |   Task,
 20 |   TaskOutput,
 21 |   TaskStatus,
 22 | } from "@/types";
 23 | import { HyperagentError } from "@/agent/error";
 24 | import { SessionDetail } from "@hyperbrowser/sdk/types";
 25 | 
 26 | const program = new Command();
 27 | 
 28 | let currentSpinner = ora();
 29 | 
 30 | program
 31 |   .name("hyperbrowser")
 32 |   .description("CLI for Hyperbrowser - A powerful browser automation tool")
 33 |   .version("0.0.1");
 34 | 
 35 | program
 36 |   .command("run", { isDefault: true })
 37 |   .description("Run the interactive CLI")
 38 |   .option("-d, --debug", "Enable debug mode")
 39 |   .option("-c, --command <task description>", "Command to run")
 40 |   .option("-f, --file <file path>", "Path to a file containing a command")
 41 |   .option("-m, --mcp <mcp config file>", "Path to a file containing mcp config")
 42 |   .option("--hyperbrowser", "Use Hyperbrowser for the browser provider")
 43 |   .action(async function () {
 44 |     const options = this.opts();
 45 |     const debug = (options.debug as boolean) || false;
 46 |     const useHB = (options.hyperbrowser as boolean) || false;
 47 |     let taskDescription = (options.command as string) || undefined;
 48 |     const filePath = (options.file as string) || undefined;
 49 |     const mcpPath = (options.mcp as string) || undefined;
 50 | 
 51 |     console.log(chalk.blue("HyperAgent CLI"));
 52 |     currentSpinner.info(
 53 |       `Pause using ${chalk.bold("ctrl + p")} and resume using ${chalk.bold("ctrl + r")}\n`
 54 |     );
 55 |     try {
 56 |       // Check for API key if using Hyperbrowser
 57 |       if (useHB && !process.env.HYPERBROWSER_API_KEY) {
 58 |         const apiKey = await inquirer.password({
 59 |           message:
 60 |             "Hyperbrowser API key not found in environment variables. Please enter it here:",
 61 |           mask: "*",
 62 |         });
 63 |         if (!apiKey) {
 64 |           console.log(
 65 |             chalk.yellow("Hyperbrowser API key is required. Exiting.")
 66 |           );
 67 |           process.exit(0);
 68 |         }
 69 |         process.env.HYPERBROWSER_API_KEY = apiKey; // Set it for the current process
 70 |       }
 71 | 
 72 |       const agent = new HyperAgent({
 73 |         debug: debug,
 74 |         browserProvider: useHB ? "Hyperbrowser" : "Local",
 75 |         customActions: [
 76 |           UserInteractionAction(
 77 |             async ({ message, kind, choices }): Promise<ActionOutput> => {
 78 |               const currentText = currentSpinner.text;
 79 |               try {
 80 |                 currentSpinner.stop();
 81 |                 currentSpinner.clear();
 82 |                 if (kind === "text_input") {
 83 |                   const response = await inquirer.input({
 84 |                     message,
 85 |                     required: true,
 86 |                   });
 87 |                   return {
 88 |                     success: true,
 89 |                     message: `User responded with the text: "${response}"`,
 90 |                   };
 91 |                 } else if (kind === "confirm") {
 92 |                   const response = await inquirer.confirm({
 93 |                     message,
 94 |                   });
 95 |                   return {
 96 |                     success: true,
 97 |                     message: `User responded with "${response}"`,
 98 |                   };
 99 |                 } else if (kind === "password") {
100 |                   console.warn(
101 |                     chalk.red(
102 |                       "Providing passwords to LLMs can be dangerous. Passwords are passed in plain-text to the LLM and can be read by other people."
103 |                     )
104 |                   );
105 |                   const response = await inquirer.password({
106 |                     message,
107 |                   });
108 |                   return {
109 |                     success: true,
110 |                     message: `User responded with password: ${response}`,
111 |                   };
112 |                 } else {
113 |                   if (!choices) {
114 |                     return {
115 |                       success: false,
116 |                       message:
117 |                         "For choices kind of user interaction, an array of choices is required.",
118 |                     };
119 |                   } else {
120 |                     const response = await inquirer.select({
121 |                       message,
122 |                       choices: choices.map((option) => ({
123 |                         value: option,
124 |                         name: option,
125 |                       })),
126 |                     });
127 |                     return {
128 |                       success: true,
129 |                       message: `User selected the choice: ${response}`,
130 |                     };
131 |                   }
132 |                 }
133 |               } finally {
134 |                 currentSpinner.start(currentText);
135 |               }
136 |             }
137 |           ),
138 |         ],
139 |       });
140 | 
141 |       let task: Task;
142 | 
143 |       readline.emitKeypressEvents(process.stdin);
144 | 
145 |       process.stdin.on("keypress", async (ch, key) => {
146 |         if (key && key.ctrl && key.name == "p") {
147 |           if (currentSpinner.isSpinning) {
148 |             currentSpinner.stopAndPersist({ symbol: "⏸" });
149 |           }
150 |           currentSpinner.start(
151 |             chalk.blue(
152 |               "Hyperagent will pause after completing this operation. Press Ctrl+r again to resume."
153 |             )
154 |           );
155 |           currentSpinner.stopAndPersist({ symbol: "⏸" });
156 |           currentSpinner = ora();
157 | 
158 |           if (task.getStatus() == TaskStatus.RUNNING) {
159 |             task.pause();
160 |           }
161 |         } else if (key && key.ctrl && key.name == "r") {
162 |           if (task.getStatus() == TaskStatus.PAUSED) {
163 |             currentSpinner.start(chalk.blue("Hyperagent will resume"));
164 |             currentSpinner.stopAndPersist({ symbol: "⏵" });
165 |             currentSpinner = ora();
166 | 
167 |             task.resume();
168 |           }
169 |         } else if (key && key.ctrl && key.name == "c") {
170 |           if (currentSpinner.isSpinning) {
171 |             currentSpinner.stopAndPersist();
172 |           }
173 |           console.log("\nShutting down HyperAgent");
174 |           try {
175 |             await agent.closeAgent();
176 |             process.exit(0);
177 |           } catch (err) {
178 |             console.error("Error during shutdown:", err);
179 |             process.exit(1);
180 |           }
181 |         }
182 |       });
183 | 
184 |       process.stdin.setRawMode(true);
185 | 
186 |       const onStep = (params: AgentStep) => {
187 |         const actionsList = zipWith(
188 |           params.actionOutputs,
189 |           params.agentOutput.actions,
190 |           (output, action) => ({
191 |             output,
192 |             action,
193 |           })
194 |         );
195 | 
196 |         const actions = actionsList
197 |           .map((action, index, array) =>
198 |             index < array.length - 1
199 |               ? `  ├── [${action.output.success ? chalk.yellow(action.action.type) : chalk.red(action.action.type)}] ${action.output.success ? agent.pprintAction(action.action as ActionType) : chalk.red(action.output.message)}`
200 |               : `  └── [${action.output.success ? chalk.yellow(action.action.type) : chalk.red(action.action.type)}] ${action.output.success ? agent.pprintAction(action.action as ActionType) : chalk.red(action.output.message)}`
201 |           )
202 |           .join("\n");
203 | 
204 |         currentSpinner.succeed(
205 |           `[${chalk.yellow("task")}]: ${params.agentOutput.nextGoal}\n${actions}`
206 |         );
207 |         currentSpinner = ora();
208 |         process.stdin.setRawMode(true);
209 |         process.stdin.resume();
210 |       };
211 | 
212 |       const debugAgentOutput = (params: AgentOutput) => {
213 |         const actions = params.actions.map((action, index, array) =>
214 |           index < array.length - 1
215 |             ? `  ├── [${chalk.yellow(action.type)}] ${agent.pprintAction(action as ActionType)}`
216 |             : `  └── [${chalk.yellow(action.type)}] ${agent.pprintAction(action as ActionType)}`
217 |         );
218 |         currentSpinner.start(
219 |           `[${chalk.yellow("task")}]: ${params.nextGoal}\n${actions.join("\n")}`
220 |         );
221 |         process.stdin.setRawMode(true);
222 |         process.stdin.resume();
223 |       };
224 | 
225 |       const onComplete = async (params: TaskOutput) => {
226 |         console.log(
227 |           boxen(params.output || "No Response", {
228 |             title: chalk.yellow("HyperAgent Response"),
229 |             titleAlignment: "center",
230 |             float: "center",
231 |             padding: 1,
232 |             margin: { top: 2, left: 0, right: 0, bottom: 0 },
233 |           })
234 |         );
235 |         console.log("\n");
236 |         const continueTask = await inquirer.select({
237 |           message: "Would you like to continue ",
238 |           choices: [
239 |             { name: "Yes", value: true },
240 |             { name: "No", value: false },
241 |           ],
242 |         });
243 |         if (continueTask) {
244 |           const taskDescription = await inquirer.input({
245 |             message: "What should HyperAgent do next for you?",
246 |             required: true,
247 |           });
248 | 
249 |           process.stdin.setRawMode(true);
250 |           process.stdin.resume();
251 | 
252 |           task = await agent.executeTaskAsync(taskDescription, {
253 |             onStep: onStep,
254 |             debugOnAgentOutput: debugAgentOutput,
255 |             onComplete: onComplete,
256 |           });
257 |           task.emitter.addListener("error", (error) => {
258 |             task.cancel();
259 |             throw error;
260 |           });
261 |         } else {
262 |           process.exit(0);
263 |         }
264 |       };
265 |       if (!taskDescription) {
266 |         if (filePath) {
267 |           taskDescription = (await fs.promises.readFile(filePath)).toString();
268 |         } else {
269 |           taskDescription = await inquirer.input({
270 |             message: "What should HyperAgent do for you today?",
271 |             required: true,
272 |           });
273 |         }
274 |       }
275 | 
276 |       if (mcpPath) {
277 |         const mcpConfig = JSON.parse(
278 |           (await fs.promises.readFile(mcpPath)).toString()
279 |         );
280 |         await agent.initializeMCPClient({ servers: mcpConfig });
281 |       }
282 | 
283 |       if (useHB && !debug) {
284 |         await agent.initBrowser();
285 |         const session = agent.getSession() as SessionDetail;
286 |         console.log(`Hyperbrowser Live URL: ${session.liveUrl}\n`);
287 |       }
288 | 
289 |       task = await agent.executeTaskAsync(taskDescription, {
290 |         onStep: onStep,
291 |         onComplete: onComplete,
292 |         debugOnAgentOutput: debugAgentOutput,
293 |       });
294 |       task.emitter.addListener("error", (error) => {
295 |         task.cancel();
296 |         throw error;
297 |       });
298 |     } catch (err) {
299 |       if (err instanceof HyperagentError || err instanceof Error) {
300 |         console.log(chalk.red(err.message));
301 |         if (debug) {
302 |           console.trace(err);
303 |         }
304 |       } else {
305 |         console.log(chalk.red(err));
306 |         if (debug) {
307 |           console.trace(err);
308 |         }
309 |       }
310 |     }
311 |   });
312 | 
313 | program.parse();
314 | 


--------------------------------------------------------------------------------
/src/context-providers/dom/build-dom-view.ts:
--------------------------------------------------------------------------------
  1 | import { findInteractiveElements } from "./find-interactive-elements";
  2 | import { renderHighlightsOffscreen } from "./highlight";
  3 | import { getCSSPath } from "./get-css-path";
  4 | import { CONTEXT_ATTRIBUTES } from "./const";
  5 | import { DOMStateRaw } from "./types";
  6 | import { getXPath } from "./get-x-path";
  7 | 
  8 | // Helper function to convert ImageBitmap to PNG Data URL
  9 | const imageBitmapToPngDataUrl = (bitmap: ImageBitmap): string => {
 10 |   try {
 11 |     // Create an intermediate canvas
 12 |     const canvas = document.createElement("canvas");
 13 |     canvas.width = bitmap.width;
 14 |     canvas.height = bitmap.height;
 15 | 
 16 |     // Get context and draw the bitmap
 17 |     const ctx = canvas.getContext("2d") as CanvasRenderingContext2D;
 18 |     ctx.drawImage(bitmap, 0, 0);
 19 | 
 20 |     // Export as PNG Data URL
 21 |     // Note: might want to add error handling for toDataURL
 22 |     return canvas.toDataURL("image/png");
 23 |   } finally {
 24 |     // Close the bitmap to free up resources (important!)
 25 |     bitmap.close();
 26 |   }
 27 | };
 28 | 
 29 | // --- Start new function definition ---
 30 | const getElementTextContent = (el: HTMLElement): string => {
 31 |   const tagName = el.tagName.toLowerCase();
 32 | 
 33 |   if (tagName === "input") {
 34 |     const inputElement = el as HTMLInputElement;
 35 |     let labelText: string | null = null;
 36 | 
 37 |     // Try finding label by "for" attribute
 38 |     if (inputElement.id) {
 39 |       const label = document.querySelector(`label[for="${inputElement.id}"]`);
 40 |       if (label) {
 41 |         labelText = label.textContent?.trim() || null;
 42 |       }
 43 |     }
 44 | 
 45 |     // Use label text if found, otherwise use input value. Fallback to empty string if neither.
 46 |     return labelText ?? inputElement.value?.trim() ?? "";
 47 |   } else {
 48 |     // Original logic for non-input elements
 49 |     return el.textContent?.trim() || "";
 50 |   }
 51 | };
 52 | // --- End new function definition ---
 53 | 
 54 | export const buildDomView = (): DOMStateRaw => {
 55 |   const interactiveElements = findInteractiveElements();
 56 | 
 57 |   // 1. Render highlights to an ImageBitmap
 58 |   const screenBitmap = renderHighlightsOffscreen(
 59 |     interactiveElements.map((element, index) => ({
 60 |       element: element.element,
 61 |       index: index + 1, // index range from 1 -> index
 62 |       parentIframe: element.iframe ?? null,
 63 |     })),
 64 |     window.innerWidth,
 65 |     window.innerHeight
 66 |   );
 67 | 
 68 |   // 2. Convert the ImageBitmap to a PNG Data URL
 69 |   const screenshotPngDataUrl = imageBitmapToPngDataUrl(screenBitmap);
 70 | 
 71 |   for (let idx = 0; idx < interactiveElements.length; idx++) {
 72 |     const element = interactiveElements[idx];
 73 |     element.highlightIndex = idx + 1; // index range from 1 -> index
 74 |     element.cssPath = getCSSPath(element.element);
 75 |     element.xpath = getXPath(element.element);
 76 |   }
 77 | 
 78 |   const domRepresentation: string[] = [];
 79 | 
 80 |   const getTextBetween = (node: Node, nextNode: Node | null): string => {
 81 |     const texts: string[] = [];
 82 |     let current = node.nextSibling;
 83 | 
 84 |     while (current && current !== nextNode) {
 85 |       if (current.nodeType === Node.TEXT_NODE && current.textContent) {
 86 |         const text = current.textContent.trim();
 87 |         if (text) texts.push(text);
 88 |       }
 89 |       current = current.nextSibling;
 90 |     }
 91 | 
 92 |     return texts.join(" ");
 93 |   };
 94 | 
 95 |   for (let i = 0; i < interactiveElements.length; i++) {
 96 |     const element = interactiveElements[i];
 97 |     const el = element.element;
 98 |     const tagName = el.tagName.toLowerCase();
 99 | 
100 |     let attributes = "";
101 |     Array.from(el.attributes).forEach((attr) => {
102 |       if (CONTEXT_ATTRIBUTES.includes(attr.name)) {
103 |         attributes += ` ${attr.name}="${attr.value}"`;
104 |       }
105 |     });
106 | 
107 |     // Use the helper function to get text content
108 |     const textContent = getElementTextContent(el);
109 | 
110 |     const indexPrefix = `[${element.highlightIndex}]`;
111 |     const truncatedText =
112 |       textContent.length > 1000
113 |         ? textContent.substring(0, 997) + "..."
114 |         : textContent;
115 |     const elementString = `${indexPrefix}<${tagName}${attributes}>${truncatedText.replace(/\s+/g, " ")}</${tagName}>`;
116 |     domRepresentation.push(elementString);
117 | 
118 |     const nextElement = interactiveElements[i + 1]?.element || null;
119 |     const betweenText = getTextBetween(el, nextElement);
120 |     if (betweenText) {
121 |       domRepresentation.push(betweenText);
122 |     }
123 |   }
124 | 
125 |   return {
126 |     elements: interactiveElements,
127 |     domState: domRepresentation.join("\n"),
128 |     screenshot: screenshotPngDataUrl,
129 |   };
130 | };
131 | 


--------------------------------------------------------------------------------
/src/context-providers/dom/builder.ts:
--------------------------------------------------------------------------------
 1 | import fs from "fs";
 2 | import path from "path";
 3 | import esbuild from "esbuild";
 4 | 
 5 | fs.mkdirSync(path.join(__dirname, "./inject"), { recursive: true });
 6 | 
 7 | esbuild.buildSync({
 8 |   entryPoints: [path.join(__dirname, "build-dom-view.ts")],
 9 |   bundle: true,
10 |   outfile: path.join(__dirname, "inject", "build-dom-view-script.js"),
11 | });
12 | 
13 | const scriptContent = fs.readFileSync(
14 |   path.join(__dirname, "./inject/build-dom-view-script.js"),
15 |   "utf8"
16 | );
17 | const lines = scriptContent.trim().split("\n");
18 | const trimmedContent = `(() => {
19 | ${lines.slice(2, -1).join("\n")}
20 |   return buildDomView();
21 | })();`;
22 | fs.writeFileSync(
23 |   path.join(__dirname, "./inject/build-dom-view-script.js"),
24 |   trimmedContent
25 | );
26 | const escapedContent = trimmedContent
27 |   .replace(/\\/g, "\\\\")
28 |   .replace(/`/g, "\\`")
29 |   .replace(/\$\{/g, "\\${");
30 | const tsConstFile = `export const buildDomViewJs = \`${escapedContent}\`;`;
31 | 
32 | fs.writeFileSync(
33 |   path.join(__dirname, "./inject/build-dom-view.ts"),
34 |   tsConstFile
35 | );
36 | 


--------------------------------------------------------------------------------
/src/context-providers/dom/const.ts:
--------------------------------------------------------------------------------
 1 | export const INTERACTIVE_ELEMENTS = new Set([
 2 |   "a",
 3 |   "input",
 4 |   "button",
 5 |   "select",
 6 |   "menu",
 7 |   "menuitem",
 8 |   "textarea",
 9 |   "canvas",
10 |   "embed",
11 | ]);
12 | 
13 | export const INTERACTIVE_ROLES = new Set([
14 |   "button",
15 |   "link",
16 |   "checkbox",
17 |   "radio",
18 |   "textbox",
19 |   "menuitem",
20 |   "tab",
21 |   "tabpanel",
22 |   "tooltip",
23 |   "slider",
24 |   "progressbar",
25 |   "switch",
26 |   "listbox",
27 |   "option",
28 |   "combobox",
29 |   "menu",
30 |   "treeitem",
31 |   "tree",
32 |   "spinbutton",
33 |   "scrollbar",
34 |   "menuitemcheckbox",
35 |   "menuitemradio",
36 |   "action",
37 | ]);
38 | 
39 | export const INTERACTIVE_EVENTS = new Set([
40 |   "click",
41 |   "mousedown",
42 |   "mouseup",
43 |   "touchstart",
44 |   "touchend",
45 | ]);
46 | 
47 | export const INTERACTIVE_ARIA_PROPS = [
48 |   "aria-expanded",
49 |   "aria-pressed",
50 |   "aria-selected",
51 |   "aria-checked",
52 | ];
53 | 
54 | export const CLICK_ATTRIBUTES = ["onclick", "ng-click", "@click", "v-on:click"];
55 | 
56 | export const CONTEXT_ATTRIBUTES = [
57 |   "title",
58 |   "type",
59 |   "name",
60 |   "role",
61 |   "aria-label",
62 |   "placeholder",
63 |   "value",
64 |   "alt",
65 |   "aria-expanded",
66 | ];
67 | 


--------------------------------------------------------------------------------
/src/context-providers/dom/elem-interactive.ts:
--------------------------------------------------------------------------------
 1 | import {
 2 |   INTERACTIVE_ELEMENTS,
 3 |   INTERACTIVE_ROLES,
 4 |   INTERACTIVE_ARIA_PROPS,
 5 |   CLICK_ATTRIBUTES,
 6 | } from "./const";
 7 | 
 8 | export const isInteractiveElem = (
 9 |   element: HTMLElement
10 | ): { isInteractive: boolean; reason: string } => {
11 |   const tagName = element.tagName.toLowerCase();
12 |   const role = element.getAttribute("role");
13 |   const ariaRole = element.getAttribute("aria-role");
14 | 
15 |   const hasInteractiveRole =
16 |     INTERACTIVE_ELEMENTS.has(tagName) ||
17 |     INTERACTIVE_ROLES.has(role || "") ||
18 |     INTERACTIVE_ROLES.has(ariaRole || "");
19 | 
20 |   if (hasInteractiveRole) {
21 |     let reason = "";
22 |     if (INTERACTIVE_ELEMENTS.has(tagName)) {
23 |       reason = `Interactive HTML element: <${tagName}>`;
24 |     } else if (INTERACTIVE_ROLES.has(role || "")) {
25 |       reason = `Interactive role: ${role}`;
26 |     } else if (INTERACTIVE_ROLES.has(ariaRole || "")) {
27 |       reason = `Interactive aria-role: ${ariaRole}`;
28 |     }
29 |     return { isInteractive: true, reason };
30 |   }
31 | 
32 |   const hasClickHandler =
33 |     element.onclick !== null ||
34 |     element.getAttribute("onclick") !== null ||
35 |     CLICK_ATTRIBUTES.some((attr) => element.hasAttribute(attr));
36 | 
37 |   if (hasClickHandler) {
38 |     return { isInteractive: true, reason: "Has click handler" };
39 |   }
40 | 
41 |   // Check for the marker attribute set by the injected script
42 |   const hasInjectedListener = element.hasAttribute("data-has-interactive-listener");
43 | 
44 |   if (hasInjectedListener) {
45 |     return { isInteractive: true, reason: "Has interactive event listener (tracked)" };
46 |   }
47 | 
48 |   const hasAriaProps = INTERACTIVE_ARIA_PROPS.some((prop) =>
49 |     element.hasAttribute(prop)
50 |   );
51 | 
52 |   if (hasAriaProps) {
53 |     const props = INTERACTIVE_ARIA_PROPS.filter((prop) =>
54 |       element.hasAttribute(prop)
55 |     );
56 |     return {
57 |       isInteractive: true,
58 |       reason: `Has interactive ARIA properties: ${props.join(", ")}`,
59 |     };
60 |   }
61 | 
62 |   const isContentEditable =
63 |     element.getAttribute("contenteditable") === "true" ||
64 |     element.isContentEditable;
65 | 
66 |   if (isContentEditable) {
67 |     return { isInteractive: true, reason: "Is content editable" };
68 |   }
69 | 
70 |   const isDraggable =
71 |     element.draggable || element.getAttribute("draggable") === "true";
72 | 
73 |   if (isDraggable) {
74 |     return { isInteractive: true, reason: "Is draggable" };
75 |   }
76 | 
77 |   return { isInteractive: false, reason: "Not interactive" };
78 | };
79 | 
80 | export const isIgnoredElem = (element: HTMLElement): boolean => {
81 |   const rect = element.getBoundingClientRect();
82 |   const isNotVisible = rect.width === 0 || rect.height === 0;
83 | 
84 |   return (
85 |     element.tagName.toLowerCase() === "html" ||
86 |     element.tagName.toLowerCase() === "body" ||
87 |     isNotVisible ||
88 |     element.hasAttribute("disabled") ||
89 |     element.getAttribute("aria-disabled") === "true"
90 |   );
91 | };
92 | 


--------------------------------------------------------------------------------
/src/context-providers/dom/find-interactive-elements.ts:
--------------------------------------------------------------------------------
 1 | import { isIgnoredElem, isInteractiveElem } from "./elem-interactive";
 2 | import { InteractiveElement } from "./types";
 3 | 
 4 | export const findInteractiveElements = (): InteractiveElement[] => {
 5 |   const interactiveElements: InteractiveElement[] = [];
 6 |   const processedElements = new Set<HTMLElement>();
 7 | 
 8 |   const processRoot = (
 9 |     root: Document | ShadowRoot,
10 |     rootInfo: {
11 |       iframe?: HTMLIFrameElement;
12 |       shadowHost?: HTMLElement;
13 |     } = {}
14 |   ) => {
15 |     const elements = root.querySelectorAll("*");
16 |     for (let i = 0; i < elements.length; i++) {
17 |       const element = elements[i] as HTMLElement;
18 |       if (processedElements.has(element)) {
19 |         continue;
20 |       }
21 |       processedElements.add(element);
22 |       if (element.shadowRoot) {
23 |         processRoot(element.shadowRoot, {
24 |           iframe: rootInfo.iframe,
25 |           shadowHost: element,
26 |         });
27 |       }
28 |       const { isInteractive, reason } = isInteractiveElem(element);
29 |       if (isIgnoredElem(element) || !isInteractive) {
30 |         continue;
31 |       }
32 |       interactiveElements.push({
33 |         element,
34 |         iframe: rootInfo.iframe,
35 |         shadowHost: rootInfo.shadowHost,
36 |         rect: element.getBoundingClientRect(),
37 |         interactiveReason: reason,
38 |         isUnderShadowRoot:
39 |           element.getRootNode().nodeType === Node.DOCUMENT_FRAGMENT_NODE,
40 |         cssPath: "",
41 |         xpath: "",
42 |       });
43 |     }
44 |   };
45 | 
46 |   processRoot(document);
47 | 
48 |   const iframes = document.querySelectorAll("iframe");
49 |   for (let i = 0; i < iframes.length; i++) {
50 |     const iframe = iframes[i] as HTMLIFrameElement;
51 |     try {
52 |       const iframeDoc =
53 |         iframe.contentDocument || iframe.contentWindow?.document;
54 |       if (iframeDoc) {
55 |         processRoot(iframeDoc, { iframe });
56 |       }
57 |     } catch (e) {
58 |       console.warn("error processing iframe", e);
59 |     }
60 |   }
61 | 
62 |   return interactiveElements;
63 | };
64 | 


--------------------------------------------------------------------------------
/src/context-providers/dom/get-css-path.ts:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Escapes characters that have special meaning in CSS selectors.
  3 |  * Handles common cases like IDs and class names.
  4 |  *
  5 |  * @param value The string to escape (e.g., an ID or class name).
  6 |  * @returns The escaped string suitable for use in a CSS selector.
  7 |  */
  8 | const escapeSelector = (value: string): string => {
  9 |   return CSS.escape(value);
 10 | };
 11 | 
 12 | /**
 13 |  * Generates a unique CSS selector segment for a given element relative to its siblings.
 14 |  * Prefers ID, then unique classes, then :nth-of-type.
 15 |  *
 16 |  * @param element The element to generate the selector for.
 17 |  * @returns A CSS selector segment string (e.g., "div#myId", "button.btn.primary", "span:nth-of-type(2)").
 18 |  */
 19 | const getUniqueSegment = (element: HTMLElement): string => {
 20 |   const tagName = element.tagName.toLowerCase();
 21 |   const parent = element.parentElement;
 22 | 
 23 |   // 1. Try ID
 24 |   if (element.id) {
 25 |     const idSelector = `#${escapeSelector(element.id)}`;
 26 |     return idSelector;
 27 |   }
 28 | 
 29 |   // 2. Try unique combination of classes
 30 |   const classes = Array.from(element.classList).map(escapeSelector).join(".");
 31 |   if (classes && parent) {
 32 |     const classSelector = `${tagName}.${classes}`;
 33 |     const siblingsWithSameClasses = Array.from(
 34 |       parent.querySelectorAll(`:scope > ${classSelector}`)
 35 |     );
 36 |     if (
 37 |       siblingsWithSameClasses.length === 1 &&
 38 |       siblingsWithSameClasses[0] === element
 39 |     ) {
 40 |       return classSelector;
 41 |     }
 42 |   }
 43 | 
 44 |   // 3. Fallback to :nth-of-type
 45 |   let index = 1; // CSS :nth-of-type is 1-based
 46 |   let sibling = element.previousElementSibling;
 47 |   while (sibling) {
 48 |     if (sibling.tagName === element.tagName) {
 49 |       index++;
 50 |     }
 51 |     sibling = sibling.previousElementSibling;
 52 |   }
 53 | 
 54 |   // Only add :nth-of-type if there are other siblings of the same type
 55 |   let hasSameTypeSiblings = index > 1; // Already found preceding siblings
 56 |   if (!hasSameTypeSiblings && parent) {
 57 |     sibling = element.nextElementSibling;
 58 |     while (sibling) {
 59 |       if (sibling.tagName === element.tagName) {
 60 |         hasSameTypeSiblings = true;
 61 |         break;
 62 |       }
 63 |       sibling = sibling.nextElementSibling;
 64 |     }
 65 |   }
 66 | 
 67 |   return hasSameTypeSiblings ? `${tagName}:nth-of-type(${index})` : tagName;
 68 | };
 69 | 
 70 | /**
 71 |  * Calculates a CSS selector path for an element relative to a boundary node (Document or ShadowRoot).
 72 |  * Uses '>' as the child combinator.
 73 |  *
 74 |  * @param element The target element.
 75 |  * @param boundary The node (Document or ShadowRoot) to stop traversal at.
 76 |  * @returns A relative CSS selector string.
 77 |  */
 78 | const getRelativeCSSPath = (element: HTMLElement, boundary: Node): string => {
 79 |   if (element === boundary) {
 80 |     return ""; // Should not happen if called correctly, but return empty if it does
 81 |   }
 82 | 
 83 |   const segments: string[] = [];
 84 |   let currentElement: HTMLElement | null = element;
 85 | 
 86 |   while (
 87 |     currentElement &&
 88 |     currentElement !== boundary &&
 89 |     currentElement.nodeType === Node.ELEMENT_NODE
 90 |   ) {
 91 |     const segment = getUniqueSegment(currentElement);
 92 |     segments.unshift(segment);
 93 | 
 94 |     const parent = currentElement.parentElement;
 95 |     // Stop if parent is null, not an element, or the boundary itself
 96 |     if (
 97 |       !parent ||
 98 |       parent === boundary ||
 99 |       parent.nodeType !== Node.ELEMENT_NODE
100 |     ) {
101 |       break;
102 |     }
103 |     currentElement = parent as HTMLElement;
104 |   }
105 | 
106 |   return segments.join(" > ");
107 | };
108 | 
109 | /**
110 |  * Generates a full CSS selector path for a given element, handling shadow DOM boundaries.
111 |  * Uses Playwright's '>>' syntax to denote shadow DOM transitions.
112 |  *
113 |  * @param element The target HTMLElement.
114 |  * @returns A CSS selector string that can be used with Playwright locators.
115 |  */
116 | export const getCSSPath = (element: HTMLElement | null): string => {
117 |   if (!element || element.nodeType !== Node.ELEMENT_NODE) {
118 |     // console.warn("getCSSPath called with invalid element:", element);
119 |     return "";
120 |   }
121 | 
122 |   if (!element.isConnected) {
123 |     // console.warn("getCSSPath called with disconnected element:", element);
124 |     // Attempting to generate path anyway, might be useful in some rare debugging cases
125 |   }
126 | 
127 |   const root = element.getRootNode();
128 | 
129 |   if (root instanceof ShadowRoot) {
130 |     // Element is inside a shadow DOM
131 |     const host = root.host as HTMLElement;
132 |     if (!host) {
133 |       console.warn("ShadowRoot found without a host element:", root);
134 |       // Cannot generate a path from the document root if the host is unknown
135 |       return ""; // Or potentially just the relative path within the shadow root? Unreliable.
136 |     }
137 |     const hostPath = getCSSPath(host); // Recursive call to get path to the host
138 |     const relativePath = getRelativeCSSPath(element, root); // Path within the shadow root
139 | 
140 |     if (!hostPath) {
141 |       console.warn("Could not determine CSS path for host element:", host);
142 |       return ""; // Cannot construct full path
143 |     }
144 |     if (!relativePath) {
145 |       console.warn(
146 |         "Could not determine relative CSS path within ShadowRoot for:",
147 |         element
148 |       );
149 |       // Element might be the direct child/root of the shadow DOM, or path generation failed.
150 |       // Playwright needs a selector after >>, maybe ':host' or '*' or just return hostPath?
151 |       // Returning just hostPath might select the host instead of the shadow content.
152 |       // Let's assume relativePath should usually exist. If not, path is likely invalid.
153 |       return "";
154 |     }
155 | 
156 |     // Playwright syntax for piercing shadow DOM
157 |     return `${hostPath} >> ${relativePath}`;
158 |   } else if (root instanceof Document) {
159 |     // Element is in the main document or an iframe document
160 |     return getRelativeCSSPath(element, root);
161 |   } else {
162 |     console.warn(
163 |       "Element root is neither Document nor ShadowRoot:",
164 |       root,
165 |       "for element:",
166 |       element
167 |     );
168 |     // Fallback: Try to compute path relative to its own root node anyway
169 |     return getRelativeCSSPath(element, root);
170 |   }
171 | };
172 | 


--------------------------------------------------------------------------------
/src/context-providers/dom/get-x-path.ts:
--------------------------------------------------------------------------------
 1 | export const getXPath = (element: HTMLElement) => {
 2 |   const segments = [];
 3 |   let currentElement: HTMLElement | null = element;
 4 | 
 5 |   while (currentElement && currentElement.nodeType === Node.ELEMENT_NODE) {
 6 |     if (
 7 |       currentElement.parentNode instanceof ShadowRoot ||
 8 |       currentElement.parentNode instanceof HTMLIFrameElement
 9 |     ) {
10 |       break;
11 |     }
12 | 
13 |     let index = 0;
14 |     let hasSiblings = false;
15 |     let sibling = currentElement.previousSibling;
16 |     while (sibling) {
17 |       if (
18 |         sibling.nodeType === Node.ELEMENT_NODE &&
19 |         sibling.nodeName === currentElement.nodeName
20 |       ) {
21 |         index++;
22 |         hasSiblings = true;
23 |       }
24 |       sibling = sibling.previousSibling;
25 |     }
26 | 
27 |     if (!hasSiblings) {
28 |       sibling = currentElement.nextSibling;
29 |       while (sibling) {
30 |         if (
31 |           sibling.nodeType === Node.ELEMENT_NODE &&
32 |           sibling.nodeName === currentElement.nodeName
33 |         ) {
34 |           hasSiblings = true;
35 |           break;
36 |         }
37 |         sibling = sibling.nextSibling;
38 |       }
39 |     }
40 | 
41 |     const tagName = currentElement.nodeName.toLowerCase();
42 | 
43 |     // Always include position index if there are siblings with the same tag name
44 |     // This ensures uniqueness of the XPath
45 |     const xpathIndex = hasSiblings ? `[${index + 1}]` : "";
46 | 
47 |     // Add id attribute for even more uniqueness if present
48 |     if (currentElement.id && currentElement.id.toString().trim() !== "") {
49 |       segments.unshift(`${tagName}[@id="${currentElement.id}"]`);
50 |     } else {
51 |       segments.unshift(`${tagName}${xpathIndex}`);
52 |     }
53 | 
54 |     currentElement = currentElement.parentElement;
55 |   }
56 | 
57 |   return segments.join("/");
58 | };
59 | 


--------------------------------------------------------------------------------
/src/context-providers/dom/highlight.ts:
--------------------------------------------------------------------------------
  1 | // --- Interfaces ---
  2 | 
  3 | interface HighlightInfo {
  4 |   element: HTMLElement;
  5 |   index: number;
  6 |   parentIframe: HTMLElement | null;
  7 | }
  8 | 
  9 | interface IframeOffset {
 10 |   x: number;
 11 |   y: number;
 12 | }
 13 | 
 14 | // --- Helper Functions (Stateless) ---
 15 | 
 16 | const isElementPartiallyVisible = (rect: DOMRect): boolean => {
 17 |   // Check if the element is within the viewport, considering potential zero dimensions
 18 |   return (
 19 |     rect.width > 0 &&
 20 |     rect.height > 0 &&
 21 |     rect.top < window.innerHeight && // These checks are relative to the current viewport
 22 |     rect.bottom > 0 && // where the rect was calculated.
 23 |     rect.left < window.innerWidth &&
 24 |     rect.right > 0
 25 |   );
 26 | };
 27 | 
 28 | const getHighlightColor = (
 29 |   index: number
 30 | ): { baseColor: string; backgroundColor: string } => {
 31 |   const colors = [
 32 |     "#FF0000",
 33 |     "#00FF00",
 34 |     "#0000FF",
 35 |     "#FFA500",
 36 |     "#800080",
 37 |     "#008080",
 38 |     "#FF69B4",
 39 |     "#4B0082",
 40 |     "#FF4500",
 41 |     "#2E8B57",
 42 |     "#DC143C",
 43 |     "#4682B4",
 44 |   ];
 45 |   const colorIndex = index % colors.length;
 46 |   const baseColor = colors[colorIndex];
 47 |   const backgroundColor = baseColor + "1A";
 48 |   return { baseColor, backgroundColor };
 49 | };
 50 | 
 51 | // Calculates label position relative to the canvas (0,0 top-left)
 52 | const calculateLabelPosition = (
 53 |   rect: DOMRect,
 54 |   iframeOffset: IframeOffset,
 55 |   labelWidth: number,
 56 |   labelHeight: number,
 57 |   canvasWidth: number, // Pass canvas dims for bounds checking
 58 |   canvasHeight: number
 59 | ): { top: number; left: number } => {
 60 |   const top = rect.top + iframeOffset.y;
 61 |   const left = rect.left + iframeOffset.x;
 62 | 
 63 |   // Default: top-right corner relative to element
 64 |   let labelTop = top - labelHeight;
 65 |   let labelLeft = left + rect.width - labelWidth;
 66 | 
 67 |   // Constraints to keep label within *canvas* bounds
 68 |   labelTop = Math.min(labelTop, canvasHeight - labelHeight);
 69 |   labelLeft = Math.min(labelLeft, canvasWidth - labelWidth);
 70 | 
 71 |   // Basic overlap check (can be improved) - position relative to element
 72 |   const elementBottom = top + rect.height;
 73 |   const elementRight = left + rect.width;
 74 | 
 75 |   // If the calculated top-left of the label is inside the element's box
 76 |   if (
 77 |     labelTop + labelHeight > top &&
 78 |     labelTop < elementBottom &&
 79 |     labelLeft + labelWidth > left &&
 80 |     labelLeft < elementRight
 81 |   ) {
 82 |     // Try bottom-right corner relative to element
 83 |     labelTop = elementBottom;
 84 |     labelLeft = elementRight - labelWidth;
 85 | 
 86 |     // Re-apply constraints
 87 |     labelTop = Math.min(labelTop, canvasHeight - labelHeight);
 88 |     labelLeft = Math.min(labelLeft, canvasWidth - labelWidth);
 89 |   }
 90 | 
 91 |   return { top: labelTop, left: labelLeft };
 92 | };
 93 | 
 94 | // --- Public API ---
 95 | 
 96 | /**
 97 |  * Renders highlights for the given elements onto an OffscreenCanvas
 98 |  * and returns an ImageBitmap.
 99 |  *
100 |  * @param highlightInfos Array of objects describing elements to highlight.
101 |  * @param width The desired width of the canvas (e.g., window.innerWidth).
102 |  * @param height The desired height of the canvas (e.g., window.innerHeight).
103 |  * @returns A Promise resolving to an ImageBitmap containing the highlights.
104 |  */
105 | export function renderHighlightsOffscreen(
106 |   highlightInfos: HighlightInfo[],
107 |   width: number,
108 |   height: number
109 | ): ImageBitmap {
110 |   if (width <= 0 || height <= 0) {
111 |     console.warn(
112 |       "Attempted to render highlights on zero-sized canvas. Will default to innerWidth x innerHeight"
113 |     );
114 |     // Return an empty bitmap maybe? Or null.
115 |     const emptyCanvas = new OffscreenCanvas(
116 |       window.innerWidth,
117 |       window.innerHeight
118 |     );
119 |     return emptyCanvas.transferToImageBitmap();
120 |   }
121 | 
122 |   const dpr = window.devicePixelRatio || 1;
123 |   const canvasWidth = width * dpr;
124 |   const canvasHeight = height * dpr;
125 |   const offscreenCanvas = new OffscreenCanvas(canvasWidth, canvasHeight);
126 |   const ctx = offscreenCanvas.getContext("2d", {
127 |     alpha: true,
128 |   }) as OffscreenCanvasRenderingContext2D; // Ensure alpha for transparency
129 | 
130 |   // Scale context for DPI awareness. All drawing coords should be in logical pixels.
131 |   ctx.scale(dpr, dpr);
132 | 
133 |   // Clear canvas (important for transparency)
134 |   ctx.clearRect(0, 0, width, height);
135 | 
136 |   try {
137 |     highlightInfos.forEach(({ element, index, parentIframe }) => {
138 |       // Element might be stale, ensure it's still in the DOM
139 |       if (!document.body.contains(element)) {
140 |         return; // Skip elements not in DOM
141 |       }
142 | 
143 |       const rect = element.getBoundingClientRect();
144 |       // Skip elements that are not visible or have no dimensions
145 |       if (
146 |         !rect ||
147 |         rect.width === 0 ||
148 |         rect.height === 0 ||
149 |         !isElementPartiallyVisible(rect)
150 |       ) {
151 |         return;
152 |       }
153 | 
154 |       const iframeOffset: IframeOffset = { x: 0, y: 0 };
155 |       if (parentIframe && document.body.contains(parentIframe)) {
156 |         const iframeRect = parentIframe.getBoundingClientRect();
157 |         iframeOffset.x = iframeRect.left;
158 |         iframeOffset.y = iframeRect.top;
159 |       }
160 | 
161 |       const colors = getHighlightColor(index);
162 |       const drawTop = rect.top + iframeOffset.y;
163 |       const drawLeft = rect.left + iframeOffset.x;
164 | 
165 |       // --- Draw overlay rectangle ---
166 |       ctx.fillStyle = colors.backgroundColor;
167 |       ctx.fillRect(drawLeft, drawTop, rect.width, rect.height);
168 |       ctx.strokeStyle = colors.baseColor;
169 |       ctx.lineWidth = 1; // Use 1 logical pixel for crispness after scaling
170 |       ctx.strokeRect(drawLeft, drawTop, rect.width, rect.height);
171 | 
172 |       // --- Draw label ---
173 |       const labelText = index.toString();
174 |       // Font size calculation needs to consider DPR if you want physical pixel size
175 |       // Or keep it simple with logical pixels. Let's use logical pixels.
176 |       const fontSize = Math.min(12, Math.max(9, rect.height * 0.3));
177 |       ctx.font = `bold ${fontSize}px sans-serif`;
178 |       ctx.textAlign = "center";
179 |       ctx.textBaseline = "middle";
180 | 
181 |       // Estimate label dimensions in logical pixels
182 |       const textMetrics = ctx.measureText(labelText);
183 |       const labelPadding = 4;
184 |       const labelHeight = fontSize + labelPadding;
185 |       // Ensure width is at least height for near-square background
186 |       const labelWidth = Math.max(
187 |         labelHeight,
188 |         textMetrics.width + labelPadding * 2
189 |       );
190 | 
191 |       // Calculate position relative to the canvas (using logical pixels)
192 |       const labelPos = calculateLabelPosition(
193 |         rect,
194 |         iframeOffset,
195 |         labelWidth,
196 |         labelHeight,
197 |         width,
198 |         height
199 |       );
200 | 
201 |       // Draw label background (logical pixels)
202 |       ctx.fillStyle = colors.baseColor;
203 |       ctx.fillRect(labelPos.left, labelPos.top, labelWidth, labelHeight);
204 | 
205 |       // Draw label text (logical pixels)
206 |       ctx.fillStyle = "white";
207 |       ctx.fillText(
208 |         labelText,
209 |         labelPos.left + labelWidth / 2,
210 |         labelPos.top + labelHeight / 2
211 |       );
212 |     });
213 | 
214 |     // Transfer the bitmap
215 |     return offscreenCanvas.transferToImageBitmap();
216 |   } catch (error) {
217 |     console.error("Error drawing highlights onto OffscreenCanvas:", error);
218 |     // In case of error, maybe return an empty bitmap or null
219 |     const emptyCanvas = new OffscreenCanvas(1, 1);
220 |     return emptyCanvas.transferToImageBitmap(); // Or return null
221 |   }
222 | }
223 | 


--------------------------------------------------------------------------------
/src/context-providers/dom/index.ts:
--------------------------------------------------------------------------------
 1 | import { Page } from "playwright";
 2 | import { buildDomViewJs } from "./inject/build-dom-view";
 3 | import { DOMState, DOMStateRaw, InteractiveElement } from "./types";
 4 | 
 5 | export const getDom = async (page: Page): Promise<DOMState | null> => {
 6 |   const result = (await page.evaluate(buildDomViewJs)) as DOMStateRaw;
 7 |   const elements = new Map<number, InteractiveElement>();
 8 |   for (const element of result.elements) {
 9 |     if (element.highlightIndex !== undefined) {
10 |       elements.set(element.highlightIndex, element);
11 |     }
12 |   }
13 |   return {
14 |     elements,
15 |     domState: result.domState,
16 |     screenshot: result.screenshot,
17 |   };
18 | };
19 | 


--------------------------------------------------------------------------------
/src/context-providers/dom/types.ts:
--------------------------------------------------------------------------------
 1 | export interface InteractiveElement {
 2 |   element: HTMLElement;
 3 |   iframe?: HTMLIFrameElement;
 4 |   shadowHost?: HTMLElement;
 5 |   isUnderShadowRoot: boolean;
 6 |   rect: DOMRect;
 7 |   interactiveReason?: string;
 8 |   highlightIndex?: number;
 9 |   cssPath: string;
10 |   xpath: string;
11 | }
12 | 
13 | export interface DOMStateRaw {
14 |   elements: InteractiveElement[];
15 |   domState: string;
16 |   screenshot: string;
17 | }
18 | 
19 | export interface DOMState {
20 |   elements: Map<number, InteractiveElement>;
21 |   domState: string;
22 |   screenshot: string;
23 | }
24 | 


--------------------------------------------------------------------------------
/src/context-providers/dom/window-type.ts:
--------------------------------------------------------------------------------
1 | interface Window {
2 |   getEventListeners?: (element: HTMLElement) => {
3 |     [eventName: string]: Array<{
4 |       listener: Function;
5 |       useCapture: boolean;
6 |     }>;
7 |   };
8 | }
9 | 


--------------------------------------------------------------------------------
/src/custom-actions/index.ts:
--------------------------------------------------------------------------------
1 | import { UserInteractionAction } from "./user-interaction";
2 | 
3 | export { UserInteractionAction };
4 | 


--------------------------------------------------------------------------------
/src/custom-actions/user-interaction.ts:
--------------------------------------------------------------------------------
 1 | import { z } from "zod";
 2 | import { ActionOutput, AgentActionDefinition } from "@/types";
 3 | 
 4 | export const UserInteractionActionParams = z.object({
 5 |   message: z
 6 |     .string()
 7 |     .describe(
 8 |       "A message to provide to the user. Make it friendly and ask them for a suitable response. Keep it short and between 1-2 sentences if possible."
 9 |     ),
10 |   kind: z
11 |     .enum(["password", "text_input", "select", "confirm"])
12 |     .describe(
13 |       "The kind of response that is expected from the user. If you can't find a suitable option, then respond with confirm."
14 |     ),
15 |   choices: z
16 |     .array(z.string())
17 |     .optional()
18 |     .describe(
19 |       "If you select choices as the kind option, then what options should be offered to the user."
20 |     ),
21 | }).describe(`Action to request input from the user during task execution.
22 |     Use this when you need to collect information from the user such as text input, password, 
23 |     selection from choices, or confirmation. The response will be returned to continue the workflow.`);
24 | 
25 | export type UserInteractionActionParamsType =
26 |   typeof UserInteractionActionParams;
27 | 
28 | type userInputFn = (
29 |   params: z.infer<UserInteractionActionParamsType>
30 | ) => Promise<ActionOutput>;
31 | 
32 | export const UserInteractionAction = (
33 |   userInputFn: userInputFn
34 | ): AgentActionDefinition<UserInteractionActionParamsType> => {
35 |   return {
36 |     type: "UserInteractionActionParams",
37 |     actionParams: UserInteractionActionParams,
38 |     run: async (ctx, action): Promise<ActionOutput> =>
39 |       await userInputFn(action),
40 |   };
41 | };
42 | 


--------------------------------------------------------------------------------
/src/index.ts:
--------------------------------------------------------------------------------
 1 | import { HyperAgent } from "./agent";
 2 | import { TaskStatus } from "./types/agent/types";
 3 | 
 4 | export { TaskStatus, HyperAgent };
 5 | export default HyperAgent;
 6 | 
 7 | // For CommonJS compatibility
 8 | if (typeof module !== "undefined" && module.exports) {
 9 |   module.exports = HyperAgent;
10 |   module.exports.HyperAgent = HyperAgent;
11 |   module.exports.TaskStatus = TaskStatus;
12 |   module.exports.default = HyperAgent;
13 | }
14 | 


--------------------------------------------------------------------------------
/src/types/agent/actions/types.ts:
--------------------------------------------------------------------------------
 1 | import { Page } from "playwright";
 2 | import { DOMState } from "../../../context-providers/dom/types";
 3 | import { BaseChatModel } from "@langchain/core/language_models/chat_models";
 4 | import { z } from "zod";
 5 | import { MCPClient } from "../../../agent/mcp/client";
 6 | import { HyperVariable } from "../types";
 7 | 
 8 | export interface ActionContext {
 9 |   page: Page;
10 |   domState: DOMState;
11 |   llm: BaseChatModel;
12 |   tokenLimit: number;
13 |   variables: HyperVariable[];
14 |   debugDir?: string;
15 |   mcpClient?: MCPClient;
16 | }
17 | 
18 | export interface ActionOutput {
19 |   success: boolean;
20 |   message: string;
21 |   extract?: object;
22 | }
23 | 
24 | export type ActionSchemaType = z.ZodObject<
25 |   {
26 |     type: z.ZodLiteral<string>;
27 | 
28 |     // eslint-disable-next-line @typescript-eslint/no-empty-object-type
29 |     params: z.ZodObject<{}, "strip", z.ZodTypeAny, {}, {}>;
30 |   },
31 |   "strip",
32 |   z.ZodTypeAny,
33 |   {
34 |     params: object;
35 |     type: string;
36 |   },
37 |   {
38 |     params: object;
39 |     type: string;
40 |   }
41 | >;
42 | 
43 | export type ActionType = z.infer<ActionSchemaType>;
44 | 
45 | export interface AgentActionDefinition<
46 |   T extends z.AnyZodObject = z.AnyZodObject,
47 | > {
48 |   readonly type: string;
49 |   actionParams: T;
50 | 
51 |   run(ctx: ActionContext, params: z.infer<T>): Promise<ActionOutput>;
52 |   /**
53 |    * completeAction is only called if the name of this action is "complete". It is meant to format text into a proper format for output.
54 |    * @param params
55 |    */
56 |   completeAction?(params: z.infer<T>): Promise<string>;
57 |   pprintAction?(params: z.infer<T>): string;
58 | }
59 | 


--------------------------------------------------------------------------------
/src/types/agent/types.ts:
--------------------------------------------------------------------------------
 1 | import { z } from "zod";
 2 | import { ActionOutput } from "./actions/types";
 3 | import { Page } from "playwright";
 4 | import { ErrorEmitter } from "@/utils";
 5 | 
 6 | export const AgentOutputFn = (
 7 |   actionsSchema: z.ZodUnion<readonly [z.AnyZodObject, ...z.AnyZodObject[]]>
 8 | ) =>
 9 |   z.object({
10 |     thoughts: z
11 |       .string()
12 |       .describe(
13 |         "Your thoughts on the task at hand, was the previous goal successful?"
14 |       ),
15 |     memory: z
16 |       .string()
17 |       .describe(
18 |         "Information that you need to remember to accomplish subsequent goals"
19 |       ),
20 |     nextGoal: z
21 |       .string()
22 |       .describe(
23 |         "The next goal you are trying to accomplish with the actions you have chosen"
24 |       ),
25 |     actions: z.array(actionsSchema),
26 |   });
27 | 
28 | export type AgentOutput = z.infer<ReturnType<typeof AgentOutputFn>>;
29 | 
30 | export interface AgentStep {
31 |   idx: number;
32 |   agentOutput: AgentOutput;
33 |   actionOutputs: ActionOutput[];
34 | }
35 | 
36 | export interface TaskParams {
37 |   maxSteps?: number;
38 |   debugDir?: string;
39 |   outputSchema?: z.AnyZodObject;
40 |   onStep?: (step: AgentStep) => Promise<void> | void;
41 |   onComplete?: (output: TaskOutput) => Promise<void> | void;
42 |   debugOnAgentOutput?: (step: AgentOutput) => void;
43 | }
44 | 
45 | export interface TaskOutput {
46 |   status?: TaskStatus;
47 |   steps: AgentStep[];
48 |   output?: string;
49 | }
50 | 
51 | export interface Task {
52 |   getStatus: () => TaskStatus;
53 |   pause: () => TaskStatus;
54 |   resume: () => TaskStatus;
55 |   cancel: () => TaskStatus;
56 |   emitter: ErrorEmitter;
57 | }
58 | 
59 | export enum TaskStatus {
60 |   PENDING = "pending",
61 |   RUNNING = "running",
62 |   PAUSED = "paused",
63 |   CANCELLED = "cancelled",
64 |   COMPLETED = "completed",
65 |   FAILED = "failed",
66 | }
67 | 
68 | export const endTaskStatuses = new Set([
69 |   TaskStatus.CANCELLED,
70 |   TaskStatus.COMPLETED,
71 |   TaskStatus.FAILED,
72 | ]);
73 | 
74 | export interface TaskState {
75 |   id: string;
76 |   task: string;
77 |   status: TaskStatus;
78 |   startingPage: Page;
79 |   steps: AgentStep[];
80 |   output?: string;
81 |   error?: string;
82 | }
83 | 
84 | export interface HyperVariable {
85 |   key: string;
86 |   value: string;
87 |   description: string;
88 | }
89 | 
90 | export interface HyperPage extends Page {
91 |   ai: (task: string, params?: TaskParams) => Promise<TaskOutput>;
92 |   aiAsync: (task: string, params?: TaskParams) => Promise<Task>;
93 |   extract<T extends z.AnyZodObject | undefined = undefined>(
94 |     task?: string,
95 |     outputSchema?: T
96 |   ): Promise<T extends z.AnyZodObject ? z.infer<T> : string>;
97 | }
98 | 


--------------------------------------------------------------------------------
/src/types/browser-providers/types.ts:
--------------------------------------------------------------------------------
 1 | import { Browser } from "playwright";
 2 | 
 3 | abstract class BrowserProvider<T> {
 4 |   abstract session: unknown;
 5 |   abstract start(): Promise<Browser>;
 6 |   abstract close(): Promise<void>;
 7 |   abstract getSession(): T|null;
 8 | }
 9 | 
10 | export default BrowserProvider;
11 | 


--------------------------------------------------------------------------------
/src/types/config.ts:
--------------------------------------------------------------------------------
 1 | import { BaseChatModel } from "@langchain/core/language_models/chat_models";
 2 | import { AgentActionDefinition } from "./agent/actions/types";
 3 | 
 4 | import {
 5 |   HyperbrowserProvider,
 6 |   LocalBrowserProvider,
 7 | } from "@/browser-providers";
 8 | 
 9 | export interface MCPServerConfig {
10 |   id?: string;
11 | 
12 |   /**
13 |    * The type of MCP server to use
14 |    */
15 |   connectionType?: "stdio" | "sse";
16 | 
17 |   /**
18 |    * The executable to run to start the server.
19 |    */
20 |   command?: string;
21 |   /**
22 |    * Command line arguments to pass to the executable.
23 |    */
24 |   args?: string[];
25 |   /**
26 |    * The environment to use when spawning the process.
27 |    *
28 |    */
29 |   env?: Record<string, string>;
30 | 
31 |   /**
32 |    * URL for SSE connection (required when connectionType is "sse")
33 |    */
34 |   sseUrl?: string;
35 |   /**
36 |    * Headers for SSE connection
37 |    */
38 |   sseHeaders?: Record<string, string>;
39 | 
40 |   /**
41 |    * List of tools to exclude from the MCP config
42 |    */
43 |   excludeTools?: string[];
44 |   /**
45 |    * List of tools to include from the MCP config
46 |    */
47 |   includeTools?: string[];
48 | }
49 | 
50 | export interface MCPConfig {
51 |   /**
52 |    * List of servers to connect to
53 |    */
54 |   servers: MCPServerConfig[];
55 | }
56 | 
57 | export type BrowserProviders = "Local" | "Hyperbrowser";
58 | 
59 | export interface HyperAgentConfig<T extends BrowserProviders = "Local"> {
60 |   customActions?: Array<AgentActionDefinition>;
61 | 
62 |   browserProvider?: T;
63 | 
64 |   debug?: boolean;
65 |   llm?: BaseChatModel;
66 | 
67 |   hyperbrowserConfig?: Omit<
68 |     NonNullable<ConstructorParameters<typeof HyperbrowserProvider>[0]>,
69 |     "debug"
70 |   >;
71 |   localConfig?: ConstructorParameters<typeof LocalBrowserProvider>[0];
72 | }
73 | 


--------------------------------------------------------------------------------
/src/types/index.ts:
--------------------------------------------------------------------------------
 1 | // Agent Action Types
 2 | import {
 3 |   ActionType,
 4 |   ActionSchemaType,
 5 |   AgentActionDefinition,
 6 |   ActionContext,
 7 |   ActionOutput,
 8 | } from "./agent/actions/types";
 9 | 
10 | // Agent Types
11 | import {
12 |   AgentOutputFn,
13 |   AgentOutput,
14 |   AgentStep,
15 |   TaskParams,
16 |   TaskOutput,
17 |   Task,
18 |   TaskStatus,
19 |   TaskState,
20 |   endTaskStatuses,
21 | } from "./agent/types";
22 | 
23 | // Config Types
24 | import { MCPServerConfig, MCPConfig, HyperAgentConfig } from "./config";
25 | 
26 | // Browser Provider Types
27 | import BrowserProvider from "./browser-providers/types";
28 | 
29 | // Export all types
30 | export {
31 |   // Agent Action Types
32 |   ActionType,
33 |   ActionSchemaType,
34 |   AgentActionDefinition,
35 |   ActionContext,
36 |   ActionOutput,
37 | 
38 |   // Agent Types
39 |   AgentOutputFn,
40 |   AgentOutput,
41 |   AgentStep,
42 |   TaskParams,
43 |   TaskOutput,
44 |   Task,
45 |   TaskStatus,
46 |   TaskState,
47 | 
48 |   // Config Types
49 |   MCPServerConfig,
50 |   MCPConfig,
51 |   HyperAgentConfig,
52 | 
53 |   // Browser Provider Types
54 |   BrowserProvider,
55 |   endTaskStatuses,
56 | };
57 | 
58 | // Extend NodeJS.ProcessEnv to include our environment variables
59 | declare global {
60 |   // eslint-disable-next-line @typescript-eslint/no-namespace
61 |   namespace NodeJS {
62 |     interface ProcessEnv {
63 |       OPENAI_API_KEY?: string;
64 |       GEMINI_API_KEY?: string;
65 |     }
66 |   }
67 | }
68 | 


--------------------------------------------------------------------------------
/src/utils/error-emitter.ts:
--------------------------------------------------------------------------------
 1 | import EventEmitter from "events";
 2 | 
 3 | type ErrorEvents = {
 4 |   error: (error: Error) => void;
 5 | };
 6 | 
 7 | export class ErrorEmitter extends EventEmitter {
 8 |   override on<K extends keyof ErrorEvents>(
 9 |     event: K,
10 |     listener: ErrorEvents[K]
11 |   ): this {
12 |     return super.on(event, listener);
13 |   }
14 | 
15 |   override once<K extends keyof ErrorEvents>(
16 |     event: K,
17 |     listener: ErrorEvents[K]
18 |   ): this {
19 |     return super.once(event, listener);
20 |   }
21 | 
22 |   override off<K extends keyof ErrorEvents>(
23 |     event: K,
24 |     listener: ErrorEvents[K]
25 |   ): this {
26 |     return super.off(event, listener);
27 |   }
28 | 
29 |   override emit<K extends keyof ErrorEvents>(
30 |     event: K,
31 |     ...args: Parameters<ErrorEvents[K]>
32 |   ): boolean {
33 |     return super.emit(event, ...args);
34 |   }
35 | 
36 |   override addListener<K extends keyof ErrorEvents>(
37 |     eventName: K,
38 |     listener: (...args: Parameters<ErrorEvents[K]>) => void
39 |   ): this {
40 |     return super.addListener(eventName, listener);
41 |   }
42 | }
43 | 


--------------------------------------------------------------------------------
/src/utils/html-to-markdown.ts:
--------------------------------------------------------------------------------
 1 | import TurndownService from "turndown";
 2 | // TODO: Add gfm plugin
 3 | // import { gfm } from "joplin-turndown-plugin-gfm";
 4 | 
 5 | export const turndownService = new TurndownService();
 6 | 
 7 | turndownService.addRule("removeUnwantedTags", {
 8 |   filter: ["head", "script", "style"],
 9 |   replacement: function () {
10 |     return "";
11 |   },
12 | });
13 | 
14 | turndownService.addRule("inlineLink", {
15 |   filter: function (node: any, options: any) {
16 |     return (
17 |       options.linkStyle === "inlined" &&
18 |       node.nodeName === "A" &&
19 |       node.getAttribute("href")
20 |     );
21 |   },
22 |   replacement: function (content: string, node: any) {
23 |     var href = node.getAttribute("href").trim();
24 |     var title = node.title ? ' "' + node.title + '"' : "";
25 |     return "[" + content.trim() + "](" + href + title + ")\n";
26 |   },
27 | });
28 | // turndownService.use(gfm);
29 | 
30 | const processMultiLineLinks = (markdownContent: string): string => {
31 |   let insideLinkContent = false;
32 |   let newMarkdownContent = "";
33 |   let linkOpenCount = 0;
34 |   for (let i = 0; i < markdownContent.length; i++) {
35 |     const char = markdownContent[i];
36 | 
37 |     if (char == "[") {
38 |       linkOpenCount++;
39 |     } else if (char == "]") {
40 |       linkOpenCount = Math.max(0, linkOpenCount - 1);
41 |     }
42 |     insideLinkContent = linkOpenCount > 0;
43 | 
44 |     if (insideLinkContent && char == "\n") {
45 |       newMarkdownContent += "\\" + "\n";
46 |     } else {
47 |       newMarkdownContent += char;
48 |     }
49 |   }
50 |   return newMarkdownContent;
51 | };
52 | 
53 | const removeSkipToContentLinks = (markdownContent: string): string => {
54 |   // Remove [Skip to Content](#page) and [Skip to content](#skip)
55 |   const newMarkdownContent = markdownContent.replace(
56 |     /\[Skip to Content\]\(#[^\)]*\)/gi,
57 |     ""
58 |   );
59 |   return newMarkdownContent;
60 | };
61 | 
62 | export async function parseMarkdown(
63 |   html: string | null | undefined
64 | ): Promise<string> {
65 |   if (!html) {
66 |     return "";
67 |   }
68 |   try {
69 |     let markdownContent = turndownService.turndown(html);
70 |     markdownContent = processMultiLineLinks(markdownContent);
71 |     markdownContent = removeSkipToContentLinks(markdownContent);
72 |     return markdownContent;
73 |   } catch (error) {
74 |     console.error("Error converting HTML to Markdown", { error });
75 |     return ""; // Optionally return an empty string or handle the error as needed
76 |   }
77 | }
78 | 


--------------------------------------------------------------------------------
/src/utils/index.ts:
--------------------------------------------------------------------------------
1 | import { sleep } from "./sleep";
2 | import { retry } from "./retry";
3 | import { ErrorEmitter } from "./error-emitter";
4 | 
5 | export { sleep, retry, ErrorEmitter };
6 | 


--------------------------------------------------------------------------------
/src/utils/retry.ts:
--------------------------------------------------------------------------------
 1 | import { sleep } from "./sleep";
 2 | export async function retry<T>({
 3 |   func,
 4 |   params,
 5 |   onError,
 6 | }: {
 7 |   func: () => Promise<T>;
 8 |   params?: { retryCount: number };
 9 |   onError?: (...err: Array<unknown>) => void;
10 | }) {
11 |   let err = null;
12 |   const retryCount = params?.retryCount || 3;
13 |   for (let i = 0; i < retryCount; i++) {
14 |     try {
15 |       const resp = await func();
16 |       return resp;
17 |     } catch (error) {
18 |       onError?.(`Retry Attempt: ${i}`, error);
19 |       err = error;
20 |       await sleep(Math.pow(2, i) * 1000);
21 |       continue;
22 |     }
23 |   }
24 |   throw err;
25 | }
26 | 


--------------------------------------------------------------------------------
/src/utils/sleep.ts:
--------------------------------------------------------------------------------
1 | export const sleep = (ms: number): Promise<void> => {
2 |   return new Promise((resolve) => setTimeout(resolve, ms));
3 | };
4 | 


--------------------------------------------------------------------------------
/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "compilerOptions": {
 3 |     "target": "es2020",
 4 |     "lib": [
 5 |       "es2020",
 6 |       "DOM"
 7 |     ],
 8 |     "module": "commonjs",
 9 |     "moduleResolution": "node",
10 |     "declaration": true,
11 |     "outDir": "./dist",
12 |     "paths": {
13 |       "@hyperbrowser/agent": [
14 |         "./src/index"
15 |       ],
16 |       "@hyperbrowser/agent/types": [
17 |         "./src/types/index"
18 |       ],
19 |       "@hyperbrowser/agent/custom-actions": [
20 |         "./src/custom-actions/index"
21 |       ],
22 |       "@/*": [
23 |         "./src/*"
24 |       ]
25 |     },
26 |     "strict": true,
27 |     "esModuleInterop": true,
28 |     "skipLibCheck": true,
29 |     "forceConsistentCasingInFileNames": true,
30 |     "allowJs": true,
31 |   },
32 |   "include": [
33 |     "src/**/*.ts",
34 |     "src/**/*.js"
35 |   ],
36 |   "exclude": [
37 |     "node_modules",
38 |     "dist",
39 |     "debug"
40 |   ]
41 | }


--------------------------------------------------------------------------------