├── .changeset ├── config.json ├── cyan-symbols-double.md ├── empty-bugs-occur.md ├── fifty-cats-sell.md ├── floppy-experts-wash.md ├── green-signs-live.md ├── short-banks-sit.md ├── solid-rice-admire.md ├── sweet-glasses-hope.md ├── vast-vans-crash.md └── whole-yaks-cheat.md ├── .cursorrules ├── .env.example ├── .github ├── pull_request_template └── workflows │ ├── ci.yml │ └── release.yml ├── .gitignore ├── .prettierignore ├── .prettierrc ├── .vscode └── settings.json ├── CHANGELOG.md ├── LICENSE ├── README.md ├── docs ├── logging.md ├── media │ ├── chunks.png │ └── stagehand-playwright.png └── release.md ├── eslint.config.mjs ├── evals ├── args.ts ├── assets │ ├── cart.html │ └── peeler.html ├── deterministic │ ├── auxiliary │ │ └── logo.png │ ├── bb.playwright.config.ts │ ├── e2e.playwright.config.ts │ ├── local.playwright.config.ts │ ├── stagehand.config.ts │ └── tests │ │ ├── BrowserContext │ │ ├── addInitScript.test.ts │ │ ├── cookies.test.ts │ │ ├── multiPage.test.ts │ │ ├── page.test.ts │ │ └── routing.test.ts │ │ ├── Errors │ │ └── apiKeyError.test.ts │ │ ├── browserbase │ │ ├── contexts.test.ts │ │ ├── downloads.test.ts │ │ ├── sessions.test.ts │ │ └── uploads.test.ts │ │ ├── local │ │ └── create.test.ts │ │ └── page │ │ ├── addInitScript.test.ts │ │ ├── addRemoveLocatorHandler.test.ts │ │ ├── addTags.test.ts │ │ ├── bringToFront.test.ts │ │ ├── content.test.ts │ │ ├── evaluate.test.ts │ │ ├── expose.test.ts │ │ ├── frames.test.ts │ │ ├── getBy.test.ts │ │ ├── navigation.test.ts │ │ ├── on.test.ts │ │ ├── pageContext.test.ts │ │ ├── reload.test.ts │ │ └── waitFor.test.ts ├── env.ts ├── evals.config.json ├── evaluator.ts ├── index.eval.ts ├── initStagehand.ts ├── llm_clients │ ├── hn_aisdk.ts │ ├── hn_customOpenAI.ts │ └── hn_langchain.ts ├── logger.ts ├── scoring.ts ├── taskConfig.ts ├── tasks │ ├── agent │ │ ├── google_flights.ts │ │ ├── iframe_form.ts │ │ ├── iframe_form_multiple.ts │ │ ├── sf_library_card.ts │ │ └── sf_library_card_multiple.ts │ ├── allrecipes.ts │ ├── amazon_add_to_cart.ts │ ├── apple.ts │ ├── arxiv.ts │ ├── bidnet.ts │ ├── checkboxes.ts │ ├── combination_sauce.ts │ ├── costar.ts │ ├── dropdown.ts │ ├── expect_act_timeout.ts │ ├── expedia.ts │ ├── expedia_search.ts │ ├── extract_aigrant_companies.ts │ ├── extract_aigrant_targeted.ts │ ├── extract_aigrant_targeted_2.ts │ ├── extract_apartments.ts │ ├── extract_area_codes.ts │ ├── extract_baptist_health.ts │ ├── extract_capacitor_info.ts │ ├── extract_collaborators.ts │ ├── extract_csa.ts │ ├── extract_geniusee.ts │ ├── extract_geniusee_2.ts │ ├── extract_github_commits.ts │ ├── extract_github_stars.ts │ ├── extract_hamilton_weather.ts │ ├── extract_jfk_links.ts │ ├── extract_jstor_news.ts │ ├── extract_memorial_healthcare.ts │ ├── extract_nhl_stats.ts │ ├── extract_partners.ts │ ├── extract_press_releases.ts │ ├── extract_professional_info.ts │ ├── extract_public_notices.ts │ ├── extract_recipe.ts │ ├── extract_regulations_table.ts │ ├── extract_repo_name.ts │ ├── extract_resistor_info.ts │ ├── extract_rockauto.ts │ ├── extract_single_link.ts │ ├── extract_snowshoeing_destinations.ts │ ├── extract_staff_members.ts │ ├── extract_zillow.ts │ ├── google_flights.ts │ ├── google_jobs.ts │ ├── history.ts │ ├── homedepot.ts │ ├── imdb_movie_details.ts │ ├── instructions.ts │ ├── ionwave.ts │ ├── ionwave_observe.ts │ ├── nextChunk.ts │ ├── nonsense_action.ts │ ├── observe_amazon_add_to_cart.ts │ ├── observe_github.ts │ ├── observe_iframes1.ts │ ├── observe_iframes2.ts │ ├── observe_simple_google_search.ts │ ├── observe_taxes.ts │ ├── observe_vantechjournal.ts │ ├── observe_yc_startup.ts │ ├── panamcs.ts │ ├── peeler_complex.ts │ ├── peeler_simple.ts │ ├── prevChunk.ts │ ├── radio_btn.ts │ ├── rakuten_jp.ts │ ├── sciquest.ts │ ├── scroll_50.ts │ ├── scroll_75.ts │ ├── simple_google_search.ts │ ├── stock_x.ts │ ├── ted_talk.ts │ ├── vanta_h.ts │ ├── vantechjournal.ts │ ├── wichita.ts │ └── wikipedia.ts └── utils.ts ├── examples ├── 2048.ts ├── actionable_observe_example.ts ├── ai_sdk_example.ts ├── cua-example.ts ├── debugUrl.ts ├── example.ts ├── external_client.ts ├── external_clients │ ├── aisdk.ts │ ├── customOpenAI.ts │ └── langchain.ts ├── form_filling_sensible.ts ├── form_filling_sensible_cerebras.ts ├── form_filling_sensible_groq.ts ├── google_enter.ts ├── instructions.ts ├── langchain.ts ├── operator-example.ts ├── parameterizeApiKey.ts ├── popup.ts └── try_wordle.ts ├── lib ├── StagehandContext.ts ├── StagehandPage.ts ├── a11y │ └── utils.ts ├── agent │ ├── AgentClient.ts │ ├── AgentProvider.ts │ ├── AnthropicCUAClient.ts │ ├── OpenAICUAClient.ts │ └── StagehandAgent.ts ├── api.ts ├── cache.ts ├── cache │ ├── ActionCache.ts │ ├── BaseCache.ts │ └── LLMCache.ts ├── dom │ ├── DomChunk.ts │ ├── ElementContainer.ts │ ├── GlobalPageContainer.ts │ ├── StagehandContainer.ts │ ├── candidateCollector.ts │ ├── containerFactory.ts │ ├── elementCheckUtils.ts │ ├── genDomScripts.ts │ ├── global.d.ts │ ├── index.ts │ ├── process.ts │ ├── utils.ts │ └── xpathUtils.ts ├── handlers │ ├── actHandler.ts │ ├── agentHandler.ts │ ├── extractHandler.ts │ ├── handlerUtils │ │ └── actHandlerUtils.ts │ ├── observeHandler.ts │ └── operatorHandler.ts ├── index.ts ├── inference.ts ├── inferenceLogUtils.ts ├── llm │ ├── AnthropicClient.ts │ ├── CerebrasClient.ts │ ├── GoogleClient.ts │ ├── GroqClient.ts │ ├── LLMClient.ts │ ├── LLMProvider.ts │ └── OpenAIClient.ts ├── logger.ts ├── prompt.ts └── utils.ts ├── media ├── create-browser-app.gif └── github_demo.gif ├── package-lock.json ├── package.json ├── stagehand.config.ts ├── tsconfig.json └── types ├── act.ts ├── agent.ts ├── api.ts ├── browser.ts ├── context.ts ├── evals.ts ├── evaluator.ts ├── inference.ts ├── llm.ts ├── log.ts ├── model.ts ├── operator.ts ├── page.ts ├── playwright.ts ├── stagehand.ts ├── stagehandApiErrors.ts ├── stagehandErrors.ts └── textannotation.ts /.changeset/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://unpkg.com/@changesets/config@2.1.1/schema.json", 3 | "commit": false, 4 | "fixed": [], 5 | "linked": [], 6 | "baseBranch": "main", 7 | "updateInternalDependencies": "patch", 8 | "access": "public", 9 | "changelog": [ 10 | "@changesets/changelog-github", 11 | { 12 | "repo": "browserbase/stagehand" 13 | } 14 | ], 15 | "snapshot": { 16 | "useCalculatedVersion": true, 17 | "prereleaseTemplate": "alpha-{commit}", 18 | "tag": "alpha" 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /.changeset/cyan-symbols-double.md: -------------------------------------------------------------------------------- 1 | --- 2 | "@browserbasehq/stagehand": patch 3 | --- 4 | 5 | Updated the list of OpenAI models (4.1, o3...) 6 | -------------------------------------------------------------------------------- /.changeset/empty-bugs-occur.md: -------------------------------------------------------------------------------- 1 | --- 2 | "@browserbasehq/stagehand": patch 3 | --- 4 | 5 | wrap page.evaluate to make sure we have injected browser side scripts before calling them 6 | -------------------------------------------------------------------------------- /.changeset/fifty-cats-sell.md: -------------------------------------------------------------------------------- 1 | --- 2 | "@browserbasehq/stagehand": minor 3 | --- 4 | 5 | extract links 6 | -------------------------------------------------------------------------------- /.changeset/floppy-experts-wash.md: -------------------------------------------------------------------------------- 1 | --- 2 | "@browserbasehq/stagehand": patch 3 | --- 4 | 5 | remove unnecessary log 6 | -------------------------------------------------------------------------------- /.changeset/green-signs-live.md: -------------------------------------------------------------------------------- 1 | --- 2 | "@browserbasehq/stagehand": patch 3 | --- 4 | 5 | use javsacript click instead of playwright 6 | -------------------------------------------------------------------------------- /.changeset/short-banks-sit.md: -------------------------------------------------------------------------------- 1 | --- 2 | "@browserbasehq/stagehand": patch 3 | --- 4 | 5 | Fixed the schema input for Gemini's response model 6 | -------------------------------------------------------------------------------- /.changeset/solid-rice-admire.md: -------------------------------------------------------------------------------- 1 | --- 2 | "@browserbasehq/stagehand": minor 3 | --- 4 | 5 | Added Gemini 2.5 Flash to Google supported models 6 | -------------------------------------------------------------------------------- /.changeset/sweet-glasses-hope.md: -------------------------------------------------------------------------------- 1 | --- 2 | "@browserbasehq/stagehand": patch 3 | --- 4 | 5 | allow form filling when form is not top-most element 6 | -------------------------------------------------------------------------------- /.changeset/vast-vans-crash.md: -------------------------------------------------------------------------------- 1 | --- 2 | "@browserbasehq/stagehand": patch 3 | --- 4 | 5 | Fixes a redundant unnecessary log 6 | -------------------------------------------------------------------------------- /.changeset/whole-yaks-cheat.md: -------------------------------------------------------------------------------- 1 | --- 2 | "@browserbasehq/stagehand": minor 3 | --- 4 | 5 | Added a new class - Stagehand Evaluator - that wraps around a Stagehand object to determine whether a task is successful or not. Currently used for agent evals 6 | -------------------------------------------------------------------------------- /.env.example: -------------------------------------------------------------------------------- 1 | OPENAI_API_KEY="" 2 | CEREBRAS_API_KEY="" 3 | GROQ_API_KEY="" 4 | BROWSERBASE_API_KEY="" 5 | BRAINTRUST_API_KEY="" 6 | ANTHROPIC_API_KEY="" 7 | HEADLESS=false 8 | ENABLE_CACHING=false 9 | EVAL_MODELS="gpt-4o,claude-3-5-sonnet-latest" 10 | EXPERIMENTAL_EVAL_MODELS="gpt-4o,claude-3-5-sonnet-latest,o1-mini,o1-preview" 11 | EVAL_CATEGORIES="observe,act,combination,extract,experimental" 12 | STAGEHAND_API_URL="http://localhost:80" 13 | -------------------------------------------------------------------------------- /.github/pull_request_template: -------------------------------------------------------------------------------- 1 | # why 2 | 3 | # what changed 4 | 5 | # test plan 6 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Release 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | 8 | permissions: 9 | contents: write 10 | pull-requests: write 11 | 12 | concurrency: ${{ github.workflow }}-${{ github.ref }} 13 | 14 | jobs: 15 | release: 16 | name: Release 17 | runs-on: ubuntu-latest 18 | steps: 19 | - name: Checkout Repo 20 | uses: actions/checkout@v3 21 | 22 | - name: Setup Node.js 20.x 23 | uses: actions/setup-node@v3 24 | with: 25 | node-version: 20.x 26 | registry-url: "https://registry.npmjs.org" 27 | 28 | - name: Install dependencies 29 | run: | 30 | rm -rf node_modules 31 | rm -f package-lock.json 32 | npm install 33 | 34 | - name: Build 35 | run: npm run build 36 | 37 | - name: Create Release Pull Request or Publish to npm 38 | id: changesets 39 | uses: changesets/action@v1 40 | with: 41 | publish: npm run release 42 | env: 43 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 44 | NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }} 45 | 46 | - name: Publish Canary 47 | if: github.ref == 'refs/heads/main' 48 | run: | 49 | npm config set //registry.npmjs.org/:_authToken=${NODE_AUTH_TOKEN} 50 | git checkout main 51 | npm run release-canary 52 | env: 53 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 54 | NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }} 55 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules/ 2 | /test-results/ 3 | /playwright-report/ 4 | /blob-report/ 5 | /playwright/.cache/ 6 | screenshot.png 7 | .DS_STORE 8 | .cache/ 9 | .env 10 | downloads/ 11 | dist/ 12 | evals/**/public 13 | lib/dom/build/ 14 | evals/public 15 | *.tgz 16 | evals/playground.ts 17 | tmp/ 18 | eval-summary.json 19 | pnpm-lock.yaml 20 | evals/deterministic/tests/BrowserContext/tmp-test.har 21 | -------------------------------------------------------------------------------- /.prettierignore: -------------------------------------------------------------------------------- 1 | pnpm-lock.yaml 2 | README.md 3 | **/*.json -------------------------------------------------------------------------------- /.prettierrc: -------------------------------------------------------------------------------- 1 | {} 2 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "editor.defaultFormatter": "esbenp.prettier-vscode", 3 | "editor.formatOnSave": true 4 | } 5 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Browserbase Inc. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /docs/media/chunks.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/browserbase/stagehand/74f9339bac3404078bfcc47fecda44cc4b0f3876/docs/media/chunks.png -------------------------------------------------------------------------------- /docs/media/stagehand-playwright.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/browserbase/stagehand/74f9339bac3404078bfcc47fecda44cc4b0f3876/docs/media/stagehand-playwright.png -------------------------------------------------------------------------------- /eslint.config.mjs: -------------------------------------------------------------------------------- 1 | import globals from "globals"; 2 | import pluginJs from "@eslint/js"; 3 | import tseslint from "typescript-eslint"; 4 | 5 | /** @type {import('eslint').Linter.Config[]} */ 6 | export default [ 7 | { files: ["**/*.{js,mjs,cjs,ts}"] }, 8 | { languageOptions: { globals: globals.browser } }, 9 | { ignores: ["**/dist/**", "lib/dom/build/**"] }, 10 | pluginJs.configs.recommended, 11 | ...tseslint.configs.recommended, 12 | ]; 13 | -------------------------------------------------------------------------------- /evals/assets/cart.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Document 7 | 8 | 9 | 20 |
21 | 22 | 23 | -------------------------------------------------------------------------------- /evals/assets/peeler.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Document 7 | 8 | 9 |

Welcome to Our Page

10 | 11 |
12 |
13 |

Knife Set

14 |

15 | High-quality stainless steel knives for all your cooking needs.my stuff 18 | more stuff 19 |

20 |
21 | 22 |
23 |
24 |
25 |

Peeler

26 |

The ultimate tool for peeling fruits and vegetables.

27 |
28 | 29 |
30 | 31 |
hi world
32 |
33 |

34 | Baseball evolved from older 35 | bat-and-ball games 38 | already being played in England by the mid-18th century. This game was 39 | brought by immigrants to North America, 40 | where the modern version developed. 45 |

46 | 47 | 48 | -------------------------------------------------------------------------------- /evals/deterministic/auxiliary/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/browserbase/stagehand/74f9339bac3404078bfcc47fecda44cc4b0f3876/evals/deterministic/auxiliary/logo.png -------------------------------------------------------------------------------- /evals/deterministic/bb.playwright.config.ts: -------------------------------------------------------------------------------- 1 | import { defineConfig, devices } from "@playwright/test"; 2 | 3 | /** 4 | * See https://playwright.dev/docs/test-configuration. 5 | */ 6 | export default defineConfig({ 7 | testDir: "./tests/browserbase", 8 | 9 | /* Fail the build on CI if you accidentally left test.only in the source code. */ 10 | /* Run tests in files in parallel */ 11 | fullyParallel: true, 12 | /* Reporter to use. See https://playwright.dev/docs/test-reporters */ 13 | // reporter: "html", 14 | reporter: "line", 15 | /* Retry on CI only */ 16 | retries: 2, 17 | 18 | /* Shared settings for all the projects below. See https://playwright.dev/docs/api/class-testoptions. */ 19 | use: { 20 | /* Collect trace when retrying the failed test. See https://playwright.dev/docs/trace-viewer */ 21 | trace: "on-first-retry", 22 | }, 23 | 24 | /* Configure projects for major browsers */ 25 | projects: [ 26 | { 27 | name: "chromium", 28 | use: { ...devices["Desktop Chrome"] }, 29 | }, 30 | ], 31 | }); 32 | -------------------------------------------------------------------------------- /evals/deterministic/e2e.playwright.config.ts: -------------------------------------------------------------------------------- 1 | import { defineConfig, devices } from "@playwright/test"; 2 | 3 | /** 4 | * See https://playwright.dev/docs/test-configuration. 5 | */ 6 | export default defineConfig({ 7 | // Look in "tests" for test files... 8 | testDir: "./tests", 9 | // ...but ignore anything in "tests/browserbase & "tests/local" 10 | testIgnore: ["**/browserbase/**", "**/local/**"], 11 | 12 | /* Fail the build on CI if you accidentally left test.only in the source code. */ 13 | /* Run tests in files in parallel */ 14 | fullyParallel: true, 15 | /* Reporter to use. See https://playwright.dev/docs/test-reporters */ 16 | // reporter: "html", 17 | reporter: "line", 18 | retries: 2, 19 | 20 | /* Shared settings for all the projects below. See https://playwright.dev/docs/api/class-testoptions. */ 21 | use: { 22 | /* Collect trace when retrying the failed test. See https://playwright.dev/docs/trace-viewer */ 23 | trace: "on-first-retry", 24 | }, 25 | 26 | /* Configure projects for major browsers */ 27 | projects: [ 28 | { 29 | name: "chromium", 30 | use: { ...devices["Desktop Chrome"] }, 31 | }, 32 | ], 33 | }); 34 | -------------------------------------------------------------------------------- /evals/deterministic/local.playwright.config.ts: -------------------------------------------------------------------------------- 1 | import { defineConfig, devices } from "@playwright/test"; 2 | 3 | /** 4 | * See https://playwright.dev/docs/test-configuration. 5 | */ 6 | export default defineConfig({ 7 | testDir: "./tests/local", 8 | 9 | /* Maximum time one test can run for. */ 10 | timeout: 30 * 1000, 11 | 12 | /* Fail the build on CI if you accidentally left test.only in the source code. */ 13 | forbidOnly: !!process.env.CI, 14 | 15 | /* Run tests in files in parallel */ 16 | fullyParallel: false, 17 | 18 | /* Reporter to use */ 19 | reporter: "line", 20 | 21 | /* Retry on CI only */ 22 | retries: process.env.CI ? 2 : 0, 23 | 24 | /* Shared settings for all the projects below. See https://playwright.dev/docs/api/class-testoptions. */ 25 | use: { 26 | /* Collect trace when retrying the failed test. See https://playwright.dev/docs/trace-viewer */ 27 | trace: "on-first-retry", 28 | }, 29 | 30 | /* Configure projects for major browsers */ 31 | projects: [ 32 | { 33 | name: "chromium", 34 | use: { ...devices["Desktop Chrome"] }, 35 | }, 36 | ], 37 | }); 38 | -------------------------------------------------------------------------------- /evals/deterministic/stagehand.config.ts: -------------------------------------------------------------------------------- 1 | import { default as DefaultStagehandConfig } from "@/stagehand.config"; 2 | import type { ConstructorParams } from "@/dist"; 3 | import dotenv from "dotenv"; 4 | dotenv.config({ path: "../../.env" }); 5 | 6 | const StagehandConfig: ConstructorParams = { 7 | ...DefaultStagehandConfig, 8 | env: "LOCAL" /* Environment to run Stagehand in */, 9 | verbose: 1 /* Logging verbosity level (0=quiet, 1=normal, 2=verbose) */, 10 | browserbaseSessionCreateParams: { 11 | projectId: process.env.BROWSERBASE_PROJECT_ID, 12 | }, 13 | enableCaching: false /* Enable caching functionality */, 14 | localBrowserLaunchOptions: { 15 | headless: true /* Run browser in headless mode */, 16 | }, 17 | }; 18 | export default StagehandConfig; 19 | -------------------------------------------------------------------------------- /evals/deterministic/tests/BrowserContext/addInitScript.test.ts: -------------------------------------------------------------------------------- 1 | import { test, expect } from "@playwright/test"; 2 | import { Stagehand } from "@/dist"; 3 | import StagehandConfig from "@/evals/deterministic/stagehand.config"; 4 | 5 | test.describe("StagehandContext - addInitScript", () => { 6 | test("should inject a script on the context before pages load", async () => { 7 | const stagehand = new Stagehand(StagehandConfig); 8 | await stagehand.init(); 9 | 10 | const context = stagehand.context; 11 | 12 | await context.addInitScript(() => { 13 | const w = window as typeof window & { 14 | __testContextScriptVar?: string; 15 | }; 16 | w.__testContextScriptVar = "Hello from context.initScript!"; 17 | }); 18 | 19 | const pageA = await context.newPage(); 20 | await pageA.goto("https://example.com"); 21 | 22 | const resultA = await pageA.evaluate(() => { 23 | const w = window as typeof window & { 24 | __testContextScriptVar?: string; 25 | }; 26 | return w.__testContextScriptVar; 27 | }); 28 | expect(resultA).toBe("Hello from context.initScript!"); 29 | 30 | const pageB = await context.newPage(); 31 | await pageB.goto("https://docs.browserbase.com"); 32 | 33 | const resultB = await pageB.evaluate(() => { 34 | const w = window as typeof window & { 35 | __testContextScriptVar?: string; 36 | }; 37 | return w.__testContextScriptVar; 38 | }); 39 | expect(resultB).toBe("Hello from context.initScript!"); 40 | 41 | await stagehand.close(); 42 | }); 43 | }); 44 | -------------------------------------------------------------------------------- /evals/deterministic/tests/BrowserContext/cookies.test.ts: -------------------------------------------------------------------------------- 1 | import { test, expect } from "@playwright/test"; 2 | import { Stagehand } from "@/dist"; 3 | import StagehandConfig from "@/evals/deterministic/stagehand.config"; 4 | 5 | test.describe("StagehandContext - Cookies", () => { 6 | let stagehand: Stagehand; 7 | 8 | test.beforeEach(async () => { 9 | stagehand = new Stagehand(StagehandConfig); 10 | await stagehand.init(); 11 | }); 12 | 13 | test.afterEach(async () => { 14 | await stagehand.close(); 15 | }); 16 | 17 | test("should add cookies and retrieve them", async () => { 18 | const context = stagehand.context; // This is the wrapped BrowserContext 19 | const url = "https://example.com"; 20 | 21 | await context.addCookies([ 22 | { 23 | name: "myCookie", 24 | value: "myValue", 25 | domain: "example.com", 26 | path: "/", 27 | expires: Math.floor(Date.now() / 1000) + 3600, 28 | httpOnly: false, 29 | secure: false, 30 | sameSite: "Lax", 31 | }, 32 | ]); 33 | 34 | const cookies = await context.cookies(url); 35 | expect(cookies.length).toBeGreaterThan(0); 36 | 37 | const myCookie = cookies.find((c) => c.name === "myCookie"); 38 | expect(myCookie).toBeDefined(); 39 | expect(myCookie?.value).toBe("myValue"); 40 | }); 41 | 42 | test("should clear all cookies", async () => { 43 | const context = stagehand.context; 44 | const url = "https://example.com"; 45 | 46 | await context.addCookies([ 47 | { 48 | name: "myOtherCookie", 49 | value: "anotherValue", 50 | domain: "example.com", 51 | path: "/", 52 | expires: Math.floor(Date.now() / 1000) + 3600, 53 | httpOnly: false, 54 | secure: false, 55 | sameSite: "Lax", 56 | }, 57 | ]); 58 | 59 | const cookiesBefore = await context.cookies(url); 60 | const found = cookiesBefore.some((c) => c.name === "myOtherCookie"); 61 | expect(found).toBe(true); 62 | 63 | await context.clearCookies(); 64 | 65 | const cookiesAfter = await context.cookies(url); 66 | const stillFound = cookiesAfter.some((c) => c.name === "myOtherCookie"); 67 | expect(stillFound).toBe(false); 68 | }); 69 | }); 70 | -------------------------------------------------------------------------------- /evals/deterministic/tests/Errors/apiKeyError.test.ts: -------------------------------------------------------------------------------- 1 | import { test, expect } from "@playwright/test"; 2 | import { Stagehand } from "@/dist"; 3 | import StagehandConfig from "@/evals/deterministic/stagehand.config"; 4 | import { z } from "zod"; 5 | 6 | test.describe("API key/LLMClient error", () => { 7 | test("Should confirm that we get an error if we call extract without LLM API key or LLMClient", async () => { 8 | const stagehand = new Stagehand(StagehandConfig); 9 | await stagehand.init(); 10 | await stagehand.page.goto("https://docs.browserbase.com/introduction"); 11 | 12 | let errorThrown: Error | null = null; 13 | 14 | try { 15 | await stagehand.page.extract({ 16 | instruction: 17 | "From the introduction page, extract the explanation of what Browserbase is.", 18 | schema: z.object({ 19 | stars: z.string().describe("the explanation of what Browserbase is"), 20 | }), 21 | }); 22 | } catch (error) { 23 | errorThrown = error as Error; 24 | } 25 | 26 | expect(errorThrown).toBeInstanceOf(Error); 27 | expect(errorThrown?.message).toContain( 28 | "No LLM API key or LLM Client configured", 29 | ); 30 | 31 | await stagehand.close(); 32 | }); 33 | 34 | test("Should confirm that we get an error if we call act without LLM API key or LLMClient", async () => { 35 | const stagehand = new Stagehand(StagehandConfig); 36 | await stagehand.init(); 37 | await stagehand.page.goto("https://docs.browserbase.com/introduction"); 38 | 39 | let errorThrown: Error | null = null; 40 | 41 | try { 42 | await stagehand.page.act({ 43 | action: "Click on the 'Quickstart' section", 44 | }); 45 | } catch (error) { 46 | errorThrown = error as Error; 47 | } 48 | 49 | expect(errorThrown).toBeInstanceOf(Error); 50 | expect(errorThrown?.message).toContain( 51 | "No LLM API key or LLM Client configured", 52 | ); 53 | 54 | await stagehand.close(); 55 | }); 56 | 57 | test("Should confirm that we get an error if we call observe without LLM API key or LLMClient", async () => { 58 | const stagehand = new Stagehand(StagehandConfig); 59 | await stagehand.init(); 60 | await stagehand.page.goto("https://docs.browserbase.com/introduction"); 61 | 62 | let errorThrown: Error | null = null; 63 | 64 | try { 65 | await stagehand.page.observe(); 66 | } catch (error) { 67 | errorThrown = error as Error; 68 | } 69 | 70 | expect(errorThrown).toBeInstanceOf(Error); 71 | expect(errorThrown?.message).toContain( 72 | "No LLM API key or LLM Client configured", 73 | ); 74 | 75 | await stagehand.close(); 76 | }); 77 | }); 78 | -------------------------------------------------------------------------------- /evals/deterministic/tests/browserbase/downloads.test.ts: -------------------------------------------------------------------------------- 1 | import { test, expect } from "@playwright/test"; 2 | import AdmZip from "adm-zip"; 3 | import StagehandConfig from "@/evals/deterministic/stagehand.config"; 4 | import { Stagehand } from "@/dist"; 5 | import Browserbase from "@browserbasehq/sdk"; 6 | 7 | const downloadRe = /sandstorm-(\d{13})+\.mp3/; 8 | 9 | test("Downloads", async () => { 10 | const stagehand = new Stagehand(StagehandConfig); 11 | await stagehand.init(); 12 | const page = stagehand.page; 13 | const context = stagehand.context; 14 | 15 | const client = await context.newCDPSession(page); 16 | await client.send("Browser.setDownloadBehavior", { 17 | behavior: "allow", 18 | // `downloadPath` gets appended to the browser's default download directory. 19 | // set to "downloads", it ends up being "/app/apps/browser/downloads/". 20 | downloadPath: "downloads", 21 | eventsEnabled: true, 22 | }); 23 | 24 | await page.goto("https://browser-tests-alpha.vercel.app/api/download-test"); 25 | 26 | const [download] = await Promise.all([ 27 | page.waitForEvent("download"), 28 | page.locator("#download").click(), 29 | ]); 30 | 31 | const downloadError = await download.failure(); 32 | 33 | await stagehand.close(); 34 | 35 | if (downloadError !== null) { 36 | throw new Error( 37 | `Download for session ${stagehand.browserbaseSessionID} failed: ${downloadError}`, 38 | ); 39 | } 40 | 41 | expect(async () => { 42 | const bb = new Browserbase(); 43 | const zipBuffer = await bb.sessions.downloads.list( 44 | stagehand.browserbaseSessionID, 45 | ); 46 | if (!zipBuffer) { 47 | throw new Error( 48 | `Download buffer is empty for session ${stagehand.browserbaseSessionID}`, 49 | ); 50 | } 51 | 52 | const zip = new AdmZip(Buffer.from(await zipBuffer.arrayBuffer())); 53 | const zipEntries = zip.getEntries(); 54 | const mp3Entry = zipEntries.find((entry) => 55 | downloadRe.test(entry.entryName), 56 | ); 57 | 58 | if (!mp3Entry) { 59 | throw new Error( 60 | `Session ${stagehand.browserbaseSessionID} is missing a file matching "${downloadRe.toString()}" in its zip entries: ${JSON.stringify(zipEntries.map((entry) => entry.entryName))}`, 61 | ); 62 | } 63 | 64 | const expectedFileSize = 6137541; 65 | expect(mp3Entry.header.size).toBe(expectedFileSize); 66 | }).toPass({ 67 | timeout: 30_000, 68 | }); 69 | }); 70 | -------------------------------------------------------------------------------- /evals/deterministic/tests/browserbase/sessions.test.ts: -------------------------------------------------------------------------------- 1 | import { test, expect } from "@playwright/test"; 2 | import { Stagehand } from "@/dist"; 3 | import StagehandConfig from "@/evals/deterministic/stagehand.config"; 4 | import Browserbase from "@browserbasehq/sdk"; 5 | 6 | test.describe("Browserbase Sessions", () => { 7 | let browserbase: Browserbase; 8 | let sessionId: string; 9 | let bigStagehand: Stagehand; 10 | 11 | test.beforeAll(async () => { 12 | browserbase = new Browserbase({ 13 | apiKey: process.env.BROWSERBASE_API_KEY, 14 | }); 15 | bigStagehand = new Stagehand({ 16 | ...StagehandConfig, 17 | env: "BROWSERBASE", 18 | browserbaseSessionCreateParams: { 19 | projectId: process.env.BROWSERBASE_PROJECT_ID, 20 | keepAlive: true, 21 | }, 22 | }); 23 | await bigStagehand.init(); 24 | await bigStagehand.page.goto( 25 | "https://docs.stagehand.dev/get_started/introduction", 26 | ); 27 | sessionId = bigStagehand.browserbaseSessionID; 28 | if (!sessionId) { 29 | throw new Error("Failed to get browserbase session ID"); 30 | } 31 | }); 32 | test.afterAll(async () => { 33 | await bigStagehand.close(); 34 | }); 35 | test("resumes a session via sessionId", async () => { 36 | const stagehand = new Stagehand({ 37 | ...StagehandConfig, 38 | env: "BROWSERBASE", 39 | browserbaseSessionID: sessionId, 40 | }); 41 | await stagehand.init(); 42 | 43 | const page = stagehand.page; 44 | 45 | expect(page.url()).toBe( 46 | "https://docs.stagehand.dev/get_started/introduction", 47 | ); 48 | await stagehand.close(); 49 | }); 50 | test("resumes a session via CDP URL", async () => { 51 | const session = await browserbase.sessions.retrieve(sessionId); 52 | const stagehand = new Stagehand({ 53 | ...StagehandConfig, 54 | env: "LOCAL", 55 | localBrowserLaunchOptions: { 56 | headless: true, 57 | cdpUrl: session.connectUrl, 58 | }, 59 | }); 60 | await stagehand.init(); 61 | const page = stagehand.page; 62 | 63 | expect(page.url()).toBe( 64 | "https://docs.stagehand.dev/get_started/introduction", 65 | ); 66 | }); 67 | }); 68 | -------------------------------------------------------------------------------- /evals/deterministic/tests/browserbase/uploads.test.ts: -------------------------------------------------------------------------------- 1 | import { join } from "node:path"; 2 | import { test, expect } from "@playwright/test"; 3 | import { Stagehand } from "@/dist"; 4 | import StagehandConfig from "@/evals/deterministic/stagehand.config"; 5 | 6 | test.describe("Playwright Upload", () => { 7 | let stagehand: Stagehand; 8 | 9 | test.beforeAll(async () => { 10 | stagehand = new Stagehand(StagehandConfig); 11 | await stagehand.init(); 12 | }); 13 | 14 | test.afterAll(async () => { 15 | await stagehand.close(); 16 | }); 17 | 18 | test("uploads a file", async () => { 19 | const page = stagehand.page; 20 | await page.goto("https://browser-tests-alpha.vercel.app/api/upload-test"); 21 | 22 | const fileInput = page.locator("#fileUpload"); 23 | await fileInput.setInputFiles( 24 | join(__dirname, "../..", "auxiliary", "logo.png"), 25 | ); 26 | 27 | const fileNameSpan = page.locator("#fileName"); 28 | const fileName = await fileNameSpan.innerText(); 29 | 30 | const fileSizeSpan = page.locator("#fileSize"); 31 | const fileSize = Number(await fileSizeSpan.innerText()); 32 | 33 | expect(fileName).toBe("logo.png"); 34 | expect(fileSize).toBeGreaterThan(0); 35 | }); 36 | }); 37 | -------------------------------------------------------------------------------- /evals/deterministic/tests/page/addInitScript.test.ts: -------------------------------------------------------------------------------- 1 | import { test, expect } from "@playwright/test"; 2 | import { Stagehand } from "@/dist"; 3 | import StagehandConfig from "@/evals/deterministic/stagehand.config"; 4 | 5 | test.describe("StagehandPage - addInitScript", () => { 6 | test("should inject a script before the page loads", async () => { 7 | const stagehand = new Stagehand(StagehandConfig); 8 | await stagehand.init(); 9 | 10 | const page = stagehand.page; 11 | 12 | await page.addInitScript(() => { 13 | const w = window as typeof window & { 14 | __testInitScriptVar?: string; 15 | }; 16 | w.__testInitScriptVar = "Hello from init script!"; 17 | }); 18 | 19 | await page.goto("https://example.com"); 20 | 21 | const result = await page.evaluate(() => { 22 | const w = window as typeof window & { 23 | __testInitScriptVar?: string; 24 | }; 25 | return w.__testInitScriptVar; 26 | }); 27 | expect(result).toBe("Hello from init script!"); 28 | 29 | await page.goto("https://docs.browserbase.com/"); 30 | const resultAfterNavigation = await page.evaluate(() => { 31 | const w = window as typeof window & { 32 | __testInitScriptVar?: string; 33 | }; 34 | return w.__testInitScriptVar; 35 | }); 36 | expect(resultAfterNavigation).toBe("Hello from init script!"); 37 | 38 | await stagehand.close(); 39 | }); 40 | 41 | test("checks if init scripts are re-added and available even if they've been deleted", async () => { 42 | const stagehand = new Stagehand(StagehandConfig); 43 | await stagehand.init(); 44 | 45 | const page = stagehand.page; 46 | await page.goto( 47 | "https://browserbase.github.io/stagehand-eval-sites/sites/aigrant/", 48 | ); 49 | 50 | // delete the __stagehandInjected flag, and delete the 51 | // getScrollableElementXpaths function 52 | await page.evaluate(() => { 53 | delete window.getScrollableElementXpaths; 54 | delete window.__stagehandInjected; 55 | }); 56 | 57 | // attempt to call the getScrollableElementXpaths function 58 | // which we previously deleted. page.evaluate should realize 59 | // its been deleted and re-inject it 60 | const xpaths = await page.evaluate(() => { 61 | return window.getScrollableElementXpaths(); 62 | }); 63 | 64 | await stagehand.close(); 65 | // this is the only scrollable element on the page 66 | expect(xpaths).toContain("/html"); 67 | }); 68 | }); 69 | -------------------------------------------------------------------------------- /evals/deterministic/tests/page/addTags.test.ts: -------------------------------------------------------------------------------- 1 | import { test, expect } from "@playwright/test"; 2 | import { Stagehand } from "@/dist"; 3 | import StagehandConfig from "@/evals/deterministic/stagehand.config"; 4 | 5 | test.describe("StagehandPage - addScriptTag and addStyleTag", () => { 6 | let stagehand: Stagehand; 7 | 8 | test.beforeAll(async () => { 9 | stagehand = new Stagehand(StagehandConfig); 10 | await stagehand.init(); 11 | }); 12 | 13 | test.afterAll(async () => { 14 | await stagehand.close(); 15 | }); 16 | 17 | test("should inject a script tag and have access to the defined function", async () => { 18 | const { page } = stagehand; 19 | 20 | await page.setContent(` 21 | 22 | 23 |

Hello, world!

24 | 25 | 26 | `); 27 | 28 | await page.addScriptTag({ 29 | content: ` 30 | window.sayHello = function() { 31 | document.getElementById("greeting").textContent = "Hello from injected script!"; 32 | } 33 | `, 34 | }); 35 | 36 | await page.evaluate(() => { 37 | const w = window as typeof window & { 38 | sayHello?: () => void; 39 | }; 40 | w.sayHello?.(); 41 | }); 42 | 43 | const text = await page.locator("#greeting").textContent(); 44 | expect(text).toBe("Hello from injected script!"); 45 | }); 46 | 47 | test("should inject a style tag and apply styles", async () => { 48 | const { page } = stagehand; 49 | 50 | await page.setContent(` 51 | 52 | 53 |
Some text
54 | 55 | 56 | `); 57 | 58 | await page.addStyleTag({ 59 | content: ` 60 | #styledDiv { 61 | color: red; 62 | font-weight: bold; 63 | } 64 | `, 65 | }); 66 | 67 | const color = await page.evaluate(() => { 68 | const el = document.getElementById("styledDiv"); 69 | return window.getComputedStyle(el!).color; 70 | }); 71 | expect(color).toBe("rgb(255, 0, 0)"); 72 | 73 | const fontWeight = await page.evaluate(() => { 74 | const el = document.getElementById("styledDiv"); 75 | return window.getComputedStyle(el!).fontWeight; 76 | }); 77 | expect(["bold", "700"]).toContain(fontWeight); 78 | }); 79 | }); 80 | -------------------------------------------------------------------------------- /evals/deterministic/tests/page/bringToFront.test.ts: -------------------------------------------------------------------------------- 1 | import { test, expect } from "@playwright/test"; 2 | import { Stagehand } from "@/dist"; 3 | import StagehandConfig from "@/evals/deterministic/stagehand.config"; 4 | 5 | test.describe("StagehandPage - bringToFront", () => { 6 | test("should bring a background page to the front and allow further actions", async () => { 7 | const stagehand = new Stagehand(StagehandConfig); 8 | await stagehand.init(); 9 | 10 | const { page: page1 } = stagehand; 11 | 12 | const page2 = await stagehand.context.newPage(); 13 | await page2.goto("https://example.com"); 14 | const page2Title = await page2.title(); 15 | console.log("Page2 Title:", page2Title); 16 | 17 | await page1.goto("https://www.google.com"); 18 | const page1TitleBefore = await page1.title(); 19 | console.log("Page1 Title before:", page1TitleBefore); 20 | 21 | await page1.bringToFront(); 22 | 23 | await page1.goto("https://docs.browserbase.com"); 24 | const page1TitleAfter = await page1.title(); 25 | console.log("Page1 Title after:", page1TitleAfter); 26 | 27 | await page2.bringToFront(); 28 | const page2URLBefore = page2.url(); 29 | console.log("Page2 URL before navigation:", page2URLBefore); 30 | 31 | await stagehand.close(); 32 | 33 | expect(page1TitleBefore).toContain("Google"); 34 | expect(page1TitleAfter).toContain("Browserbase"); 35 | expect(page2Title).toContain("Example Domain"); 36 | }); 37 | }); 38 | -------------------------------------------------------------------------------- /evals/deterministic/tests/page/content.test.ts: -------------------------------------------------------------------------------- 1 | import { test, expect } from "@playwright/test"; 2 | import { Stagehand } from "@/dist"; 3 | import StagehandConfig from "@/evals/deterministic/stagehand.config"; 4 | 5 | test.describe("StagehandPage - content", () => { 6 | test("should retrieve the full HTML content of the page", async () => { 7 | const stagehand = new Stagehand(StagehandConfig); 8 | await stagehand.init(); 9 | 10 | const page = stagehand.page; 11 | await page.goto("https://example.com"); 12 | const html = await page.content(); 13 | expect(html).toContain("Example Domain"); 14 | expect(html).toContain("

Example Domain

"); 15 | 16 | await stagehand.close(); 17 | }); 18 | }); 19 | -------------------------------------------------------------------------------- /evals/deterministic/tests/page/evaluate.test.ts: -------------------------------------------------------------------------------- 1 | import { test, expect } from "@playwright/test"; 2 | import { Stagehand } from "@/dist"; 3 | import StagehandConfig from "@/evals/deterministic/stagehand.config"; 4 | 5 | test.describe("StagehandPage - JavaScript Evaluation", () => { 6 | test("can evaluate JavaScript in the page context", async () => { 7 | const stagehand = new Stagehand(StagehandConfig); 8 | await stagehand.init(); 9 | 10 | const page = stagehand.page; 11 | 12 | await page.goto("https://example.com"); 13 | 14 | const sum = await page.evaluate(() => 2 + 2); 15 | expect(sum).toBe(4); 16 | 17 | const pageTitle = await page.evaluate(() => document.title); 18 | expect(pageTitle).toMatch(/example/i); 19 | 20 | const obj = await page.evaluate(() => { 21 | return { 22 | message: "Hello from the browser", 23 | userAgent: navigator.userAgent, 24 | }; 25 | }); 26 | expect(obj).toHaveProperty("message", "Hello from the browser"); 27 | expect(obj.userAgent).toBeDefined(); 28 | 29 | await stagehand.close(); 30 | }); 31 | }); 32 | -------------------------------------------------------------------------------- /evals/deterministic/tests/page/expose.test.ts: -------------------------------------------------------------------------------- 1 | import { test, expect } from "@playwright/test"; 2 | import { Stagehand } from "@/dist"; 3 | import StagehandConfig from "@/evals/deterministic/stagehand.config"; 4 | 5 | test.describe("StagehandPage - evaluateHandle, exposeBinding, exposeFunction", () => { 6 | let stagehand: Stagehand; 7 | 8 | test.beforeAll(async () => { 9 | stagehand = new Stagehand(StagehandConfig); 10 | await stagehand.init(); 11 | }); 12 | 13 | test.afterAll(async () => { 14 | await stagehand.close(); 15 | }); 16 | 17 | test("demonstrates evaluateHandle, exposeBinding, and exposeFunction", async () => { 18 | const { page } = stagehand; 19 | 20 | await page.setContent(` 21 | 22 | 23 |
Initial Text
24 | 25 | 26 | `); 27 | 28 | const divHandle = await page.evaluateHandle(() => { 29 | return document.getElementById("myDiv"); 30 | }); 31 | await divHandle.evaluate((div, newText) => { 32 | div.textContent = newText; 33 | }, "Text updated via evaluateHandle"); 34 | 35 | const text = await page.locator("#myDiv").textContent(); 36 | expect(text).toBe("Text updated via evaluateHandle"); 37 | 38 | await page.exposeBinding("myBinding", async (source, arg: string) => { 39 | console.log("myBinding called from page with arg:", arg); 40 | return `Node responded with: I got your message: "${arg}"`; 41 | }); 42 | 43 | const responseFromBinding = await page.evaluate(async () => { 44 | const w = window as typeof window & { 45 | myBinding?: (arg: string) => Promise; 46 | }; 47 | return w.myBinding?.("Hello from the browser"); 48 | }); 49 | expect(responseFromBinding).toMatch(/I got your message/); 50 | 51 | await page.exposeFunction("addNumbers", (a: number, b: number) => { 52 | return a + b; 53 | }); 54 | 55 | const sum = await page.evaluate(async () => { 56 | const w = window as typeof window & { 57 | addNumbers?: (a: number, b: number) => number; 58 | }; 59 | return w.addNumbers?.(3, 7); 60 | }); 61 | expect(sum).toBe(10); 62 | }); 63 | }); 64 | -------------------------------------------------------------------------------- /evals/deterministic/tests/page/frames.test.ts: -------------------------------------------------------------------------------- 1 | import { test, expect } from "@playwright/test"; 2 | import { Stagehand } from "@/dist"; 3 | import StagehandConfig from "@/evals/deterministic/stagehand.config"; 4 | 5 | test.describe("StagehandPage - frame operations", () => { 6 | let stagehand: Stagehand; 7 | 8 | test.beforeAll(async () => { 9 | stagehand = new Stagehand(StagehandConfig); 10 | await stagehand.init(); 11 | }); 12 | 13 | test.afterAll(async () => { 14 | await stagehand.close(); 15 | }); 16 | 17 | test("should use page.mainFrame(), page.frames(), page.frame(), and page.frameLocator()", async () => { 18 | const { page } = stagehand; 19 | 20 | await page.setContent(` 21 | 22 | 23 | 27 | 28 | 32 | 33 | 34 | `); 35 | 36 | await page.waitForSelector('iframe[name="frame-one"]'); 37 | await page.waitForSelector('iframe[name="frame-two"]'); 38 | 39 | const frames = page.frames(); 40 | console.log( 41 | "All frames found:", 42 | frames.map((f) => f.name()), 43 | ); 44 | expect(frames).toHaveLength(3); 45 | 46 | const mainFrame = page.mainFrame(); 47 | console.log("Main frame name:", mainFrame.name()); 48 | expect(mainFrame.name()).toBe(""); 49 | 50 | const frameOne = page.frame({ name: "frame-one" }); 51 | expect(frameOne).not.toBeNull(); 52 | 53 | const frameOneText = await frameOne?.locator("h1").textContent(); 54 | expect(frameOneText).toBe("Hello from Frame 1"); 55 | 56 | const frameTwoLocator = page.frameLocator("iframe[name='frame-two']"); 57 | const frameTwoText = await frameTwoLocator.locator("h1").textContent(); 58 | expect(frameTwoText).toBe("Hello from Frame 2"); 59 | 60 | const frameTwo = page.frame({ name: "frame-two" }); 61 | expect(frameTwo).not.toBeNull(); 62 | 63 | const frameTwoTextAgain = await frameTwo?.locator("h1").textContent(); 64 | expect(frameTwoTextAgain).toBe("Hello from Frame 2"); 65 | }); 66 | }); 67 | -------------------------------------------------------------------------------- /evals/deterministic/tests/page/getBy.test.ts: -------------------------------------------------------------------------------- 1 | import { test, expect } from "@playwright/test"; 2 | import { Stagehand } from "@/dist"; 3 | import StagehandConfig from "@/evals/deterministic/stagehand.config"; 4 | 5 | test.describe("StagehandPage - Built-in locators", () => { 6 | let stagehand: Stagehand; 7 | 8 | test.beforeAll(async () => { 9 | stagehand = new Stagehand(StagehandConfig); 10 | await stagehand.init(); 11 | }); 12 | 13 | test.afterAll(async () => { 14 | await stagehand.close(); 15 | }); 16 | 17 | test("demonstrates getByAltText, getByLabel, getByPlaceholder, getByRole, getByTestId, getByText, getByTitle", async () => { 18 | const { page } = stagehand; 19 | await page.setContent(` 20 | 21 | 22 | Profile picture 23 | 24 | 25 | 26 | 27 |
Hello World!
28 |

This is some descriptive text on the page.

29 |

Site Title

30 | 31 | 32 | `); 33 | const image = page.getByAltText("Profile picture"); 34 | await expect(image).toBeVisible(); 35 | const usernameInput = page.getByLabel("Username"); 36 | await expect(usernameInput).toBeVisible(); 37 | const emailInput = page.getByPlaceholder("Enter your email"); 38 | await expect(emailInput).toBeVisible(); 39 | const signInButton = page.getByRole("button", { name: "Sign in" }); 40 | await expect(signInButton).toBeVisible(); 41 | const greetingDiv = page.getByTestId("greeting"); 42 | await expect(greetingDiv).toHaveText("Hello World!"); 43 | const descriptiveText = page.getByText( 44 | "This is some descriptive text on the page.", 45 | ); 46 | await expect(descriptiveText).toBeVisible(); 47 | const heading = page.getByTitle("A heading for the page"); 48 | await expect(heading).toHaveText("Site Title"); 49 | }); 50 | }); 51 | -------------------------------------------------------------------------------- /evals/deterministic/tests/page/navigation.test.ts: -------------------------------------------------------------------------------- 1 | import { test, expect } from "@playwright/test"; 2 | import { Stagehand } from "@/dist"; 3 | import StagehandConfig from "@/evals/deterministic/stagehand.config"; 4 | 5 | test.describe("StagehandPage - Navigation", () => { 6 | test("should navigate back and forward between pages", async () => { 7 | const stagehand = new Stagehand(StagehandConfig); 8 | await stagehand.init(); 9 | 10 | const page = stagehand.page; 11 | 12 | await page.goto("https://example.com"); 13 | expect(page.url()).toBe("https://example.com/"); 14 | 15 | await page.goto("https://docs.browserbase.com/introduction"); 16 | expect(page.url()).toBe("https://docs.browserbase.com/introduction"); 17 | 18 | await page.goBack(); 19 | expect(page.url()).toBe("https://example.com/"); 20 | 21 | await page.goForward(); 22 | expect(page.url()).toBe("https://docs.browserbase.com/introduction"); 23 | 24 | await stagehand.close(); 25 | }); 26 | }); 27 | -------------------------------------------------------------------------------- /evals/deterministic/tests/page/pageContext.test.ts: -------------------------------------------------------------------------------- 1 | import { test, expect } from "@playwright/test"; 2 | import { Stagehand } from "@/dist"; 3 | import StagehandConfig from "@/evals/deterministic/stagehand.config"; 4 | 5 | test.describe("StagehandPage - page.context()", () => { 6 | let stagehand: Stagehand; 7 | 8 | test.beforeEach(async () => { 9 | stagehand = new Stagehand(StagehandConfig); 10 | await stagehand.init(); 11 | }); 12 | 13 | test.afterEach(async () => { 14 | if (stagehand) { 15 | try { 16 | await stagehand.close(); 17 | } catch (error) { 18 | console.error("[afterEach] Error during stagehand.close():", error); 19 | } 20 | } else { 21 | console.log("[afterEach] Stagehand was not defined, skipping close()."); 22 | } 23 | }); 24 | 25 | test("should confirm page.context() and stagehand.context share state", async () => { 26 | const page = stagehand.page; 27 | const stagehandContext = stagehand.context; 28 | const pageContext = page.context(); 29 | 30 | await pageContext.addCookies([ 31 | { 32 | name: "stagehandTestCookie", 33 | value: "hello-stagehand", 34 | domain: "example.com", 35 | path: "/", 36 | expires: Math.floor(Date.now() / 1000) + 3600, // 1 hour 37 | httpOnly: false, 38 | secure: false, 39 | sameSite: "Lax", 40 | }, 41 | ]); 42 | 43 | const cookies = await stagehandContext.cookies("https://example.com"); 44 | 45 | const testCookie = cookies.find((c) => c.name === "stagehandTestCookie"); 46 | expect(testCookie).toBeDefined(); 47 | expect(testCookie?.value).toBe("hello-stagehand"); 48 | 49 | const extraPage = await pageContext.newPage(); 50 | await extraPage.goto("https://example.com"); 51 | const contextPages = stagehandContext.pages(); 52 | 53 | // The newly created page should be recognized by stagehandContext as well. 54 | const foundExtraPage = contextPages.find( 55 | (p) => p.url() === "https://example.com/", 56 | ); 57 | expect(foundExtraPage).toBeDefined(); 58 | }); 59 | }); 60 | -------------------------------------------------------------------------------- /evals/deterministic/tests/page/reload.test.ts: -------------------------------------------------------------------------------- 1 | import { test, expect } from "@playwright/test"; 2 | import { Stagehand } from "@/dist"; 3 | import StagehandConfig from "@/evals/deterministic/stagehand.config"; 4 | 5 | test.describe("StagehandPage - Reload", () => { 6 | test("should reload the page and reset page state", async () => { 7 | const stagehand = new Stagehand(StagehandConfig); 8 | await stagehand.init(); 9 | 10 | const page = stagehand.page; 11 | await page.goto("https://docs.browserbase.com/"); 12 | 13 | await page.evaluate(() => { 14 | const w = window as typeof window & { 15 | __testReloadMarker?: string; 16 | }; 17 | w.__testReloadMarker = "Hello Reload!"; 18 | }); 19 | 20 | const markerBeforeReload = await page.evaluate(() => { 21 | const w = window as typeof window & { 22 | __testReloadMarker?: string; 23 | }; 24 | return w.__testReloadMarker; 25 | }); 26 | expect(markerBeforeReload).toBe("Hello Reload!"); 27 | 28 | await page.reload(); 29 | 30 | const markerAfterReload = await page.evaluate(() => { 31 | const w = window as typeof window & { 32 | __testReloadMarker?: string; 33 | }; 34 | return w.__testReloadMarker; 35 | }); 36 | expect(markerAfterReload).toBeUndefined(); 37 | 38 | await stagehand.close(); 39 | }); 40 | }); 41 | -------------------------------------------------------------------------------- /evals/env.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Determine the current environment in which the evaluations are running: 3 | * - BROWSERBASE or LOCAL 4 | * 5 | * The environment is read from the EVAL_ENV environment variable. 6 | */ 7 | export const env: "BROWSERBASE" | "LOCAL" = 8 | process.env.EVAL_ENV?.toLowerCase() === "browserbase" 9 | ? "BROWSERBASE" 10 | : "LOCAL"; 11 | 12 | /** 13 | * Enable or disable caching based on the EVAL_ENABLE_CACHING environment variable. 14 | * Caching may improve performance by not re-fetching or re-computing certain results. 15 | * By default, caching is disabled unless explicitly enabled. 16 | */ 17 | export const enableCaching = 18 | process.env.EVAL_ENABLE_CACHING?.toLowerCase() === "true"; 19 | -------------------------------------------------------------------------------- /evals/scoring.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * This file implements scoring functions needed by braintrust. 3 | */ 4 | 5 | import { EvalArgs, EvalInput, EvalResult } from "@/types/evals"; 6 | 7 | /** 8 | * Scoring function: exactMatch 9 | * Given the arguments (including input, output, and expected result), 10 | * this returns a score of 1 if the result matches the expectation, and 0 otherwise. 11 | * 12 | * If "expected" is true, it checks if the output indicates success. 13 | * If "expected" is a boolean or an object with _success flag, 14 | * it checks if output is exactly that success condition. 15 | */ 16 | export function exactMatch( 17 | args: EvalArgs, 18 | ): EvalResult { 19 | console.log(`Task "${args.input.name}" returned: ${args.output}`); 20 | 21 | const expected = args.expected ?? true; 22 | if (expected === true) { 23 | // If we expect a success (true), then we check the output's _success flag. 24 | return { 25 | name: "Exact match", 26 | score: 27 | typeof args.output === "boolean" 28 | ? args.output 29 | ? 1 30 | : 0 31 | : args.output._success 32 | ? 1 33 | : 0, 34 | }; 35 | } 36 | 37 | // If expected is not true, just directly compare the output to expected. 38 | return { 39 | name: "Exact match", 40 | score: args.output === expected ? 1 : 0, 41 | }; 42 | } 43 | 44 | /** 45 | * Scoring function: errorMatch 46 | * Determines if an error occurred in the task. 47 | * Scores 1 if an error is found, otherwise 0. 48 | */ 49 | export function errorMatch( 50 | args: EvalArgs< 51 | EvalInput, 52 | boolean | { _success: boolean; error?: unknown }, 53 | unknown 54 | >, 55 | ): EvalResult { 56 | console.log(`Task "${args.input.name}" returned: ${args.output}`); 57 | 58 | return { 59 | name: "Error rate", 60 | score: 61 | typeof args.output === "object" && args.output.error !== undefined 62 | ? 1 63 | : 0, 64 | }; 65 | } 66 | -------------------------------------------------------------------------------- /evals/tasks/agent/google_flights.ts: -------------------------------------------------------------------------------- 1 | import { EvalFunction } from "@/types/evals"; 2 | import { Evaluator } from "../../evaluator"; 3 | 4 | export const google_flights: EvalFunction = async ({ 5 | debugUrl, 6 | sessionUrl, 7 | stagehand, 8 | logger, 9 | modelName, 10 | }) => { 11 | await stagehand.page.goto("https://google.com/travel/flights"); 12 | 13 | const agent = stagehand.agent({ 14 | model: modelName, 15 | provider: modelName.startsWith("claude") ? "anthropic" : "openai", 16 | instructions: `You are a helpful assistant that can help me with my tasks. You are given a task and you need to complete it without asking follow up questions. Today is ${new Date().toISOString().slice(0, 10)}. The current page is ${await stagehand.page.title()}`, 17 | }); 18 | 19 | const agentResult = await agent.execute({ 20 | instruction: 21 | "Search for flights from San Francisco to New York for next weekend", 22 | maxSteps: 15, 23 | }); 24 | logger.log(agentResult); 25 | 26 | const evaluator = new Evaluator(stagehand); 27 | const result = await evaluator.evaluate({ 28 | question: 29 | "Does the page show flights (options, available flights, not a search form) from San Francisco to New York?", 30 | strictResponse: true, 31 | }); 32 | 33 | if (result.evaluation !== "YES" && result.evaluation !== "NO") { 34 | await stagehand.close(); 35 | return { 36 | _success: false, 37 | observations: "Evaluator provided an invalid response", 38 | debugUrl, 39 | sessionUrl, 40 | logs: logger.getLogs(), 41 | }; 42 | } 43 | 44 | if (result.evaluation === "YES") { 45 | await stagehand.close(); 46 | return { 47 | _success: true, 48 | observations: result.reasoning, 49 | debugUrl, 50 | sessionUrl, 51 | logs: logger.getLogs(), 52 | }; 53 | } else { 54 | await stagehand.close(); 55 | return { 56 | _success: false, 57 | observations: result.reasoning, 58 | debugUrl, 59 | sessionUrl, 60 | logs: logger.getLogs(), 61 | }; 62 | } 63 | }; 64 | -------------------------------------------------------------------------------- /evals/tasks/agent/iframe_form.ts: -------------------------------------------------------------------------------- 1 | import { EvalFunction } from "@/types/evals"; 2 | import { Evaluator } from "../../evaluator"; 3 | 4 | export const iframe_form: EvalFunction = async ({ 5 | debugUrl, 6 | sessionUrl, 7 | stagehand, 8 | logger, 9 | modelName, 10 | }) => { 11 | await stagehand.page.goto("https://tucowsdomains.com/abuse-form/phishing/"); 12 | 13 | const agent = stagehand.agent({ 14 | provider: "anthropic", 15 | model: modelName, 16 | }); 17 | 18 | const agentResult = await agent.execute({ 19 | instruction: "Fill in the form name with 'John Smith'", 20 | maxSteps: 3, 21 | }); 22 | logger.log(agentResult); 23 | 24 | await stagehand.page.mouse.wheel(0, -1000); 25 | const evaluator = new Evaluator(stagehand); 26 | const result = await evaluator.evaluate({ 27 | question: "Is the form name input filled with 'John Smith'?", 28 | strictResponse: true, 29 | }); 30 | 31 | if (result.evaluation !== "YES" && result.evaluation !== "NO") { 32 | await stagehand.close(); 33 | return { 34 | _success: false, 35 | observations: "Evaluator provided an invalid response", 36 | debugUrl, 37 | sessionUrl, 38 | logs: logger.getLogs(), 39 | }; 40 | } 41 | 42 | const agentResult2 = await agent.execute({ 43 | instruction: "Fill in the form email with 'john.smith@example.com'", 44 | maxSteps: 3, 45 | }); 46 | logger.log(agentResult2); 47 | 48 | await stagehand.page.mouse.wheel(0, -1000); 49 | const result2 = await evaluator.evaluate({ 50 | question: "Is the form email input filled with 'john.smith@example.com'?", 51 | strictResponse: true, 52 | }); 53 | 54 | if (result2.evaluation !== "YES" && result2.evaluation !== "NO") { 55 | await stagehand.close(); 56 | return { 57 | _success: false, 58 | observations: "Evaluator provided an invalid response", 59 | debugUrl, 60 | sessionUrl, 61 | logs: logger.getLogs(), 62 | }; 63 | } 64 | 65 | if (result.evaluation === "YES" && result2.evaluation === "YES") { 66 | await stagehand.close(); 67 | return { 68 | _success: true, 69 | observations: "All fields were filled correctly", 70 | debugUrl, 71 | sessionUrl, 72 | logs: logger.getLogs(), 73 | }; 74 | } else { 75 | await stagehand.close(); 76 | return { 77 | _success: false, 78 | observations: "One or more fields were not filled correctly", 79 | debugUrl, 80 | sessionUrl, 81 | logs: logger.getLogs(), 82 | }; 83 | } 84 | }; 85 | -------------------------------------------------------------------------------- /evals/tasks/agent/iframe_form_multiple.ts: -------------------------------------------------------------------------------- 1 | import { EvalFunction } from "@/types/evals"; 2 | import { Evaluator } from "../../evaluator"; 3 | 4 | export const iframe_form_multiple: EvalFunction = async ({ 5 | debugUrl, 6 | sessionUrl, 7 | stagehand, 8 | logger, 9 | modelName, 10 | }) => { 11 | await stagehand.page.goto("https://tucowsdomains.com/abuse-form/phishing/"); 12 | 13 | const agent = stagehand.agent({ 14 | provider: modelName.startsWith("claude") ? "anthropic" : "openai", 15 | model: modelName, 16 | }); 17 | 18 | const agentResult = await agent.execute({ 19 | instruction: 20 | "Fill in the form name with 'John Smith', the email with 'john.smith@example.com', and select the 'Are you the domain owner?' option as 'No'", 21 | maxSteps: 10, 22 | }); 23 | logger.log(agentResult); 24 | 25 | await stagehand.page.mouse.wheel(0, -1000); 26 | const evaluator = new Evaluator(stagehand); 27 | const results = await evaluator.batchEvaluate({ 28 | questions: [ 29 | "Is the form name input filled with 'John Smith'?", 30 | "Is the form email input filled with 'john.smith@example.com'?", 31 | "Is the 'Are you the domain owner?' option selected as 'No'?", 32 | ], 33 | strictResponse: true, 34 | }); 35 | 36 | for (const r of results) { 37 | if (r.evaluation !== "YES" && r.evaluation !== "NO") { 38 | await stagehand.close(); 39 | return { 40 | _success: false, 41 | observations: "Evaluator provided an invalid response", 42 | debugUrl, 43 | sessionUrl, 44 | logs: logger.getLogs(), 45 | }; 46 | } 47 | if (r.evaluation === "NO") { 48 | await stagehand.close(); 49 | return { 50 | _success: false, 51 | observations: r.reasoning, 52 | debugUrl, 53 | sessionUrl, 54 | logs: logger.getLogs(), 55 | }; 56 | } 57 | } 58 | 59 | await stagehand.close(); 60 | return { 61 | _success: true, 62 | observations: "All fields were filled correctly", 63 | debugUrl, 64 | sessionUrl, 65 | logs: logger.getLogs(), 66 | }; 67 | }; 68 | -------------------------------------------------------------------------------- /evals/tasks/agent/sf_library_card.ts: -------------------------------------------------------------------------------- 1 | import { EvalFunction } from "@/types/evals"; 2 | import { Evaluator } from "../../evaluator"; 3 | 4 | export const sf_library_card: EvalFunction = async ({ 5 | debugUrl, 6 | sessionUrl, 7 | stagehand, 8 | logger, 9 | modelName, 10 | }) => { 11 | await stagehand.page.goto("https://sflib1.sfpl.org/selfreg"); 12 | 13 | const agent = stagehand.agent({ 14 | model: modelName, 15 | provider: modelName.startsWith("claude") ? "anthropic" : "openai", 16 | instructions: `You are a helpful assistant that can help me with my tasks. You are given a task and you need to complete it without asking follow up questions. The current page is ${await stagehand.page.title()}`, 17 | }); 18 | 19 | const agentResult = await agent.execute({ 20 | instruction: "Fill in the 'Residential Address' field with '166 Geary St'", 21 | maxSteps: 3, 22 | }); 23 | logger.log(agentResult); 24 | 25 | await stagehand.page.mouse.wheel(0, -1000); 26 | const evaluator = new Evaluator(stagehand); 27 | const result = await evaluator.evaluate({ 28 | question: 29 | "Does the page show the 'Residential Address' field filled with '166 Geary St'?", 30 | strictResponse: true, 31 | }); 32 | 33 | if (result.evaluation !== "YES" && result.evaluation !== "NO") { 34 | await stagehand.close(); 35 | return { 36 | _success: false, 37 | observations: "Evaluator provided an invalid response", 38 | debugUrl, 39 | sessionUrl, 40 | logs: logger.getLogs(), 41 | }; 42 | } 43 | 44 | if (result.evaluation === "YES") { 45 | await stagehand.close(); 46 | return { 47 | _success: true, 48 | observations: result.reasoning, 49 | debugUrl, 50 | sessionUrl, 51 | logs: logger.getLogs(), 52 | }; 53 | } else { 54 | await stagehand.close(); 55 | return { 56 | _success: false, 57 | observations: result.reasoning, 58 | debugUrl, 59 | sessionUrl, 60 | logs: logger.getLogs(), 61 | }; 62 | } 63 | }; 64 | -------------------------------------------------------------------------------- /evals/tasks/agent/sf_library_card_multiple.ts: -------------------------------------------------------------------------------- 1 | import { EvalFunction } from "@/types/evals"; 2 | import { Evaluator } from "../../evaluator"; 3 | 4 | export const sf_library_card_multiple: EvalFunction = async ({ 5 | debugUrl, 6 | sessionUrl, 7 | stagehand, 8 | logger, 9 | modelName, 10 | }) => { 11 | await stagehand.page.goto("https://sflib1.sfpl.org/selfreg"); 12 | 13 | const agent = stagehand.agent({ 14 | model: modelName, 15 | provider: modelName.startsWith("claude") ? "anthropic" : "openai", 16 | instructions: `You are a helpful assistant that can help me with my tasks. You are given a task and you need to complete it without asking follow up questions. The current page is ${await stagehand.page.title()}`, 17 | }); 18 | 19 | const agentResult = await agent.execute({ 20 | instruction: 21 | "Fill in ALL the required fields with mock data. DO NOT submit the form", 22 | maxSteps: 20, 23 | }); 24 | logger.log(agentResult); 25 | 26 | const evaluator = new Evaluator(stagehand); 27 | const result = await evaluator.evaluate({ 28 | question: "Does the page show all the required fields filled?", 29 | strictResponse: true, 30 | }); 31 | 32 | if (result.evaluation !== "YES" && result.evaluation !== "NO") { 33 | await stagehand.close(); 34 | return { 35 | _success: false, 36 | observations: "Evaluator provided an invalid response", 37 | debugUrl, 38 | sessionUrl, 39 | logs: logger.getLogs(), 40 | }; 41 | } 42 | 43 | if (result.evaluation === "YES") { 44 | await stagehand.close(); 45 | return { 46 | _success: true, 47 | observations: result.reasoning, 48 | debugUrl, 49 | sessionUrl, 50 | logs: logger.getLogs(), 51 | }; 52 | } else { 53 | await stagehand.close(); 54 | return { 55 | _success: false, 56 | observations: result.reasoning, 57 | debugUrl, 58 | sessionUrl, 59 | logs: logger.getLogs(), 60 | }; 61 | } 62 | }; 63 | -------------------------------------------------------------------------------- /evals/tasks/allrecipes.ts: -------------------------------------------------------------------------------- 1 | import { EvalFunction } from "@/types/evals"; 2 | import { z } from "zod"; 3 | 4 | export const allrecipes: EvalFunction = async ({ 5 | logger, 6 | useTextExtract, 7 | debugUrl, 8 | sessionUrl, 9 | stagehand, 10 | }) => { 11 | await stagehand.page.goto("https://www.allrecipes.com/", { 12 | waitUntil: "domcontentloaded", 13 | }); 14 | 15 | await stagehand.page.act({ 16 | action: 'Type "chocolate chip cookies" in the search bar', 17 | }); 18 | await stagehand.page.act({ 19 | action: "press enter", 20 | }); 21 | 22 | const recipeDetails = await stagehand.page.extract({ 23 | instruction: 24 | "Extract the title of the first recipe and the total number of ratings it has received.", 25 | schema: z.object({ 26 | title: z.string().describe("Title of the recipe"), 27 | total_ratings: z 28 | .string() 29 | .describe("Total number of ratings for the recipe"), 30 | }), 31 | useTextExtract, 32 | }); 33 | 34 | await stagehand.close(); 35 | 36 | const { title, total_ratings } = recipeDetails; 37 | const expectedTitle = "Best Chocolate Chip Cookies"; 38 | const expectedRatings = 19164; 39 | 40 | const extractedRatings = parseInt(total_ratings.replace(/[^\d]/g, ""), 10); 41 | const isRatingsWithinRange = 42 | extractedRatings >= expectedRatings - 1000 && 43 | extractedRatings <= expectedRatings + 1000; 44 | 45 | if (title !== expectedTitle || !isRatingsWithinRange) { 46 | const errors = []; 47 | if (title !== expectedTitle) { 48 | errors.push({ 49 | message: "Extracted title does not match the expected title", 50 | expected: expectedTitle, 51 | actual: title, 52 | }); 53 | } 54 | if (!isRatingsWithinRange) { 55 | errors.push({ 56 | message: "Extracted ratings are not within the expected range", 57 | expected: `${expectedRatings} ± 1000`, 58 | actual: extractedRatings.toString(), 59 | }); 60 | } 61 | 62 | logger.error({ 63 | message: "Failed to extract correct recipe details", 64 | level: 0, 65 | auxiliary: { 66 | errors: { 67 | value: JSON.stringify(errors), 68 | type: "object", 69 | }, 70 | }, 71 | }); 72 | 73 | return { 74 | _success: false, 75 | error: "Recipe details extraction validation failed", 76 | logs: logger.getLogs(), 77 | debugUrl, 78 | sessionUrl, 79 | }; 80 | } 81 | 82 | return { 83 | _success: true, 84 | recipeDetails: { 85 | title, 86 | total_ratings: extractedRatings, 87 | }, 88 | logs: logger.getLogs(), 89 | debugUrl, 90 | sessionUrl, 91 | }; 92 | }; 93 | -------------------------------------------------------------------------------- /evals/tasks/amazon_add_to_cart.ts: -------------------------------------------------------------------------------- 1 | import { EvalFunction } from "@/types/evals"; 2 | 3 | export const amazon_add_to_cart: EvalFunction = async ({ 4 | logger, 5 | debugUrl, 6 | sessionUrl, 7 | stagehand, 8 | }) => { 9 | await stagehand.page.goto( 10 | "https://browserbase.github.io/stagehand-eval-sites/sites/amazon/", 11 | ); 12 | 13 | await stagehand.page.waitForTimeout(5000); 14 | 15 | await stagehand.page.act({ 16 | action: "click the 'Add to Cart' button", 17 | }); 18 | 19 | await stagehand.page.waitForTimeout(2000); 20 | 21 | await stagehand.page.act({ 22 | action: "click the 'Proceed to checkout' button", 23 | }); 24 | 25 | await stagehand.page.waitForTimeout(2000); 26 | const currentUrl = stagehand.page.url(); 27 | const expectedUrl = 28 | "https://browserbase.github.io/stagehand-eval-sites/sites/amazon/sign-in.html"; 29 | 30 | await stagehand.close(); 31 | 32 | return { 33 | _success: currentUrl === expectedUrl, 34 | currentUrl, 35 | debugUrl, 36 | sessionUrl, 37 | logs: logger.getLogs(), 38 | }; 39 | }; 40 | -------------------------------------------------------------------------------- /evals/tasks/apple.ts: -------------------------------------------------------------------------------- 1 | import { EvalFunction } from "@/types/evals"; 2 | 3 | export const apple: EvalFunction = async ({ 4 | logger, 5 | debugUrl, 6 | sessionUrl, 7 | stagehand, 8 | }) => { 9 | await stagehand.page.goto("https://www.apple.com/iphone-16-pro/"); 10 | 11 | await stagehand.page.act({ action: "click on the buy button" }); 12 | await stagehand.page.act({ action: "select the Pro Max model" }); 13 | await stagehand.page.act({ action: "select the natural titanium color" }); 14 | await stagehand.page.act({ action: "select the 256GB storage option" }); 15 | await stagehand.page.act({ 16 | action: "click on the 'select a smartphone' trade-in option", 17 | }); 18 | 19 | await stagehand.page.act({ 20 | action: "select the iPhone 13 mini model from the dropdown", 21 | }); 22 | await stagehand.page.act({ 23 | action: "select the iPhone 13 mini is in good condition", 24 | }); 25 | 26 | const successMessageLocator = stagehand.page.locator( 27 | 'text="Good News. Your iPhone 13 mini qualifies for credit."', 28 | ); 29 | const isVisible = await successMessageLocator.isVisible(); 30 | 31 | await stagehand.close(); 32 | 33 | return { 34 | _success: isVisible, 35 | debugUrl, 36 | sessionUrl, 37 | logs: logger.getLogs(), 38 | }; 39 | }; 40 | -------------------------------------------------------------------------------- /evals/tasks/bidnet.ts: -------------------------------------------------------------------------------- 1 | import { EvalFunction } from "@/types/evals"; 2 | 3 | export const bidnet: EvalFunction = async ({ 4 | logger, 5 | debugUrl, 6 | sessionUrl, 7 | stagehand, 8 | }) => { 9 | await stagehand.page.goto("https://www.bidnetdirect.com/"); 10 | 11 | await stagehand.page.act({ 12 | action: 'Click on the "Construction" keyword', 13 | }); 14 | 15 | const expectedUrl = 16 | "https://www.bidnetdirect.com/public/solicitations/open?keywords=Construction"; 17 | const currentUrl = stagehand.page.url(); 18 | 19 | await stagehand.close(); 20 | 21 | return { 22 | _success: currentUrl.startsWith(expectedUrl), 23 | currentUrl, 24 | debugUrl, 25 | sessionUrl, 26 | logs: logger.getLogs(), 27 | }; 28 | }; 29 | -------------------------------------------------------------------------------- /evals/tasks/checkboxes.ts: -------------------------------------------------------------------------------- 1 | import { EvalFunction } from "@/types/evals"; 2 | 3 | export const checkboxes: EvalFunction = async ({ 4 | debugUrl, 5 | sessionUrl, 6 | stagehand, 7 | logger, 8 | }) => { 9 | await stagehand.page.goto( 10 | "https://browserbase.github.io/stagehand-eval-sites/sites/checkboxes/", 11 | ); 12 | 13 | await stagehand.page.act({ 14 | action: "click the 'baseball' option", 15 | }); 16 | 17 | await stagehand.page.act({ 18 | action: "click the 'netball' option", 19 | }); 20 | 21 | const baseballChecked = await stagehand.page 22 | .locator('input[type="checkbox"][name="sports"][value="baseball"]') 23 | .isChecked(); 24 | 25 | const netballChecked = await stagehand.page 26 | .locator('input[type="checkbox"][name="sports"][value="netball"]') 27 | .isChecked(); 28 | 29 | await stagehand.close(); 30 | 31 | return { 32 | _success: baseballChecked && netballChecked, 33 | debugUrl, 34 | sessionUrl, 35 | logs: logger.getLogs(), 36 | }; 37 | }; 38 | -------------------------------------------------------------------------------- /evals/tasks/combination_sauce.ts: -------------------------------------------------------------------------------- 1 | import { EvalFunction } from "@/types/evals"; 2 | import { z } from "zod"; 3 | 4 | export const combination_sauce: EvalFunction = async ({ 5 | logger, 6 | useTextExtract, 7 | debugUrl, 8 | sessionUrl, 9 | stagehand, 10 | }) => { 11 | try { 12 | await stagehand.page.goto("https://www.saucedemo.com/"); 13 | 14 | const { usernames, password } = await stagehand.page.extract({ 15 | instruction: "extract the accepted usernames and the password for login", 16 | schema: z.object({ 17 | usernames: z.array(z.string()).describe("the accepted usernames"), 18 | password: z.string().describe("the password for login"), 19 | }), 20 | useTextExtract, 21 | }); 22 | 23 | await stagehand.page.act({ 24 | action: `enter username 'standard_user'`, 25 | }); 26 | 27 | await stagehand.page.act({ 28 | action: `enter password '${password}'`, 29 | }); 30 | 31 | await stagehand.page.act({ 32 | action: "click on 'login'", 33 | }); 34 | 35 | const observations = await stagehand.page.observe({ 36 | instruction: "find all the 'add to cart' buttons", 37 | }); 38 | 39 | console.log("observations", observations); 40 | console.log("observations length", observations.length); 41 | 42 | const url = await stagehand.page.url(); 43 | 44 | await stagehand.close(); 45 | 46 | const usernamesCheck = usernames.length === 6; 47 | const urlCheck = url === "https://www.saucedemo.com/inventory.html"; 48 | const observationsCheck = observations.length === 6; 49 | 50 | return { 51 | _success: usernamesCheck && urlCheck && observationsCheck, 52 | debugUrl, 53 | sessionUrl, 54 | logs: logger.getLogs(), 55 | }; 56 | } catch (error) { 57 | console.error("Error or timeout occurred:", error); 58 | 59 | await stagehand.close(); 60 | 61 | return { 62 | _success: false, 63 | error: JSON.parse(JSON.stringify(error, null, 2)), 64 | debugUrl, 65 | sessionUrl, 66 | logs: logger.getLogs(), 67 | }; 68 | } 69 | }; 70 | -------------------------------------------------------------------------------- /evals/tasks/costar.ts: -------------------------------------------------------------------------------- 1 | import { EvalFunction } from "@/types/evals"; 2 | import { z } from "zod"; 3 | 4 | export const costar: EvalFunction = async ({ 5 | logger, 6 | debugUrl, 7 | sessionUrl, 8 | stagehand, 9 | useTextExtract, 10 | }) => { 11 | // TODO: fix this eval - does not work in headless mode 12 | try { 13 | await Promise.race([ 14 | stagehand.page.goto("https://www.costar.com/"), 15 | new Promise((_, reject) => 16 | setTimeout(() => reject(new Error("Navigation timeout")), 30000), 17 | ), 18 | ]); 19 | 20 | await stagehand.page.act({ action: "click on the first article" }); 21 | 22 | await stagehand.page.act({ 23 | action: "click on the learn more button for the first job", 24 | }); 25 | 26 | const articleTitle = await stagehand.page.extract({ 27 | instruction: "extract the title of the article", 28 | schema: z.object({ 29 | title: z.string().describe("the title of the article").nullable(), 30 | }), 31 | useTextExtract, 32 | }); 33 | 34 | logger.log({ 35 | message: "got article title", 36 | level: 1, 37 | auxiliary: { 38 | articleTitle: { 39 | value: JSON.stringify(articleTitle), 40 | type: "object", 41 | }, 42 | }, 43 | }); 44 | 45 | // Check if the title is more than 5 characters 46 | const isTitleValid = 47 | articleTitle.title !== null && articleTitle.title.length > 5; 48 | 49 | await stagehand.close(); 50 | 51 | return { 52 | title: articleTitle.title, 53 | _success: isTitleValid, 54 | debugUrl, 55 | sessionUrl, 56 | logs: logger.getLogs(), 57 | }; 58 | } catch (error) { 59 | logger.error({ 60 | message: "error in costar function", 61 | level: 0, 62 | auxiliary: { 63 | error: { 64 | value: error.message, 65 | type: "string", 66 | }, 67 | trace: { 68 | value: error.stack, 69 | type: "string", 70 | }, 71 | }, 72 | }); 73 | 74 | await stagehand.close(); 75 | 76 | return { 77 | title: null, 78 | _success: false, 79 | debugUrl, 80 | sessionUrl, 81 | logs: logger.getLogs(), 82 | }; 83 | } 84 | }; 85 | -------------------------------------------------------------------------------- /evals/tasks/dropdown.ts: -------------------------------------------------------------------------------- 1 | import { EvalFunction } from "@/types/evals"; 2 | 3 | export const dropdown: EvalFunction = async ({ 4 | debugUrl, 5 | sessionUrl, 6 | stagehand, 7 | logger, 8 | }) => { 9 | await stagehand.page.goto( 10 | "https://browserbase.github.io/stagehand-eval-sites/sites/dropdown/", 11 | ); 12 | 13 | // click the dropdown element to expand it 14 | const xpath = "xpath=/html/body/div/div/button"; 15 | await stagehand.page.locator(xpath).click(); 16 | 17 | // type into the input box (which should be hidden behind the 18 | // expanded dropdown) 19 | await stagehand.page.act("type 'test fill' into the input field"); 20 | 21 | const input = stagehand.page.locator(`xpath=/html/body/div/input`); 22 | const expectedValue = "test fill"; 23 | 24 | // get the value of the input box 25 | const actualValue = await input.inputValue(); 26 | await stagehand.close(); 27 | 28 | // pass if the value matches expected 29 | return { 30 | _success: actualValue === expectedValue, 31 | expectedValue, 32 | actualValue, 33 | debugUrl, 34 | sessionUrl, 35 | logs: logger.getLogs(), 36 | }; 37 | }; 38 | -------------------------------------------------------------------------------- /evals/tasks/expect_act_timeout.ts: -------------------------------------------------------------------------------- 1 | import { EvalFunction } from "@/types/evals"; 2 | 3 | export const expect_act_timeout: EvalFunction = async ({ 4 | logger, 5 | debugUrl, 6 | sessionUrl, 7 | stagehand, 8 | }) => { 9 | await stagehand.page.goto("https://docs.stagehand.dev"); 10 | const result = await stagehand.page.act({ 11 | action: "search for 'Stagehand'", 12 | timeoutMs: 1_000, 13 | }); 14 | 15 | await stagehand.close(); 16 | 17 | return { 18 | _success: !result.success, 19 | debugUrl, 20 | sessionUrl, 21 | logs: logger.getLogs(), 22 | }; 23 | }; 24 | -------------------------------------------------------------------------------- /evals/tasks/expedia.ts: -------------------------------------------------------------------------------- 1 | import { EvalFunction } from "@/types/evals"; 2 | 3 | export const expedia: EvalFunction = async ({ 4 | logger, 5 | debugUrl, 6 | sessionUrl, 7 | stagehand, 8 | }) => { 9 | try { 10 | await stagehand.page.goto("https://www.expedia.com/flights"); 11 | await stagehand.page.act( 12 | "find round-trip flights from San Francisco (SFO) to Toronto (YYZ) for Jan 1, 2025 (up to one to two weeks)", 13 | ); 14 | await stagehand.page.act("Go to the first non-stop flight"); 15 | await stagehand.page.act("select the cheapest flight"); 16 | await stagehand.page.act("click on the first non-stop flight"); 17 | await stagehand.page.act("Take me to the checkout page"); 18 | 19 | const url = stagehand.page.url(); 20 | return { 21 | _success: url.startsWith("https://www.expedia.com/Checkout/"), 22 | logs: logger.getLogs(), 23 | debugUrl, 24 | sessionUrl, 25 | }; 26 | } catch (error) { 27 | logger.error({ 28 | message: "Error in expedia eval", 29 | level: 0, 30 | auxiliary: { 31 | error: { value: error.message, type: "string" }, 32 | trace: { value: error.stack, type: "string" }, 33 | }, 34 | }); 35 | 36 | return { 37 | _success: false, 38 | logs: logger.getLogs(), 39 | debugUrl, 40 | sessionUrl, 41 | }; 42 | } finally { 43 | await stagehand.close(); 44 | } 45 | }; 46 | -------------------------------------------------------------------------------- /evals/tasks/expedia_search.ts: -------------------------------------------------------------------------------- 1 | import { EvalFunction } from "@/types/evals"; 2 | 3 | export const expedia_search: EvalFunction = async ({ 4 | logger, 5 | debugUrl, 6 | sessionUrl, 7 | stagehand, 8 | }) => { 9 | try { 10 | await stagehand.page.goto("https://www.expedia.com/flights"); 11 | await stagehand.page.act({ 12 | action: 13 | "find round-trip flights from San Francisco (SFO) to Toronto (YYZ) for Jan 1, 2025 (up to one to two weeks)", 14 | }); 15 | 16 | await stagehand.page.act({ action: "Go to the first non-stop flight" }); 17 | 18 | await stagehand.page.act({ action: "select the cheapest flight" }); 19 | 20 | await stagehand.page.act({ action: "click on the first non-stop flight" }); 21 | 22 | await stagehand.page.act({ 23 | action: "Take me to the checkout page", 24 | }); 25 | 26 | const url = stagehand.page.url(); 27 | return { 28 | _success: url.startsWith("https://www.expedia.com/Checkout/"), 29 | logs: logger.getLogs(), 30 | debugUrl, 31 | sessionUrl, 32 | }; 33 | } catch (error) { 34 | logger.error({ 35 | message: `error in expedia function`, 36 | level: 0, 37 | auxiliary: { 38 | error: { 39 | value: JSON.stringify(error, null, 2), 40 | type: "object", 41 | }, 42 | trace: { 43 | value: error.stack, 44 | type: "string", 45 | }, 46 | }, 47 | }); 48 | return { 49 | _success: false, 50 | error: JSON.parse(JSON.stringify(error, null, 2)), 51 | debugUrl, 52 | sessionUrl, 53 | logs: logger.getLogs(), 54 | }; 55 | } finally { 56 | await stagehand.close(); 57 | } 58 | }; 59 | -------------------------------------------------------------------------------- /evals/tasks/extract_aigrant_targeted.ts: -------------------------------------------------------------------------------- 1 | import { z } from "zod"; 2 | import { EvalFunction } from "@/types/evals"; 3 | 4 | export const extract_aigrant_targeted: EvalFunction = async ({ 5 | logger, 6 | useTextExtract, 7 | debugUrl, 8 | sessionUrl, 9 | stagehand, 10 | }) => { 11 | await stagehand.page.goto( 12 | "https://browserbase.github.io/stagehand-eval-sites/sites/aigrant/", 13 | ); 14 | const selector = "/html/body/div/ul[5]/li[28]"; 15 | const company = await stagehand.page.extract({ 16 | instruction: "Extract the company name.", 17 | schema: z.object({ 18 | company_name: z.string(), 19 | }), 20 | useTextExtract, 21 | selector: selector, 22 | }); 23 | 24 | await stagehand.close(); 25 | const companyName = company.company_name; 26 | 27 | const expectedName = { 28 | company_name: "Coframe", 29 | }; 30 | 31 | const nameMatches = companyName == expectedName.company_name; 32 | 33 | if (!nameMatches) { 34 | logger.error({ 35 | message: "extracted company name does not match expected", 36 | level: 0, 37 | auxiliary: { 38 | expected: { 39 | value: expectedName.company_name, 40 | type: "string", 41 | }, 42 | actual: { 43 | value: companyName, 44 | type: "string", 45 | }, 46 | }, 47 | }); 48 | return { 49 | _success: false, 50 | error: "Company name does not match expected", 51 | logs: logger.getLogs(), 52 | debugUrl, 53 | sessionUrl, 54 | }; 55 | } 56 | 57 | return { 58 | _success: true, 59 | logs: logger.getLogs(), 60 | debugUrl, 61 | sessionUrl, 62 | }; 63 | }; 64 | -------------------------------------------------------------------------------- /evals/tasks/extract_aigrant_targeted_2.ts: -------------------------------------------------------------------------------- 1 | import { z } from "zod"; 2 | import { EvalFunction } from "@/types/evals"; 3 | 4 | export const extract_aigrant_targeted_2: EvalFunction = async ({ 5 | logger, 6 | useTextExtract, 7 | debugUrl, 8 | sessionUrl, 9 | stagehand, 10 | }) => { 11 | await stagehand.page.goto( 12 | "https://browserbase.github.io/stagehand-eval-sites/sites/aigrant/", 13 | ); 14 | const selector = "/html/body/div/ul[5]/li[28]"; 15 | const company = await stagehand.page.extract({ 16 | instruction: "Extract the name of the company that comes after 'Coframe'.", 17 | schema: z.object({ 18 | company_name: z.string(), 19 | }), 20 | useTextExtract, 21 | selector: selector, 22 | }); 23 | 24 | await stagehand.close(); 25 | const companyName = company.company_name; 26 | 27 | // nameWeShouldNotGet matches the name of the company that comes after 28 | // CoFrame on the website. Since we are using targeted_extract here, 29 | // and passing in a selector that does NOT contain the nameWeShouldNotGet, 30 | // the LLM should have no visibility into what comes after 'CoFrame' if 31 | // targeted_extract is performing correctly 32 | const nameWeShouldNotGet = { 33 | company_name: "OpusClip", 34 | }; 35 | 36 | const nameMatches = companyName == nameWeShouldNotGet.company_name; 37 | 38 | if (nameMatches) { 39 | logger.error({ 40 | message: 41 | "extracted company name matches the company name that we SHOULD NOT get", 42 | level: 0, 43 | auxiliary: { 44 | expected: { 45 | value: nameWeShouldNotGet.company_name, 46 | type: "string", 47 | }, 48 | actual: { 49 | value: companyName, 50 | type: "string", 51 | }, 52 | }, 53 | }); 54 | return { 55 | _success: false, 56 | error: 57 | "extracted company name matches the company name that we SHOULD NOT get", 58 | logs: logger.getLogs(), 59 | debugUrl, 60 | sessionUrl, 61 | }; 62 | } 63 | 64 | return { 65 | _success: true, 66 | logs: logger.getLogs(), 67 | debugUrl, 68 | sessionUrl, 69 | }; 70 | }; 71 | -------------------------------------------------------------------------------- /evals/tasks/extract_apartments.ts: -------------------------------------------------------------------------------- 1 | import { z } from "zod"; 2 | import { EvalFunction } from "../../types/evals"; 3 | 4 | export const extract_apartments: EvalFunction = async ({ 5 | logger, 6 | useTextExtract, 7 | debugUrl, 8 | sessionUrl, 9 | stagehand, 10 | }) => { 11 | await stagehand.page.goto( 12 | "https://www.apartments.com/san-francisco-ca/2-bedrooms/", 13 | ); 14 | const apartment_listings = await stagehand.page.extract({ 15 | instruction: 16 | "Extract all the apartment listings with their prices and their addresses.", 17 | schema: z.object({ 18 | listings: z.array( 19 | z.object({ 20 | price: z.string().describe("The price of the listing"), 21 | trails: z.string().describe("The address of the listing"), 22 | }), 23 | ), 24 | }), 25 | useTextExtract, 26 | }); 27 | 28 | await stagehand.close(); 29 | const listings = apartment_listings.listings; 30 | const expectedLength = 40; 31 | 32 | if (listings.length < expectedLength) { 33 | logger.error({ 34 | message: "Incorrect number of listings extracted", 35 | level: 0, 36 | auxiliary: { 37 | expected: { 38 | value: expectedLength.toString(), 39 | type: "integer", 40 | }, 41 | actual: { 42 | value: listings.length.toString(), 43 | type: "integer", 44 | }, 45 | }, 46 | }); 47 | return { 48 | _success: false, 49 | error: "Incorrect number of listings extracted", 50 | logs: logger.getLogs(), 51 | debugUrl, 52 | sessionUrl, 53 | }; 54 | } 55 | 56 | return { 57 | _success: true, 58 | logs: logger.getLogs(), 59 | debugUrl, 60 | sessionUrl, 61 | }; 62 | }; 63 | -------------------------------------------------------------------------------- /evals/tasks/extract_baptist_health.ts: -------------------------------------------------------------------------------- 1 | import { EvalFunction } from "@/types/evals"; 2 | import { compareStrings } from "@/evals/utils"; 3 | import { z } from "zod"; 4 | 5 | export const extract_baptist_health: EvalFunction = async ({ 6 | logger, 7 | useTextExtract, 8 | debugUrl, 9 | sessionUrl, 10 | stagehand, 11 | }) => { 12 | await stagehand.page.goto( 13 | "https://browserbase.github.io/stagehand-eval-sites/sites/baptist-health/", 14 | ); 15 | 16 | const result = await stagehand.page.extract({ 17 | instruction: 18 | "Extract the address, phone number, and fax number of the healthcare location.", 19 | schema: z.object({ 20 | address: z.string(), 21 | phone: z.string(), 22 | fax: z.string(), 23 | }), 24 | useTextExtract, 25 | }); 26 | 27 | await stagehand.close(); 28 | 29 | const { address, phone, fax } = result; 30 | const expected = { 31 | address: "2055 East South Blvd; Suite 908 Montgomery, AL 36116", 32 | phone: "334-747-2273", 33 | fax: "334-747-7501", 34 | }; 35 | 36 | const similarityThreshold = 0.85; 37 | const failedFields: Array<{ 38 | field: string; 39 | similarity: number; 40 | expected: string; 41 | actual: string; 42 | }> = []; 43 | 44 | const compareField = ( 45 | actualVal: string, 46 | expectedVal: string, 47 | fieldName: string, 48 | ) => { 49 | const { similarity, meetsThreshold } = compareStrings( 50 | actualVal, 51 | expectedVal, 52 | similarityThreshold, 53 | ); 54 | 55 | if (!meetsThreshold) { 56 | failedFields.push({ 57 | field: fieldName, 58 | similarity, 59 | expected: expectedVal, 60 | actual: actualVal, 61 | }); 62 | logger.error({ 63 | message: `${fieldName} extracted does not meet similarity threshold`, 64 | level: 0, 65 | auxiliary: { 66 | field: { value: fieldName, type: "string" }, 67 | similarity: { value: similarity.toFixed(2), type: "string" }, 68 | expected: { value: expectedVal, type: "string" }, 69 | actual: { value: actualVal, type: "string" }, 70 | }, 71 | }); 72 | } 73 | 74 | return meetsThreshold; 75 | }; 76 | 77 | const addressOk = compareField(address, expected.address, "Address"); 78 | const phoneOk = compareField(phone, expected.phone, "Phone number"); 79 | const faxOk = compareField(fax, expected.fax, "Fax number"); 80 | 81 | if (!addressOk || !phoneOk || !faxOk) { 82 | return { 83 | _success: false, 84 | error: "Some fields did not meet similarity threshold", 85 | logs: logger.getLogs(), 86 | debugUrl, 87 | sessionUrl, 88 | failedFields, 89 | }; 90 | } 91 | 92 | return { 93 | _success: true, 94 | logs: logger.getLogs(), 95 | debugUrl, 96 | sessionUrl, 97 | }; 98 | }; 99 | -------------------------------------------------------------------------------- /evals/tasks/extract_collaborators.ts: -------------------------------------------------------------------------------- 1 | import { EvalFunction } from "@/types/evals"; 2 | import { z } from "zod"; 3 | 4 | export const extract_collaborators: EvalFunction = async ({ 5 | logger, 6 | useTextExtract, 7 | debugUrl, 8 | sessionUrl, 9 | stagehand, 10 | }) => { 11 | try { 12 | await stagehand.page.goto("https://github.com/facebook/react"); 13 | await stagehand.page.act({ 14 | action: "find and click the contributors section", 15 | }); 16 | 17 | await stagehand.page.waitForLoadState("domcontentloaded"); 18 | await stagehand.page.waitForLoadState("networkidle"); 19 | await stagehand.page.waitForTimeout(5000); 20 | 21 | const { contributors } = await stagehand.page.extract({ 22 | instruction: "Extract top 5 contributors of this repository", 23 | schema: z.object({ 24 | contributors: z.array( 25 | z.object({ 26 | github_username: z 27 | .string() 28 | .describe("the github username of the contributor"), 29 | commits: z.number().describe("number of commits contributed"), 30 | }), 31 | ), 32 | }), 33 | useTextExtract, 34 | }); 35 | 36 | await stagehand.close(); 37 | 38 | const EXPECTED_CONTRIBUTORS = [ 39 | "zpao", 40 | "gaearon", 41 | "sebmarkbage", 42 | "acdlite", 43 | "sophiebits", 44 | ]; 45 | return { 46 | _success: 47 | contributors.length === EXPECTED_CONTRIBUTORS.length && 48 | contributors.every( 49 | (c, i) => 50 | EXPECTED_CONTRIBUTORS[i] === c.github_username && c.commits >= 1000, 51 | ), 52 | contributors, 53 | debugUrl, 54 | sessionUrl, 55 | logs: logger.getLogs(), 56 | }; 57 | } catch (error) { 58 | console.error("Error or timeout occurred:", error); 59 | 60 | await stagehand.close(); 61 | 62 | return { 63 | _success: false, 64 | error: JSON.parse(JSON.stringify(error, null, 2)), 65 | debugUrl, 66 | sessionUrl, 67 | logs: logger.getLogs(), 68 | }; 69 | } 70 | }; 71 | -------------------------------------------------------------------------------- /evals/tasks/extract_geniusee.ts: -------------------------------------------------------------------------------- 1 | import { z } from "zod"; 2 | import { EvalFunction } from "@/types/evals"; 3 | 4 | export const extract_geniusee: EvalFunction = async ({ 5 | logger, 6 | useTextExtract, 7 | debugUrl, 8 | sessionUrl, 9 | stagehand, 10 | }) => { 11 | await stagehand.page.goto( 12 | "https://browserbase.github.io/stagehand-eval-sites/sites/geniusee/", 13 | ); 14 | const selector = "/html/body/main/div[2]/div[2]/div[2]/table"; 15 | const scalability = await stagehand.page.extract({ 16 | instruction: 17 | "Extract the scalability comment in the table for Gemini (Google)", 18 | schema: z.object({ 19 | scalability: z.string(), 20 | }), 21 | useTextExtract, 22 | selector: selector, 23 | }); 24 | 25 | await stagehand.close(); 26 | const scalabilityComment = scalability.scalability; 27 | 28 | const expectedScalabilityComment = { 29 | scalability: "Scalable architecture with API access", 30 | }; 31 | 32 | const commentMatches = 33 | scalabilityComment == expectedScalabilityComment.scalability; 34 | 35 | if (!commentMatches) { 36 | logger.error({ 37 | message: "extracted scalability comment does not match expected", 38 | level: 0, 39 | auxiliary: { 40 | expected: { 41 | value: expectedScalabilityComment.scalability, 42 | type: "string", 43 | }, 44 | actual: { 45 | value: scalabilityComment, 46 | type: "string", 47 | }, 48 | }, 49 | }); 50 | return { 51 | _success: false, 52 | error: "extracted scalability comment does not match expected", 53 | logs: logger.getLogs(), 54 | debugUrl, 55 | sessionUrl, 56 | }; 57 | } 58 | 59 | return { 60 | _success: true, 61 | logs: logger.getLogs(), 62 | debugUrl, 63 | sessionUrl, 64 | }; 65 | }; 66 | -------------------------------------------------------------------------------- /evals/tasks/extract_geniusee_2.ts: -------------------------------------------------------------------------------- 1 | import { z } from "zod"; 2 | import { EvalFunction } from "@/types/evals"; 3 | 4 | export const extract_geniusee_2: EvalFunction = async ({ 5 | logger, 6 | useTextExtract, 7 | debugUrl, 8 | sessionUrl, 9 | stagehand, 10 | }) => { 11 | await stagehand.page.goto( 12 | "https://browserbase.github.io/stagehand-eval-sites/sites/geniusee/", 13 | ); 14 | const selector = "/html/body/main/div[2]/div[2]/div[2]/table/tbody/tr[9]"; 15 | const scalability = await stagehand.page.extract({ 16 | instruction: 17 | "Extract the scalability comment in the table for Gemini (Google)", 18 | schema: z.object({ 19 | scalability: z.string(), 20 | }), 21 | useTextExtract, 22 | selector: selector, 23 | }); 24 | 25 | await stagehand.close(); 26 | const scalabilityComment = scalability.scalability; 27 | 28 | // scalabilityCommentWeShouldNotGet matches a scalability comment in the table, 29 | // but since we are using targeted_extract here, 30 | // and passing in a selector that does NOT contain the scalabilityCommentWeShouldNotGet, 31 | // the LLM should have no visibility into scalabilityCommentWeShouldNotGet if 32 | // targeted_extract is performing correctly 33 | const scalabilityCommentWeShouldNotGet = { 34 | scalability: "Scalable architecture with API access", 35 | }; 36 | 37 | const commentMatches = 38 | scalabilityComment == scalabilityCommentWeShouldNotGet.scalability; 39 | 40 | if (commentMatches) { 41 | logger.error({ 42 | message: 43 | "extracted scalability comment matches the scalability comment that we SHOULD NOT get", 44 | level: 0, 45 | auxiliary: { 46 | expected: { 47 | value: scalabilityCommentWeShouldNotGet.scalability, 48 | type: "string", 49 | }, 50 | actual: { 51 | value: scalabilityComment, 52 | type: "string", 53 | }, 54 | }, 55 | }); 56 | return { 57 | _success: false, 58 | error: 59 | "scalability comment matches the scalability comment that we SHOULD NOT get", 60 | logs: logger.getLogs(), 61 | debugUrl, 62 | sessionUrl, 63 | }; 64 | } 65 | 66 | return { 67 | _success: true, 68 | logs: logger.getLogs(), 69 | debugUrl, 70 | sessionUrl, 71 | }; 72 | }; 73 | -------------------------------------------------------------------------------- /evals/tasks/extract_github_commits.ts: -------------------------------------------------------------------------------- 1 | import { EvalFunction } from "@/types/evals"; 2 | import { z } from "zod"; 3 | 4 | export const extract_github_commits: EvalFunction = async ({ 5 | logger, 6 | useTextExtract, 7 | debugUrl, 8 | sessionUrl, 9 | stagehand, 10 | }) => { 11 | try { 12 | await stagehand.page.goto("https://github.com/facebook/react"); 13 | 14 | await stagehand.page.act({ 15 | action: 16 | "find commit history, generally described by the number of commits", 17 | }); 18 | const { commits } = await stagehand.page.extract({ 19 | instruction: "Extract last 20 commits", 20 | schema: z.object({ 21 | commits: z.array( 22 | z.object({ 23 | commit_message: z.string(), 24 | commit_url: z.string(), 25 | commit_hash: z.string(), 26 | }), 27 | ), 28 | }), 29 | useTextExtract, 30 | }); 31 | 32 | logger.log({ 33 | message: "Extracted commits", 34 | level: 1, 35 | auxiliary: { 36 | commits: { 37 | value: JSON.stringify(commits), 38 | type: "object", 39 | }, 40 | }, 41 | }); 42 | 43 | await stagehand.close(); 44 | 45 | return { 46 | _success: commits.length === 20, 47 | commits, 48 | debugUrl, 49 | sessionUrl, 50 | logs: logger.getLogs(), 51 | }; 52 | } catch (error) { 53 | console.error("Error or timeout occurred:", error); 54 | 55 | await stagehand.close(); 56 | 57 | return { 58 | _success: false, 59 | error: JSON.parse(JSON.stringify(error, null, 2)), 60 | debugUrl, 61 | sessionUrl, 62 | logs: logger.getLogs(), 63 | }; 64 | } 65 | }; 66 | -------------------------------------------------------------------------------- /evals/tasks/extract_github_stars.ts: -------------------------------------------------------------------------------- 1 | import { EvalFunction } from "@/types/evals"; 2 | import { z } from "zod"; 3 | 4 | export const extract_github_stars: EvalFunction = async ({ 5 | logger, 6 | useTextExtract, 7 | debugUrl, 8 | sessionUrl, 9 | stagehand, 10 | }) => { 11 | try { 12 | await stagehand.page.goto("https://github.com/facebook/react"); 13 | 14 | const { stars } = await stagehand.page.extract({ 15 | instruction: "Extract the number of stars for the project", 16 | schema: z.object({ 17 | stars: z.number().describe("the number of stars for the project"), 18 | }), 19 | useTextExtract, 20 | }); 21 | 22 | const expectedStarsString = await stagehand.page 23 | .locator("#repo-stars-counter-star") 24 | .first() 25 | .innerHTML(); 26 | 27 | const expectedStars = expectedStarsString.toLowerCase().endsWith("k") 28 | ? parseFloat(expectedStarsString.slice(0, -1)) * 1000 29 | : parseFloat(expectedStarsString); 30 | 31 | const tolerance = 1000; 32 | const isWithinTolerance = Math.abs(stars - expectedStars) <= tolerance; 33 | 34 | await stagehand.close(); 35 | 36 | return { 37 | _success: isWithinTolerance, 38 | stars, 39 | debugUrl, 40 | sessionUrl, 41 | logs: logger.getLogs(), 42 | }; 43 | } catch (error) { 44 | console.error("Error or timeout occurred:", error); 45 | 46 | await stagehand.close(); 47 | 48 | return { 49 | _success: false, 50 | error: JSON.parse(JSON.stringify(error, null, 2)), 51 | debugUrl, 52 | sessionUrl, 53 | logs: logger.getLogs(), 54 | }; 55 | } 56 | }; 57 | -------------------------------------------------------------------------------- /evals/tasks/extract_hamilton_weather.ts: -------------------------------------------------------------------------------- 1 | import { EvalFunction } from "@/types/evals"; 2 | import { z } from "zod"; 3 | 4 | export const extract_hamilton_weather: EvalFunction = async ({ 5 | logger, 6 | useTextExtract, 7 | debugUrl, 8 | sessionUrl, 9 | stagehand, 10 | }) => { 11 | try { 12 | await stagehand.page.goto( 13 | "https://browserbase.github.io/stagehand-eval-sites/sites/hamilton-weather/", 14 | ); 15 | const xpath = 16 | "/html/body[1]/div[5]/main[1]/article[1]/div[6]/div[2]/div[1]/table[1]"; 17 | 18 | const weatherData = await stagehand.page.extract({ 19 | instruction: "extract the weather data for Sun, Feb 23 at 11PM", 20 | schema: z.object({ 21 | temperature: z.string(), 22 | weather_description: z.string(), 23 | wind: z.string(), 24 | humidity: z.string(), 25 | barometer: z.string(), 26 | visibility: z.string(), 27 | }), 28 | useTextExtract, 29 | selector: xpath, 30 | }); 31 | 32 | // Define the expected weather data 33 | const expectedWeatherData = { 34 | temperature: "27 °F", 35 | weather_description: "Light snow. Overcast.", 36 | wind: "6 mph", 37 | humidity: "93%", 38 | barometer: '30.07 "Hg', 39 | visibility: "10 mi", 40 | }; 41 | 42 | // Check that every field matches the expected value 43 | const isWeatherCorrect = 44 | weatherData.temperature === expectedWeatherData.temperature && 45 | weatherData.weather_description === 46 | expectedWeatherData.weather_description && 47 | weatherData.wind === expectedWeatherData.wind && 48 | weatherData.humidity === expectedWeatherData.humidity && 49 | weatherData.barometer === expectedWeatherData.barometer && 50 | weatherData.visibility === expectedWeatherData.visibility; 51 | 52 | await stagehand.close(); 53 | 54 | return { 55 | _success: isWeatherCorrect, 56 | weatherData, 57 | debugUrl, 58 | sessionUrl, 59 | logs: logger.getLogs(), 60 | }; 61 | } catch (error) { 62 | console.error("Error or timeout occurred:", error); 63 | 64 | await stagehand.close(); 65 | 66 | return { 67 | _success: false, 68 | error: JSON.parse(JSON.stringify(error, null, 2)), 69 | debugUrl, 70 | sessionUrl, 71 | logs: logger.getLogs(), 72 | }; 73 | } 74 | }; 75 | -------------------------------------------------------------------------------- /evals/tasks/extract_partners.ts: -------------------------------------------------------------------------------- 1 | import { EvalFunction } from "@/types/evals"; 2 | import { z } from "zod"; 3 | 4 | export const extract_partners: EvalFunction = async ({ 5 | debugUrl, 6 | sessionUrl, 7 | stagehand, 8 | logger, 9 | useTextExtract, 10 | }) => { 11 | try { 12 | await stagehand.page.goto("https://ramp.com"); 13 | 14 | await stagehand.page.act({ 15 | action: "move down to the bottom of the page.", 16 | }); 17 | 18 | await stagehand.page.act({ 19 | action: "Close the popup.", 20 | }); 21 | 22 | await stagehand.page.act({ 23 | action: "Find and click on the link that leads to the partners page.", 24 | }); 25 | 26 | const partners = await stagehand.page.extract({ 27 | instruction: ` 28 | Extract all of the partner categories on the page. 29 | `, 30 | schema: z.object({ 31 | partners: z.array( 32 | z.object({ 33 | partner_category: z.string().describe("The partner category"), 34 | }), 35 | ), 36 | explanation: z 37 | .string() 38 | .optional() 39 | .describe("Any explanation about partner listing or absence thereof"), 40 | }), 41 | useTextExtract, 42 | }); 43 | 44 | const expectedPartners = [ 45 | "Accounting Partners", 46 | "Private Equity & Venture Capital Partners", 47 | "Services Partners", 48 | "Affiliates", 49 | ]; 50 | 51 | const foundPartners = partners.partners.map((partner) => 52 | partner.partner_category.toLowerCase(), 53 | ); 54 | 55 | const allExpectedPartnersFound = expectedPartners.every((partner) => 56 | foundPartners.includes(partner.toLowerCase()), 57 | ); 58 | 59 | await stagehand.close(); 60 | 61 | return { 62 | _success: allExpectedPartnersFound, 63 | partners, 64 | debugUrl, 65 | sessionUrl, 66 | logs: logger.getLogs(), 67 | }; 68 | } catch (error) { 69 | logger.error({ 70 | message: "error in extractPartners function", 71 | level: 0, 72 | auxiliary: { 73 | error: { 74 | value: error.message, 75 | type: "string", 76 | }, 77 | trace: { 78 | value: error.stack, 79 | type: "string", 80 | }, 81 | }, 82 | }); 83 | 84 | await stagehand.close(); 85 | 86 | return { 87 | _success: false, 88 | debugUrl, 89 | sessionUrl, 90 | error: JSON.parse(JSON.stringify(error, null, 2)), 91 | logs: logger.getLogs(), 92 | }; 93 | } 94 | }; 95 | -------------------------------------------------------------------------------- /evals/tasks/extract_regulations_table.ts: -------------------------------------------------------------------------------- 1 | import { EvalFunction } from "@/types/evals"; 2 | import { z } from "zod"; 3 | 4 | export const extract_regulations_table: EvalFunction = async ({ 5 | debugUrl, 6 | sessionUrl, 7 | stagehand, 8 | logger, 9 | useTextExtract, 10 | }) => { 11 | try { 12 | await stagehand.page.goto( 13 | "https://browserbase.github.io/stagehand-eval-sites/sites/ncc-numbering-plan/", 14 | ); 15 | 16 | const xpath = 17 | "/html/body/div[3]/main/div[2]/div[2]/div/div/div[2]/article/div[2]/div[1]/div/table"; 18 | 19 | const allottees = await stagehand.page.extract({ 20 | instruction: 21 | "Extract ALL of the Allottees and their corresponding name, area, and area code.", 22 | schema: z.object({ 23 | allottee_list: z.array( 24 | z.object({ 25 | allottee_name: z.string(), 26 | area: z.string(), 27 | area_code: z.string(), 28 | access_code: z.string(), 29 | }), 30 | ), 31 | }), 32 | useTextExtract, 33 | selector: xpath, 34 | }); 35 | 36 | // Define the expected weather data 37 | const allottees_expected_first = { 38 | allottee_name: "101 Communications Limited", 39 | area: "Lagos", 40 | area_code: "0201", 41 | access_code: "249", 42 | }; 43 | 44 | const allottees_expected_last = { 45 | allottee_name: "Airtel Networks Limited", 46 | area: "National", 47 | area_code: "0708", 48 | access_code: "708", 49 | }; 50 | 51 | const expected_length = 25; 52 | 53 | const allotteeList = allottees.allottee_list; 54 | 55 | // Check that the first entry, last entry, and total number match expectations 56 | const isFirstCorrect = 57 | JSON.stringify(allotteeList[0]) === 58 | JSON.stringify(allottees_expected_first); 59 | const isLastCorrect = 60 | JSON.stringify(allotteeList[allotteeList.length - 1]) === 61 | JSON.stringify(allottees_expected_last); 62 | const isLengthCorrect = allotteeList.length === expected_length; 63 | 64 | const isRegulationsCorrect = 65 | isFirstCorrect && isLastCorrect && isLengthCorrect; 66 | 67 | await stagehand.close(); 68 | 69 | return { 70 | _success: isRegulationsCorrect, 71 | regulationsData: allottees, 72 | debugUrl, 73 | sessionUrl, 74 | logs: logger.getLogs(), 75 | }; 76 | } catch (error) { 77 | console.error("Error or timeout occurred:", error); 78 | 79 | await stagehand.close(); 80 | 81 | return { 82 | _success: false, 83 | error: JSON.parse(JSON.stringify(error, null, 2)), 84 | debugUrl, 85 | sessionUrl, 86 | logs: logger.getLogs(), 87 | }; 88 | } 89 | }; 90 | -------------------------------------------------------------------------------- /evals/tasks/extract_repo_name.ts: -------------------------------------------------------------------------------- 1 | import { EvalFunction } from "@/types/evals"; 2 | 3 | export const extract_repo_name: EvalFunction = async ({ 4 | debugUrl, 5 | sessionUrl, 6 | stagehand, 7 | logger, 8 | }) => { 9 | try { 10 | await stagehand.page.goto("https://github.com/facebook/react"); 11 | 12 | const { extraction } = await stagehand.page.extract( 13 | "extract the title of the Github repository. Do not include the owner of the repository.", 14 | ); 15 | 16 | logger.log({ 17 | message: "Extracted repo title", 18 | level: 1, 19 | auxiliary: { 20 | repo_name: { 21 | value: extraction, 22 | type: "object", 23 | }, 24 | }, 25 | }); 26 | 27 | await stagehand.close(); 28 | 29 | return { 30 | _success: extraction === "react", 31 | extraction, 32 | debugUrl, 33 | sessionUrl, 34 | logs: logger.getLogs(), 35 | }; 36 | } catch (error) { 37 | console.error("Error or timeout occurred:", error); 38 | 39 | await stagehand.close(); 40 | 41 | return { 42 | _success: false, 43 | error: JSON.parse(JSON.stringify(error, null, 2)), 44 | debugUrl, 45 | sessionUrl, 46 | logs: logger.getLogs(), 47 | }; 48 | } 49 | }; 50 | -------------------------------------------------------------------------------- /evals/tasks/extract_rockauto.ts: -------------------------------------------------------------------------------- 1 | import { EvalFunction } from "@/types/evals"; 2 | import { z } from "zod"; 3 | 4 | export const extract_rockauto: EvalFunction = async ({ 5 | debugUrl, 6 | sessionUrl, 7 | stagehand, 8 | logger, 9 | useTextExtract, 10 | }) => { 11 | await stagehand.page.goto( 12 | "https://browserbase.github.io/stagehand-eval-sites/sites/rockauto/", 13 | ); 14 | await new Promise((resolve) => setTimeout(resolve, 5000)); 15 | const result = await stagehand.page.extract({ 16 | instruction: 17 | "Extract the part number of all the coolant and antifreeze products in the 'economy' category. " + 18 | "Do not include the manufacturer name. Do not include products from the premium category.", 19 | schema: z.object({ 20 | coolant_products: z.array( 21 | z.object({ 22 | part_number: z.string(), 23 | }), 24 | ), 25 | }), 26 | useTextExtract, 27 | }); 28 | 29 | await stagehand.close(); 30 | 31 | const coolantProducts = result.coolant_products; 32 | const expectedPartNumbers = [ 33 | "GREEN5050GAL", 34 | "719009", 35 | "AF3300", 36 | "AF3100", 37 | "MV5050GAL", 38 | ]; 39 | const expectedLength = expectedPartNumbers.length; 40 | 41 | if (coolantProducts.length !== expectedLength) { 42 | logger.error({ 43 | message: "Incorrect number of coolant products extracted", 44 | level: 0, 45 | auxiliary: { 46 | expected: { 47 | value: expectedLength.toString(), 48 | type: "integer", 49 | }, 50 | actual: { 51 | value: coolantProducts.length.toString(), 52 | type: "integer", 53 | }, 54 | }, 55 | }); 56 | return { 57 | _success: false, 58 | error: "Incorrect number of coolant products extracted", 59 | logs: logger.getLogs(), 60 | debugUrl, 61 | sessionUrl, 62 | }; 63 | } 64 | 65 | const missingParts = expectedPartNumbers.filter( 66 | (expectedPart) => 67 | !coolantProducts.some((p) => p.part_number === expectedPart), 68 | ); 69 | 70 | if (missingParts.length > 0) { 71 | logger.error({ 72 | message: "Missing expected part number(s)", 73 | level: 0, 74 | auxiliary: { 75 | missingParts: { 76 | value: JSON.stringify(missingParts), 77 | type: "object", 78 | }, 79 | actualExtracted: { 80 | value: JSON.stringify(coolantProducts), 81 | type: "object", 82 | }, 83 | }, 84 | }); 85 | return { 86 | _success: false, 87 | error: `One or more expected part numbers were not found: ${missingParts.join(", ")}`, 88 | logs: logger.getLogs(), 89 | debugUrl, 90 | sessionUrl, 91 | }; 92 | } 93 | 94 | return { 95 | _success: true, 96 | logs: logger.getLogs(), 97 | debugUrl, 98 | sessionUrl, 99 | }; 100 | }; 101 | -------------------------------------------------------------------------------- /evals/tasks/extract_single_link.ts: -------------------------------------------------------------------------------- 1 | import { EvalFunction } from "@/types/evals"; 2 | import { z } from "zod"; 3 | 4 | export const extract_single_link: EvalFunction = async ({ 5 | logger, 6 | debugUrl, 7 | sessionUrl, 8 | stagehand, 9 | }) => { 10 | try { 11 | await stagehand.page.goto( 12 | "https://browserbase.github.io/stagehand-eval-sites/sites/geniusee/", 13 | ); 14 | 15 | const extraction = await stagehand.page.extract({ 16 | instruction: "extract the link to the 'contact us' page", 17 | schema: z.object({ 18 | link: z.string().url(), 19 | }), 20 | }); 21 | 22 | await stagehand.close(); 23 | const extractedLink = extraction.link; 24 | const expectedLink = 25 | "https://browserbase.github.io/stagehand-eval-sites/sites/geniusee/#contact"; 26 | 27 | if (extractedLink === expectedLink) { 28 | return { 29 | _success: true, 30 | debugUrl, 31 | sessionUrl, 32 | logs: logger.getLogs(), 33 | }; 34 | } 35 | return { 36 | _success: false, 37 | reason: `Extracted link: ${extractedLink} does not match expected link: ${expectedLink}`, 38 | debugUrl, 39 | sessionUrl, 40 | logs: logger.getLogs(), 41 | }; 42 | } catch (error) { 43 | await stagehand.close(); 44 | return { 45 | _success: false, 46 | error: JSON.parse(JSON.stringify(error, null, 2)), 47 | debugUrl, 48 | sessionUrl, 49 | logs: logger.getLogs(), 50 | }; 51 | } 52 | }; 53 | -------------------------------------------------------------------------------- /evals/tasks/extract_snowshoeing_destinations.ts: -------------------------------------------------------------------------------- 1 | import { z } from "zod"; 2 | import { EvalFunction } from "@/types/evals"; 3 | 4 | export const extract_snowshoeing_destinations: EvalFunction = async ({ 5 | debugUrl, 6 | sessionUrl, 7 | stagehand, 8 | logger, 9 | useTextExtract, 10 | }) => { 11 | try { 12 | await stagehand.page.goto( 13 | "https://www.cbisland.com/blog/10-snowshoeing-adventures-on-cape-breton-island/", 14 | ); 15 | 16 | await stagehand.page.act({ action: "accept the cookies" }); 17 | 18 | const snowshoeing_regions = await stagehand.page.extract({ 19 | instruction: 20 | "Extract all the snowshoeing regions and the names of the trails within each region.", 21 | schema: z.object({ 22 | snowshoeing_regions: z.array( 23 | z.object({ 24 | region_name: z 25 | .string() 26 | .describe("The name of the snowshoeing region"), 27 | trails: z 28 | .array( 29 | z.object({ 30 | trail_name: z.string().describe("The name of the trail"), 31 | }), 32 | ) 33 | .describe("The list of trails available in this region."), 34 | }), 35 | ), 36 | }), 37 | useTextExtract, 38 | }); 39 | 40 | logger.log({ 41 | message: "Extracted destinations and trails", 42 | level: 1, 43 | auxiliary: { 44 | destinations: { 45 | value: JSON.stringify(snowshoeing_regions), 46 | type: "object", 47 | }, 48 | }, 49 | }); 50 | 51 | await stagehand.close(); 52 | 53 | const _success = snowshoeing_regions.snowshoeing_regions.length === 10; 54 | 55 | return { 56 | _success, 57 | snowshoeing_regions, 58 | debugUrl, 59 | sessionUrl, 60 | logs: logger.getLogs(), 61 | }; 62 | } catch (error) { 63 | logger.error({ 64 | message: "Error in extract_snowshoeing_destinations function", 65 | level: 0, 66 | auxiliary: { 67 | error: { 68 | value: error.message, 69 | type: "string", 70 | }, 71 | trace: { 72 | value: error.stack, 73 | type: "string", 74 | }, 75 | }, 76 | }); 77 | return { 78 | _success: false, 79 | error: JSON.parse(JSON.stringify(error, null, 2)), 80 | debugUrl, 81 | sessionUrl, 82 | logs: logger.getLogs(), 83 | }; 84 | } finally { 85 | await stagehand.context.close().catch(() => {}); 86 | } 87 | }; 88 | -------------------------------------------------------------------------------- /evals/tasks/extract_zillow.ts: -------------------------------------------------------------------------------- 1 | import { z } from "zod"; 2 | import { EvalFunction } from "../../types/evals"; 3 | 4 | export const extract_zillow: EvalFunction = async ({ 5 | debugUrl, 6 | sessionUrl, 7 | stagehand, 8 | logger, 9 | useTextExtract, 10 | }) => { 11 | await stagehand.page.goto( 12 | "https://browserbase.github.io/stagehand-eval-sites/sites/zillow/", 13 | ); 14 | // timeout for 5 seconds 15 | await stagehand.page.waitForTimeout(5000); 16 | const real_estate_listings = await stagehand.page.extract({ 17 | instruction: 18 | "Extract EACH AND EVERY HOME PRICE AND ADDRESS ON THE PAGE. DO NOT MISS ANY OF THEM.", 19 | schema: z.object({ 20 | listings: z.array( 21 | z.object({ 22 | price: z.string().describe("The price of the home"), 23 | trails: z.string().describe("The address of the home"), 24 | }), 25 | ), 26 | }), 27 | useTextExtract, 28 | }); 29 | 30 | await stagehand.close(); 31 | const listings = real_estate_listings.listings; 32 | const expectedLength = 38; 33 | 34 | if (listings.length < expectedLength) { 35 | logger.error({ 36 | message: "Incorrect number of listings extracted", 37 | level: 0, 38 | auxiliary: { 39 | expected: { 40 | value: expectedLength.toString(), 41 | type: "integer", 42 | }, 43 | actual: { 44 | value: listings.length.toString(), 45 | type: "integer", 46 | }, 47 | }, 48 | }); 49 | return { 50 | _success: false, 51 | error: "Incorrect number of listings extracted", 52 | logs: logger.getLogs(), 53 | debugUrl, 54 | sessionUrl, 55 | }; 56 | } 57 | 58 | return { 59 | _success: true, 60 | logs: logger.getLogs(), 61 | debugUrl, 62 | sessionUrl, 63 | }; 64 | }; 65 | -------------------------------------------------------------------------------- /evals/tasks/google_flights.ts: -------------------------------------------------------------------------------- 1 | import { EvalFunction } from "@/types/evals"; 2 | import { ObserveResult } from "@/types/stagehand"; 3 | 4 | /** 5 | * This eval attempts to click on an element that should not pass the playwright actionability check 6 | * which happens by default if you call locator.click (more information here: 7 | * https://playwright.dev/docs/actionability) 8 | * 9 | * If this eval passes, it means that we have correctly set {force: true} in performPlaywrightMethod, 10 | * and the click was successful even though the target element (found by the xpath) did not 11 | * pass the actionability check. 12 | */ 13 | 14 | export const google_flights: EvalFunction = async ({ 15 | debugUrl, 16 | sessionUrl, 17 | stagehand, 18 | logger, 19 | }) => { 20 | await stagehand.page.goto( 21 | "https://browserbase.github.io/stagehand-eval-sites/sites/google-flights/", 22 | ); 23 | 24 | const observeResult: ObserveResult = { 25 | selector: 26 | "xpath=/html/body/c-wiz[2]/div/div[2]/c-wiz/div[1]/c-wiz/div[2]/div[2]/div[2]/div/div[2]/div[1]/ul/li[1]/div/div[1]", 27 | description: "the first departing flight", 28 | method: "click", 29 | arguments: [], 30 | }; 31 | await stagehand.page.act(observeResult); 32 | 33 | const expectedUrl = 34 | "https://browserbase.github.io/stagehand-eval-sites/sites/google-flights/return-flight.html"; 35 | const currentUrl = stagehand.page.url(); 36 | 37 | await stagehand.close(); 38 | 39 | if (currentUrl === expectedUrl) { 40 | return { 41 | _success: true, 42 | currentUrl, 43 | debugUrl, 44 | sessionUrl, 45 | logs: logger.getLogs(), 46 | }; 47 | } 48 | return { 49 | _success: false, 50 | error: "The current URL does not match expected.", 51 | logs: logger.getLogs(), 52 | debugUrl, 53 | sessionUrl, 54 | }; 55 | }; 56 | -------------------------------------------------------------------------------- /evals/tasks/history.ts: -------------------------------------------------------------------------------- 1 | import { EvalFunction } from "@/types/evals"; 2 | 3 | export const history: EvalFunction = async ({ 4 | debugUrl, 5 | sessionUrl, 6 | stagehand, 7 | logger, 8 | }) => { 9 | await stagehand.page.goto("https://docs.stagehand.dev"); 10 | 11 | await stagehand.page.act("click on the 'Quickstart' tab"); 12 | 13 | await stagehand.page.extract("Extract the title of the page"); 14 | 15 | await stagehand.page.observe("Find all links on the page"); 16 | 17 | const history = stagehand.history; 18 | 19 | const hasCorrectNumberOfEntries = history.length === 4; 20 | 21 | const hasNavigateEntry = history[0].method === "navigate"; 22 | const hasActEntry = history[1].method === "act"; 23 | const hasExtractEntry = history[2].method === "extract"; 24 | const hasObserveEntry = history[3].method === "observe"; 25 | 26 | const allEntriesHaveTimestamps = history.every( 27 | (entry) => 28 | typeof entry.timestamp === "string" && entry.timestamp.length > 0, 29 | ); 30 | const allEntriesHaveResults = history.every( 31 | (entry) => entry.result !== undefined, 32 | ); 33 | 34 | await stagehand.close(); 35 | 36 | const success = 37 | hasCorrectNumberOfEntries && 38 | hasNavigateEntry && 39 | hasActEntry && 40 | hasExtractEntry && 41 | hasObserveEntry && 42 | allEntriesHaveTimestamps && 43 | allEntriesHaveResults; 44 | 45 | return { 46 | _success: success, 47 | debugUrl, 48 | sessionUrl, 49 | logs: logger.getLogs(), 50 | }; 51 | }; 52 | -------------------------------------------------------------------------------- /evals/tasks/homedepot.ts: -------------------------------------------------------------------------------- 1 | import { EvalFunction } from "@/types/evals"; 2 | import { z } from "zod"; 3 | 4 | export const homedepot: EvalFunction = async ({ 5 | debugUrl, 6 | sessionUrl, 7 | stagehand, 8 | logger, 9 | useTextExtract, 10 | }) => { 11 | try { 12 | await stagehand.page.goto("https://www.homedepot.com/"); 13 | await stagehand.page.act("search for gas grills"); 14 | await stagehand.page.act("click on the best selling gas grill"); 15 | await stagehand.page.act("click on the Product Details"); 16 | await stagehand.page.act("find the Primary Burner BTU"); 17 | 18 | const productSpecs = await stagehand.page.extract({ 19 | instruction: "Extract the Primary exact Burner BTU of the product", 20 | schema: z.object({ 21 | productSpecs: z 22 | .array( 23 | z.object({ 24 | burnerBTU: z.string().describe("Primary Burner BTU exact value"), 25 | }), 26 | ) 27 | .describe("Gas grill Primary Burner BTU exact value"), 28 | }), 29 | useTextExtract, 30 | }); 31 | 32 | logger.log({ 33 | message: `gas grill primary burner BTU`, 34 | level: 1, 35 | auxiliary: { 36 | productSpecs: { 37 | value: JSON.stringify(productSpecs), 38 | type: "object", 39 | }, 40 | }, 41 | }); 42 | 43 | if ( 44 | !productSpecs || 45 | !productSpecs.productSpecs || 46 | productSpecs.productSpecs.length !== 1 47 | ) { 48 | await stagehand.close(); 49 | 50 | return { 51 | _success: false, 52 | productSpecs, 53 | debugUrl, 54 | sessionUrl, 55 | logs: logger.getLogs(), 56 | }; 57 | } 58 | 59 | const hasFourZerosAndOne4 = 60 | (productSpecs.productSpecs[0].burnerBTU.match(/0/g) || []).length === 4 && 61 | (productSpecs.productSpecs[0].burnerBTU.match(/4/g) || []).length === 1; 62 | 63 | await stagehand.close(); 64 | 65 | return { 66 | _success: hasFourZerosAndOne4, 67 | productSpecs, 68 | debugUrl, 69 | sessionUrl, 70 | logs: logger.getLogs(), 71 | }; 72 | } catch (error) { 73 | logger.error({ 74 | message: "error in homedepot function", 75 | level: 0, 76 | auxiliary: { 77 | error: { 78 | value: error.message, 79 | type: "string", 80 | }, 81 | trace: { 82 | value: error.stack, 83 | type: "string", 84 | }, 85 | }, 86 | }); 87 | 88 | await stagehand.close(); 89 | 90 | return { 91 | _success: false, 92 | error: JSON.parse(JSON.stringify(error, null, 2)), 93 | debugUrl, 94 | sessionUrl, 95 | logs: logger.getLogs(), 96 | }; 97 | } 98 | }; 99 | -------------------------------------------------------------------------------- /evals/tasks/imdb_movie_details.ts: -------------------------------------------------------------------------------- 1 | import { EvalFunction } from "@/types/evals"; 2 | import { z } from "zod"; 3 | 4 | export const imdb_movie_details: EvalFunction = async ({ 5 | debugUrl, 6 | sessionUrl, 7 | stagehand, 8 | logger, 9 | useTextExtract, 10 | }) => { 11 | await stagehand.page.goto("https://www.imdb.com/title/tt0111161/", { 12 | waitUntil: "domcontentloaded", 13 | }); 14 | await stagehand.page.act({ 15 | action: "click on the movie ratings", 16 | }); 17 | 18 | const movieDetails = await stagehand.page.extract({ 19 | instruction: "Extract the list of countries with the most ratings.", 20 | schema: z.object({ 21 | countries: z 22 | .array(z.string()) 23 | .describe("List of countries with the most ratings"), 24 | }), 25 | useTextExtract, 26 | }); 27 | 28 | await stagehand.close(); 29 | 30 | const expectedCountries = [ 31 | "United States", 32 | "United Kingdom", 33 | "Turkey", 34 | "India", 35 | "Germany", 36 | ]; 37 | 38 | if (!movieDetails.countries || movieDetails.countries.length !== 5) { 39 | logger.error({ 40 | message: "Failed to extract exactly five countries", 41 | level: 0, 42 | auxiliary: { 43 | expected: { 44 | value: JSON.stringify(expectedCountries), 45 | type: "object", 46 | }, 47 | actual: { 48 | value: JSON.stringify(movieDetails.countries || []), 49 | type: "object", 50 | }, 51 | }, 52 | }); 53 | 54 | return { 55 | _success: false, 56 | error: "Incorrect number of countries extracted", 57 | logs: logger.getLogs(), 58 | debugUrl, 59 | sessionUrl, 60 | }; 61 | } 62 | 63 | const missingCountries = expectedCountries.filter( 64 | (country) => !movieDetails.countries.includes(country), 65 | ); 66 | 67 | if (missingCountries.length > 0) { 68 | logger.error({ 69 | message: "Extracted countries do not match expected countries", 70 | level: 0, 71 | auxiliary: { 72 | missing: { 73 | value: JSON.stringify(missingCountries), 74 | type: "object", 75 | }, 76 | extracted: { 77 | value: JSON.stringify(movieDetails.countries), 78 | type: "object", 79 | }, 80 | }, 81 | }); 82 | 83 | return { 84 | _success: false, 85 | error: "Extracted countries do not match expected countries", 86 | logs: logger.getLogs(), 87 | debugUrl, 88 | sessionUrl, 89 | }; 90 | } 91 | 92 | return { 93 | _success: true, 94 | countries: movieDetails.countries, 95 | logs: logger.getLogs(), 96 | debugUrl, 97 | sessionUrl, 98 | }; 99 | }; 100 | -------------------------------------------------------------------------------- /evals/tasks/instructions.ts: -------------------------------------------------------------------------------- 1 | import { EvalFunction } from "@/types/evals"; 2 | 3 | export const instructions: EvalFunction = async ({ 4 | debugUrl, 5 | sessionUrl, 6 | stagehand, 7 | logger, 8 | }) => { 9 | try { 10 | const page = stagehand.page; 11 | 12 | await page.goto("https://docs.browserbase.com/"); 13 | 14 | await page.act({ 15 | action: "secret12345", 16 | }); 17 | 18 | await page.waitForLoadState("domcontentloaded"); 19 | 20 | const url = page.url(); 21 | 22 | const isCorrectUrl = 23 | url === "https://docs.browserbase.com/introduction/what-is-browserbase"; 24 | 25 | return { 26 | _success: isCorrectUrl, 27 | debugUrl, 28 | sessionUrl, 29 | logs: logger.getLogs(), 30 | }; 31 | } catch (error) { 32 | console.error("Error or timeout occurred:", error); 33 | 34 | return { 35 | _success: false, 36 | error: JSON.parse(JSON.stringify(error, null, 2)), 37 | debugUrl, 38 | sessionUrl, 39 | logs: logger.getLogs(), 40 | }; 41 | } finally { 42 | await stagehand.close(); 43 | } 44 | }; 45 | -------------------------------------------------------------------------------- /evals/tasks/ionwave.ts: -------------------------------------------------------------------------------- 1 | import { EvalFunction } from "@/types/evals"; 2 | 3 | export const ionwave: EvalFunction = async ({ 4 | debugUrl, 5 | sessionUrl, 6 | stagehand, 7 | logger, 8 | }) => { 9 | await stagehand.page.goto("https://elpasotexas.ionwave.net/Login.aspx"); 10 | 11 | await stagehand.page.act({ 12 | action: 'Click on "Closed Bids"', 13 | }); 14 | 15 | const expectedUrl = 16 | "https://elpasotexas.ionwave.net/SourcingEvents.aspx?SourceType=2"; 17 | const currentUrl = stagehand.page.url(); 18 | 19 | await stagehand.close(); 20 | 21 | return { 22 | _success: currentUrl.startsWith(expectedUrl), 23 | currentUrl, 24 | debugUrl, 25 | sessionUrl, 26 | logs: logger.getLogs(), 27 | }; 28 | }; 29 | -------------------------------------------------------------------------------- /evals/tasks/ionwave_observe.ts: -------------------------------------------------------------------------------- 1 | import { EvalFunction } from "@/types/evals"; 2 | 3 | export const ionwave_observe: EvalFunction = async ({ 4 | debugUrl, 5 | sessionUrl, 6 | stagehand, 7 | logger, 8 | }) => { 9 | await stagehand.page.goto("https://elpasotexas.ionwave.net/Login.aspx"); 10 | 11 | const observations = await stagehand.page.observe({ onlyVisible: true }); 12 | 13 | if (observations.length === 0) { 14 | await stagehand.close(); 15 | return { 16 | _success: false, 17 | observations, 18 | debugUrl, 19 | sessionUrl, 20 | logs: logger.getLogs(), 21 | }; 22 | } 23 | 24 | const expectedLocator = `div.rowLinks:nth-child(27) > div:nth-child(1) > a:nth-child(1)`; 25 | 26 | const expectedResult = await stagehand.page 27 | .locator(expectedLocator) 28 | .first() 29 | .innerText(); 30 | 31 | let foundMatch = false; 32 | for (const observation of observations) { 33 | try { 34 | const observationResult = await stagehand.page 35 | .locator(observation.selector) 36 | .first() 37 | .innerText(); 38 | 39 | if (observationResult === expectedResult) { 40 | foundMatch = true; 41 | break; 42 | } 43 | } catch (error) { 44 | console.warn( 45 | `Failed to check observation with selector ${observation.selector}:`, 46 | error.message, 47 | ); 48 | continue; 49 | } 50 | } 51 | 52 | await stagehand.close(); 53 | 54 | return { 55 | _success: foundMatch, 56 | expected: expectedResult, 57 | observations, 58 | debugUrl, 59 | sessionUrl, 60 | logs: logger.getLogs(), 61 | }; 62 | }; 63 | -------------------------------------------------------------------------------- /evals/tasks/nextChunk.ts: -------------------------------------------------------------------------------- 1 | import { Stagehand } from "@/dist"; 2 | import { EvalFunction } from "@/types/evals"; 3 | 4 | export const nextChunk: EvalFunction = async ({ 5 | logger, 6 | stagehandConfig, 7 | debugUrl, 8 | sessionUrl, 9 | }) => { 10 | const stagehand = new Stagehand({ 11 | ...stagehandConfig, 12 | domSettleTimeoutMs: 3000, 13 | }); 14 | await stagehand.init(); 15 | 16 | await stagehand.page.goto("https://www.apartments.com/san-francisco-ca/"); 17 | await stagehand.page.act({ 18 | action: "click on the all filters button", 19 | }); 20 | 21 | const { initialScrollTop, chunkHeight } = await stagehand.page.evaluate( 22 | () => { 23 | const container = document.querySelector( 24 | "#advancedFilters > div", 25 | ) as HTMLElement; 26 | if (!container) { 27 | console.warn( 28 | "Could not find #advancedFilters > div. Returning 0 for measurements.", 29 | ); 30 | return { initialScrollTop: 0, chunkHeight: 0 }; 31 | } 32 | return { 33 | initialScrollTop: container.scrollTop, 34 | chunkHeight: container.getBoundingClientRect().height, 35 | }; 36 | }, 37 | ); 38 | 39 | await stagehand.page.act({ 40 | action: "scroll down one chunk on the filters modal", 41 | }); 42 | 43 | await new Promise((resolve) => setTimeout(resolve, 2000)); 44 | 45 | const newScrollTop = await stagehand.page.evaluate(() => { 46 | const container = document.querySelector( 47 | "#advancedFilters > div", 48 | ) as HTMLElement; 49 | return container?.scrollTop ?? 0; 50 | }); 51 | 52 | await stagehand.close(); 53 | 54 | const actualDiff = newScrollTop - initialScrollTop; 55 | const threshold = 20; // allowable difference in px 56 | const scrolledOneChunk = Math.abs(actualDiff - chunkHeight) <= threshold; 57 | 58 | const evaluationResult = scrolledOneChunk 59 | ? { 60 | _success: true, 61 | logs: logger.getLogs(), 62 | debugUrl, 63 | sessionUrl, 64 | message: `Successfully scrolled ~one chunk: expected ~${chunkHeight}, got ${actualDiff}`, 65 | } 66 | : { 67 | _success: false, 68 | logs: logger.getLogs(), 69 | debugUrl, 70 | sessionUrl, 71 | message: `Scroll difference expected ~${chunkHeight} but only scrolled ${actualDiff}.`, 72 | }; 73 | 74 | return evaluationResult; 75 | }; 76 | -------------------------------------------------------------------------------- /evals/tasks/nonsense_action.ts: -------------------------------------------------------------------------------- 1 | import { EvalFunction } from "@/types/evals"; 2 | 3 | export const nonsense_action: EvalFunction = async ({ 4 | debugUrl, 5 | sessionUrl, 6 | stagehand, 7 | logger, 8 | }) => { 9 | try { 10 | await stagehand.page.goto("https://www.homedepot.com/"); 11 | 12 | const result = await stagehand.page.act({ 13 | action: "what is the capital of the moon?", 14 | }); 15 | 16 | return { 17 | _success: !result.success, // We expect this to fail 18 | debugUrl, 19 | sessionUrl, 20 | logs: logger.getLogs(), 21 | }; 22 | } catch (error) { 23 | console.error(`Error in nonsense_action function: ${error.message}`); 24 | return { 25 | _success: false, 26 | error: JSON.parse(JSON.stringify(error, null, 2)), 27 | debugUrl, 28 | sessionUrl, 29 | logs: logger.getLogs(), 30 | }; 31 | } finally { 32 | await stagehand.close(); 33 | } 34 | }; 35 | -------------------------------------------------------------------------------- /evals/tasks/observe_amazon_add_to_cart.ts: -------------------------------------------------------------------------------- 1 | import { EvalFunction } from "@/types/evals"; 2 | 3 | export const observe_amazon_add_to_cart: EvalFunction = async ({ 4 | debugUrl, 5 | sessionUrl, 6 | stagehand, 7 | logger, 8 | }) => { 9 | await stagehand.page.goto( 10 | "https://browserbase.github.io/stagehand-eval-sites/sites/amazon/", 11 | ); 12 | 13 | await stagehand.page.waitForTimeout(5000); 14 | 15 | const observations1 = await stagehand.page.observe({ 16 | instruction: "Find and click the 'Add to Cart' button", 17 | onlyVisible: false, 18 | returnAction: true, 19 | }); 20 | 21 | console.log(observations1); 22 | 23 | // Example of using performPlaywrightMethod if you have the xpath 24 | if (observations1.length > 0) { 25 | const action1 = observations1[0]; 26 | await stagehand.page.act(action1); 27 | } 28 | 29 | await stagehand.page.waitForTimeout(2000); 30 | 31 | const observations2 = await stagehand.page.observe({ 32 | instruction: "Find and click the 'Proceed to checkout' button", 33 | }); 34 | 35 | // Example of using performPlaywrightMethod if you have the xpath 36 | if (observations2.length > 0) { 37 | const action2 = observations2[0]; 38 | await stagehand.page.act(action2); 39 | } 40 | await stagehand.page.waitForTimeout(2000); 41 | 42 | const currentUrl = stagehand.page.url(); 43 | const expectedUrlPrefix = 44 | "https://browserbase.github.io/stagehand-eval-sites/sites/amazon/sign-in.html"; 45 | 46 | await stagehand.close(); 47 | 48 | return { 49 | _success: currentUrl.startsWith(expectedUrlPrefix), 50 | currentUrl, 51 | debugUrl, 52 | sessionUrl, 53 | logs: logger.getLogs(), 54 | }; 55 | }; 56 | -------------------------------------------------------------------------------- /evals/tasks/observe_github.ts: -------------------------------------------------------------------------------- 1 | import { EvalFunction } from "@/types/evals"; 2 | 3 | export const observe_github: EvalFunction = async ({ 4 | debugUrl, 5 | sessionUrl, 6 | stagehand, 7 | logger, 8 | }) => { 9 | await stagehand.page.goto("https://github.com/numpy/numpy/tree/main/numpy"); 10 | 11 | const observations = await stagehand.page.observe({ 12 | instruction: "find the scrollable element that holds the repos file tree.", 13 | }); 14 | 15 | if (observations.length === 0) { 16 | await stagehand.close(); 17 | return { 18 | _success: false, 19 | observations, 20 | debugUrl, 21 | sessionUrl, 22 | logs: logger.getLogs(), 23 | }; 24 | } 25 | 26 | const possibleLocators = [ 27 | `#repos-file-tree > div.Box-sc-g0xbh4-0.jbQqON > div > div > div > nav > ul`, 28 | `#repos-file-tree > div.Box-sc-g0xbh4-0.jbQqON > div > div > div > nav`, 29 | `#repos-file-tree > div.Box-sc-g0xbh4-0.jbQqON`, 30 | ]; 31 | 32 | const possibleHandles = []; 33 | for (const locatorStr of possibleLocators) { 34 | const locator = stagehand.page.locator(locatorStr); 35 | const handle = await locator.elementHandle(); 36 | if (handle) { 37 | possibleHandles.push({ locatorStr, handle }); 38 | } 39 | } 40 | 41 | let foundMatch = false; 42 | let matchedLocator: string | null = null; 43 | 44 | for (const observation of observations) { 45 | try { 46 | const observationLocator = stagehand.page 47 | .locator(observation.selector) 48 | .first(); 49 | const observationHandle = await observationLocator.elementHandle(); 50 | if (!observationHandle) { 51 | continue; 52 | } 53 | 54 | for (const { locatorStr, handle: candidateHandle } of possibleHandles) { 55 | const isSameNode = await observationHandle.evaluate( 56 | (node, otherNode) => node === otherNode, 57 | candidateHandle, 58 | ); 59 | if (isSameNode) { 60 | foundMatch = true; 61 | matchedLocator = locatorStr; 62 | break; 63 | } 64 | } 65 | 66 | if (foundMatch) { 67 | break; 68 | } 69 | } catch (error) { 70 | console.warn( 71 | `Failed to check observation with selector ${observation.selector}:`, 72 | error.message, 73 | ); 74 | continue; 75 | } 76 | } 77 | 78 | await stagehand.close(); 79 | 80 | return { 81 | _success: foundMatch, 82 | matchedLocator, 83 | observations, 84 | debugUrl, 85 | sessionUrl, 86 | logs: logger.getLogs(), 87 | }; 88 | }; 89 | -------------------------------------------------------------------------------- /evals/tasks/observe_iframes1.ts: -------------------------------------------------------------------------------- 1 | import { EvalFunction } from "@/types/evals"; 2 | 3 | export const observe_iframes1: EvalFunction = async ({ 4 | debugUrl, 5 | sessionUrl, 6 | stagehand, 7 | logger, 8 | }) => { 9 | await stagehand.page.goto("https://tucowsdomains.com/abuse-form/phishing/"); 10 | 11 | const observations = await stagehand.page.observe({ 12 | instruction: "find the main header of the page", 13 | }); 14 | 15 | if (observations.length === 0) { 16 | await stagehand.close(); 17 | return { 18 | _success: false, 19 | observations, 20 | debugUrl, 21 | sessionUrl, 22 | logs: logger.getLogs(), 23 | }; 24 | } 25 | 26 | const possibleLocators = [ 27 | `#primary > div.singlePage > section > div > div > article > div > iframe`, 28 | `#primary > div.heroBanner > section > div > h1`, 29 | ]; 30 | 31 | const possibleHandles = []; 32 | for (const locatorStr of possibleLocators) { 33 | const locator = stagehand.page.locator(locatorStr); 34 | const handle = await locator.elementHandle(); 35 | if (handle) { 36 | possibleHandles.push({ locatorStr, handle }); 37 | } 38 | } 39 | 40 | let foundMatch = false; 41 | let matchedLocator: string | null = null; 42 | 43 | for (const observation of observations) { 44 | try { 45 | const observationLocator = stagehand.page 46 | .locator(observation.selector) 47 | .first(); 48 | const observationHandle = await observationLocator.elementHandle(); 49 | if (!observationHandle) { 50 | continue; 51 | } 52 | 53 | for (const { locatorStr, handle: candidateHandle } of possibleHandles) { 54 | const isSameNode = await observationHandle.evaluate( 55 | (node, otherNode) => node === otherNode, 56 | candidateHandle, 57 | ); 58 | if (isSameNode) { 59 | foundMatch = true; 60 | matchedLocator = locatorStr; 61 | break; 62 | } 63 | } 64 | 65 | if (foundMatch) { 66 | break; 67 | } 68 | } catch (error) { 69 | console.warn( 70 | `Failed to check observation with selector ${observation.selector}:`, 71 | error.message, 72 | ); 73 | continue; 74 | } 75 | } 76 | 77 | await stagehand.close(); 78 | 79 | return { 80 | _success: foundMatch, 81 | matchedLocator, 82 | observations, 83 | debugUrl, 84 | sessionUrl, 85 | logs: logger.getLogs(), 86 | }; 87 | }; 88 | -------------------------------------------------------------------------------- /evals/tasks/observe_simple_google_search.ts: -------------------------------------------------------------------------------- 1 | import { EvalFunction } from "@/types/evals"; 2 | import { performPlaywrightMethod } from "@/lib/a11y/utils"; 3 | 4 | export const observe_simple_google_search: EvalFunction = async ({ 5 | debugUrl, 6 | sessionUrl, 7 | stagehand, 8 | logger, 9 | }) => { 10 | await stagehand.page.goto("https://www.google.com"); 11 | 12 | const observation1 = await stagehand.page.observe({ 13 | instruction: "Find the search bar and enter 'OpenAI'", 14 | onlyVisible: false, 15 | returnAction: true, 16 | }); 17 | console.log(observation1); 18 | 19 | if (observation1.length > 0) { 20 | const action1 = observation1[0]; 21 | await performPlaywrightMethod( 22 | stagehand.page, 23 | stagehand.logger, 24 | action1.method, 25 | action1.arguments, 26 | action1.selector.replace("xpath=", ""), 27 | ); 28 | } 29 | await stagehand.page.waitForTimeout(5000); 30 | const observation2 = await stagehand.page.observe({ 31 | instruction: "Click the search button in the suggestions dropdown", 32 | onlyVisible: false, 33 | returnAction: true, 34 | }); 35 | console.log(observation2); 36 | 37 | if (observation2.length > 0) { 38 | const action2 = observation2[0]; 39 | await performPlaywrightMethod( 40 | stagehand.page, 41 | stagehand.logger, 42 | action2.method, 43 | action2.arguments, 44 | action2.selector.replace("xpath=", ""), 45 | ); 46 | } 47 | await stagehand.page.waitForTimeout(5000); 48 | 49 | const expectedUrl = "https://www.google.com/search?q=OpenAI"; 50 | const currentUrl = stagehand.page.url(); 51 | 52 | await stagehand.close(); 53 | 54 | return { 55 | _success: currentUrl.startsWith(expectedUrl), 56 | currentUrl, 57 | debugUrl, 58 | sessionUrl, 59 | logs: logger.getLogs(), 60 | }; 61 | }; 62 | -------------------------------------------------------------------------------- /evals/tasks/observe_taxes.ts: -------------------------------------------------------------------------------- 1 | import { EvalFunction } from "@/types/evals"; 2 | 3 | export const observe_taxes: EvalFunction = async ({ 4 | debugUrl, 5 | sessionUrl, 6 | stagehand, 7 | logger, 8 | }) => { 9 | await stagehand.page.goto("https://file.1040.com/estimate/"); 10 | 11 | const observations = await stagehand.page.observe({ 12 | instruction: "Find all the form input elements under the 'Income' section", 13 | }); 14 | 15 | if (observations.length === 0) { 16 | await stagehand.close(); 17 | return { 18 | _success: false, 19 | observations, 20 | debugUrl, 21 | sessionUrl, 22 | logs: logger.getLogs(), 23 | }; 24 | } else if (observations.length < 13) { 25 | await stagehand.close(); 26 | return { 27 | _success: false, 28 | observations, 29 | debugUrl, 30 | sessionUrl, 31 | logs: logger.getLogs(), 32 | }; 33 | } 34 | 35 | const expectedLocator = `#tpWages`; 36 | 37 | const expectedResult = await stagehand.page 38 | .locator(expectedLocator) 39 | .first() 40 | .innerText(); 41 | 42 | let foundMatch = false; 43 | for (const observation of observations) { 44 | try { 45 | const observationResult = await stagehand.page 46 | .locator(observation.selector) 47 | .first() 48 | .innerText(); 49 | 50 | if (observationResult === expectedResult) { 51 | foundMatch = true; 52 | break; 53 | } 54 | } catch (error) { 55 | console.warn( 56 | `Failed to check observation with selector ${observation.selector}:`, 57 | error.message, 58 | ); 59 | continue; 60 | } 61 | } 62 | 63 | await stagehand.close(); 64 | 65 | return { 66 | _success: foundMatch, 67 | expected: expectedResult, 68 | observations, 69 | debugUrl, 70 | sessionUrl, 71 | logs: logger.getLogs(), 72 | }; 73 | }; 74 | -------------------------------------------------------------------------------- /evals/tasks/observe_vantechjournal.ts: -------------------------------------------------------------------------------- 1 | import { EvalFunction } from "@/types/evals"; 2 | 3 | export const observe_vantechjournal: EvalFunction = async ({ 4 | debugUrl, 5 | sessionUrl, 6 | stagehand, 7 | logger, 8 | }) => { 9 | await stagehand.page.goto("https://vantechjournal.com/archive?page=8"); 10 | await stagehand.page.waitForTimeout(1000); 11 | 12 | const observations = await stagehand.page.observe({ 13 | instruction: "find the button that takes us to the 11th page", 14 | }); 15 | 16 | if (observations.length === 0) { 17 | await stagehand.close(); 18 | return { 19 | _success: false, 20 | observations, 21 | debugUrl, 22 | sessionUrl, 23 | logs: logger.getLogs(), 24 | }; 25 | } 26 | 27 | const expectedLocator = `a.rounded-lg:nth-child(8)`; 28 | 29 | const expectedResult = await stagehand.page.locator(expectedLocator); 30 | 31 | let foundMatch = false; 32 | 33 | for (const observation of observations) { 34 | try { 35 | const observationLocator = stagehand.page 36 | .locator(observation.selector) 37 | .first(); 38 | const observationHandle = await observationLocator.elementHandle(); 39 | const expectedHandle = await expectedResult.elementHandle(); 40 | 41 | if (!observationHandle || !expectedHandle) { 42 | // Couldn’t get handles, skip 43 | continue; 44 | } 45 | 46 | const isSameNode = await observationHandle.evaluate( 47 | (node, otherNode) => node === otherNode, 48 | expectedHandle, 49 | ); 50 | 51 | if (isSameNode) { 52 | foundMatch = true; 53 | break; 54 | } 55 | } catch (error) { 56 | console.warn( 57 | `Failed to check observation with selector ${observation.selector}:`, 58 | error.message, 59 | ); 60 | continue; 61 | } 62 | } 63 | 64 | await stagehand.close(); 65 | 66 | return { 67 | _success: foundMatch, 68 | expected: expectedResult, 69 | observations, 70 | debugUrl, 71 | sessionUrl, 72 | logs: logger.getLogs(), 73 | }; 74 | }; 75 | -------------------------------------------------------------------------------- /evals/tasks/observe_yc_startup.ts: -------------------------------------------------------------------------------- 1 | import { EvalFunction } from "@/types/evals"; 2 | 3 | export const observe_yc_startup: EvalFunction = async ({ 4 | debugUrl, 5 | sessionUrl, 6 | stagehand, 7 | logger, 8 | }) => { 9 | await stagehand.page.goto("https://www.ycombinator.com/companies"); 10 | await stagehand.page.waitForLoadState("networkidle"); 11 | 12 | const observations = await stagehand.page.observe({ 13 | instruction: 14 | "Click the container element that holds links to each of the startup companies. The companies each have a name, a description, and a link to their website.", 15 | }); 16 | 17 | if (observations.length === 0) { 18 | await stagehand.close(); 19 | return { 20 | _success: false, 21 | observations, 22 | debugUrl, 23 | sessionUrl, 24 | logs: logger.getLogs(), 25 | }; 26 | } 27 | 28 | const possibleLocators = [ 29 | `div._rightCol_i9oky_592`, 30 | `div._section_i9oky_163._results_i9oky_343`, 31 | ]; 32 | 33 | const possibleHandles = []; 34 | for (const locatorStr of possibleLocators) { 35 | const locator = stagehand.page.locator(locatorStr); 36 | const handle = await locator.elementHandle(); 37 | if (handle) { 38 | possibleHandles.push({ locatorStr, handle }); 39 | } 40 | } 41 | 42 | let foundMatch = false; 43 | let matchedLocator: string | null = null; 44 | 45 | for (const observation of observations) { 46 | try { 47 | const observationLocator = stagehand.page 48 | .locator(observation.selector) 49 | .first(); 50 | const observationHandle = await observationLocator.elementHandle(); 51 | if (!observationHandle) { 52 | continue; 53 | } 54 | 55 | for (const { locatorStr, handle: candidateHandle } of possibleHandles) { 56 | const isSameNode = await observationHandle.evaluate( 57 | (node, otherNode) => node === otherNode, 58 | candidateHandle, 59 | ); 60 | if (isSameNode) { 61 | foundMatch = true; 62 | matchedLocator = locatorStr; 63 | break; 64 | } 65 | } 66 | 67 | if (foundMatch) { 68 | break; 69 | } 70 | } catch (error) { 71 | console.warn( 72 | `Failed to check observation with selector ${observation.selector}:`, 73 | error.message, 74 | ); 75 | continue; 76 | } 77 | } 78 | 79 | await stagehand.close(); 80 | 81 | return { 82 | _success: foundMatch, 83 | matchedLocator, 84 | observations, 85 | debugUrl, 86 | sessionUrl, 87 | logs: logger.getLogs(), 88 | }; 89 | }; 90 | -------------------------------------------------------------------------------- /evals/tasks/panamcs.ts: -------------------------------------------------------------------------------- 1 | import { EvalFunction } from "@/types/evals"; 2 | 3 | export const panamcs: EvalFunction = async ({ 4 | debugUrl, 5 | sessionUrl, 6 | stagehand, 7 | logger, 8 | }) => { 9 | await stagehand.page.goto( 10 | "https://browserbase.github.io/stagehand-eval-sites/sites/panamcs/", 11 | ); 12 | 13 | const observations = await stagehand.page.observe( 14 | "click the 'about us' link", 15 | ); 16 | 17 | if (observations.length === 0) { 18 | await stagehand.close(); 19 | return { 20 | _success: false, 21 | observations, 22 | debugUrl, 23 | sessionUrl, 24 | logs: logger.getLogs(), 25 | }; 26 | } 27 | 28 | const expectedLocator = `#menu > li:nth-child(1) > a`; 29 | 30 | const expectedResult = await stagehand.page 31 | .locator(expectedLocator) 32 | .first() 33 | .innerText(); 34 | 35 | let foundMatch = false; 36 | for (const observation of observations) { 37 | try { 38 | const observationResult = await stagehand.page 39 | .locator(observation.selector) 40 | .first() 41 | .innerText(); 42 | 43 | if (observationResult === expectedResult) { 44 | foundMatch = true; 45 | break; 46 | } 47 | } catch (error) { 48 | console.warn( 49 | `Failed to check observation with selector ${observation.selector}:`, 50 | error.message, 51 | ); 52 | continue; 53 | } 54 | } 55 | 56 | await stagehand.close(); 57 | 58 | return { 59 | _success: foundMatch, 60 | expected: expectedResult, 61 | observations, 62 | debugUrl, 63 | sessionUrl, 64 | logs: logger.getLogs(), 65 | }; 66 | }; 67 | -------------------------------------------------------------------------------- /evals/tasks/peeler_complex.ts: -------------------------------------------------------------------------------- 1 | import { EvalFunction } from "@/types/evals"; 2 | import { z } from "zod"; 3 | 4 | export const peeler_complex: EvalFunction = async ({ 5 | debugUrl, 6 | sessionUrl, 7 | stagehand, 8 | logger, 9 | useTextExtract, 10 | }) => { 11 | try { 12 | await stagehand.page.goto(`https://chefstoys.com/`, { timeout: 60000 }); 13 | await stagehand.page.waitForLoadState("networkidle"); 14 | 15 | await stagehand.page.act("find the button to close the popup"); 16 | await stagehand.page.act({ 17 | action: "search for %search_query%", 18 | variables: { 19 | search_query: "peeler", 20 | }, 21 | }); 22 | 23 | await stagehand.page.act({ 24 | action: 'click on the first "OXO" brand peeler', 25 | }); 26 | 27 | const { price } = await stagehand.page.extract({ 28 | instruction: "get the price of the peeler", 29 | schema: z.object({ price: z.number().nullable() }), 30 | useTextExtract, 31 | }); 32 | 33 | await stagehand.close(); 34 | 35 | return { 36 | _success: price === 11.99, 37 | price, 38 | debugUrl, 39 | sessionUrl, 40 | logs: logger.getLogs(), 41 | }; 42 | } catch (error) { 43 | logger.error({ 44 | message: "error in peeler_complex function", 45 | level: 0, 46 | auxiliary: { 47 | error: { 48 | value: JSON.stringify(error, null, 2), 49 | type: "object", 50 | }, 51 | trace: { 52 | value: error.stack, 53 | type: "string", 54 | }, 55 | }, 56 | }); 57 | 58 | await stagehand.close(); 59 | 60 | return { 61 | _success: false, 62 | error: JSON.parse(JSON.stringify(error, null, 2)), 63 | debugUrl, 64 | sessionUrl, 65 | logs: logger.getLogs(), 66 | }; 67 | } 68 | }; 69 | -------------------------------------------------------------------------------- /evals/tasks/peeler_simple.ts: -------------------------------------------------------------------------------- 1 | import { EvalFunction } from "@/types/evals"; 2 | import { StagehandEnvironmentError } from "@/types/stagehandErrors"; 3 | 4 | const env: "BROWSERBASE" | "LOCAL" = 5 | process.env.EVAL_ENV?.toLowerCase() === "browserbase" 6 | ? "BROWSERBASE" 7 | : "LOCAL"; 8 | 9 | export const peeler_simple: EvalFunction = async ({ 10 | debugUrl, 11 | sessionUrl, 12 | stagehand, 13 | logger, 14 | }) => { 15 | if (env === "BROWSERBASE") { 16 | throw new StagehandEnvironmentError( 17 | "BROWSERBASE", 18 | "LOCAL", 19 | "peeler_simple eval", 20 | ); 21 | } 22 | 23 | await stagehand.page.goto(`file://${process.cwd()}/evals/assets/peeler.html`); 24 | await stagehand.page.act({ action: "add the peeler to cart" }); 25 | 26 | const successMessageLocator = stagehand.page.locator( 27 | 'text="Congratulations, you have 1 A in your cart"', 28 | ); 29 | const isVisible = await successMessageLocator.isVisible(); 30 | 31 | await stagehand.close(); 32 | 33 | return { 34 | _success: isVisible, 35 | debugUrl, 36 | sessionUrl, 37 | logs: logger.getLogs(), 38 | }; 39 | }; 40 | -------------------------------------------------------------------------------- /evals/tasks/prevChunk.ts: -------------------------------------------------------------------------------- 1 | import { Stagehand } from "@/dist"; 2 | import { EvalFunction } from "@/types/evals"; 3 | 4 | export const prevChunk: EvalFunction = async ({ 5 | logger, 6 | stagehandConfig, 7 | debugUrl, 8 | sessionUrl, 9 | }) => { 10 | const stagehand = new Stagehand({ 11 | ...stagehandConfig, 12 | domSettleTimeoutMs: 3000, 13 | }); 14 | await stagehand.init(); 15 | 16 | await stagehand.page.goto( 17 | "https://browserbase.github.io/stagehand-eval-sites/sites/aigrant/", 18 | ); 19 | await new Promise((resolve) => setTimeout(resolve, 2000)); 20 | const { initialScrollTop, chunkHeight } = await stagehand.page.evaluate( 21 | () => { 22 | const halfPage = document.body.scrollHeight / 2; 23 | 24 | window.scrollTo({ 25 | top: halfPage, 26 | left: 0, 27 | behavior: "instant", 28 | }); 29 | 30 | const chunk = window.innerHeight; 31 | 32 | return { 33 | initialScrollTop: window.scrollY, 34 | chunkHeight: chunk, 35 | }; 36 | }, 37 | ); 38 | await new Promise((resolve) => setTimeout(resolve, 2000)); 39 | await stagehand.page.act({ 40 | action: "scroll up one chunk", 41 | }); 42 | 43 | await new Promise((resolve) => setTimeout(resolve, 5000)); 44 | 45 | const finalScrollTop = await stagehand.page.evaluate(() => window.scrollY); 46 | 47 | await stagehand.close(); 48 | 49 | const actualDiff = initialScrollTop - finalScrollTop; 50 | const threshold = 20; // px tolerance 51 | const scrolledOneChunk = Math.abs(actualDiff - chunkHeight) <= threshold; 52 | 53 | const evaluationResult = scrolledOneChunk 54 | ? { 55 | _success: true, 56 | logs: logger.getLogs(), 57 | debugUrl, 58 | sessionUrl, 59 | message: `Successfully scrolled ~one chunk UP: expected ~${chunkHeight}, got ${actualDiff}.`, 60 | } 61 | : { 62 | _success: false, 63 | logs: logger.getLogs(), 64 | debugUrl, 65 | sessionUrl, 66 | message: `Scroll difference expected ~${chunkHeight} but only scrolled ${actualDiff}.`, 67 | }; 68 | 69 | return evaluationResult; 70 | }; 71 | -------------------------------------------------------------------------------- /evals/tasks/radio_btn.ts: -------------------------------------------------------------------------------- 1 | import { EvalFunction } from "@/types/evals"; 2 | 3 | export const radio_btn: EvalFunction = async ({ 4 | debugUrl, 5 | sessionUrl, 6 | stagehand, 7 | logger, 8 | }) => { 9 | await stagehand.page.goto( 10 | "https://browserbase.github.io/stagehand-eval-sites/sites/paneer-pizza/", 11 | ); 12 | 13 | await stagehand.page.act({ 14 | action: "click the 'medium' option", 15 | }); 16 | 17 | // confirm that the Medium radio is now checked 18 | const radioBtnClicked = await stagehand.page 19 | .locator('input[type="radio"][name="Pizza"][value="Medium"]') 20 | .isChecked(); 21 | 22 | await stagehand.close(); 23 | 24 | return { 25 | _success: radioBtnClicked, 26 | debugUrl, 27 | sessionUrl, 28 | logs: logger.getLogs(), 29 | }; 30 | }; 31 | -------------------------------------------------------------------------------- /evals/tasks/rakuten_jp.ts: -------------------------------------------------------------------------------- 1 | import { EvalFunction } from "@/types/evals"; 2 | 3 | export const rakuten_jp: EvalFunction = async ({ 4 | debugUrl, 5 | sessionUrl, 6 | stagehand, 7 | logger, 8 | }) => { 9 | await stagehand.page.goto("https://www.rakuten.co.jp/"); 10 | await stagehand.page.act({ action: "click on online supermarket" }); 11 | 12 | await stagehand.page.act({ action: "if there is a popup, close it" }); 13 | 14 | await stagehand.page.act({ 15 | action: "navigate to Inageya Online Supermarket", 16 | }); 17 | await stagehand.page.act({ action: "click the search bar input" }); 18 | await stagehand.page.act({ action: "search for '香菜'" }); 19 | 20 | const url = stagehand.page.url(); 21 | const successUrl = 22 | "https://netsuper.rakuten.co.jp/inageya/search/?keyword=%E9%A6%99%E8%8F%9C"; 23 | 24 | await stagehand.close(); 25 | 26 | return { 27 | _success: url === successUrl, 28 | debugUrl, 29 | sessionUrl, 30 | logs: logger.getLogs(), 31 | }; 32 | }; 33 | -------------------------------------------------------------------------------- /evals/tasks/sciquest.ts: -------------------------------------------------------------------------------- 1 | import { EvalFunction } from "@/types/evals"; 2 | import { z } from "zod"; 3 | 4 | export const sciquest: EvalFunction = async ({ 5 | debugUrl, 6 | sessionUrl, 7 | stagehand, 8 | logger, 9 | useTextExtract, 10 | }) => { 11 | await stagehand.page.goto( 12 | "https://bids.sciquest.com/apps/Router/PublicEvent?tab=PHX_NAV_SourcingAllOpps&CustomerOrg=StateOfUtah", 13 | ); 14 | 15 | await stagehand.page.act({ 16 | action: 'Click on the "Closed" tab', 17 | }); 18 | 19 | const result = await stagehand.page.extract({ 20 | instruction: 21 | "Extract the total number of results that the search produced. Not the number of results displayed on the page.", 22 | schema: z.object({ 23 | total_results: z.string(), 24 | }), 25 | useTextExtract, 26 | }); 27 | 28 | await stagehand.close(); 29 | 30 | const { total_results } = result; 31 | 32 | const expectedNumber = 12637; 33 | const extractedNumber = parseInt(total_results.replace(/[^\d]/g, ""), 10); 34 | 35 | const isWithinRange = 36 | extractedNumber >= expectedNumber - 1000 && 37 | extractedNumber <= expectedNumber + 1000; 38 | 39 | if (!isWithinRange) { 40 | logger.error({ 41 | message: "Total number of results is not within the expected range", 42 | level: 0, 43 | auxiliary: { 44 | expected: { 45 | value: `${expectedNumber} ± 1000`, 46 | type: "string", 47 | }, 48 | actual: { 49 | value: extractedNumber.toString(), 50 | type: "integer", 51 | }, 52 | }, 53 | }); 54 | return { 55 | _success: false, 56 | error: "Total number of results is not within the expected range", 57 | extractedNumber, 58 | debugUrl, 59 | sessionUrl, 60 | logs: logger.getLogs(), 61 | }; 62 | } 63 | 64 | return { 65 | _success: true, 66 | extractedNumber, 67 | debugUrl, 68 | sessionUrl, 69 | logs: logger.getLogs(), 70 | }; 71 | }; 72 | -------------------------------------------------------------------------------- /evals/tasks/scroll_50.ts: -------------------------------------------------------------------------------- 1 | import { EvalFunction } from "@/types/evals"; 2 | 3 | export const scroll_50: EvalFunction = async ({ 4 | debugUrl, 5 | sessionUrl, 6 | stagehand, 7 | logger, 8 | }) => { 9 | await stagehand.page.goto( 10 | "https://browserbase.github.io/stagehand-eval-sites/sites/aigrant/", 11 | ); 12 | await stagehand.page.act({ 13 | action: "Scroll 50% down the page", 14 | }); 15 | 16 | await new Promise((resolve) => setTimeout(resolve, 5000)); 17 | 18 | // Get the current scroll position and total scroll height 19 | const scrollInfo = await stagehand.page.evaluate(() => { 20 | return { 21 | scrollTop: window.scrollY + window.innerHeight / 2, 22 | scrollHeight: document.documentElement.scrollHeight, 23 | }; 24 | }); 25 | 26 | await stagehand.close(); 27 | 28 | const halfwayScroll = scrollInfo.scrollHeight / 2; 29 | const halfwayReached = Math.abs(scrollInfo.scrollTop - halfwayScroll) <= 200; 30 | const evaluationResult = halfwayReached 31 | ? { 32 | _success: true, 33 | logs: logger.getLogs(), 34 | debugUrl, 35 | sessionUrl, 36 | } 37 | : { 38 | _success: false, 39 | logs: logger.getLogs(), 40 | debugUrl, 41 | sessionUrl, 42 | message: `Scroll position (${scrollInfo.scrollTop}px) is not halfway down the page (${halfwayScroll}px).`, 43 | }; 44 | 45 | return evaluationResult; 46 | }; 47 | -------------------------------------------------------------------------------- /evals/tasks/scroll_75.ts: -------------------------------------------------------------------------------- 1 | import { Stagehand } from "@/dist"; 2 | import { EvalFunction } from "@/types/evals"; 3 | 4 | export const scroll_75: EvalFunction = async ({ 5 | logger, 6 | stagehandConfig, 7 | debugUrl, 8 | sessionUrl, 9 | }) => { 10 | const stagehand = new Stagehand({ 11 | ...stagehandConfig, 12 | domSettleTimeoutMs: 3000, 13 | }); 14 | await stagehand.init(); 15 | 16 | await stagehand.page.goto( 17 | "https://browserbase.github.io/stagehand-eval-sites/sites/aigrant/", 18 | ); 19 | await stagehand.page.act({ 20 | action: "Scroll 75% down the page", 21 | }); 22 | 23 | await new Promise((resolve) => setTimeout(resolve, 5000)); 24 | 25 | // Get the current scroll position and total scroll height 26 | const scrollInfo = await stagehand.page.evaluate(() => { 27 | return { 28 | scrollTop: window.scrollY + window.innerHeight * 0.75, 29 | scrollHeight: document.documentElement.scrollHeight, 30 | }; 31 | }); 32 | 33 | await stagehand.close(); 34 | 35 | const threeQuartersScroll = scrollInfo.scrollHeight * 0.75; 36 | const threeQuartersReached = 37 | Math.abs(scrollInfo.scrollTop - threeQuartersScroll) <= 200; 38 | const evaluationResult = threeQuartersReached 39 | ? { 40 | _success: true, 41 | logs: logger.getLogs(), 42 | debugUrl, 43 | sessionUrl, 44 | } 45 | : { 46 | _success: false, 47 | logs: logger.getLogs(), 48 | debugUrl, 49 | sessionUrl, 50 | message: `Scroll position (${scrollInfo.scrollTop}px) is not three quarters down the page (${threeQuartersScroll}px).`, 51 | }; 52 | 53 | return evaluationResult; 54 | }; 55 | -------------------------------------------------------------------------------- /evals/tasks/simple_google_search.ts: -------------------------------------------------------------------------------- 1 | import { EvalFunction } from "@/types/evals"; 2 | 3 | export const simple_google_search: EvalFunction = async ({ 4 | debugUrl, 5 | sessionUrl, 6 | stagehand, 7 | logger, 8 | }) => { 9 | await stagehand.page.goto("https://www.google.com"); 10 | 11 | await stagehand.page.act({ 12 | action: 'type "OpenAI" into the search bar', 13 | }); 14 | 15 | await stagehand.page.act("click the search button"); 16 | 17 | const expectedUrl = "https://www.google.com/search?q=OpenAI"; 18 | const currentUrl = stagehand.page.url(); 19 | 20 | await stagehand.close(); 21 | 22 | return { 23 | _success: currentUrl.startsWith(expectedUrl), 24 | currentUrl, 25 | debugUrl, 26 | sessionUrl, 27 | logs: logger.getLogs(), 28 | }; 29 | }; 30 | -------------------------------------------------------------------------------- /evals/tasks/stock_x.ts: -------------------------------------------------------------------------------- 1 | import { EvalFunction } from "@/types/evals"; 2 | 3 | export const stock_x: EvalFunction = async ({ 4 | debugUrl, 5 | sessionUrl, 6 | stagehand, 7 | logger, 8 | }) => { 9 | await stagehand.page.goto( 10 | "https://stockx.com/air-jordan-3-retro-black-cement-2024", 11 | ); 12 | 13 | await stagehand.page.waitForTimeout(3000); 14 | 15 | await stagehand.page.act({ 16 | action: "click on Jordan 3 Retro Crimson in the related products", 17 | }); 18 | 19 | await stagehand.page.waitForTimeout(2000); 20 | const currentUrl = stagehand.page.url(); 21 | const expectedUrlPrefix = "https://stockx.com/jordan-3-retro-crimson"; 22 | 23 | await stagehand.close(); 24 | 25 | return { 26 | _success: currentUrl.startsWith(expectedUrlPrefix), 27 | currentUrl, 28 | debugUrl, 29 | sessionUrl, 30 | logs: logger.getLogs(), 31 | }; 32 | }; 33 | -------------------------------------------------------------------------------- /evals/tasks/vanta_h.ts: -------------------------------------------------------------------------------- 1 | import { EvalFunction } from "@/types/evals"; 2 | 3 | export const vanta_h: EvalFunction = async ({ 4 | debugUrl, 5 | sessionUrl, 6 | stagehand, 7 | logger, 8 | }) => { 9 | await stagehand.page.goto("https://www.vanta.com/"); 10 | 11 | const observations = await stagehand.page.observe( 12 | "click the buy now button if it is available", 13 | ); 14 | 15 | await stagehand.close(); 16 | 17 | // we should have no saved observation since the element shouldn't exist 18 | return { 19 | _success: observations.length === 0, 20 | observations, 21 | debugUrl, 22 | sessionUrl, 23 | logs: logger.getLogs(), 24 | }; 25 | }; 26 | -------------------------------------------------------------------------------- /evals/tasks/vantechjournal.ts: -------------------------------------------------------------------------------- 1 | import { EvalFunction } from "@/types/evals"; 2 | 3 | export const vantechjournal: EvalFunction = async ({ 4 | debugUrl, 5 | sessionUrl, 6 | stagehand, 7 | logger, 8 | }) => { 9 | await stagehand.page.goto("https://vantechjournal.com/"); 10 | 11 | await stagehand.page.act({ 12 | action: "click on page 8. do not click the next button", 13 | }); 14 | 15 | const expectedUrl = "https://vantechjournal.com/archive?page=8"; 16 | const currentUrl = stagehand.page.url(); 17 | 18 | await stagehand.close(); 19 | 20 | return { 21 | _success: currentUrl === expectedUrl, 22 | currentUrl, 23 | expectedUrl, 24 | debugUrl, 25 | sessionUrl, 26 | logs: logger.getLogs(), 27 | }; 28 | }; 29 | -------------------------------------------------------------------------------- /evals/tasks/wichita.ts: -------------------------------------------------------------------------------- 1 | import { EvalFunction } from "@/types/evals"; 2 | import { z } from "zod"; 3 | 4 | export const wichita: EvalFunction = async ({ 5 | debugUrl, 6 | sessionUrl, 7 | stagehand, 8 | logger, 9 | useTextExtract, 10 | }) => { 11 | await stagehand.page.goto("https://www.wichitafallstx.gov/Bids.aspx"); 12 | 13 | await stagehand.page.act({ 14 | action: 'Click on "Show Closed/Awarded/Cancelled bids"', 15 | }); 16 | 17 | const result = await stagehand.page.extract({ 18 | instruction: "Extract the total number of bids that the search produced.", 19 | schema: z.object({ 20 | total_results: z.string(), 21 | }), 22 | useTextExtract, 23 | }); 24 | 25 | await stagehand.close(); 26 | 27 | const { total_results } = result; 28 | 29 | const expectedNumber = 405; 30 | const extractedNumber = parseInt(total_results.replace(/[^\d]/g, ""), 10); 31 | 32 | const isWithinRange = 33 | extractedNumber >= expectedNumber - 10 && 34 | extractedNumber <= expectedNumber + 10; 35 | 36 | if (!isWithinRange) { 37 | logger.error({ 38 | message: "Total number of results is not within the expected range", 39 | level: 0, 40 | auxiliary: { 41 | expected: { 42 | value: `${expectedNumber} ± 10`, 43 | type: "string", 44 | }, 45 | actual: { 46 | value: extractedNumber.toString(), 47 | type: "integer", 48 | }, 49 | }, 50 | }); 51 | return { 52 | _success: false, 53 | error: "Total number of results is not within the expected range", 54 | extractedNumber, 55 | debugUrl, 56 | sessionUrl, 57 | logs: logger.getLogs(), 58 | }; 59 | } 60 | 61 | return { 62 | _success: true, 63 | extractedNumber, 64 | debugUrl, 65 | sessionUrl, 66 | logs: logger.getLogs(), 67 | }; 68 | }; 69 | -------------------------------------------------------------------------------- /evals/tasks/wikipedia.ts: -------------------------------------------------------------------------------- 1 | import { EvalFunction } from "@/types/evals"; 2 | 3 | export const wikipedia: EvalFunction = async ({ 4 | debugUrl, 5 | sessionUrl, 6 | stagehand, 7 | logger, 8 | }) => { 9 | await stagehand.page.goto(`https://en.wikipedia.org/wiki/Baseball`); 10 | await stagehand.page.act({ 11 | action: 'click the "hit and run" link in this article', 12 | timeoutMs: 360_000, 13 | }); 14 | 15 | const url = "https://en.wikipedia.org/wiki/Hit_and_run_(baseball)"; 16 | const currentUrl = stagehand.page.url(); 17 | 18 | await stagehand.close(); 19 | 20 | return { 21 | _success: currentUrl === url, 22 | expected: url, 23 | actual: currentUrl, 24 | debugUrl, 25 | sessionUrl, 26 | logs: logger.getLogs(), 27 | }; 28 | }; 29 | -------------------------------------------------------------------------------- /examples/actionable_observe_example.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is meant to be used as a scratchpad for trying out actionable observe. 3 | * To create a Stagehand project with best practices and configuration, run: 4 | * 5 | * npx create-browser-app@latest my-browser-app 6 | */ 7 | 8 | import { Stagehand } from "@/dist"; 9 | import stagehandConfig from "@/stagehand.config"; 10 | 11 | async function example() { 12 | const stagehand = new Stagehand(stagehandConfig); 13 | await stagehand.init(); 14 | await stagehand.page.goto("https://www.apartments.com/san-francisco-ca/"); 15 | 16 | await new Promise((resolve) => setTimeout(resolve, 3000)); 17 | const observations1 = await stagehand.page.observe({ 18 | instruction: "find the 'all filters' button", 19 | }); 20 | await stagehand.page.act(observations1[0]); 21 | 22 | await new Promise((resolve) => setTimeout(resolve, 3000)); 23 | const observations2 = await stagehand.page.observe({ 24 | instruction: "find the '1+' button in the 'beds' section", 25 | }); 26 | await stagehand.page.act(observations2[0]); 27 | 28 | await new Promise((resolve) => setTimeout(resolve, 3000)); 29 | const observations3 = await stagehand.page.observe({ 30 | instruction: "find the 'apartments' button in the 'home type' section", 31 | }); 32 | await stagehand.page.act(observations3[0]); 33 | 34 | await new Promise((resolve) => setTimeout(resolve, 3000)); 35 | const observations4 = await stagehand.page.observe({ 36 | instruction: "find the pet policy dropdown to click on.", 37 | }); 38 | await stagehand.page.act(observations4[0]); 39 | 40 | await new Promise((resolve) => setTimeout(resolve, 3000)); 41 | const observations5 = await stagehand.page.observe({ 42 | instruction: "find the 'Dog Friendly' option to click on", 43 | }); 44 | await stagehand.page.act(observations5[0]); 45 | 46 | await new Promise((resolve) => setTimeout(resolve, 3000)); 47 | const observations6 = await stagehand.page.observe({ 48 | instruction: "find the 'see results' section", 49 | }); 50 | await stagehand.page.act(observations6[0]); 51 | 52 | const currentUrl = await stagehand.page.url(); 53 | await stagehand.close(); 54 | if ( 55 | currentUrl.includes( 56 | "https://www.apartments.com/apartments/san-francisco-ca/min-1-bedrooms-pet-friendly-dog/", 57 | ) 58 | ) { 59 | console.log("✅ Success! we made it to the correct page"); 60 | } else { 61 | console.log( 62 | "❌ Whoops, looks like we didn't make it to the correct page. " + 63 | "\nThanks for testing out this new Stagehand feature!" + 64 | "\nReach us on Slack if you have any feedback/questions/suggestions!", 65 | ); 66 | } 67 | } 68 | 69 | (async () => { 70 | await example(); 71 | })(); 72 | -------------------------------------------------------------------------------- /examples/ai_sdk_example.ts: -------------------------------------------------------------------------------- 1 | import { openai } from "@ai-sdk/openai"; 2 | import { Stagehand } from "@/dist"; 3 | import { AISdkClient } from "./external_clients/aisdk"; 4 | import StagehandConfig from "@/stagehand.config"; 5 | import { z } from "zod"; 6 | 7 | async function example() { 8 | const stagehand = new Stagehand({ 9 | ...StagehandConfig, 10 | llmClient: new AISdkClient({ 11 | model: openai("gpt-4o"), 12 | }), 13 | }); 14 | 15 | await stagehand.init(); 16 | await stagehand.page.goto("https://news.ycombinator.com"); 17 | 18 | const { story } = await stagehand.page.extract({ 19 | instruction: "extract the title of the top story on the page", 20 | schema: z.object({ 21 | story: z.string().describe("the top story on the page"), 22 | }), 23 | }); 24 | 25 | console.log("The top story is:", story); 26 | 27 | await stagehand.page.act("click the first story"); 28 | 29 | await stagehand.close(); 30 | } 31 | 32 | (async () => { 33 | await example(); 34 | })(); 35 | -------------------------------------------------------------------------------- /examples/debugUrl.ts: -------------------------------------------------------------------------------- 1 | import { Stagehand } from "@/dist"; 2 | 3 | async function debug(url: string) { 4 | const stagehand = new Stagehand({ 5 | env: "LOCAL", 6 | verbose: 2, 7 | localBrowserLaunchOptions: { 8 | headless: true, 9 | }, 10 | }); 11 | await stagehand.init(); 12 | await stagehand.page.goto(url); 13 | } 14 | 15 | (async () => { 16 | const url = process.argv.find((arg) => arg.startsWith("--url=")); 17 | if (!url) { 18 | console.error("No URL flag provided. Usage: --url=https://example.com"); 19 | process.exit(1); 20 | } 21 | const targetUrl = url.split("=")[1]; 22 | console.log(`Navigating to: ${targetUrl}`); 23 | await debug(targetUrl); 24 | })(); 25 | -------------------------------------------------------------------------------- /examples/example.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is meant to be used as a scratchpad for developing new evals. 3 | * To create a Stagehand project with best practices and configuration, run: 4 | * 5 | * npx create-browser-app@latest my-browser-app 6 | */ 7 | 8 | import { Stagehand } from "@/dist"; 9 | import StagehandConfig from "@/stagehand.config"; 10 | 11 | async function example() { 12 | const stagehand = new Stagehand({ 13 | ...StagehandConfig, 14 | }); 15 | await stagehand.init(); 16 | await stagehand.page.goto("https://docs.stagehand.dev"); 17 | /** 18 | * Add your code here! 19 | */ 20 | await stagehand.close(); 21 | } 22 | 23 | (async () => { 24 | await example(); 25 | })(); 26 | -------------------------------------------------------------------------------- /examples/external_client.ts: -------------------------------------------------------------------------------- 1 | import { Stagehand } from "@/dist"; 2 | import { z } from "zod"; 3 | import { CustomOpenAIClient } from "./external_clients/customOpenAI"; 4 | import StagehandConfig from "@/stagehand.config"; 5 | import OpenAI from "openai"; 6 | 7 | async function example() { 8 | const stagehand = new Stagehand({ 9 | ...StagehandConfig, 10 | llmClient: new CustomOpenAIClient({ 11 | modelName: "gpt-4o-mini", 12 | client: new OpenAI({ 13 | apiKey: process.env.OPENAI_API_KEY, 14 | }), 15 | }), 16 | }); 17 | 18 | await stagehand.init(); 19 | await stagehand.page.goto("https://news.ycombinator.com"); 20 | await stagehand.page.act("click on the 'new' link"); 21 | 22 | const headlines = await stagehand.page.extract({ 23 | instruction: "Extract the top 3 stories from the Hacker News homepage.", 24 | schema: z.object({ 25 | stories: z.array( 26 | z.object({ 27 | title: z.string(), 28 | url: z.string(), 29 | points: z.number(), 30 | }), 31 | ), 32 | }), 33 | }); 34 | 35 | console.log(headlines); 36 | 37 | await stagehand.close(); 38 | } 39 | 40 | (async () => { 41 | await example(); 42 | })(); 43 | -------------------------------------------------------------------------------- /examples/external_clients/langchain.ts: -------------------------------------------------------------------------------- 1 | import { BaseChatModel } from "@langchain/core/language_models/chat_models"; 2 | import { CreateChatCompletionOptions, LLMClient, AvailableModel } from "@/dist"; 3 | import { zodToJsonSchema } from "zod-to-json-schema"; 4 | import { 5 | AIMessage, 6 | BaseMessageLike, 7 | HumanMessage, 8 | SystemMessage, 9 | } from "@langchain/core/messages"; 10 | import { ChatCompletion } from "openai/resources"; 11 | 12 | export class LangchainClient extends LLMClient { 13 | public type = "langchainClient" as const; 14 | private model: BaseChatModel; 15 | 16 | constructor(model: BaseChatModel) { 17 | super(model.name as AvailableModel); 18 | this.model = model; 19 | } 20 | 21 | async createChatCompletion({ 22 | options, 23 | }: CreateChatCompletionOptions): Promise { 24 | const formattedMessages: BaseMessageLike[] = options.messages.map( 25 | (message) => { 26 | if (Array.isArray(message.content)) { 27 | if (message.role === "system") { 28 | return new SystemMessage( 29 | message.content 30 | .map((c) => ("text" in c ? c.text : "")) 31 | .join("\n"), 32 | ); 33 | } 34 | 35 | const content = message.content.map((content) => 36 | "image_url" in content 37 | ? { type: "image", image: content.image_url.url } 38 | : { type: "text", text: content.text }, 39 | ); 40 | 41 | if (message.role === "user") return new HumanMessage({ content }); 42 | 43 | const textOnlyParts = content.map((part) => ({ 44 | type: "text" as const, 45 | text: part.type === "image" ? "[Image]" : part.text, 46 | })); 47 | 48 | return new AIMessage({ content: textOnlyParts }); 49 | } 50 | 51 | return { 52 | role: message.role, 53 | content: message.content, 54 | }; 55 | }, 56 | ); 57 | 58 | if (options.response_model) { 59 | const responseSchema = zodToJsonSchema(options.response_model.schema, { 60 | $refStrategy: "none", 61 | }); 62 | const structuredModel = this.model.withStructuredOutput(responseSchema); 63 | const response = await structuredModel.invoke(formattedMessages); 64 | 65 | return { 66 | data: response, 67 | usage: { 68 | prompt_tokens: 0, // Langchain doesn't provide token counts by default 69 | completion_tokens: 0, 70 | total_tokens: 0, 71 | }, 72 | } as T; 73 | } 74 | 75 | const modelWithTools = this.model.bindTools(options.tools); 76 | const response = await modelWithTools.invoke(formattedMessages); 77 | 78 | return { 79 | data: response, 80 | usage: { 81 | prompt_tokens: 0, // Langchain doesn't provide token counts by default 82 | completion_tokens: 0, 83 | total_tokens: 0, 84 | }, 85 | } as T; 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /examples/google_enter.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is meant to be used as a scratchpad for developing new evals. 3 | * To create a Stagehand project with best practices and configuration, run: 4 | * 5 | * npx create-browser-app@latest my-browser-app 6 | */ 7 | 8 | import { Stagehand } from "@/dist"; 9 | import StagehandConfig from "@/stagehand.config"; 10 | 11 | async function example() { 12 | const stagehand = new Stagehand({ 13 | ...StagehandConfig, 14 | }); 15 | await stagehand.init(); 16 | const page = stagehand.page; 17 | await page.goto("https://google.com"); 18 | await page.act("type in 'Browserbase'"); 19 | await page.act("press enter"); 20 | await stagehand.close(); 21 | } 22 | 23 | (async () => { 24 | await example(); 25 | })(); 26 | -------------------------------------------------------------------------------- /examples/instructions.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * This example shows how to use custom instructions with Stagehand. 3 | */ 4 | import { Stagehand } from "@/dist"; 5 | import StagehandConfig from "@/stagehand.config"; 6 | 7 | async function example() { 8 | const stagehand = new Stagehand({ 9 | ...StagehandConfig, 10 | systemPrompt: 11 | "if the users says `secret12345`, click on the 'getting started' tab. additionally, if the user says to type something, translate their input into french and type it.", 12 | }); 13 | await stagehand.init(); 14 | 15 | const page = stagehand.page; 16 | 17 | await page.goto("https://docs.browserbase.com/"); 18 | 19 | await page.act({ 20 | action: "secret12345", 21 | }); 22 | 23 | await page.act({ 24 | action: "search for 'how to use browserbase'", 25 | }); 26 | 27 | await stagehand.close(); 28 | } 29 | 30 | (async () => { 31 | await example(); 32 | })(); 33 | -------------------------------------------------------------------------------- /examples/langchain.ts: -------------------------------------------------------------------------------- 1 | import { z } from "zod"; 2 | import { Stagehand } from "@/dist"; 3 | import StagehandConfig from "@/stagehand.config"; 4 | import { LangchainClient } from "./external_clients/langchain"; 5 | import { ChatOpenAI } from "@langchain/openai"; 6 | 7 | async function example() { 8 | const stagehand = new Stagehand({ 9 | ...StagehandConfig, 10 | llmClient: new LangchainClient( 11 | new ChatOpenAI({ 12 | model: "gpt-4o", 13 | }), 14 | ), 15 | }); 16 | 17 | await stagehand.init(); 18 | await stagehand.page.goto("https://news.ycombinator.com"); 19 | 20 | const { story } = await stagehand.page.extract({ 21 | schema: z.object({ 22 | story: z.string().describe("the top story on the page"), 23 | }), 24 | }); 25 | 26 | console.log("The top story is:", story); 27 | 28 | await stagehand.page.act("click the first story"); 29 | 30 | await stagehand.close(); 31 | } 32 | 33 | (async () => { 34 | await example(); 35 | })(); 36 | -------------------------------------------------------------------------------- /examples/operator-example.ts: -------------------------------------------------------------------------------- 1 | import { LogLine, Stagehand } from "@/dist"; 2 | import dotenv from "dotenv"; 3 | import StagehandConfig from "@/stagehand.config"; 4 | import chalk from "chalk"; 5 | 6 | // Load environment variables 7 | dotenv.config(); 8 | 9 | const INSTRUCTION = 10 | "Go to Google Japan and interact with it in Japanese. Tell me (in English) an authentic recipe that I can make with ingredients found in American grocery stores."; 11 | 12 | async function main() { 13 | console.log(`\n${chalk.bold("Stagehand 🤘 Operator Example")}\n`); 14 | 15 | // Initialize Stagehand 16 | const stagehand = new Stagehand({ 17 | ...StagehandConfig, 18 | logger: ({ level, message, timestamp }: LogLine) => { 19 | console.log({ level, message, timestamp }); 20 | }, 21 | }); 22 | 23 | await stagehand.init(); 24 | 25 | try { 26 | const agent = stagehand.agent(); 27 | 28 | // Execute the agent 29 | console.log(`${chalk.cyan("↳")} Instruction: ${INSTRUCTION}`); 30 | 31 | const result = await agent.execute({ 32 | instruction: INSTRUCTION, 33 | maxSteps: 20, 34 | }); 35 | 36 | console.log(`${chalk.green("✓")} Execution complete`); 37 | console.log(`${chalk.yellow("⤷")} Result:`); 38 | console.log(JSON.stringify(result, null, 2)); 39 | console.log(chalk.white(result.message)); 40 | } catch (error) { 41 | console.log(`${chalk.red("✗")} Error: ${error}`); 42 | } finally { 43 | await stagehand.close(); 44 | } 45 | } 46 | 47 | main(); 48 | -------------------------------------------------------------------------------- /examples/parameterizeApiKey.ts: -------------------------------------------------------------------------------- 1 | import { Stagehand } from "@/dist"; 2 | import { z } from "zod"; 3 | 4 | /** 5 | * This example shows how to parameterize the API key for the LLM provider. 6 | * 7 | * In order to best demonstrate, unset the OPENAI_API_KEY environment variable and 8 | * set the USE_OPENAI_API_KEY environment variable to your OpenAI API key. 9 | * 10 | * export USE_OPENAI_API_KEY=$OPENAI_API_KEY 11 | * unset OPENAI_API_KEY 12 | */ 13 | 14 | async function example() { 15 | const stagehand = new Stagehand({ 16 | env: "LOCAL", 17 | verbose: 1, 18 | enableCaching: false, 19 | modelName: "gpt-4o", 20 | modelClientOptions: { 21 | apiKey: process.env.USE_OPENAI_API_KEY, 22 | }, 23 | }); 24 | 25 | await stagehand.init(); 26 | await stagehand.page.goto("https://github.com/browserbase/stagehand"); 27 | await stagehand.page.act({ action: "click on the contributors" }); 28 | const contributor = await stagehand.page.extract({ 29 | instruction: "extract the top contributor", 30 | schema: z.object({ 31 | username: z.string(), 32 | url: z.string(), 33 | }), 34 | }); 35 | console.log(`Our favorite contributor is ${contributor.username}`); 36 | } 37 | 38 | (async () => { 39 | await example(); 40 | })(); 41 | -------------------------------------------------------------------------------- /examples/popup.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is meant to be used as a scratchpad for developing new evals. 3 | * To create a Stagehand project with best practices and configuration, run: 4 | * 5 | * npx create-browser-app@latest my-browser-app 6 | */ 7 | 8 | import { ObserveResult, Stagehand } from "@/dist"; 9 | import StagehandConfig from "@/stagehand.config"; 10 | 11 | async function example() { 12 | const stagehand = new Stagehand(StagehandConfig); 13 | await stagehand.init(); 14 | 15 | const page = await stagehand.page; 16 | 17 | let observePromise: Promise; 18 | 19 | page.on("popup", async (newPage) => { 20 | observePromise = newPage.observe({ 21 | instruction: "return all the next possible actions from the page", 22 | }); 23 | }); 24 | 25 | await page.goto( 26 | "https://docs.browserbase.com/integrations/crew-ai/introduction", 27 | ); 28 | 29 | await page.click( 30 | "#content-area > div.relative.mt-8.prose.prose-gray.dark\\:prose-invert > p:nth-child(2) > a", 31 | ); 32 | 33 | await page.waitForTimeout(5000); 34 | 35 | if (observePromise) { 36 | const observeResult = await observePromise; 37 | 38 | console.log("Observed", observeResult.length, "actions"); 39 | } 40 | 41 | await stagehand.close(); 42 | } 43 | 44 | (async () => { 45 | await example(); 46 | })(); 47 | -------------------------------------------------------------------------------- /examples/try_wordle.ts: -------------------------------------------------------------------------------- 1 | import { Stagehand } from "@/dist"; 2 | import StagehandConfig from "@/stagehand.config"; 3 | 4 | async function example() { 5 | const stagehand = new Stagehand({ 6 | ...StagehandConfig, 7 | }); 8 | await stagehand.init(); 9 | const page = stagehand.page; 10 | await page.goto("https://www.nytimes.com/games/wordle/index.html"); 11 | await page.act("click 'Continue'"); 12 | await page.act("click 'Play'"); 13 | await page.act("click cross sign on top right of 'How To Play' card"); 14 | const word = "WORDS"; 15 | for (const letter of word) { 16 | await page.act(`press ${letter}`); 17 | } 18 | await page.act("press enter"); 19 | await stagehand.close(); 20 | } 21 | 22 | (async () => { 23 | await example(); 24 | })(); 25 | -------------------------------------------------------------------------------- /lib/agent/AgentClient.ts: -------------------------------------------------------------------------------- 1 | import { 2 | AgentAction, 3 | AgentResult, 4 | AgentType, 5 | AgentExecutionOptions, 6 | } from "@/types/agent"; 7 | 8 | /** 9 | * Abstract base class for agent clients 10 | * This provides a common interface for all agent implementations 11 | */ 12 | export abstract class AgentClient { 13 | public type: AgentType; 14 | public modelName: string; 15 | public clientOptions: Record; 16 | public userProvidedInstructions?: string; 17 | 18 | constructor( 19 | type: AgentType, 20 | modelName: string, 21 | userProvidedInstructions?: string, 22 | ) { 23 | this.type = type; 24 | this.modelName = modelName; 25 | this.userProvidedInstructions = userProvidedInstructions; 26 | this.clientOptions = {}; 27 | } 28 | 29 | abstract execute(options: AgentExecutionOptions): Promise; 30 | 31 | abstract captureScreenshot( 32 | options?: Record, 33 | ): Promise; 34 | 35 | abstract setViewport(width: number, height: number): void; 36 | 37 | abstract setCurrentUrl(url: string): void; 38 | 39 | abstract setScreenshotProvider(provider: () => Promise): void; 40 | 41 | abstract setActionHandler( 42 | handler: (action: AgentAction) => Promise, 43 | ): void; 44 | } 45 | -------------------------------------------------------------------------------- /lib/agent/AgentProvider.ts: -------------------------------------------------------------------------------- 1 | import { LogLine } from "@/types/log"; 2 | import { AgentClient } from "./AgentClient"; 3 | import { AgentType } from "@/types/agent"; 4 | import { OpenAICUAClient } from "./OpenAICUAClient"; 5 | import { AnthropicCUAClient } from "./AnthropicCUAClient"; 6 | import { 7 | UnsupportedModelError, 8 | UnsupportedModelProviderError, 9 | } from "@/types/stagehandErrors"; 10 | 11 | // Map model names to their provider types 12 | const modelToAgentProviderMap: Record = { 13 | "computer-use-preview": "openai", 14 | "claude-3-5-sonnet-20240620": "anthropic", 15 | "claude-3-7-sonnet-20250219": "anthropic", // Add newer Claude models 16 | }; 17 | 18 | /** 19 | * Provider for agent clients 20 | * This class is responsible for creating the appropriate agent client 21 | * based on the provider type 22 | */ 23 | export class AgentProvider { 24 | private logger: (message: LogLine) => void; 25 | 26 | /** 27 | * Create a new agent provider 28 | */ 29 | constructor(logger: (message: LogLine) => void) { 30 | this.logger = logger; 31 | } 32 | 33 | getClient( 34 | modelName: string, 35 | clientOptions?: Record, 36 | userProvidedInstructions?: string, 37 | ): AgentClient { 38 | const type = AgentProvider.getAgentProvider(modelName); 39 | this.logger({ 40 | category: "agent", 41 | message: `Getting agent client for type: ${type}, model: ${modelName}`, 42 | level: 2, 43 | }); 44 | 45 | try { 46 | switch (type) { 47 | case "openai": 48 | return new OpenAICUAClient( 49 | type, 50 | modelName, 51 | userProvidedInstructions, 52 | clientOptions, 53 | ); 54 | case "anthropic": 55 | return new AnthropicCUAClient( 56 | type, 57 | modelName, 58 | userProvidedInstructions, 59 | clientOptions, 60 | ); 61 | default: 62 | throw new UnsupportedModelProviderError( 63 | ["openai", "anthropic"], 64 | "Computer Use Agent", 65 | ); 66 | } 67 | } catch (error) { 68 | const errorMessage = 69 | error instanceof Error ? error.message : String(error); 70 | this.logger({ 71 | category: "agent", 72 | message: `Error creating agent client: ${errorMessage}`, 73 | level: 0, 74 | }); 75 | throw error; 76 | } 77 | } 78 | 79 | static getAgentProvider(modelName: string): AgentType { 80 | // First check the exact model name in the map 81 | if (modelName in modelToAgentProviderMap) { 82 | return modelToAgentProviderMap[modelName]; 83 | } 84 | 85 | throw new UnsupportedModelError( 86 | Object.keys(modelToAgentProviderMap), 87 | "Computer Use Agent", 88 | ); 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /lib/agent/StagehandAgent.ts: -------------------------------------------------------------------------------- 1 | import { LogLine } from "@/types/log"; 2 | import { 3 | AgentExecuteOptions, 4 | AgentResult, 5 | AgentExecutionOptions, 6 | } from "@/types/agent"; 7 | import { AgentClient } from "./AgentClient"; 8 | 9 | /** 10 | * Main interface for agent operations in Stagehand 11 | * This class provides methods for executing tasks with an agent 12 | */ 13 | export class StagehandAgent { 14 | private client: AgentClient; 15 | private logger: (message: LogLine) => void; 16 | 17 | constructor(client: AgentClient, logger: (message: LogLine) => void) { 18 | this.client = client; 19 | this.logger = logger; 20 | } 21 | 22 | async execute( 23 | optionsOrInstruction: AgentExecuteOptions | string, 24 | ): Promise { 25 | const options = 26 | typeof optionsOrInstruction === "string" 27 | ? { instruction: optionsOrInstruction } 28 | : optionsOrInstruction; 29 | 30 | this.logger({ 31 | category: "agent", 32 | message: `Executing agent task: ${options.instruction}`, 33 | level: 1, 34 | }); 35 | 36 | const executionOptions: AgentExecutionOptions = { 37 | options, 38 | logger: this.logger, 39 | retries: 3, 40 | }; 41 | 42 | return await this.client.execute(executionOptions); 43 | } 44 | 45 | getModelName(): string { 46 | return this.client.modelName; 47 | } 48 | 49 | getAgentType(): string { 50 | return this.client.type; 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /lib/cache.ts: -------------------------------------------------------------------------------- 1 | import fs from "fs"; 2 | const observationsPath = "./.cache/observations.json"; 3 | const actionsPath = "./.cache/actions.json"; 4 | 5 | /** 6 | * A file system cache to skip inference when repeating steps 7 | * It also acts as the source of truth for identifying previously seen actions and observations 8 | */ 9 | class Cache { 10 | disabled: boolean; 11 | 12 | constructor({ disabled = false } = {}) { 13 | this.disabled = disabled; 14 | if (!this.disabled) { 15 | this.initCache(); 16 | } 17 | } 18 | 19 | readObservations() { 20 | if (this.disabled) { 21 | return {}; 22 | } 23 | try { 24 | return JSON.parse(fs.readFileSync(observationsPath, "utf8")); 25 | } catch (error) { 26 | console.error("Error reading from observations.json", error); 27 | return {}; 28 | } 29 | } 30 | 31 | readActions() { 32 | if (this.disabled) { 33 | return {}; 34 | } 35 | try { 36 | return JSON.parse(fs.readFileSync(actionsPath, "utf8")); 37 | } catch (error) { 38 | console.error("Error reading from actions.json", error); 39 | return {}; 40 | } 41 | } 42 | 43 | writeObservations({ 44 | key, 45 | value, 46 | }: { 47 | key: string; 48 | value: { id: string; result: string }; 49 | }) { 50 | if (this.disabled) { 51 | return; 52 | } 53 | 54 | const observations = this.readObservations(); 55 | observations[key] = value; 56 | fs.writeFileSync(observationsPath, JSON.stringify(observations, null, 2)); 57 | } 58 | 59 | writeActions({ 60 | key, 61 | value, 62 | }: { 63 | key: string; 64 | value: { id: string; result: string }; 65 | }) { 66 | if (this.disabled) { 67 | return; 68 | } 69 | 70 | const actions = this.readActions(); 71 | actions[key] = value; 72 | fs.writeFileSync(actionsPath, JSON.stringify(actions, null, 2)); 73 | } 74 | 75 | evictCache() { 76 | throw new Error("implement me"); 77 | } 78 | 79 | private initCache() { 80 | if (this.disabled) { 81 | return; 82 | } 83 | const cacheDir = ".cache"; 84 | 85 | if (!fs.existsSync(cacheDir)) { 86 | fs.mkdirSync(cacheDir); 87 | } 88 | if (!fs.existsSync(actionsPath)) { 89 | fs.writeFileSync(actionsPath, JSON.stringify({})); 90 | } 91 | 92 | if (!fs.existsSync(observationsPath)) { 93 | fs.writeFileSync(observationsPath, JSON.stringify({})); 94 | } 95 | } 96 | } 97 | 98 | export default Cache; 99 | -------------------------------------------------------------------------------- /lib/cache/LLMCache.ts: -------------------------------------------------------------------------------- 1 | import { BaseCache, CacheEntry } from "./BaseCache"; 2 | 3 | export class LLMCache extends BaseCache { 4 | constructor( 5 | logger: (message: { 6 | category?: string; 7 | message: string; 8 | level?: number; 9 | }) => void, 10 | cacheDir?: string, 11 | cacheFile?: string, 12 | ) { 13 | super(logger, cacheDir, cacheFile || "llm_calls.json"); 14 | } 15 | 16 | /** 17 | * Overrides the get method to track used hashes by requestId. 18 | * @param options - The options used to generate the cache key. 19 | * @param requestId - The identifier for the current request. 20 | * @returns The cached data if available, otherwise null. 21 | */ 22 | public async get( 23 | options: Record, 24 | requestId: string, 25 | ): Promise { 26 | const data = await super.get(options, requestId); 27 | return data as T | null; // TODO: remove this cast 28 | } 29 | 30 | /** 31 | * Overrides the set method to include cache cleanup logic. 32 | * @param options - The options used to generate the cache key. 33 | * @param data - The data to be cached. 34 | * @param requestId - The identifier for the current request. 35 | */ 36 | public async set( 37 | options: Record, 38 | data: unknown, 39 | requestId: string, 40 | ): Promise { 41 | await super.set(options, data, requestId); 42 | this.logger({ 43 | category: "llm_cache", 44 | message: "Cache miss - saved new response", 45 | level: 1, 46 | }); 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /lib/dom/DomChunk.ts: -------------------------------------------------------------------------------- 1 | export interface DomChunk { 2 | startOffset: number; 3 | endOffset: number; 4 | outputString: string; 5 | selectorMap: Record; 6 | } 7 | -------------------------------------------------------------------------------- /lib/dom/containerFactory.ts: -------------------------------------------------------------------------------- 1 | import { StagehandContainer } from "./StagehandContainer"; 2 | import { GlobalPageContainer } from "./GlobalPageContainer"; 3 | import { ElementContainer } from "./ElementContainer"; 4 | 5 | /** 6 | * Decide which container to create. 7 | */ 8 | export function createStagehandContainer( 9 | obj: Window | HTMLElement, 10 | ): StagehandContainer { 11 | if (obj instanceof Window) { 12 | return new GlobalPageContainer(); 13 | } else { 14 | return new ElementContainer(obj); 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /lib/dom/genDomScripts.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * We have a collection of typescript functions that we need to run in the browser. 3 | * First, we build them into a single js file 4 | * Second, due to framework differences we need to get our script content as a string to avoid pathing issues due to file routing in frameworks like Next.js 5 | * Playwright allows us to pass in script content directly as a string instead of reading a file from a path 6 | * https://github.com/browserbase/stagehand/issues/180 7 | * 8 | * We can't rely on the normal build process for stagehand, because we need our script content as a string so that the import *just works* 9 | */ 10 | import fs from "fs"; 11 | import path from "path"; 12 | import esbuild from "esbuild"; 13 | 14 | fs.mkdirSync(path.join(__dirname, "./build"), { recursive: true }); 15 | 16 | esbuild.buildSync({ 17 | entryPoints: [path.join(__dirname, "index.ts")], 18 | bundle: true, 19 | outdir: path.join(__dirname, "build"), 20 | }); 21 | 22 | const scriptContent = fs.readFileSync( 23 | path.join(__dirname, "./build/index.js"), 24 | "utf8", 25 | ); 26 | 27 | const output = `export const scriptContent = ${JSON.stringify(scriptContent)};`; 28 | 29 | fs.writeFileSync(path.join(__dirname, "./build/scriptContent.ts"), output); 30 | -------------------------------------------------------------------------------- /lib/dom/global.d.ts: -------------------------------------------------------------------------------- 1 | import { StagehandContainer } from "./StagehandContainer"; 2 | 3 | export {}; 4 | declare global { 5 | interface Window { 6 | __stagehandInjected?: boolean; 7 | chunkNumber: number; 8 | showChunks?: boolean; 9 | processDom: (chunksSeen: Array) => Promise<{ 10 | outputString: string; 11 | selectorMap: Record; 12 | chunk: number; 13 | chunks: number[]; 14 | }>; 15 | processAllOfDom: (xpath?: string) => Promise<{ 16 | outputString: string; 17 | selectorMap: Record; 18 | }>; 19 | createStagehandContainer: (obj: Window | HTMLElement) => StagehandContainer; 20 | waitForDomSettle: () => Promise; 21 | __playwright?: unknown; 22 | __pw_manual?: unknown; 23 | __PW_inspect?: unknown; 24 | storeDOM: (xpath?: string) => string; 25 | restoreDOM: (storedDOM: string, xpath?: string) => void; 26 | createTextBoundingBoxes: (xpath?: string) => void; 27 | getElementBoundingBoxes: (xpath: string) => Array<{ 28 | text: string; 29 | top: number; 30 | left: number; 31 | width: number; 32 | height: number; 33 | }>; 34 | getScrollableElementXpaths: (topN?: number) => Promise; 35 | getNodeFromXpath: (xpath: string) => Node | null; 36 | waitForElementScrollEnd: (element: HTMLElement) => Promise; 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /lib/dom/index.ts: -------------------------------------------------------------------------------- 1 | export * from "./process"; 2 | export * from "./utils"; 3 | -------------------------------------------------------------------------------- /lib/dom/utils.ts: -------------------------------------------------------------------------------- 1 | import { StagehandDomProcessError } from "@/types/stagehandErrors"; 2 | 3 | export async function waitForDomSettle() { 4 | return new Promise((resolve) => { 5 | const createTimeout = () => { 6 | return setTimeout(() => { 7 | resolve(); 8 | }, 2000); 9 | }; 10 | let timeout = createTimeout(); 11 | const observer = new MutationObserver(() => { 12 | clearTimeout(timeout); 13 | timeout = createTimeout(); 14 | }); 15 | observer.observe(window.document.body, { childList: true, subtree: true }); 16 | }); 17 | } 18 | 19 | export function calculateViewportHeight() { 20 | return Math.ceil(window.innerHeight * 0.75); 21 | } 22 | 23 | /** 24 | * Tests if the element actually responds to .scrollTo(...) 25 | * and that scrollTop changes as expected. 26 | */ 27 | export function canElementScroll(elem: HTMLElement): boolean { 28 | // Quick check if scrollTo is a function 29 | if (typeof elem.scrollTo !== "function") { 30 | console.warn("canElementScroll: .scrollTo is not a function."); 31 | return false; 32 | } 33 | 34 | try { 35 | const originalTop = elem.scrollTop; 36 | 37 | // try to scroll 38 | elem.scrollTo({ 39 | top: originalTop + 100, 40 | left: 0, 41 | behavior: "instant", 42 | }); 43 | 44 | // If scrollTop never changed, consider it unscrollable 45 | if (elem.scrollTop === originalTop) { 46 | throw new StagehandDomProcessError("scrollTop did not change"); 47 | } 48 | 49 | // Scroll back to original place 50 | elem.scrollTo({ 51 | top: originalTop, 52 | left: 0, 53 | behavior: "instant", 54 | }); 55 | 56 | return true; 57 | } catch (error) { 58 | console.warn("canElementScroll error:", (error as Error).message || error); 59 | return false; 60 | } 61 | } 62 | 63 | export function getNodeFromXpath(xpath: string) { 64 | return document.evaluate( 65 | xpath, 66 | document.documentElement, 67 | null, 68 | XPathResult.FIRST_ORDERED_NODE_TYPE, 69 | null, 70 | ).singleNodeValue; 71 | } 72 | 73 | export function waitForElementScrollEnd( 74 | element: HTMLElement, 75 | idleMs = 100, 76 | ): Promise { 77 | return new Promise((resolve) => { 78 | let scrollEndTimer: number | undefined; 79 | 80 | const handleScroll = () => { 81 | clearTimeout(scrollEndTimer); 82 | scrollEndTimer = window.setTimeout(() => { 83 | element.removeEventListener("scroll", handleScroll); 84 | resolve(); 85 | }, idleMs); 86 | }; 87 | 88 | element.addEventListener("scroll", handleScroll, { passive: true }); 89 | handleScroll(); 90 | }); 91 | } 92 | -------------------------------------------------------------------------------- /media/create-browser-app.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/browserbase/stagehand/74f9339bac3404078bfcc47fecda44cc4b0f3876/media/create-browser-app.gif -------------------------------------------------------------------------------- /media/github_demo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/browserbase/stagehand/74f9339bac3404078bfcc47fecda44cc4b0f3876/media/github_demo.gif -------------------------------------------------------------------------------- /stagehand.config.ts: -------------------------------------------------------------------------------- 1 | import type { ConstructorParams } from "@/dist"; 2 | import dotenv from "dotenv"; 3 | dotenv.config(); 4 | 5 | const StagehandConfig: ConstructorParams = { 6 | verbose: 2 /* Verbosity level for logging: 0 = silent, 1 = info, 2 = all */, 7 | domSettleTimeoutMs: 30_000 /* Timeout for DOM to settle in milliseconds */, 8 | 9 | // LLM configuration 10 | modelName: "gpt-4o" /* Name of the model to use */, 11 | modelClientOptions: { 12 | apiKey: process.env.OPENAI_API_KEY, 13 | } /* Configuration options for the model client */, 14 | 15 | // Browser configuration 16 | env: 17 | process.env.BROWSERBASE_API_KEY && process.env.BROWSERBASE_PROJECT_ID 18 | ? "BROWSERBASE" 19 | : "LOCAL", 20 | apiKey: process.env.BROWSERBASE_API_KEY /* API key for authentication */, 21 | projectId: process.env.BROWSERBASE_PROJECT_ID /* Project identifier */, 22 | browserbaseSessionID: 23 | undefined /* Session ID for resuming Browserbase sessions */, 24 | browserbaseSessionCreateParams: { 25 | projectId: process.env.BROWSERBASE_PROJECT_ID!, 26 | browserSettings: { 27 | blockAds: true, 28 | viewport: { 29 | width: 1024, 30 | height: 768, 31 | }, 32 | }, 33 | }, 34 | localBrowserLaunchOptions: { 35 | headless: false, 36 | viewport: { 37 | width: 1024, 38 | height: 768, 39 | }, 40 | } /* Configuration options for the local browser */, 41 | }; 42 | export default StagehandConfig; 43 | -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "module": "commonjs", 4 | "esModuleInterop": true, 5 | "allowSyntheticDefaultImports": true, 6 | "target": "es6", 7 | "noImplicitAny": true, 8 | "moduleResolution": "node", 9 | "sourceMap": true, 10 | "outDir": "dist", 11 | "baseUrl": ".", 12 | "paths": { 13 | "*": ["node_modules/*", "lib/types/*"], 14 | "@/*": ["./*"] 15 | }, 16 | "skipLibCheck": true, 17 | "declaration": true 18 | }, 19 | "exclude": ["node_modules", "dist", ".eslintrc.cjs"] 20 | } 21 | -------------------------------------------------------------------------------- /types/act.ts: -------------------------------------------------------------------------------- 1 | import { LLMClient } from "../lib/llm/LLMClient"; 2 | import { Locator } from "@playwright/test"; 3 | import { Logger } from "@/types/log"; 4 | import { StagehandPage } from "@/lib/StagehandPage"; 5 | 6 | // WARNING: This is NOT to be confused with the ActParams type used in `page.act()`. 7 | // This is the type for the parameters passed to the `act` command in `inference.ts`. 8 | // page.act() params/result types are defined in `types/stagehand.ts`. 9 | export interface ActCommandParams { 10 | action: string; 11 | steps?: string; 12 | domElements: string; 13 | llmClient: LLMClient; 14 | retries?: number; 15 | logger: (message: { category?: string; message: string }) => void; 16 | requestId: string; 17 | variables?: Record; 18 | userProvidedInstructions?: string; 19 | } 20 | 21 | // WARNING: This is NOT to be confused with the ActResult type used in `page.act()`. 22 | // This is the type for the result of the `act` command in `inference.ts`. 23 | // page.act() params/result types are defined in `types/stagehand.ts`. 24 | export interface ActCommandResult { 25 | method: string; 26 | element: number; 27 | args: unknown[]; 28 | completed: boolean; 29 | step: string; 30 | why?: string; 31 | } 32 | 33 | // We can use this enum to list the actions supported in performPlaywrightMethod 34 | export enum SupportedPlaywrightAction { 35 | CLICK = "click", 36 | FILL = "fill", 37 | TYPE = "type", 38 | PRESS = "press", 39 | SCROLL = "scrollTo", 40 | NEXT_CHUNK = "nextChunk", 41 | PREV_CHUNK = "prevChunk", 42 | } 43 | 44 | /** 45 | * A context object to hold all parameters that might be needed by 46 | * any of the methods in the `methodHandlerMap` 47 | */ 48 | export interface MethodHandlerContext { 49 | method: string; 50 | locator: Locator; 51 | xpath: string; 52 | args: unknown[]; 53 | logger: Logger; 54 | stagehandPage: StagehandPage; 55 | initialUrl: string; 56 | domSettleTimeoutMs?: number; 57 | } 58 | -------------------------------------------------------------------------------- /types/api.ts: -------------------------------------------------------------------------------- 1 | import Browserbase from "@browserbasehq/sdk"; 2 | import { LogLine } from "./log"; 3 | 4 | export interface StagehandAPIConstructorParams { 5 | apiKey: string; 6 | projectId: string; 7 | logger: (message: LogLine) => void; 8 | } 9 | 10 | export interface ExecuteActionParams { 11 | method: "act" | "extract" | "observe" | "navigate" | "end" | "agentExecute"; 12 | args?: unknown; 13 | params?: unknown; 14 | } 15 | 16 | export interface StartSessionParams { 17 | modelName: string; 18 | modelApiKey: string; 19 | domSettleTimeoutMs: number; 20 | verbose: number; 21 | debugDom: boolean; 22 | systemPrompt?: string; 23 | browserbaseSessionCreateParams?: Browserbase.Sessions.SessionCreateParams; 24 | selfHeal?: boolean; 25 | waitForCaptchaSolves?: boolean; 26 | actionTimeoutMs?: number; 27 | browserbaseSessionID?: string; 28 | } 29 | 30 | export interface StartSessionResult { 31 | sessionId: string; 32 | } 33 | 34 | export interface SuccessResponse { 35 | success: true; 36 | data: T; 37 | } 38 | 39 | export interface ErrorResponse { 40 | success: false; 41 | message: string; 42 | } 43 | 44 | export type ApiResponse = SuccessResponse | ErrorResponse; 45 | -------------------------------------------------------------------------------- /types/browser.ts: -------------------------------------------------------------------------------- 1 | import { Browser, BrowserContext } from "./page"; 2 | 3 | export interface BrowserResult { 4 | env: "LOCAL" | "BROWSERBASE"; 5 | browser?: Browser; 6 | context: BrowserContext; 7 | debugUrl?: string; 8 | sessionUrl?: string; 9 | contextPath?: string; 10 | sessionId?: string; 11 | } 12 | -------------------------------------------------------------------------------- /types/context.ts: -------------------------------------------------------------------------------- 1 | import type { BrowserContext as PlaywrightContext } from "@playwright/test"; 2 | import { Page } from "../types/page"; 3 | 4 | export interface AXNode { 5 | role?: { value: string }; 6 | name?: { value: string }; 7 | description?: { value: string }; 8 | value?: { value: string }; 9 | nodeId: string; 10 | backendDOMNodeId?: number; 11 | parentId?: string; 12 | childIds?: string[]; 13 | properties?: { 14 | name: string; 15 | value: { 16 | type: string; 17 | value?: string; 18 | }; 19 | }[]; 20 | } 21 | 22 | export type AccessibilityNode = { 23 | role: string; 24 | name?: string; 25 | description?: string; 26 | value?: string; 27 | children?: AccessibilityNode[]; 28 | childIds?: string[]; 29 | parentId?: string; 30 | nodeId?: string; 31 | backendDOMNodeId?: number; 32 | properties?: { 33 | name: string; 34 | value: { 35 | type: string; 36 | value?: string; 37 | }; 38 | }[]; 39 | }; 40 | 41 | export interface TreeResult { 42 | tree: AccessibilityNode[]; 43 | simplified: string; 44 | iframes?: AccessibilityNode[]; 45 | idToUrl: Record; 46 | } 47 | 48 | export interface EnhancedContext 49 | extends Omit { 50 | newPage(): Promise; 51 | pages(): Page[]; 52 | } 53 | -------------------------------------------------------------------------------- /types/evals.ts: -------------------------------------------------------------------------------- 1 | import { z } from "zod"; 2 | import type { AvailableModel } from "../types/model"; 3 | import type { LogLine } from "../types/log"; 4 | import type { EvalCase } from "braintrust"; 5 | import { Stagehand } from "@/dist"; 6 | import { ConstructorParams } from "@/dist"; 7 | import { EvalLogger } from "@/evals/logger"; 8 | 9 | export type StagehandInitResult = { 10 | stagehand: Stagehand; 11 | logger: EvalLogger; 12 | debugUrl: string; 13 | sessionUrl: string; 14 | useTextExtract: boolean; 15 | stagehandConfig: ConstructorParams; 16 | modelName: AvailableModel; 17 | }; 18 | 19 | export type EvalFunction = (taskInput: StagehandInitResult) => Promise<{ 20 | _success: boolean; 21 | logs: LogLine[]; 22 | debugUrl: string; 23 | sessionUrl: string; 24 | error?: unknown; 25 | }>; 26 | 27 | export const EvalCategorySchema = z.enum([ 28 | "observe", 29 | "act", 30 | "combination", 31 | "extract", 32 | "experimental", 33 | "text_extract", 34 | "targeted_extract", 35 | "regression", 36 | "regression_llm_providers", 37 | "llm_clients", 38 | "agent", 39 | ]); 40 | 41 | export type EvalCategory = z.infer; 42 | export interface EvalInput { 43 | name: string; 44 | modelName: AvailableModel; 45 | } 46 | 47 | export interface Testcase 48 | extends EvalCase< 49 | EvalInput, 50 | unknown, 51 | { model: AvailableModel; test: string } 52 | > { 53 | input: EvalInput; 54 | name: string; 55 | tags: string[]; 56 | metadata: { model: AvailableModel; test: string }; 57 | expected: unknown; 58 | } 59 | 60 | export interface SummaryResult { 61 | input: EvalInput; 62 | output: { _success: boolean }; 63 | name: string; 64 | score: number; 65 | } 66 | 67 | export interface EvalArgs { 68 | input: TInput; 69 | output: TOutput; 70 | expected: TExpected; 71 | metadata?: { model: AvailableModel; test: string }; 72 | } 73 | 74 | export interface EvalResult { 75 | name: string; 76 | score: number; 77 | } 78 | 79 | export type LogLineEval = LogLine & { 80 | parsedAuxiliary?: string | object; 81 | }; 82 | -------------------------------------------------------------------------------- /types/evaluator.ts: -------------------------------------------------------------------------------- 1 | export interface EvaluateOptions { 2 | /** 3 | * The question to ask about the task state 4 | */ 5 | question: string; 6 | /** 7 | * Custom system prompt for the evaluator 8 | */ 9 | systemPrompt?: string; 10 | /** 11 | * Delay in milliseconds before taking the screenshot 12 | * @default 1000 13 | */ 14 | screenshotDelayMs?: number; 15 | /** 16 | * Whether to throw an error if the response is not a clear YES or NO 17 | * @default false 18 | */ 19 | strictResponse?: boolean; 20 | } 21 | 22 | export interface BatchEvaluateOptions { 23 | /** 24 | * Array of questions to evaluate 25 | */ 26 | questions: string[]; 27 | /** 28 | * Custom system prompt for the evaluator 29 | */ 30 | systemPrompt?: string; 31 | /** 32 | * Delay in milliseconds before taking the screenshot 33 | * @default 1000 34 | */ 35 | screenshotDelayMs?: number; 36 | /** 37 | * Whether to throw an error if any response is not a clear YES or NO 38 | * @default false 39 | */ 40 | strictResponse?: boolean; 41 | /** 42 | * The reasoning behind the evaluation 43 | */ 44 | reasoning?: string; 45 | } 46 | 47 | /** 48 | * Result of an evaluation 49 | */ 50 | export interface EvaluationResult { 51 | /** 52 | * The evaluation result ('YES', 'NO', or 'INVALID' if parsing failed or value was unexpected) 53 | */ 54 | evaluation: "YES" | "NO" | "INVALID"; 55 | /** 56 | * The reasoning behind the evaluation 57 | */ 58 | reasoning: string; 59 | } 60 | -------------------------------------------------------------------------------- /types/inference.ts: -------------------------------------------------------------------------------- 1 | import { LLMClient } from "../lib/llm/LLMClient"; 2 | import { LLMProvider } from "../lib/llm/LLMProvider"; 3 | 4 | export interface VerifyActCompletionParams { 5 | goal: string; 6 | steps: string; 7 | llmProvider: LLMProvider; 8 | llmClient: LLMClient; 9 | domElements?: string; 10 | logger: (message: { category?: string; message: string }) => void; 11 | requestId: string; 12 | } 13 | -------------------------------------------------------------------------------- /types/llm.ts: -------------------------------------------------------------------------------- 1 | export interface LLMTool { 2 | type: "function"; 3 | name: string; 4 | description: string; 5 | parameters: Record; 6 | } 7 | -------------------------------------------------------------------------------- /types/log.ts: -------------------------------------------------------------------------------- 1 | export type LogLevel = 0 | 1 | 2; 2 | 3 | /** 4 | * Mapping between numeric log levels and their names 5 | * 6 | * 0 - error/warn - Critical issues or important warnings 7 | * 1 - info - Standard information messages 8 | * 2 - debug - Detailed information for debugging 9 | */ 10 | export const LOG_LEVEL_NAMES: Record = { 11 | 0: "error", 12 | 1: "info", 13 | 2: "debug", 14 | }; 15 | 16 | export type LogLine = { 17 | id?: string; 18 | category?: string; 19 | message: string; 20 | level?: LogLevel; 21 | timestamp?: string; 22 | auxiliary?: { 23 | [key: string]: { 24 | value: string; 25 | type: "object" | "string" | "html" | "integer" | "float" | "boolean"; 26 | }; 27 | }; 28 | }; 29 | 30 | export type Logger = (logLine: LogLine) => void; 31 | -------------------------------------------------------------------------------- /types/model.ts: -------------------------------------------------------------------------------- 1 | import type { ClientOptions as AnthropicClientOptions } from "@anthropic-ai/sdk"; 2 | import type { ClientOptions as OpenAIClientOptions } from "openai"; 3 | import { z } from "zod"; 4 | 5 | export const AvailableModelSchema = z.enum([ 6 | "gpt-4.1", 7 | "gpt-4.1-mini", 8 | "gpt-4.1-nano", 9 | "o4-mini", 10 | "o3", 11 | "o3-mini", 12 | "o1", 13 | "o1-mini", 14 | "gpt-4o", 15 | "gpt-4o-mini", 16 | "gpt-4o-2024-08-06", 17 | "gpt-4.5-preview", 18 | "o1-preview", 19 | "claude-3-5-sonnet-latest", 20 | "claude-3-5-sonnet-20241022", 21 | "claude-3-5-sonnet-20240620", 22 | "claude-3-7-sonnet-latest", 23 | "claude-3-7-sonnet-20250219", 24 | "cerebras-llama-3.3-70b", 25 | "cerebras-llama-3.1-8b", 26 | "groq-llama-3.3-70b-versatile", 27 | "groq-llama-3.3-70b-specdec", 28 | "gemini-1.5-flash", 29 | "gemini-1.5-pro", 30 | "gemini-1.5-flash-8b", 31 | "gemini-2.0-flash-lite", 32 | "gemini-2.0-flash", 33 | "gemini-2.5-flash-preview-04-17", 34 | "gemini-2.5-pro-preview-03-25", 35 | ]); 36 | 37 | export type AvailableModel = z.infer; 38 | 39 | export type ModelProvider = 40 | | "openai" 41 | | "anthropic" 42 | | "cerebras" 43 | | "groq" 44 | | "google"; 45 | 46 | export type ClientOptions = OpenAIClientOptions | AnthropicClientOptions; 47 | 48 | export interface AnthropicJsonSchemaObject { 49 | definitions?: { 50 | MySchema?: { properties?: Record; required?: string[] }; 51 | }; 52 | properties?: Record; 53 | required?: string[]; 54 | } 55 | -------------------------------------------------------------------------------- /types/operator.ts: -------------------------------------------------------------------------------- 1 | import { z } from "zod"; 2 | 3 | export const operatorResponseSchema = z.object({ 4 | reasoning: z 5 | .string() 6 | .describe( 7 | "The reasoning for the step taken. If this step's method is `close`, the goal was to extract data, and the task was successful, state the data that was extracted.", 8 | ), 9 | method: z.enum([ 10 | "act", 11 | "extract", 12 | "goto", 13 | "close", 14 | "wait", 15 | "navback", 16 | "refresh", 17 | ]) 18 | .describe(`The action to perform on the page based off of the goal and the current state of the page. 19 | goto: Navigate to a specific URL. 20 | act: Perform an action on the page. 21 | extract: Extract data from the page. 22 | close: The task is complete, close the browser. 23 | wait: Wait for a period of time. 24 | navback: Navigate back to the previous page. Do not navigate back if you are already on the first page. 25 | refresh: Refresh the page.`), 26 | parameters: z 27 | .string() 28 | .describe( 29 | `The parameter for the action. Only pass in a parameter for the following methods: 30 | - act: The action to perform. e.g. "click on the submit button" or "type [email] into the email input field and press enter" 31 | - extract: The data to extract. e.g. "the title of the article". If you want to extract all of the text on the page, leave this undefined. 32 | - wait: The amount of time to wait in milliseconds. 33 | - goto: The URL to navigate to. e.g. "https://www.google.com" 34 | The other methods do not require a parameter.`, 35 | ) 36 | .optional(), 37 | taskComplete: z 38 | .boolean() 39 | .describe( 40 | "Whether the task is complete. If true, the task is complete and no more steps are needed. If you chose to close the task because the goal is not achievable, set this to false.", 41 | ), 42 | }); 43 | 44 | export type OperatorResponse = z.infer; 45 | 46 | export const operatorSummarySchema = z.object({ 47 | answer: z.string().describe("The final answer to the original instruction."), 48 | }); 49 | 50 | export type OperatorSummary = z.infer; 51 | -------------------------------------------------------------------------------- /types/page.ts: -------------------------------------------------------------------------------- 1 | import type { 2 | Browser as PlaywrightBrowser, 3 | BrowserContext as PlaywrightContext, 4 | Page as PlaywrightPage, 5 | } from "@playwright/test"; 6 | import { z } from "zod"; 7 | import type { 8 | ActOptions, 9 | ActResult, 10 | ExtractOptions, 11 | ExtractResult, 12 | ObserveOptions, 13 | ObserveResult, 14 | } from "./stagehand"; 15 | 16 | export const defaultExtractSchema = z.object({ 17 | extraction: z.string(), 18 | }); 19 | 20 | export const pageTextSchema = z.object({ 21 | page_text: z.string(), 22 | }); 23 | 24 | export interface Page extends Omit { 25 | act(action: string): Promise; 26 | act(options: ActOptions): Promise; 27 | act(observation: ObserveResult): Promise; 28 | 29 | extract( 30 | instruction: string, 31 | ): Promise>; 32 | extract( 33 | options: ExtractOptions, 34 | ): Promise>; 35 | extract(): Promise>; 36 | 37 | observe(): Promise; 38 | observe(instruction: string): Promise; 39 | observe(options?: ObserveOptions): Promise; 40 | 41 | on: { 42 | (event: "popup", listener: (page: Page) => unknown): Page; 43 | } & PlaywrightPage["on"]; 44 | } 45 | 46 | // Empty type for now, but will be used in the future 47 | export type BrowserContext = PlaywrightContext; 48 | 49 | // Empty type for now, but will be used in the future 50 | export type Browser = PlaywrightBrowser; 51 | -------------------------------------------------------------------------------- /types/playwright.ts: -------------------------------------------------------------------------------- 1 | export class PlaywrightCommandException extends Error { 2 | constructor(message: string) { 3 | super(message); 4 | this.name = "PlaywrightCommandException"; 5 | } 6 | } 7 | 8 | export class PlaywrightCommandMethodNotSupportedException extends Error { 9 | constructor(message: string) { 10 | super(message); 11 | this.name = "PlaywrightCommandMethodNotSupportedException"; 12 | } 13 | } 14 | 15 | export interface GotoOptions { 16 | timeout?: number; 17 | waitUntil?: "load" | "domcontentloaded" | "networkidle" | "commit"; 18 | referer?: string; 19 | } 20 | -------------------------------------------------------------------------------- /types/stagehandApiErrors.ts: -------------------------------------------------------------------------------- 1 | export class StagehandAPIError extends Error { 2 | constructor(message: string) { 3 | super(message); 4 | this.name = this.constructor.name; 5 | } 6 | } 7 | 8 | export class StagehandAPIUnauthorizedError extends StagehandAPIError { 9 | constructor(message?: string) { 10 | super(message || "Unauthorized request"); 11 | } 12 | } 13 | 14 | export class StagehandHttpError extends StagehandAPIError { 15 | constructor(message: string) { 16 | super(message); 17 | } 18 | } 19 | 20 | export class StagehandServerError extends StagehandAPIError { 21 | constructor(message: string) { 22 | super(message); 23 | } 24 | } 25 | 26 | export class StagehandResponseBodyError extends StagehandAPIError { 27 | constructor() { 28 | super("Response body is null"); 29 | } 30 | } 31 | 32 | export class StagehandResponseParseError extends StagehandAPIError { 33 | constructor(message: string) { 34 | super(message); 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /types/textannotation.ts: -------------------------------------------------------------------------------- 1 | export type TextAnnotation = { 2 | text: string; 3 | bottom_left: { x: number; y: number }; 4 | bottom_left_normalized: { x: number; y: number }; 5 | width: number; 6 | height: number; 7 | }; 8 | --------------------------------------------------------------------------------