├── .prettierrc ├── results ├── .gitkeep └── result-2025-12-07-16-48-32.json ├── .bunversion ├── .cocominify ├── GEMINI.md ├── .env.example ├── .prettierignore ├── .cocoignore ├── vitest-setup.js ├── svelte.config.js ├── lib ├── tools │ ├── index.ts │ ├── result-write.ts │ └── test-component.ts ├── test-discovery.ts ├── utils.ts ├── report.ts ├── pricing.ts ├── utils.test.ts ├── output-test-runner.ts ├── verify-references.ts ├── pricing.test.ts ├── report-styles.ts └── report-template.ts ├── CLAUDE.md ├── verify-references.ts ├── scripts └── install.sh ├── vitest.config.ts ├── tests ├── snippets │ ├── Reference.svelte │ ├── prompt.md │ └── test.ts ├── counter │ ├── Reference.svelte │ ├── prompt.md │ └── test.ts └── derived-by │ ├── Reference.svelte │ ├── prompt.md │ └── test.ts ├── .gitignore ├── .github └── workflows │ └── test.yml ├── tsconfig.json ├── eslint.config.js ├── generate-report.ts ├── patches └── @ai-sdk+mcp+0.0.11.patch ├── package.json ├── README.md ├── AGENTS.md └── index.ts /.prettierrc: -------------------------------------------------------------------------------- 1 | {} 2 | -------------------------------------------------------------------------------- /results/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.bunversion: -------------------------------------------------------------------------------- 1 | bun-v1.3.4 -------------------------------------------------------------------------------- /.cocominify: -------------------------------------------------------------------------------- 1 | tests/ 2 | data/ -------------------------------------------------------------------------------- /GEMINI.md: -------------------------------------------------------------------------------- 1 | @AGENTS.md 2 | -------------------------------------------------------------------------------- /.env.example: -------------------------------------------------------------------------------- 1 | VERCEL_OIDC_TOKEN="create with vercel cli" -------------------------------------------------------------------------------- /.prettierignore: -------------------------------------------------------------------------------- 1 | # Ignore artifacts: 2 | tests 3 | results -------------------------------------------------------------------------------- /.cocoignore: -------------------------------------------------------------------------------- 1 | .env.example 2 | results/ 3 | .claude 4 | patches -------------------------------------------------------------------------------- /vitest-setup.js: -------------------------------------------------------------------------------- 1 | import "@testing-library/jest-dom/vitest"; 2 | -------------------------------------------------------------------------------- /svelte.config.js: -------------------------------------------------------------------------------- 1 | export default { 2 | compilerOptions: { 3 | runes: true, 4 | }, 5 | }; 6 | -------------------------------------------------------------------------------- /lib/tools/index.ts: -------------------------------------------------------------------------------- 1 | export { resultWriteTool } from "./result-write.ts"; 2 | export { testComponentTool } from "./test-component.ts"; 3 | -------------------------------------------------------------------------------- /CLAUDE.md: -------------------------------------------------------------------------------- 1 | # CLAUDE.md 2 | 3 | This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. 4 | 5 | @AGENTS.md 6 | -------------------------------------------------------------------------------- /verify-references.ts: -------------------------------------------------------------------------------- 1 | import { verifyAllReferences } from "./lib/verify-references.ts"; 2 | 3 | console.log("=== Reference Implementation Verification ===\n"); 4 | 5 | const exitCode = await verifyAllReferences(); 6 | 7 | process.exit(exitCode); 8 | -------------------------------------------------------------------------------- /scripts/install.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Read the Bun version from .bunversion file 4 | if [ -f ".bunversion" ]; then 5 | BUN_VERSION=$(cat .bunversion | tr -d '[:space:]') 6 | echo "Installing Bun version: $BUN_VERSION" 7 | curl -fsSL https://bun.com/install | bash -s "$BUN_VERSION" 8 | else 9 | echo "Error: .bunversion file not found" 10 | exit 1 11 | fi 12 | -------------------------------------------------------------------------------- /vitest.config.ts: -------------------------------------------------------------------------------- 1 | import { defineConfig } from "vitest/config"; 2 | import { svelte } from "@sveltejs/vite-plugin-svelte"; 3 | import { svelteTesting } from "@testing-library/svelte/vite"; 4 | 5 | export default defineConfig({ 6 | plugins: [svelte(), svelteTesting()], 7 | test: { 8 | environment: "jsdom", 9 | setupFiles: ["./vitest-setup.js"], 10 | include: ["**/*.{test,spec}.{js,ts,jsx,tsx}", "**/test.ts"], 11 | }, 12 | }); 13 | -------------------------------------------------------------------------------- /lib/tools/result-write.ts: -------------------------------------------------------------------------------- 1 | import { tool } from "ai"; 2 | import { z } from "zod"; 3 | 4 | export const resultWriteTool = tool({ 5 | description: 6 | "Write your final Svelte component code. Call this when you have completed implementing the component and are ready to submit.", 7 | inputSchema: z.object({ 8 | content: z.string().describe("The complete Svelte component code"), 9 | }), 10 | execute: async ({ content }) => { 11 | const lines = content.split("\n").length; 12 | console.log(`[ResultWrite] Received ${lines} lines of code`); 13 | return { success: true }; 14 | }, 15 | }); 16 | -------------------------------------------------------------------------------- /tests/snippets/Reference.svelte: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | 7 | {#snippet title(bookTitle)} 8 | {bookTitle} 9 | {/snippet} 10 | 11 | 18 | 19 | 35 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # dependencies (bun install) 2 | node_modules 3 | 4 | # output 5 | out 6 | dist 7 | *.tgz 8 | 9 | # test outputs directory 10 | outputs 11 | 12 | # code coverage 13 | coverage 14 | *.lcov 15 | 16 | # logs 17 | logs 18 | _.log 19 | report.[0-9]_.[0-9]_.[0-9]_.[0-9]_.json 20 | 21 | # dotenv environment variable files 22 | .env 23 | .env.development.local 24 | .env.test.local 25 | .env.production.local 26 | .env.local 27 | 28 | # caches 29 | .eslintcache 30 | .cache 31 | *.tsbuildinfo 32 | 33 | # IntelliJ based IDEs 34 | .idea 35 | 36 | # Finder (MacOS) folder config 37 | .DS_Store 38 | 39 | # results directory (ignore HTML files but keep JSON and .gitkeep) 40 | results/* 41 | !results/.gitkeep 42 | !results/*.json 43 | .vercel 44 | .env*.local 45 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Test 2 | 3 | on: 4 | pull_request: 5 | push: 6 | branches: 7 | - main 8 | 9 | jobs: 10 | test: 11 | name: Run Tests 12 | runs-on: ubuntu-latest 13 | steps: 14 | - uses: actions/checkout@v4 15 | 16 | - name: Get Bun version 17 | id: bun-version 18 | run: echo "version=$(cat .bunversion | sed 's/bun-v//')" >> $GITHUB_OUTPUT 19 | 20 | - uses: oven-sh/setup-bun@v2 21 | with: 22 | bun-version: ${{ steps.bun-version.outputs.version }} 23 | 24 | - name: Install dependencies 25 | run: bun install 26 | 27 | - name: Run ESLint 28 | run: bun run lint 29 | 30 | - name: TypeScript type check 31 | run: bun run tsc 32 | 33 | - name: Run tests 34 | run: bun test 35 | -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | // Environment setup & latest features 4 | "lib": ["ESNext"], 5 | "target": "ESNext", 6 | "module": "Preserve", 7 | "moduleDetection": "force", 8 | "jsx": "react-jsx", 9 | "allowJs": true, 10 | 11 | // Bundler mode 12 | "moduleResolution": "bundler", 13 | "allowImportingTsExtensions": true, 14 | "verbatimModuleSyntax": true, 15 | "noEmit": true, 16 | 17 | // Best practices 18 | "strict": true, 19 | "skipLibCheck": true, 20 | "noFallthroughCasesInSwitch": true, 21 | "noUncheckedIndexedAccess": true, 22 | "noImplicitOverride": true, 23 | 24 | // Some stricter flags (disabled by default) 25 | "noUnusedLocals": false, 26 | "noUnusedParameters": false, 27 | "noPropertyAccessFromIndexSignature": false 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /eslint.config.js: -------------------------------------------------------------------------------- 1 | import tseslint from "@typescript-eslint/eslint-plugin"; 2 | import parser from "@typescript-eslint/parser"; 3 | 4 | export default [ 5 | { 6 | files: ["**/*.ts"], 7 | languageOptions: { 8 | parser, 9 | parserOptions: { 10 | project: "./tsconfig.json", 11 | }, 12 | }, 13 | plugins: { 14 | "@typescript-eslint": tseslint, 15 | }, 16 | rules: { 17 | "@typescript-eslint/naming-convention": [ 18 | "error", 19 | { 20 | selector: "variable", 21 | format: ["camelCase"], 22 | leadingUnderscore: "allow", 23 | trailingUnderscore: "allow", 24 | }, 25 | { 26 | selector: "variable", 27 | modifiers: ["const"], 28 | format: ["camelCase", "UPPER_CASE"], 29 | leadingUnderscore: "allow", 30 | trailingUnderscore: "allow", 31 | }, 32 | ], 33 | }, 34 | }, 35 | ]; 36 | -------------------------------------------------------------------------------- /tests/snippets/prompt.md: -------------------------------------------------------------------------------- 1 | # Snippet Component Task 2 | 3 | Create a simple Svelte 5 component that demonstrates the basic use of snippets. 4 | 5 | ## Requirements: 6 | 7 | 1. Create a component with a hardcoded array of 3 book titles (strings) - "The Lord of the Rings", "To Kill a Mockingbird", and "1984" 8 | 2. Create a snippet called `title` that takes a book title string as a parameter 9 | 3. The snippet should display the book title in a `` element with `data-testid="book-title"` 10 | 4. Use the `{@render ...}` syntax to render the snippet for each book title in a list 11 | 5. Each rendered title should be wrapped in a `
  • ` element with `data-testid="book-item"` 12 | 13 | ## Example HTML structure: 14 | 15 | ```html 16 |
      17 |
    • The Lord of the Rings
    • 18 |
    • To Kill a Mockingbird
    • 19 |
    • 1984
    • 20 |
    21 | ``` 22 | 23 | Please implement this component using Svelte 5 runes. 24 | -------------------------------------------------------------------------------- /tests/counter/Reference.svelte: -------------------------------------------------------------------------------- 1 | 2 | 3 | 14 | 15 |
    16 | 17 | {count} 18 | 19 |
    20 | 21 | 49 | -------------------------------------------------------------------------------- /tests/counter/prompt.md: -------------------------------------------------------------------------------- 1 | # Counter Component Task 2 | 3 | Create a Svelte 5 component called Counter that implements a simple counter with increment and decrement functionality. 4 | 5 | ## Requirements: 6 | 7 | 1. Use Svelte 5's `$state` for reactivity 8 | 2. The counter should start at 0 9 | 3. Include a decrement button with the `data-testid="decrement-button"` attribute 10 | 4. Include an increment button with the `data-testid="increment-button"` attribute 11 | 5. Display the current count with the `data-testid="count-value"` attribute 12 | 6. Clicking increment should increase the count by 1 13 | 7. Clicking decrement should decrease the count by 1 14 | 8. Style the counter with a CSS class "counter" 15 | 16 | Example structure: 17 | 18 | ```html 19 |
    20 | 21 | 0 22 | 23 |
    24 | ``` 25 | 26 | Please implement this component using Svelte 5 syntax. Make sure you only return one component. 27 | -------------------------------------------------------------------------------- /generate-report.ts: -------------------------------------------------------------------------------- 1 | import { generateReport } from "./lib/report.ts"; 2 | import { readdirSync } from "node:fs"; 3 | 4 | function getAllResultFiles(): string[] { 5 | const resultsDir = "results"; 6 | const files = readdirSync(resultsDir); 7 | 8 | const resultFiles = files.filter( 9 | (file) => file.startsWith("result-") && file.endsWith(".json"), 10 | ); 11 | 12 | if (resultFiles.length === 0) { 13 | throw new Error("No result files found in results/ directory"); 14 | } 15 | 16 | resultFiles.sort((a, b) => b.localeCompare(a)); 17 | 18 | return resultFiles.map((file) => `${resultsDir}/${file}`); 19 | } 20 | 21 | const resultFiles = getAllResultFiles(); 22 | 23 | console.log(`Found ${resultFiles.length} result file(s) to process\n`); 24 | 25 | for (const jsonPath of resultFiles) { 26 | const htmlPath = jsonPath.replace(/\.json$/, ".html"); 27 | console.log(`Generating report: ${jsonPath} -> ${htmlPath}`); 28 | await generateReport(jsonPath, htmlPath, false); 29 | } 30 | 31 | console.log(`\nSuccessfully generated ${resultFiles.length} HTML report(s)`); 32 | -------------------------------------------------------------------------------- /patches/@ai-sdk+mcp+0.0.11.patch: -------------------------------------------------------------------------------- 1 | diff --git a/node_modules/@ai-sdk/mcp/dist/index.mjs b/node_modules/@ai-sdk/mcp/dist/index.mjs 2 | index 0000000..0000000 100644 3 | --- a/node_modules/@ai-sdk/mcp/dist/index.mjs 4 | +++ b/node_modules/@ai-sdk/mcp/dist/index.mjs 5 | @@ -1350,7 +1350,7 @@ var HttpMCPTransport = class { 6 | const { done, value } = await reader.read(); 7 | if (done) return; 8 | const { event, data } = value; 9 | - if (event === "message") { 10 | + if (!event || event === "message") { 11 | try { 12 | const msg = JSONRPCMessageSchema.parse(JSON.parse(data)); 13 | (_a4 = this.onmessage) == null ? void 0 : _a4.call(this, msg); 14 | @@ -1475,7 +1475,7 @@ var HttpMCPTransport = class { 15 | if (id) { 16 | this.lastInboundEventId = id; 17 | } 18 | - if (event === "message") { 19 | + if (!event || event === "message") { 20 | try { 21 | const msg = JSONRPCMessageSchema.parse(JSON.parse(data)); 22 | (_a4 = this.onmessage) == null ? void 0 : _a4.call(this, msg); 23 | -------------------------------------------------------------------------------- /tests/derived-by/Reference.svelte: -------------------------------------------------------------------------------- 1 | 2 | 3 | 22 | 23 |
    24 | 25 | 26 | 27 |
    28 |

    Words: {textStats.wordCount}

    29 |

    Characters: {textStats.charCount}

    30 |

    31 | Status: {textStats.isLongText ? "Long text" : "Short text"} 32 |

    33 |
    34 |
    35 | 36 | 51 | -------------------------------------------------------------------------------- /tests/snippets/test.ts: -------------------------------------------------------------------------------- 1 | import { render, screen } from "@testing-library/svelte"; 2 | import { expect, test, describe } from "vitest"; 3 | import BookList from "./Component.svelte"; 4 | 5 | describe("Simple Snippet component", () => { 6 | test("renders the correct number of book titles", () => { 7 | render(BookList); 8 | 9 | const bookItems = screen.getAllByTestId("book-item"); 10 | expect(bookItems.length).toBe(3); 11 | 12 | const bookTitles = screen.getAllByTestId("book-title"); 13 | expect(bookTitles.length).toBe(3); 14 | }); 15 | 16 | test("displays correct book titles", () => { 17 | render(BookList); 18 | 19 | const bookTitles = screen.getAllByTestId("book-title"); 20 | 21 | expect(bookTitles[0]).toHaveTextContent("The Lord of the Rings"); 22 | expect(bookTitles[1]).toHaveTextContent("To Kill a Mockingbird"); 23 | expect(bookTitles[2]).toHaveTextContent("1984"); 24 | }); 25 | 26 | test("has the correct structure for each book item", () => { 27 | render(BookList); 28 | 29 | const bookItems = screen.getAllByTestId("book-item"); 30 | 31 | bookItems.forEach((item) => { 32 | expect(item.tagName).toBe("LI"); 33 | 34 | const title = item.querySelector('[data-testid="book-title"]'); 35 | expect(title).toBeInTheDocument(); 36 | expect(title?.tagName).toBe("SPAN"); 37 | }); 38 | }); 39 | }); 40 | -------------------------------------------------------------------------------- /tests/derived-by/prompt.md: -------------------------------------------------------------------------------- 1 | # $derived.by Component Task 2 | 3 | Create a Svelte 5 component that demonstrates the `$derived.by` rune for complex derivations. 4 | 5 | ## Requirements: 6 | 7 | 1. Create a text input field that allows the user to enter text 8 | 2. Use `$state` to store the current text value, starting with an empty string 9 | 3. Use `$derived.by` to calculate: 10 | - The number of words in the text 11 | - The number of characters in the text 12 | - Whether the text is considered "long" (more than 15 words) 13 | 4. Display all these derived values below the input field 14 | 5. Include a "Clear" button that resets the text to an empty string 15 | 16 | Elements should have these data-testid attributes: 17 | 18 | - "text-input" for the input field 19 | - "word-count" for displaying the word count 20 | - "char-count" for displaying the character count 21 | - "length-indicator" for displaying whether the text is long 22 | - "clear-button" for the clear button 23 | 24 | Example structure: 25 | 26 | ```html 27 |
    28 | 29 | 30 |
    31 |

    Words: 0

    32 |

    Characters: 0

    33 |

    Status: Short text

    34 |
    35 |
    36 | ``` 37 | 38 | Please implement this component using Svelte 5 runes. 39 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "ai-sdk-bench", 3 | "module": "index.ts", 4 | "type": "module", 5 | "private": true, 6 | "scripts": { 7 | "start": "bun run vercel:env:pull && bun ./index.ts", 8 | "postinstall": "patch-package", 9 | "update": "bun update --interactive", 10 | "verify-tests": "bun run verify-references.ts", 11 | "prettier": "prettier --write .", 12 | "lint": "eslint .", 13 | "lint:fix": "eslint . --fix", 14 | "tsc": "tsc --noEmit", 15 | "vercel:link": "vercel link", 16 | "vercel:env:pull": "vercel env pull .env.local --yes" 17 | }, 18 | "devDependencies": { 19 | "@sveltejs/vite-plugin-svelte": "^5.0.3", 20 | "@testing-library/jest-dom": "^6.6.3", 21 | "@types/bun": "^1.3.3", 22 | "@typescript-eslint/eslint-plugin": "^8.49.0", 23 | "@typescript-eslint/parser": "^8.49.0", 24 | "eslint": "^9.39.2", 25 | "jsdom": "^26.1.0", 26 | "patch-package": "^8.0.1", 27 | "prettier": "^3.7.4" 28 | }, 29 | "peerDependencies": { 30 | "typescript": "^5.9.3" 31 | }, 32 | "dependencies": { 33 | "@ai-sdk/anthropic": "^2.0.53", 34 | "@ai-sdk/mcp": "0.0.11", 35 | "@ai-sdk/openai": "^2.0.77", 36 | "@ai-sdk/openai-compatible": "^1.0.28", 37 | "@clack/prompts": "^0.11.0", 38 | "@openrouter/ai-sdk-provider": "^1.4.1", 39 | "@testing-library/svelte": "^5.2.9", 40 | "@testing-library/user-event": "^14.6.1", 41 | "ai": "^5.0.108", 42 | "vercel": "^49.1.2", 43 | "vitest": "^4.0.15", 44 | "zod": "^4.1.13" 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /lib/test-discovery.ts: -------------------------------------------------------------------------------- 1 | import { readdirSync, statSync, existsSync, readFileSync } from "node:fs"; 2 | import { join } from "node:path"; 3 | 4 | export interface TestDefinition { 5 | name: string; 6 | directory: string; 7 | referenceFile: string; 8 | componentFile: string; 9 | testFile: string; 10 | promptFile: string; 11 | prompt: string; 12 | } 13 | 14 | export function discoverTests() { 15 | const testsDir = join(process.cwd(), "tests"); 16 | const definitions = []; 17 | 18 | try { 19 | const entries = readdirSync(testsDir); 20 | 21 | for (const entry of entries) { 22 | const entryPath = join(testsDir, entry); 23 | const stat = statSync(entryPath); 24 | 25 | if (stat.isDirectory()) { 26 | const referenceFile = join(entryPath, "Reference.svelte"); 27 | const testFile = join(entryPath, "test.ts"); 28 | const promptFile = join(entryPath, "prompt.md"); 29 | const componentFile = join(entryPath, "Component.svelte"); 30 | 31 | if ( 32 | existsSync(referenceFile) && 33 | existsSync(testFile) && 34 | existsSync(promptFile) 35 | ) { 36 | const prompt = readFileSync(promptFile, "utf-8"); 37 | 38 | definitions.push({ 39 | name: entry, 40 | directory: entryPath, 41 | referenceFile, 42 | componentFile, 43 | testFile, 44 | promptFile, 45 | prompt, 46 | }); 47 | } else { 48 | const missing = []; 49 | if (!existsSync(referenceFile)) missing.push("Reference.svelte"); 50 | if (!existsSync(testFile)) missing.push("test.ts"); 51 | if (!existsSync(promptFile)) missing.push("prompt.md"); 52 | console.warn(`⚠️ Skipping ${entry}: missing ${missing.join(", ")}`); 53 | } 54 | } 55 | } 56 | } catch (error) { 57 | console.error("Error discovering tests:", error); 58 | } 59 | 60 | definitions.sort((a, b) => a.name.localeCompare(b.name)); 61 | 62 | return definitions; 63 | } 64 | 65 | export function buildAgentPrompt(test: TestDefinition) { 66 | return `${test.prompt} 67 | 68 | IMPORTANT: When you have finished implementing the component, use the ResultWrite tool to output your final Svelte component code. Only output the component code itself, no explanations or markdown formatting.`; 69 | } 70 | -------------------------------------------------------------------------------- /lib/tools/test-component.ts: -------------------------------------------------------------------------------- 1 | import { tool } from "ai"; 2 | import { z } from "zod"; 3 | import type { TestDefinition } from "../test-discovery.ts"; 4 | import { 5 | runTestVerification, 6 | cleanupTestEnvironment, 7 | } from "../output-test-runner.ts"; 8 | 9 | export function testComponentTool(test: TestDefinition) { 10 | return tool({ 11 | description: 12 | "Test your Svelte component against the test suite. Use this to verify your implementation and get feedback on any failing tests before submitting with ResultWrite. Returns detailed information about which tests passed or failed.", 13 | inputSchema: z.object({ 14 | content: z 15 | .string() 16 | .describe("The complete Svelte component code to test"), 17 | }), 18 | execute: async ({ content }) => { 19 | const lines = content.split("\n").length; 20 | console.log(`[TestComponent] Testing ${lines} lines of code...`); 21 | 22 | try { 23 | const result = await runTestVerification(test, content); 24 | 25 | cleanupTestEnvironment(test.name); 26 | 27 | if (result.passed) { 28 | console.log(`[TestComponent] ✓ All ${result.numTests} tests passed`); 29 | return { 30 | success: true, 31 | message: `All ${result.numTests} tests passed!`, 32 | passed: result.numPassed, 33 | failed: result.numFailed, 34 | total: result.numTests, 35 | duration: result.duration, 36 | }; 37 | } else { 38 | console.log( 39 | `[TestComponent] ✗ ${result.numFailed}/${result.numTests} tests failed`, 40 | ); 41 | return { 42 | success: false, 43 | message: `${result.numFailed} of ${result.numTests} tests failed`, 44 | passed: result.numPassed, 45 | failed: result.numFailed, 46 | total: result.numTests, 47 | duration: result.duration, 48 | error: result.error, 49 | failedTests: result.failedTests?.map((ft) => ({ 50 | name: ft.fullName, 51 | error: ft.errorMessage, 52 | })), 53 | }; 54 | } 55 | } catch (error) { 56 | cleanupTestEnvironment(test.name); 57 | console.log(`[TestComponent] ✗ Error running tests`); 58 | return { 59 | success: false, 60 | message: "Failed to run tests", 61 | error: error instanceof Error ? error.message : String(error), 62 | }; 63 | } 64 | }, 65 | }); 66 | } 67 | -------------------------------------------------------------------------------- /tests/derived-by/test.ts: -------------------------------------------------------------------------------- 1 | import { render, screen } from "@testing-library/svelte"; 2 | import { expect, test, describe } from "vitest"; 3 | import userEvent from "@testing-library/user-event"; 4 | import TextAnalyzer from "./Component.svelte"; 5 | 6 | describe("TextAnalyzer component", () => { 7 | test("renders with initial state", () => { 8 | render(TextAnalyzer); 9 | 10 | expect(screen.getByTestId("word-count")).toHaveTextContent("Words: 0"); 11 | expect(screen.getByTestId("char-count")).toHaveTextContent("Characters: 0"); 12 | expect(screen.getByTestId("length-indicator")).toHaveTextContent("Status: Short text"); 13 | }); 14 | 15 | test("updates counts when text is entered", async () => { 16 | const user = userEvent.setup(); 17 | render(TextAnalyzer); 18 | 19 | const input = screen.getByTestId("text-input"); 20 | 21 | // Enter a short text 22 | await user.type(input, "Hello world"); 23 | 24 | expect(screen.getByTestId("word-count")).toHaveTextContent("Words: 2"); 25 | expect(screen.getByTestId("char-count")).toHaveTextContent("Characters: 11"); 26 | expect(screen.getByTestId("length-indicator")).toHaveTextContent("Status: Short text"); 27 | 28 | // Clear and enter a longer text 29 | await user.clear(input); 30 | await user.type( 31 | input, 32 | "This is a much longer text that should have more than fifteen words so that we can test the long text indicator functionality properly", 33 | ); 34 | 35 | expect(screen.getByTestId("word-count")).toHaveTextContent("Words: 24"); 36 | expect(screen.getByTestId("char-count")).toHaveTextContent("Characters: 134"); 37 | expect(screen.getByTestId("length-indicator")).toHaveTextContent("Status: Long text"); 38 | }); 39 | 40 | test("clear button resets the text", async () => { 41 | const user = userEvent.setup(); 42 | render(TextAnalyzer); 43 | 44 | const input = screen.getByTestId("text-input"); 45 | const clearButton = screen.getByTestId("clear-button"); 46 | 47 | // Enter some text 48 | await user.type(input, "Hello world"); 49 | 50 | // Verify counts 51 | expect(screen.getByTestId("word-count")).toHaveTextContent("Words: 2"); 52 | 53 | // Click the clear button 54 | await user.click(clearButton); 55 | 56 | // Verify everything is reset 57 | expect(screen.getByTestId("word-count")).toHaveTextContent("Words: 0"); 58 | expect(screen.getByTestId("char-count")).toHaveTextContent("Characters: 0"); 59 | expect(screen.getByTestId("length-indicator")).toHaveTextContent("Status: Short text"); 60 | expect(input).toHaveValue(""); 61 | }); 62 | }); 63 | -------------------------------------------------------------------------------- /tests/counter/test.ts: -------------------------------------------------------------------------------- 1 | import { render, screen } from "@testing-library/svelte"; 2 | import { expect, test, describe } from "vitest"; 3 | import userEvent from "@testing-library/user-event"; 4 | import Counter from "./Component.svelte"; // Your component that is written using the ResultWrite tool 5 | 6 | describe("Counter component", () => { 7 | test("renders with initial count of 0", () => { 8 | render(Counter); 9 | 10 | // Use data-testid to get elements 11 | const countElement = screen.getByTestId("count-value"); 12 | const decrementButton = screen.getByTestId("decrement-button"); 13 | const incrementButton = screen.getByTestId("increment-button"); 14 | 15 | // Check initial state 16 | expect(countElement).toHaveTextContent("0"); 17 | expect(decrementButton).toBeInTheDocument(); 18 | expect(incrementButton).toBeInTheDocument(); 19 | }); 20 | 21 | test("increments the count when + button is clicked", async () => { 22 | const user = userEvent.setup(); 23 | render(Counter); 24 | 25 | const incrementButton = screen.getByTestId("increment-button"); 26 | const countElement = screen.getByTestId("count-value"); 27 | 28 | // Initial count should be 0 29 | expect(countElement).toHaveTextContent("0"); 30 | 31 | // Click the increment button 32 | await user.click(incrementButton); 33 | 34 | // Count should now be 1 35 | expect(countElement).toHaveTextContent("1"); 36 | }); 37 | 38 | test("decrements the count when - button is clicked", async () => { 39 | const user = userEvent.setup(); 40 | render(Counter); 41 | 42 | const decrementButton = screen.getByTestId("decrement-button"); 43 | const countElement = screen.getByTestId("count-value"); 44 | 45 | // Initial count should be 0 46 | expect(countElement).toHaveTextContent("0"); 47 | 48 | // Click the decrement button 49 | await user.click(decrementButton); 50 | 51 | // Count should now be -1 52 | expect(countElement).toHaveTextContent("-1"); 53 | }); 54 | 55 | test("handles multiple clicks correctly", async () => { 56 | const user = userEvent.setup(); 57 | render(Counter); 58 | 59 | const decrementButton = screen.getByTestId("decrement-button"); 60 | const incrementButton = screen.getByTestId("increment-button"); 61 | const countElement = screen.getByTestId("count-value"); 62 | 63 | // Increment twice 64 | await user.click(incrementButton); 65 | await user.click(incrementButton); 66 | expect(countElement).toHaveTextContent("2"); 67 | 68 | // Decrement once 69 | await user.click(decrementButton); 70 | expect(countElement).toHaveTextContent("1"); 71 | }); 72 | }); 73 | -------------------------------------------------------------------------------- /lib/utils.ts: -------------------------------------------------------------------------------- 1 | import { calculateCost, type ModelPricing } from "./pricing.ts"; 2 | import type { SingleTestResult, TotalCostInfo } from "./report.ts"; 3 | 4 | export function sanitizeModelName(modelName: string) { 5 | return modelName.replace(/[^a-zA-Z0-9.]/g, "-"); 6 | } 7 | 8 | export function getTimestampedFilename( 9 | prefix: string, 10 | extension: string, 11 | modelName?: string, 12 | now: Date = new Date(), 13 | ) { 14 | const year = now.getUTCFullYear(); 15 | const month = String(now.getUTCMonth() + 1).padStart(2, "0"); 16 | const day = String(now.getUTCDate()).padStart(2, "0"); 17 | const hours = String(now.getUTCHours()).padStart(2, "0"); 18 | const minutes = String(now.getUTCMinutes()).padStart(2, "0"); 19 | const seconds = String(now.getUTCSeconds()).padStart(2, "0"); 20 | 21 | const timestamp = `${year}-${month}-${day}-${hours}-${minutes}-${seconds}`; 22 | const modelSuffix = modelName ? `-${sanitizeModelName(modelName)}` : ""; 23 | 24 | return `${prefix}-${timestamp}${modelSuffix}.${extension}`; 25 | } 26 | 27 | export function isHttpUrl(str: string) { 28 | return str.startsWith("http://") || str.startsWith("https://"); 29 | } 30 | 31 | export function extractResultWriteContent(steps: unknown[]) { 32 | for (const step of steps) { 33 | const s = step as { 34 | content?: Array<{ 35 | type: string; 36 | toolName?: string; 37 | input?: { content: string }; 38 | }>; 39 | }; 40 | if (s.content) { 41 | for (const content of s.content) { 42 | if ( 43 | content.type === "tool-call" && 44 | content.toolName === "ResultWrite" 45 | ) { 46 | return content.input?.content ?? null; 47 | } 48 | } 49 | } 50 | } 51 | return null; 52 | } 53 | 54 | export function calculateTotalCost( 55 | tests: SingleTestResult[], 56 | pricing: ModelPricing, 57 | ) { 58 | let totalInputTokens = 0; 59 | let totalOutputTokens = 0; 60 | let totalCachedInputTokens = 0; 61 | 62 | for (const test of tests) { 63 | for (const step of test.steps) { 64 | totalInputTokens += step.usage.inputTokens; 65 | totalOutputTokens += step.usage.outputTokens; 66 | totalCachedInputTokens += step.usage.cachedInputTokens ?? 0; 67 | } 68 | } 69 | 70 | const costResult = calculateCost( 71 | pricing, 72 | totalInputTokens, 73 | totalOutputTokens, 74 | totalCachedInputTokens, 75 | ); 76 | 77 | return { 78 | inputCost: costResult.inputCost, 79 | outputCost: costResult.outputCost, 80 | cacheReadCost: costResult.cacheReadCost, 81 | totalCost: costResult.totalCost, 82 | inputTokens: totalInputTokens, 83 | outputTokens: totalOutputTokens, 84 | cachedInputTokens: totalCachedInputTokens, 85 | }; 86 | } 87 | -------------------------------------------------------------------------------- /lib/report.ts: -------------------------------------------------------------------------------- 1 | import { readFile, writeFile } from "node:fs/promises"; 2 | import type { TestVerificationResult } from "./output-test-runner.ts"; 3 | import { generateMultiTestHtml } from "./report-template.ts"; 4 | 5 | interface TextBlock { 6 | type: "text"; 7 | text: string; 8 | } 9 | 10 | interface ToolCallBlock { 11 | type: "tool-call"; 12 | toolCallId: string; 13 | toolName: string; 14 | input: Record; 15 | dynamic?: boolean; 16 | } 17 | 18 | interface ToolResultBlock { 19 | type: "tool-result"; 20 | toolCallId: string; 21 | toolName: string; 22 | input: Record; 23 | output: { 24 | content: Array<{ type: string; text?: string; [key: string]: unknown }>; 25 | isError?: boolean; 26 | }; 27 | dynamic?: boolean; 28 | } 29 | 30 | type ContentBlock = TextBlock | ToolCallBlock | ToolResultBlock; 31 | 32 | interface Message { 33 | role: "user" | "assistant"; 34 | content: ContentBlock[]; 35 | } 36 | 37 | interface Usage { 38 | inputTokens: number; 39 | outputTokens: number; 40 | totalTokens: number; 41 | cachedInputTokens: number; 42 | } 43 | 44 | interface RequestBody { 45 | model: string; 46 | max_tokens: number; 47 | messages: Message[]; 48 | } 49 | 50 | interface ResponseBody { 51 | id: string; 52 | timestamp: string; 53 | modelId: string; 54 | [key: string]: unknown; 55 | } 56 | 57 | interface Step { 58 | content: ContentBlock[]; 59 | finishReason: string; 60 | usage: Usage; 61 | request: { 62 | body: RequestBody; 63 | }; 64 | response: ResponseBody; 65 | [key: string]: unknown; 66 | } 67 | 68 | export interface PricingInfo { 69 | inputCostPerMTok: number; 70 | outputCostPerMTok: number; 71 | cacheReadCostPerMTok?: number; 72 | } 73 | 74 | export interface TotalCostInfo { 75 | inputCost: number; 76 | outputCost: number; 77 | cacheReadCost: number; 78 | totalCost: number; 79 | inputTokens: number; 80 | outputTokens: number; 81 | cachedInputTokens: number; 82 | } 83 | 84 | interface Metadata { 85 | mcpEnabled: boolean; 86 | mcpServerUrl: string | null; 87 | mcpTransportType?: string | null; 88 | timestamp: string; 89 | model: string; 90 | pricingKey?: string | null; 91 | pricing?: PricingInfo | null; 92 | totalCost?: TotalCostInfo | null; 93 | } 94 | 95 | export interface SingleTestResult { 96 | testName: string; 97 | prompt: string; 98 | steps: Step[]; 99 | resultWriteContent: string | null; 100 | verification: TestVerificationResult | null; 101 | } 102 | 103 | export interface MultiTestResultData { 104 | tests: SingleTestResult[]; 105 | metadata: Metadata; 106 | } 107 | 108 | interface LegacyResultData { 109 | steps: Step[]; 110 | resultWriteContent?: string | null; 111 | metadata?: Metadata; 112 | } 113 | 114 | export async function generateReport( 115 | resultPath: string, 116 | outputPath: string, 117 | openBrowser = true, 118 | ) { 119 | try { 120 | const jsonContent = await readFile(resultPath, "utf-8"); 121 | const data = JSON.parse(jsonContent); 122 | 123 | let html; 124 | 125 | if ("tests" in data && Array.isArray(data.tests)) { 126 | html = generateMultiTestHtml(data as MultiTestResultData); 127 | } else { 128 | const legacyData = data as LegacyResultData; 129 | const multiTestData = { 130 | tests: [ 131 | { 132 | testName: "Legacy Test", 133 | prompt: "Static prompt (legacy format)", 134 | steps: legacyData.steps, 135 | resultWriteContent: legacyData.resultWriteContent ?? null, 136 | verification: null, 137 | }, 138 | ], 139 | metadata: legacyData.metadata ?? { 140 | mcpEnabled: false, 141 | mcpServerUrl: null, 142 | timestamp: new Date().toISOString(), 143 | model: "unknown", 144 | }, 145 | }; 146 | html = generateMultiTestHtml(multiTestData); 147 | } 148 | 149 | await writeFile(outputPath, html, "utf-8"); 150 | 151 | console.log(`✓ Report generated successfully: ${outputPath}`); 152 | 153 | if (openBrowser) { 154 | Bun.spawn(["open", outputPath]); 155 | } 156 | } catch (error) { 157 | console.error("Error generating report:", error); 158 | throw error; 159 | } 160 | } 161 | -------------------------------------------------------------------------------- /lib/pricing.ts: -------------------------------------------------------------------------------- 1 | export interface ModelPricing { 2 | inputCostPerToken: number; 3 | outputCostPerToken: number; 4 | cacheReadInputTokenCost?: number; 5 | cacheCreationInputTokenCost?: number; 6 | } 7 | 8 | export interface CostCalculation { 9 | inputCost: number; 10 | outputCost: number; 11 | cacheReadCost: number; 12 | totalCost: number; 13 | inputTokens: number; 14 | outputTokens: number; 15 | cachedInputTokens: number; 16 | } 17 | 18 | export interface ModelPricingDisplay { 19 | inputCostPerMTok: number; 20 | outputCostPerMTok: number; 21 | cacheReadCostPerMTok?: number; 22 | } 23 | 24 | export interface ModelPricingLookup { 25 | pricing: ModelPricing; 26 | matchedKey: string; 27 | } 28 | 29 | export interface GatewayPricing { 30 | input?: string; 31 | output?: string; 32 | cachedInputTokens?: string; 33 | cacheCreationInputTokens?: string; 34 | } 35 | 36 | export interface GatewayModel { 37 | id: string; 38 | name: string; 39 | description?: string; 40 | pricing?: GatewayPricing; 41 | specification?: { 42 | specificationVersion: string; 43 | provider: string; 44 | modelId: string; 45 | }; 46 | modelType: string; 47 | } 48 | 49 | export function extractPricingFromGatewayModel( 50 | model: GatewayModel, 51 | ) { 52 | if (!model.pricing) { 53 | return null; 54 | } 55 | 56 | const { pricing } = model; 57 | 58 | const inputCost = pricing.input ? parseFloat(pricing.input) : NaN; 59 | const outputCost = pricing.output ? parseFloat(pricing.output) : NaN; 60 | 61 | if (isNaN(inputCost) || isNaN(outputCost)) { 62 | throw new Error( 63 | `Invalid pricing for model ${model.id}: input and output pricing must be valid numbers.`, 64 | ); 65 | } 66 | 67 | const result: ModelPricing = { 68 | inputCostPerToken: inputCost, 69 | outputCostPerToken: outputCost, 70 | }; 71 | 72 | if (pricing.cachedInputTokens) { 73 | const cached = parseFloat(pricing.cachedInputTokens); 74 | if (!isNaN(cached)) { 75 | result.cacheReadInputTokenCost = cached; 76 | } 77 | } 78 | 79 | if (pricing.cacheCreationInputTokens) { 80 | const creation = parseFloat(pricing.cacheCreationInputTokens); 81 | if (!isNaN(creation)) { 82 | result.cacheCreationInputTokenCost = creation; 83 | } 84 | } 85 | 86 | return result; 87 | } 88 | 89 | export function buildPricingMap( 90 | models: GatewayModel[], 91 | ) { 92 | const map = new Map(); 93 | 94 | for (const model of models) { 95 | const pricing = extractPricingFromGatewayModel(model); 96 | if (pricing) { 97 | map.set(model.id, { 98 | pricing, 99 | matchedKey: model.id, 100 | }); 101 | } else { 102 | map.set(model.id, null); 103 | } 104 | } 105 | 106 | return map; 107 | } 108 | 109 | export function lookupPricingFromMap( 110 | modelId: string, 111 | pricingMap: Map, 112 | ) { 113 | return pricingMap.get(modelId) ?? null; 114 | } 115 | 116 | export function getModelPricingDisplay( 117 | pricing: ModelPricing, 118 | ) { 119 | return { 120 | inputCostPerMTok: pricing.inputCostPerToken * 1_000_000, 121 | outputCostPerMTok: pricing.outputCostPerToken * 1_000_000, 122 | cacheReadCostPerMTok: 123 | pricing.cacheReadInputTokenCost !== undefined 124 | ? pricing.cacheReadInputTokenCost * 1_000_000 125 | : undefined, 126 | }; 127 | } 128 | 129 | export function calculateCost( 130 | pricing: ModelPricing, 131 | inputTokens: number, 132 | outputTokens: number, 133 | cachedInputTokens: number = 0, 134 | ) { 135 | const uncachedInputTokens = inputTokens - cachedInputTokens; 136 | const inputCost = uncachedInputTokens * pricing.inputCostPerToken; 137 | 138 | const outputCost = outputTokens * pricing.outputCostPerToken; 139 | 140 | const cacheReadCost = 141 | cachedInputTokens * (pricing.cacheReadInputTokenCost ?? 0); 142 | 143 | return { 144 | inputCost, 145 | outputCost, 146 | cacheReadCost, 147 | totalCost: inputCost + outputCost + cacheReadCost, 148 | inputTokens, 149 | outputTokens, 150 | cachedInputTokens, 151 | }; 152 | } 153 | 154 | export function formatCost(cost: number) { 155 | if (cost === 0) return "$0.00"; 156 | if (cost < 0.01) { 157 | return `$${cost.toFixed(6)}`; 158 | } 159 | if (cost < 1) { 160 | return `$${cost.toFixed(4)}`; 161 | } 162 | return `$${cost.toFixed(2)}`; 163 | } 164 | 165 | export function formatMTokCost(costPerMTok: number) { 166 | if (costPerMTok === 0) return "$0"; 167 | if (costPerMTok < 0.01) { 168 | return `$${costPerMTok.toFixed(4)}`; 169 | } 170 | return `$${costPerMTok.toFixed(2)}`; 171 | } 172 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ai-sdk-bench 2 | 3 | AI SDK benchmarking tool that tests AI agents with MCP (Model Context Protocol) integration using the Vercel AI Gateway. Automatically discovers and runs all tests in the `tests/` directory, verifying LLM-generated Svelte components against test suites. 4 | 5 | ## Installation 6 | 7 | To install dependencies: 8 | 9 | ```bash 10 | ./scripts/install.sh # installs the correct bun version 11 | bun install 12 | ``` 13 | 14 | ## Setup 15 | 16 | Configure your API keys in `.env`: 17 | 18 | 1. Install Vercel CLI if you haven't already 19 | 2. Run `bun run vercel:link` and link the benchmark to a project that has AI Gateway enabled 20 | 3. Run the benchmark with "bun run dev" 21 | 22 | ### Required API Keys 23 | 24 | You'll need at least one API key for the providers you want to test: 25 | 26 | - `VERCEL_OIDC_TOKEN`: The OIDC token for vercel AI gateway 27 | 28 | ## Usage 29 | 30 | To run the benchmark: 31 | 32 | ```bash 33 | bun run index.ts 34 | ``` 35 | 36 | ### Interactive CLI 37 | 38 | The benchmark features an interactive CLI that will prompt you for configuration: 39 | 40 | 1. **Model Selection**: Choose one or more models from the Vercel AI Gateway 41 | - Select from available models in your configured providers 42 | - Optionally add custom model IDs 43 | - Can test multiple models in a single run 44 | 45 | 2. **MCP Integration**: Choose your MCP configuration 46 | - **No MCP Integration**: Run without external tools 47 | - **MCP over HTTP**: Use HTTP-based MCP server (default: `https://mcp.svelte.dev/mcp`) 48 | - **MCP over StdIO**: Use local MCP server via command (default: `npx -y @sveltejs/mcp`) 49 | - Option to provide custom MCP server URL or command 50 | 51 | 3. **TestComponent Tool**: Enable/disable the testing tool for models 52 | - Allows models to run tests during component development 53 | - Enabled by default 54 | 55 | ### Benchmark Workflow 56 | 57 | After configuration, the benchmark will: 58 | 59 | 1. Discover all tests in `tests/` directory 60 | 2. For each selected model and test: 61 | - Run the AI agent with the test's prompt 62 | - Extract the generated Svelte component 63 | - Verify the component against the test suite 64 | 3. Generate a combined report with all results 65 | 66 | ### Results and Reports 67 | 68 | Results are saved to the `results/` directory with timestamped filenames: 69 | 70 | - `results/result-2024-12-07-14-30-45.json` - Full execution trace with all test results 71 | - `results/result-2024-12-07-14-30-45.html` - Interactive HTML report with expandable test sections 72 | 73 | The HTML report includes: 74 | 75 | - Summary bar showing passed/failed/skipped counts 76 | - Expandable sections for each test 77 | - Step-by-step execution trace 78 | - Generated component code 79 | - Test verification results with pass/fail details 80 | - Token usage statistics 81 | - MCP status badge 82 | - Dark/light theme toggle 83 | 84 | To regenerate an HTML report from a JSON file: 85 | 86 | ```bash 87 | # Regenerate most recent result 88 | bun run generate-report.ts 89 | 90 | # Regenerate specific result 91 | bun run generate-report.ts results/result-2024-12-07-14-30-45.json 92 | ``` 93 | 94 | ## Test Structure 95 | 96 | Each test in the `tests/` directory should have: 97 | 98 | ``` 99 | tests/ 100 | {test-name}/ 101 | Reference.svelte - Reference implementation (known-good solution) 102 | test.ts - Vitest test file (imports "./Component.svelte") 103 | prompt.md - Prompt for the AI agent 104 | ``` 105 | 106 | The benchmark: 107 | 108 | 1. Reads the prompt from `prompt.md` 109 | 2. Asks the agent to generate a component 110 | 3. Writes the generated component to a temporary location 111 | 4. Runs the tests against the generated component 112 | 5. Reports pass/fail status 113 | 114 | ## Verifying Reference Implementations 115 | 116 | To verify that all reference implementations pass their tests: 117 | 118 | ```bash 119 | bun run verify-tests 120 | ``` 121 | 122 | This copies each `Reference.svelte` to `Component.svelte` temporarily and runs the tests. 123 | 124 | ## MCP Integration 125 | 126 | The tool supports optional integration with MCP (Model Context Protocol) servers through the interactive CLI. When running the benchmark, you'll be prompted to choose: 127 | 128 | - **No MCP Integration**: Run without external tools 129 | - **MCP over HTTP**: Connect to an HTTP-based MCP server 130 | - Default: `https://mcp.svelte.dev/mcp` 131 | - Option to provide a custom URL 132 | - **MCP over StdIO**: Connect to a local MCP server via command 133 | - Default: `npx -y @sveltejs/mcp` 134 | - Option to provide a custom command 135 | 136 | MCP status, transport type, and server configuration are documented in both the JSON metadata and displayed as a badge in the HTML report. 137 | 138 | ## Exit Codes 139 | 140 | - `0`: All tests passed 141 | - `1`: One or more tests failed 142 | 143 | ## Documentation 144 | 145 | See [AGENTS.md](AGENTS.md) for detailed documentation on: 146 | 147 | - Architecture and components 148 | - Environment variables and model configuration 149 | - MCP integration details 150 | - Development commands 151 | - Multi-test result format 152 | 153 | --- 154 | 155 | This project was created using `bun init` in bun v1.3.3. [Bun](https://bun.com) is a fast all-in-one JavaScript runtime. 156 | -------------------------------------------------------------------------------- /lib/utils.test.ts: -------------------------------------------------------------------------------- 1 | import { describe, it, expect } from "vitest"; 2 | import { 3 | sanitizeModelName, 4 | getTimestampedFilename, 5 | calculateTotalCost, 6 | } from "./utils.ts"; 7 | import type { ModelPricing } from "./pricing.ts"; 8 | import type { SingleTestResult } from "./report.ts"; 9 | 10 | describe("sanitizeModelName", () => { 11 | it("replaces slashes with dashes", () => { 12 | expect(sanitizeModelName("anthropic/claude-sonnet-4")).toBe( 13 | "anthropic-claude-sonnet-4", 14 | ); 15 | }); 16 | 17 | it("replaces special characters with dashes", () => { 18 | expect(sanitizeModelName("model@version")).toBe("model-version"); 19 | expect(sanitizeModelName("model_name")).toBe("model-name"); 20 | expect(sanitizeModelName("model name")).toBe("model-name"); 21 | }); 22 | 23 | it("preserves dots", () => { 24 | expect(sanitizeModelName("gpt-4.0")).toBe("gpt-4.0"); 25 | expect(sanitizeModelName("model.v1.2.3")).toBe("model.v1.2.3"); 26 | }); 27 | 28 | it("preserves alphanumeric characters", () => { 29 | expect(sanitizeModelName("gpt4o")).toBe("gpt4o"); 30 | expect(sanitizeModelName("claude3")).toBe("claude3"); 31 | }); 32 | 33 | it("handles multiple consecutive special characters", () => { 34 | expect(sanitizeModelName("model///name")).toBe("model---name"); 35 | expect(sanitizeModelName("model@#$name")).toBe("model---name"); 36 | }); 37 | }); 38 | 39 | describe("getTimestampedFilename", () => { 40 | const fixedDate = new Date("2025-12-12T14:30:45Z"); 41 | 42 | it("generates filename without model name", () => { 43 | const result = getTimestampedFilename( 44 | "result", 45 | "json", 46 | undefined, 47 | fixedDate, 48 | ); 49 | expect(result).toBe("result-2025-12-12-14-30-45.json"); 50 | }); 51 | 52 | it("generates filename with simple model name", () => { 53 | const result = getTimestampedFilename( 54 | "result", 55 | "json", 56 | "gpt-4o", 57 | fixedDate, 58 | ); 59 | expect(result).toBe("result-2025-12-12-14-30-45-gpt-4o.json"); 60 | }); 61 | 62 | it("generates filename with model name containing slashes", () => { 63 | const result = getTimestampedFilename( 64 | "result", 65 | "json", 66 | "anthropic/claude-sonnet-4", 67 | fixedDate, 68 | ); 69 | expect(result).toBe( 70 | "result-2025-12-12-14-30-45-anthropic-claude-sonnet-4.json", 71 | ); 72 | }); 73 | 74 | it("generates filename with model name containing special characters", () => { 75 | const result = getTimestampedFilename( 76 | "result", 77 | "html", 78 | "model@v1.2.3", 79 | fixedDate, 80 | ); 81 | expect(result).toBe("result-2025-12-12-14-30-45-model-v1.2.3.html"); 82 | }); 83 | 84 | it("handles different file extensions", () => { 85 | const result = getTimestampedFilename( 86 | "output", 87 | "txt", 88 | "test-model", 89 | fixedDate, 90 | ); 91 | expect(result).toBe("output-2025-12-12-14-30-45-test-model.txt"); 92 | }); 93 | 94 | it("pads single-digit months and days", () => { 95 | const earlyDate = new Date("2025-01-05T08:09:07Z"); 96 | const result = getTimestampedFilename( 97 | "result", 98 | "json", 99 | undefined, 100 | earlyDate, 101 | ); 102 | expect(result).toBe("result-2025-01-05-08-09-07.json"); 103 | }); 104 | }); 105 | 106 | describe("calculateTotalCost", () => { 107 | const pricing: ModelPricing = { 108 | inputCostPerToken: 1.0 / 1_000_000, 109 | outputCostPerToken: 2.0 / 1_000_000, 110 | cacheReadInputTokenCost: 0.1 / 1_000_000, 111 | }; 112 | 113 | it("calculates zero cost for empty results", () => { 114 | const tests: SingleTestResult[] = []; 115 | const result = calculateTotalCost(tests, pricing); 116 | 117 | expect(result).toEqual({ 118 | inputCost: 0, 119 | outputCost: 0, 120 | cacheReadCost: 0, 121 | totalCost: 0, 122 | inputTokens: 0, 123 | outputTokens: 0, 124 | cachedInputTokens: 0, 125 | }); 126 | }); 127 | 128 | it("aggregates usage from multiple steps and tests", () => { 129 | const tests: SingleTestResult[] = [ 130 | { 131 | testName: "test1", 132 | prompt: "p1", 133 | resultWriteContent: null, 134 | verification: {} as any, 135 | steps: [ 136 | { 137 | usage: { 138 | inputTokens: 100, 139 | outputTokens: 50, 140 | cachedInputTokens: 10, 141 | }, 142 | } as any, 143 | { 144 | usage: { 145 | inputTokens: 200, 146 | outputTokens: 100, 147 | cachedInputTokens: 0, 148 | }, 149 | } as any, 150 | ], 151 | }, 152 | { 153 | testName: "test2", 154 | prompt: "p2", 155 | resultWriteContent: null, 156 | verification: {} as any, 157 | steps: [ 158 | { 159 | usage: { 160 | inputTokens: 300, 161 | outputTokens: 150, 162 | cachedInputTokens: 20, 163 | }, 164 | } as any, 165 | ], 166 | }, 167 | ]; 168 | 169 | // Total Input: 100 + 200 + 300 = 600 170 | // Total Output: 50 + 100 + 150 = 300 171 | // Total Cached: 10 + 0 + 20 = 30 172 | // Uncached Input: 600 - 30 = 570 173 | 174 | // Costs (per Token): 175 | // Input: 570 * (1.0 / 1e6) = 0.00057 176 | // Output: 300 * (2.0 / 1e6) = 0.0006 177 | // Cache: 30 * (0.1 / 1e6) = 0.000003 178 | // Total: 0.00057 + 0.0006 + 0.000003 = 0.001173 179 | 180 | const result = calculateTotalCost(tests, pricing); 181 | 182 | expect(result).toEqual({ 183 | inputCost: 0.00057, 184 | outputCost: 0.0006, 185 | cacheReadCost: 0.000003, 186 | totalCost: 0.001173, 187 | inputTokens: 600, 188 | outputTokens: 300, 189 | cachedInputTokens: 30, 190 | }); 191 | }); 192 | }); 193 | -------------------------------------------------------------------------------- /lib/output-test-runner.ts: -------------------------------------------------------------------------------- 1 | import { 2 | mkdirSync, 3 | rmSync, 4 | existsSync, 5 | writeFileSync, 6 | copyFileSync, 7 | } from "node:fs"; 8 | import { join } from "node:path"; 9 | import { startVitest } from "vitest/node"; 10 | import type { TestDefinition } from "./test-discovery.ts"; 11 | 12 | const OUTPUTS_DIR = join(process.cwd(), "outputs"); 13 | 14 | export interface FailedTest { 15 | fullName: string; 16 | errorMessage: string; 17 | } 18 | 19 | export interface TestVerificationResult { 20 | testName: string; 21 | passed: boolean; 22 | numTests: number; 23 | numPassed: number; 24 | numFailed: number; 25 | duration: number; 26 | error?: string; 27 | failedTests?: FailedTest[]; 28 | } 29 | 30 | export function setupOutputsDirectory() { 31 | if (existsSync(OUTPUTS_DIR)) { 32 | rmSync(OUTPUTS_DIR, { recursive: true, force: true }); 33 | } 34 | mkdirSync(OUTPUTS_DIR, { recursive: true }); 35 | } 36 | 37 | export function cleanupOutputsDirectory() { 38 | if (existsSync(OUTPUTS_DIR)) { 39 | rmSync(OUTPUTS_DIR, { recursive: true, force: true }); 40 | } 41 | } 42 | 43 | export function prepareTestEnvironment( 44 | test: TestDefinition, 45 | componentCode: string, 46 | ) { 47 | const testDir = join(OUTPUTS_DIR, test.name); 48 | 49 | if (existsSync(testDir)) { 50 | rmSync(testDir, { recursive: true, force: true }); 51 | } 52 | mkdirSync(testDir, { recursive: true }); 53 | 54 | const componentPath = join(testDir, "Component.svelte"); 55 | writeFileSync(componentPath, componentCode, "utf-8"); 56 | 57 | const testFilePath = join(testDir, "test.ts"); 58 | copyFileSync(test.testFile, testFilePath); 59 | 60 | return testDir; 61 | } 62 | 63 | export function cleanupTestEnvironment(testName: string) { 64 | const testDir = join(OUTPUTS_DIR, testName); 65 | if (existsSync(testDir)) { 66 | rmSync(testDir, { recursive: true, force: true }); 67 | } 68 | } 69 | 70 | export async function runTestVerification( 71 | test: TestDefinition, 72 | componentCode: string, 73 | ) { 74 | const startTime = Date.now(); 75 | 76 | try { 77 | const testDir = prepareTestEnvironment(test, componentCode); 78 | const testFilePath = join(testDir, "test.ts"); 79 | 80 | const vitest = await startVitest("test", [testFilePath], { 81 | watch: false, 82 | reporters: ["verbose"], 83 | }); 84 | 85 | if (!vitest) { 86 | return { 87 | testName: test.name, 88 | passed: false, 89 | numTests: 0, 90 | numPassed: 0, 91 | numFailed: 0, 92 | duration: Date.now() - startTime, 93 | error: "Failed to start vitest", 94 | }; 95 | } 96 | 97 | await vitest.close(); 98 | 99 | const testModules = vitest.state.getTestModules(); 100 | const failedTests = []; 101 | const allErrors = []; 102 | 103 | const unhandledErrors = vitest.state.getUnhandledErrors(); 104 | for (const error of unhandledErrors) { 105 | const errorMessage = 106 | error instanceof Error ? error.message : String(error); 107 | allErrors.push(errorMessage); 108 | } 109 | 110 | let passed = true; 111 | let numTests = 0; 112 | let numFailed = 0; 113 | 114 | if (!testModules || testModules.length === 0) { 115 | return { 116 | testName: test.name, 117 | passed: false, 118 | numTests: 0, 119 | numPassed: 0, 120 | numFailed: 0, 121 | duration: Date.now() - startTime, 122 | error: 123 | allErrors.length > 0 ? allErrors.join("\n") : "No test modules found", 124 | }; 125 | } 126 | 127 | for (const module of testModules) { 128 | if (!module.ok()) { 129 | passed = false; 130 | } 131 | 132 | const moduleErrors = module.errors(); 133 | for (const error of moduleErrors) { 134 | if (error.message) { 135 | allErrors.push(error.message); 136 | } 137 | } 138 | 139 | if (!module.children) { 140 | continue; 141 | } 142 | 143 | try { 144 | const tests = Array.from(module.children.allTests()); 145 | numTests += tests.length; 146 | 147 | for (const t of tests) { 148 | const result = t.result(); 149 | 150 | if (result.state === "failed") { 151 | numFailed++; 152 | 153 | const ancestorTitles: string[] = []; 154 | let parent = t.parent; 155 | while (parent && "name" in parent) { 156 | if (parent.name) { 157 | ancestorTitles.unshift(parent.name); 158 | } 159 | parent = ( 160 | "parent" in parent 161 | ? (parent as { parent?: unknown }).parent 162 | : undefined 163 | ) as typeof parent; 164 | } 165 | 166 | const fullName = 167 | ancestorTitles.length > 0 168 | ? `${ancestorTitles.join(" > ")} > ${t.name}` 169 | : t.name; 170 | 171 | const errorMessages: string[] = []; 172 | if (result.errors) { 173 | for (const testError of result.errors) { 174 | if (testError.message) { 175 | errorMessages.push(testError.message); 176 | allErrors.push(testError.message); 177 | } 178 | } 179 | } 180 | 181 | failedTests.push({ 182 | fullName, 183 | errorMessage: 184 | errorMessages.join("\n") || "No error message available", 185 | }); 186 | } 187 | } 188 | } catch (err) { 189 | console.error(`Error processing module tests for ${test.name}:`, err); 190 | const errorMessage = err instanceof Error ? err.message : String(err); 191 | allErrors.push(errorMessage); 192 | passed = false; 193 | } 194 | } 195 | 196 | const numPassed = numTests - numFailed; 197 | 198 | return { 199 | testName: test.name, 200 | passed: passed && numFailed === 0, 201 | numTests, 202 | numPassed, 203 | numFailed, 204 | duration: Date.now() - startTime, 205 | failedTests: failedTests.length > 0 ? failedTests : undefined, 206 | error: allErrors.length > 0 && !passed ? allErrors[0] : undefined, 207 | }; 208 | } catch (error) { 209 | return { 210 | testName: test.name, 211 | passed: false, 212 | numTests: 0, 213 | numPassed: 0, 214 | numFailed: 0, 215 | duration: Date.now() - startTime, 216 | error: error instanceof Error ? error.message : String(error), 217 | }; 218 | } 219 | } 220 | -------------------------------------------------------------------------------- /lib/verify-references.ts: -------------------------------------------------------------------------------- 1 | import { 2 | readdirSync, 3 | statSync, 4 | copyFileSync, 5 | unlinkSync, 6 | existsSync, 7 | } from "node:fs"; 8 | import { join } from "node:path"; 9 | import { startVitest } from "vitest/node"; 10 | 11 | interface TestDefinition { 12 | name: string; 13 | directory: string; 14 | referenceFile: string; 15 | componentFile: string; 16 | testFile: string; 17 | promptFile: string; 18 | } 19 | 20 | interface FailedTest { 21 | fullName: string; 22 | errorMessage: string; 23 | } 24 | 25 | interface TestResult { 26 | testName: string; 27 | passed: boolean; 28 | numTests: number; 29 | numPassed: number; 30 | numFailed: number; 31 | duration: number; 32 | error?: string; 33 | failedTests?: FailedTest[]; 34 | } 35 | 36 | export function loadTestDefinitions() { 37 | const testsDir = join(process.cwd(), "tests"); 38 | const definitions = []; 39 | 40 | try { 41 | const entries = readdirSync(testsDir); 42 | 43 | for (const entry of entries) { 44 | const entryPath = join(testsDir, entry); 45 | const stat = statSync(entryPath); 46 | 47 | if (stat.isDirectory()) { 48 | const referenceFile = join(entryPath, "Reference.svelte"); 49 | const testFile = join(entryPath, "test.ts"); 50 | const promptFile = join(entryPath, "prompt.md"); 51 | const componentFile = join(entryPath, "Component.svelte"); 52 | 53 | if (existsSync(referenceFile) && existsSync(testFile)) { 54 | definitions.push({ 55 | name: entry, 56 | directory: entryPath, 57 | referenceFile, 58 | componentFile, 59 | testFile, 60 | promptFile, 61 | }); 62 | } else { 63 | console.warn( 64 | `⚠️ Skipping ${entry}: missing Reference.svelte or test.ts`, 65 | ); 66 | } 67 | } 68 | } 69 | } catch (error) { 70 | console.error("Error loading test definitions:", error); 71 | } 72 | 73 | return definitions; 74 | } 75 | 76 | export function copyReferenceToComponent(testDef: TestDefinition) { 77 | copyFileSync(testDef.referenceFile, testDef.componentFile); 78 | } 79 | 80 | export function cleanupComponent(testDef: TestDefinition) { 81 | if (existsSync(testDef.componentFile)) { 82 | try { 83 | unlinkSync(testDef.componentFile); 84 | } catch (error) { 85 | console.warn(`⚠️ Failed to cleanup ${testDef.componentFile}:`, error); 86 | } 87 | } 88 | } 89 | 90 | export async function runTest(testDef: TestDefinition) { 91 | const startTime = Date.now(); 92 | 93 | try { 94 | const vitest = await startVitest("test", [testDef.testFile], { 95 | watch: false, 96 | reporters: ["verbose"], 97 | }); 98 | 99 | if (!vitest) { 100 | return { 101 | testName: testDef.name, 102 | passed: false, 103 | numTests: 0, 104 | numPassed: 0, 105 | numFailed: 0, 106 | duration: Date.now() - startTime, 107 | error: "Failed to start vitest", 108 | }; 109 | } 110 | 111 | await vitest.close(); 112 | 113 | const testModules = vitest.state.getTestModules(); 114 | const failedTests = []; 115 | const allErrors = []; 116 | 117 | const unhandledErrors = vitest.state.getUnhandledErrors(); 118 | for (const error of unhandledErrors) { 119 | const errorMessage = 120 | error instanceof Error ? error.message : String(error); 121 | allErrors.push(errorMessage); 122 | } 123 | 124 | let passed = true; 125 | let numTests = 0; 126 | let numFailed = 0; 127 | 128 | if (!testModules || testModules.length === 0) { 129 | return { 130 | testName: testDef.name, 131 | passed: false, 132 | numTests: 0, 133 | numPassed: 0, 134 | numFailed: 0, 135 | duration: Date.now() - startTime, 136 | error: 137 | allErrors.length > 0 ? allErrors.join("\n") : "No test modules found", 138 | }; 139 | } 140 | 141 | for (const module of testModules) { 142 | if (!module.ok()) { 143 | passed = false; 144 | } 145 | 146 | const moduleErrors = module.errors(); 147 | for (const error of moduleErrors) { 148 | if (error.message) { 149 | allErrors.push(error.message); 150 | } 151 | } 152 | 153 | if (!module.children) { 154 | continue; 155 | } 156 | 157 | try { 158 | const tests = Array.from(module.children.allTests()); 159 | numTests += tests.length; 160 | 161 | for (const t of tests) { 162 | const result = t.result(); 163 | 164 | if (result.state === "failed") { 165 | numFailed++; 166 | 167 | const ancestorTitles: string[] = []; 168 | let parent = t.parent; 169 | while (parent && "name" in parent) { 170 | if (parent.name) { 171 | ancestorTitles.unshift(parent.name); 172 | } 173 | parent = ( 174 | "parent" in parent 175 | ? (parent as { parent?: unknown }).parent 176 | : undefined 177 | ) as typeof parent; 178 | } 179 | 180 | const fullName = 181 | ancestorTitles.length > 0 182 | ? `${ancestorTitles.join(" > ")} > ${t.name}` 183 | : t.name; 184 | 185 | const errorMessages: string[] = []; 186 | if (result.errors) { 187 | for (const testError of result.errors) { 188 | if (testError.message) { 189 | errorMessages.push(testError.message); 190 | allErrors.push(testError.message); 191 | } 192 | } 193 | } 194 | 195 | failedTests.push({ 196 | fullName, 197 | errorMessage: 198 | errorMessages.join("\n") || "No error message available", 199 | }); 200 | } 201 | } 202 | } catch (err) { 203 | console.error( 204 | `Error processing module tests for ${testDef.name}:`, 205 | err, 206 | ); 207 | const errorMessage = err instanceof Error ? err.message : String(err); 208 | allErrors.push(errorMessage); 209 | passed = false; 210 | } 211 | } 212 | 213 | const numPassed = numTests - numFailed; 214 | 215 | return { 216 | testName: testDef.name, 217 | passed: passed && numFailed === 0, 218 | numTests, 219 | numPassed, 220 | numFailed, 221 | duration: Date.now() - startTime, 222 | failedTests: failedTests.length > 0 ? failedTests : undefined, 223 | error: allErrors.length > 0 && !passed ? allErrors[0] : undefined, 224 | }; 225 | } catch (error) { 226 | return { 227 | testName: testDef.name, 228 | passed: false, 229 | numTests: 0, 230 | numPassed: 0, 231 | numFailed: 0, 232 | duration: Date.now() - startTime, 233 | error: error instanceof Error ? error.message : String(error), 234 | }; 235 | } 236 | } 237 | 238 | export function printSummary(results: TestResult[]) { 239 | console.log("\n=== Test Verification Summary ===\n"); 240 | 241 | const totalSuites = results.length; 242 | const passedSuites = results.filter((r) => r.passed).length; 243 | 244 | for (const result of results) { 245 | const status = result.passed ? "✓ PASSED" : "✗ FAILED"; 246 | const testInfo = `${result.numPassed}/${result.numTests} tests`; 247 | const durationInfo = `${result.duration}ms`; 248 | 249 | console.log(`${result.testName}: ${status} (${testInfo}, ${durationInfo})`); 250 | 251 | if (result.error) { 252 | console.log(`Error: ${result.error}`); 253 | } 254 | 255 | if (!result.passed && result.failedTests && result.failedTests.length > 0) { 256 | console.log(" Failed tests:"); 257 | for (const failed of result.failedTests) { 258 | console.log(`✗ ${failed.fullName}`); 259 | } 260 | } 261 | } 262 | 263 | console.log(`\nTotal: ${passedSuites}/${totalSuites} suites passed`); 264 | 265 | if (passedSuites === totalSuites) { 266 | console.log("All reference implementations verified successfully!"); 267 | } else { 268 | console.log(`${totalSuites - passedSuites} suite(s) failed.`); 269 | } 270 | } 271 | 272 | export async function verifyAllReferences() { 273 | console.log("Discovering test suites..."); 274 | const tests = loadTestDefinitions(); 275 | console.log(`Found ${tests.length} test suite(s)\n`); 276 | 277 | if (tests.length === 0) { 278 | console.log("No test suites found in tests/ directory"); 279 | return 1; 280 | } 281 | 282 | const results = []; 283 | 284 | for (const test of tests) { 285 | console.log(`Running tests/${test.name}...`); 286 | 287 | try { 288 | copyReferenceToComponent(test); 289 | console.log(" ✓ Copied Reference.svelte → Component.svelte"); 290 | 291 | const result = await runTest(test); 292 | results.push(result); 293 | 294 | if (result.passed) { 295 | console.log(`✓ All tests passed (${result.duration}ms)`); 296 | } else { 297 | console.log( 298 | `✗ Tests failed (${result.numFailed}/${result.numTests} failed)`, 299 | ); 300 | if (result.error) { 301 | console.log(`Error: ${result.error}`); 302 | } 303 | if (result.failedTests && result.failedTests.length > 0) { 304 | console.log("\n Failed tests:"); 305 | for (const failed of result.failedTests) { 306 | console.log(`✗ ${failed.fullName}`); 307 | const errorLines = failed.errorMessage.split("\n"); 308 | for (const line of errorLines) { 309 | if (line.trim()) { 310 | console.log(` ${line}`); 311 | } 312 | } 313 | } 314 | console.log(); 315 | } 316 | } 317 | } finally { 318 | cleanupComponent(test); 319 | console.log(" ✓ Cleaned up Component.svelte\n"); 320 | } 321 | } 322 | 323 | printSummary(results); 324 | 325 | const allPassed = results.every((r) => r.passed); 326 | return allPassed ? 0 : 1; 327 | } 328 | -------------------------------------------------------------------------------- /AGENTS.md: -------------------------------------------------------------------------------- 1 | ## Project Overview 2 | 3 | AI SDK benchmarking tool built with Vercel AI SDK and Bun runtime. Tests AI agents with MCP (Model Context Protocol) server integration using the Vercel AI Gateway. Automatically discovers and runs all tests in the `tests/` directory and verifies LLM-generated Svelte components against test suites. 4 | 5 | ## Development Commands 6 | 7 | ```bash 8 | # Install dependencies (runs patch-package automatically) 9 | bun install 10 | 11 | # Run the main benchmark (interactive CLI) 12 | bun run start 13 | 14 | # Verify reference implementations against test suites 15 | bun run verify-tests 16 | 17 | # Generate HTML reports from all result JSON files 18 | bun run generate-report.ts 19 | 20 | # Generate HTML report from specific result file 21 | bun run generate-report.ts results/result-2024-12-07-14-30-45.json 22 | 23 | # Run unit tests for lib modules 24 | bun run test:self 25 | 26 | # Run TypeScript type checking 27 | bun tsc --noEmit 28 | 29 | # Format code with Prettier 30 | bun run prettier 31 | ``` 32 | 33 | ## Environment Variables 34 | 35 | ### Vercel AI Gateway 36 | 37 | The benchmark uses the Vercel AI Gateway for model access. Configuration: 38 | 39 | 1. Link to a Vercel project with AI Gateway enabled: `bun run vercel:link` 40 | 2. Pull environment variables: `bun run vercel:env:pull` 41 | 42 | Required environment variable: 43 | 44 | - `VERCEL_OIDC_TOKEN`: OIDC token for Vercel AI Gateway authentication 45 | 46 | ### MCP Server Configuration 47 | 48 | MCP integration is configured via the interactive CLI at runtime. Options: 49 | 50 | - **No MCP Integration**: Agent runs with built-in tools only 51 | - **MCP over HTTP**: Uses HTTP transport (default: `https://mcp.svelte.dev/mcp`) 52 | - **MCP over StdIO**: Uses local command (default: `npx -y @sveltejs/mcp`) 53 | 54 | ## Architecture 55 | 56 | ### Directory Structure 57 | 58 | ``` 59 | ├── index.ts # Main entry point with interactive CLI 60 | ├── lib/ 61 | │ ├── pricing.ts # Cost calculation from gateway pricing 62 | │ ├── pricing.test.ts # Unit tests for pricing module 63 | │ ├── test-discovery.ts # Test suite discovery and prompt building 64 | │ ├── test-discovery.test.ts # Unit tests for test discovery 65 | │ ├── output-test-runner.ts # Vitest runner for component verification 66 | │ ├── output-test-runner.test.ts # Unit tests for output runner 67 | │ ├── verify-references.ts # Reference implementation verification 68 | │ ├── report.ts # Report generation orchestration 69 | │ ├── report-template.ts # HTML report template generation 70 | │ ├── report-styles.ts # CSS styles for HTML reports 71 | │ └── tools/ 72 | │ ├── index.ts # Tool exports 73 | │ ├── result-write.ts # ResultWrite tool for final output 74 | │ ├── result-write.test.ts # Unit tests for ResultWrite tool 75 | │ ├── test-component.ts # TestComponent tool for iterative testing 76 | │ └── test-component.test.ts # Unit tests for TestComponent tool 77 | ├── tests/ # Benchmark test suites 78 | │ └── {test-name}/ 79 | │ ├── Reference.svelte # Reference implementation 80 | │ ├── test.ts # Vitest test file 81 | │ └── prompt.md # Agent prompt 82 | ├── results/ # Benchmark results (JSON + HTML) 83 | ├── outputs/ # Temporary directory for test verification 84 | └── patches/ # Patches for dependencies 85 | ``` 86 | 87 | ### Test Suite Structure 88 | 89 | Benchmark test suites in `tests/` directory: 90 | 91 | ``` 92 | tests/ 93 | {test-name}/ 94 | Reference.svelte - Reference implementation of the component 95 | test.ts - Vitest test file (imports "./Component.svelte") 96 | prompt.md - Prompt for AI agents to implement the component 97 | ``` 98 | 99 | **Benchmark Workflow:** 100 | 101 | 1. `index.ts` presents interactive CLI for model/MCP selection 102 | 2. Discovers all test suites in `tests/` 103 | 3. For each selected model and test: 104 | - Loads `prompt.md` and builds agent prompt 105 | - Agent generates component code using available tools 106 | - Agent calls `ResultWrite` tool with the component code 107 | - Component is written to `outputs/{test-name}/Component.svelte` 108 | - Test file is copied to `outputs/{test-name}/test.ts` 109 | - Vitest runs tests against the generated component 110 | - Results are collected (pass/fail, error messages) 111 | - Output directory is cleaned up 112 | 4. All results are saved to timestamped JSON file 113 | 5. HTML report is generated with expandable sections for each test 114 | 115 | ### Agent Tools 116 | 117 | **ResultWrite** (`lib/tools/result-write.ts`): 118 | 119 | - Called when agent completes component implementation 120 | - Signals the agent to stop (via `stopWhen` configuration) 121 | - Accepts `content` parameter with Svelte component code 122 | 123 | **TestComponent** (`lib/tools/test-component.ts`): 124 | 125 | - Optional tool for iterative development 126 | - Runs component against test suite before final submission 127 | - Returns pass/fail status and detailed error messages 128 | - Enabled/disabled via interactive CLI 129 | 130 | ### Interactive CLI 131 | 132 | The benchmark uses `@clack/prompts` for an interactive CLI that prompts for: 133 | 134 | 1. **Model Selection**: Multi-select from Vercel AI Gateway available models 135 | 2. **MCP Integration**: Choose HTTP, StdIO, or no MCP 136 | 3. **TestComponent Tool**: Enable/disable iterative testing tool 137 | 4. **Pricing Confirmation**: Review and confirm cost calculation settings 138 | 139 | ### Pricing System 140 | 141 | The pricing module (`lib/pricing.ts`) handles cost calculation: 142 | 143 | - Extracts pricing from Vercel AI Gateway model metadata 144 | - Calculates costs based on input/output/cached tokens 145 | - Supports cache read billing at reduced rates 146 | - Displays costs in reports with per-million-token rates 147 | 148 | Key functions: 149 | 150 | - `extractPricingFromGatewayModel()`: Parse gateway model pricing 151 | - `buildPricingMap()`: Build lookup map from gateway models 152 | - `calculateCost()`: Calculate total cost from token usage 153 | - `formatCost()` / `formatMTokCost()`: Format costs for display 154 | 155 | ### Key Technologies 156 | 157 | - **Vercel AI SDK v5**: Agent framework with tool calling 158 | - **Vercel AI Gateway**: Unified access to multiple AI providers 159 | - **@ai-sdk/mcp**: MCP client integration (with custom patch) 160 | - **@clack/prompts**: Interactive CLI prompts 161 | - **Bun Runtime**: JavaScript runtime (not Node.js) 162 | - **Vitest**: Test framework for component testing 163 | - **@testing-library/svelte**: Testing utilities for Svelte components 164 | 165 | ### MCP Integration 166 | 167 | The project uses `@ai-sdk/mcp` with a custom patch applied via `patch-package`: 168 | 169 | - Patch location: `patches/@ai-sdk+mcp+0.0.11.patch` 170 | - Fixes: Handles missing event types in HTTP SSE responses 171 | - Supports both HTTP and StdIO transports 172 | - Configuration via interactive CLI at runtime 173 | 174 | ### Data Flow 175 | 176 | 1. Interactive CLI collects configuration (models, MCP, tools) 177 | 2. Gateway provides available models and pricing 178 | 3. Test discovery scans `tests/` directory 179 | 4. For each model and test: 180 | a. Agent receives prompt with access to tools (built-in + optional MCP) 181 | b. Agent iterates through steps, calling tools as needed 182 | c. Agent stops when `ResultWrite` tool is called 183 | d. Component is written to `outputs/{test-name}/Component.svelte` 184 | e. Vitest runs test file against the generated component 185 | f. Test results are collected (pass/fail, error details) 186 | g. Output directory is cleaned up 187 | 5. Results aggregated with pricing calculations 188 | 6. Results written to `results/result-YYYY-MM-DD-HH-MM-SS.json` 189 | 7. HTML report generated at `results/result-YYYY-MM-DD-HH-MM-SS.html` 190 | 8. Report automatically opens in default browser 191 | 192 | ### Output Files 193 | 194 | All results are saved in the `results/` directory with timestamped filenames: 195 | 196 | - **JSON files**: `result-2024-12-07-14-30-45.json` - Complete execution trace 197 | - **HTML files**: `result-2024-12-07-14-30-45.html` - Interactive visualization 198 | 199 | **Multi-Test Result JSON Structure:** 200 | 201 | ```json 202 | { 203 | "tests": [ 204 | { 205 | "testName": "counter", 206 | "prompt": "# Counter Component Task...", 207 | "steps": [...], 208 | "resultWriteContent": "...", 209 | "verification": { 210 | "testName": "counter", 211 | "passed": true, 212 | "numTests": 4, 213 | "numPassed": 4, 214 | "numFailed": 0, 215 | "duration": 150, 216 | "failedTests": [] 217 | } 218 | } 219 | ], 220 | "metadata": { 221 | "mcpEnabled": true, 222 | "mcpServerUrl": "https://mcp.svelte.dev/mcp", 223 | "mcpTransportType": "HTTP", 224 | "timestamp": "2024-12-07T14:30:45.123Z", 225 | "model": "anthropic/claude-sonnet-4", 226 | "pricingKey": "anthropic/claude-sonnet-4", 227 | "pricing": { 228 | "inputCostPerMTok": 3, 229 | "outputCostPerMTok": 15, 230 | "cacheReadCostPerMTok": 0.3 231 | }, 232 | "totalCost": { 233 | "inputCost": 0.003, 234 | "outputCost": 0.015, 235 | "cacheReadCost": 0.0003, 236 | "totalCost": 0.0183, 237 | "inputTokens": 1000, 238 | "outputTokens": 1000, 239 | "cachedInputTokens": 1000 240 | } 241 | } 242 | } 243 | ``` 244 | 245 | ## Unit Tests 246 | 247 | Unit tests for library modules are in `lib/*.test.ts`: 248 | 249 | - `lib/pricing.test.ts` - Pricing extraction, calculation, formatting 250 | - `lib/test-discovery.test.ts` - Test suite discovery and prompt building 251 | - `lib/output-test-runner.test.ts` - Output directory management 252 | - `lib/tools/result-write.test.ts` - ResultWrite tool behavior 253 | - `lib/tools/test-component.test.ts` - TestComponent tool behavior 254 | 255 | Run unit tests with: `bun run test:self` 256 | 257 | ## TypeScript Configuration 258 | 259 | - **Runtime**: Bun (not Node.js) 260 | - **Module System**: ESNext with `module: "Preserve"` and `moduleResolution: "bundler"` 261 | - **Strict Mode**: Enabled with additional checks: 262 | - `noUncheckedIndexedAccess: true` - array/index access always includes undefined 263 | - `noImplicitOverride: true` - override keyword required 264 | - `noFallthroughCasesInSwitch: true` 265 | - **Import Extensions**: `.ts` extensions allowed in imports 266 | - **No Emit**: TypeScript compilation not required for Bun runtime 267 | 268 | ## Important Notes 269 | 270 | - The MCP client import uses a direct path to the patched module: `./node_modules/@ai-sdk/mcp/dist/index.mjs` 271 | - Agent stops execution when the `ResultWrite` tool is called (configured via `stopWhen` option) 272 | - Agent also stops after 10 steps maximum (configured via `stepCountIs(10)`) 273 | - The `outputs/` directory is used temporarily for test verification and is cleaned up after each test 274 | - HTML reports include expandable sections for each test with full step details 275 | - Test verification results show pass/fail status and failed test details 276 | - Token usage includes cached token counts when available 277 | - All result files are saved with timestamps to preserve historical benchmarks 278 | - MCP integration can be configured via interactive CLI without code changes 279 | - MCP status is clearly indicated in both the JSON metadata and HTML report with a visual badge 280 | - Exit code is 0 if all tests pass, 1 if any tests fail 281 | - Pricing is fetched from Vercel AI Gateway model metadata at runtime 282 | -------------------------------------------------------------------------------- /lib/pricing.test.ts: -------------------------------------------------------------------------------- 1 | import { describe, it, expect } from "vitest"; 2 | import { 3 | extractPricingFromGatewayModel, 4 | buildPricingMap, 5 | lookupPricingFromMap, 6 | calculateCost, 7 | formatCost, 8 | formatMTokCost, 9 | getModelPricingDisplay, 10 | type ModelPricing, 11 | type GatewayModel, 12 | } from "./pricing.ts"; 13 | 14 | describe("extractPricingFromGatewayModel", () => { 15 | it("should extract pricing from a gateway model with all fields", () => { 16 | const model: GatewayModel = { 17 | id: "anthropic/claude-opus-4.5", 18 | name: "Claude Opus 4.5", 19 | pricing: { 20 | input: "0.000005", 21 | output: "0.000025", 22 | cachedInputTokens: "0.0000005", 23 | cacheCreationInputTokens: "0.00000625", 24 | }, 25 | modelType: "language", 26 | }; 27 | 28 | const pricing = extractPricingFromGatewayModel(model); 29 | 30 | expect(pricing).not.toBeNull(); 31 | expect(pricing!.inputCostPerToken).toBe(0.000005); 32 | expect(pricing!.outputCostPerToken).toBe(0.000025); 33 | expect(pricing!.cacheReadInputTokenCost).toBe(0.0000005); 34 | expect(pricing!.cacheCreationInputTokenCost).toBe(0.00000625); 35 | }); 36 | 37 | it("should extract pricing with only input and output", () => { 38 | const model: GatewayModel = { 39 | id: "openai/gpt-4o", 40 | name: "GPT-4o", 41 | pricing: { 42 | input: "0.000003", 43 | output: "0.000015", 44 | }, 45 | modelType: "language", 46 | }; 47 | 48 | const pricing = extractPricingFromGatewayModel(model); 49 | 50 | expect(pricing).not.toBeNull(); 51 | expect(pricing!.inputCostPerToken).toBe(0.000003); 52 | expect(pricing!.outputCostPerToken).toBe(0.000015); 53 | expect(pricing!.cacheReadInputTokenCost).toBeUndefined(); 54 | }); 55 | 56 | it("should return null for model without pricing", () => { 57 | const model: GatewayModel = { 58 | id: "local/model", 59 | name: "Local Model", 60 | modelType: "language", 61 | }; 62 | 63 | const pricing = extractPricingFromGatewayModel(model); 64 | expect(pricing).toBeNull(); 65 | }); 66 | 67 | it("should throw error for model with empty pricing object", () => { 68 | const model: GatewayModel = { 69 | id: "local/model", 70 | name: "Local Model", 71 | pricing: {}, 72 | modelType: "language", 73 | }; 74 | 75 | expect(() => extractPricingFromGatewayModel(model)).toThrowError( 76 | /Invalid pricing/, 77 | ); 78 | }); 79 | 80 | it("should throw error for invalid pricing values", () => { 81 | const model: GatewayModel = { 82 | id: "test/model", 83 | name: "Test Model", 84 | pricing: { 85 | input: "invalid", 86 | output: "0.000015", 87 | }, 88 | modelType: "language", 89 | }; 90 | 91 | expect(() => extractPricingFromGatewayModel(model)).toThrowError( 92 | /Invalid pricing/, 93 | ); 94 | }); 95 | }); 96 | 97 | describe("buildPricingMap", () => { 98 | it("should build a map from gateway models", () => { 99 | const models: GatewayModel[] = [ 100 | { 101 | id: "anthropic/claude-sonnet-4", 102 | name: "Claude Sonnet 4", 103 | pricing: { input: "0.000003", output: "0.000015" }, 104 | modelType: "language", 105 | }, 106 | { 107 | id: "openai/gpt-4o", 108 | name: "GPT-4o", 109 | pricing: { input: "0.000005", output: "0.000015" }, 110 | modelType: "language", 111 | }, 112 | { 113 | id: "local/model", 114 | name: "Local Model", 115 | modelType: "language", 116 | }, 117 | ]; 118 | 119 | const map = buildPricingMap(models); 120 | 121 | expect(map.size).toBe(3); 122 | expect(map.get("anthropic/claude-sonnet-4")).not.toBeNull(); 123 | expect(map.get("openai/gpt-4o")).not.toBeNull(); 124 | expect(map.get("local/model")).toBeNull(); 125 | }); 126 | }); 127 | 128 | describe("lookupPricingFromMap", () => { 129 | it("should return pricing lookup for existing model", () => { 130 | const models: GatewayModel[] = [ 131 | { 132 | id: "anthropic/claude-sonnet-4", 133 | name: "Claude Sonnet 4", 134 | pricing: { input: "0.000003", output: "0.000015" }, 135 | modelType: "language", 136 | }, 137 | ]; 138 | 139 | const map = buildPricingMap(models); 140 | const lookup = lookupPricingFromMap("anthropic/claude-sonnet-4", map); 141 | 142 | expect(lookup).not.toBeNull(); 143 | expect(lookup!.matchedKey).toBe("anthropic/claude-sonnet-4"); 144 | expect(lookup!.pricing.inputCostPerToken).toBe(0.000003); 145 | }); 146 | 147 | it("should return null for non-existent model", () => { 148 | const map = buildPricingMap([]); 149 | const lookup = lookupPricingFromMap("non/existent", map); 150 | expect(lookup).toBeNull(); 151 | }); 152 | }); 153 | 154 | describe("calculateCost", () => { 155 | const basePricing: ModelPricing = { 156 | inputCostPerToken: 0.000003, // $3 per MTok 157 | outputCostPerToken: 0.000015, // $15 per MTok 158 | }; 159 | 160 | const pricingWithCache: ModelPricing = { 161 | ...basePricing, 162 | cacheReadInputTokenCost: 0.0000003, // $0.30 per MTok (10% of input) 163 | }; 164 | 165 | describe("basic cost calculation", () => { 166 | it("should calculate cost with no cached tokens", () => { 167 | const result = calculateCost(basePricing, 1000, 500, 0); 168 | 169 | expect(result.inputTokens).toBe(1000); 170 | expect(result.outputTokens).toBe(500); 171 | expect(result.cachedInputTokens).toBe(0); 172 | expect(result.inputCost).toBe(0.003); // 1000 * $3/MTok 173 | expect(result.outputCost).toBeCloseTo(0.0075); // 500 * $15/MTok 174 | expect(result.cacheReadCost).toBe(0); 175 | expect(result.totalCost).toBe(0.0105); 176 | }); 177 | 178 | it("should default cachedInputTokens to 0", () => { 179 | const result = calculateCost(basePricing, 1000, 500); 180 | 181 | expect(result.cachedInputTokens).toBe(0); 182 | expect(result.inputCost).toBe(0.003); 183 | }); 184 | }); 185 | 186 | describe("cached token billing", () => { 187 | it("should bill cached tokens at reduced rate", () => { 188 | // 1000 input tokens, 800 are cached 189 | const result = calculateCost(pricingWithCache, 1000, 500, 800); 190 | 191 | expect(result.inputTokens).toBe(1000); 192 | expect(result.cachedInputTokens).toBe(800); 193 | // Uncached: 200 tokens * $3/MTok = $0.0006 194 | expect(result.inputCost).toBeCloseTo(0.0006); 195 | // Cached: 800 tokens * $0.30/MTok = $0.00024 196 | expect(result.cacheReadCost).toBeCloseTo(0.00024); 197 | // Output: 500 * $15/MTok = $0.0075 198 | expect(result.outputCost).toBeCloseTo(0.0075); 199 | expect(result.totalCost).toBeCloseTo(0.00834); 200 | }); 201 | 202 | it("should treat cached tokens as free when no cache rate specified", () => { 203 | // Using basePricing which has no cacheReadInputTokenCost 204 | const result = calculateCost(basePricing, 1000, 500, 800); 205 | 206 | // Only 200 uncached tokens should be billed 207 | expect(result.inputCost).toBeCloseTo(0.0006); 208 | expect(result.cacheReadCost).toBe(0); 209 | }); 210 | 211 | it("should handle all tokens being cached", () => { 212 | const result = calculateCost(pricingWithCache, 1000, 500, 1000); 213 | 214 | expect(result.inputCost).toBe(0); 215 | expect(result.cacheReadCost).toBe(0.0003); // 1000 * $0.30/MTok 216 | }); 217 | }); 218 | 219 | describe("edge cases", () => { 220 | it("should handle zero tokens", () => { 221 | const result = calculateCost(basePricing, 0, 0, 0); 222 | 223 | expect(result.inputCost).toBe(0); 224 | expect(result.outputCost).toBe(0); 225 | expect(result.cacheReadCost).toBe(0); 226 | expect(result.totalCost).toBe(0); 227 | }); 228 | 229 | it("should handle large token counts", () => { 230 | const result = calculateCost(basePricing, 1_000_000, 500_000, 0); 231 | 232 | expect(result.inputCost).toBe(3); // 1M * $3/MTok 233 | expect(result.outputCost).toBe(7.5); // 500K * $15/MTok 234 | expect(result.totalCost).toBe(10.5); 235 | }); 236 | 237 | it("should handle pricing with zero costs", () => { 238 | const freePricing: ModelPricing = { 239 | inputCostPerToken: 0, 240 | outputCostPerToken: 0, 241 | }; 242 | const result = calculateCost(freePricing, 1000, 500, 0); 243 | 244 | expect(result.totalCost).toBe(0); 245 | }); 246 | }); 247 | }); 248 | 249 | describe("formatCost", () => { 250 | it('should format zero as "$0.00"', () => { 251 | expect(formatCost(0)).toBe("$0.00"); 252 | }); 253 | 254 | it("should format very small costs with 6 decimal places", () => { 255 | expect(formatCost(0.000123)).toBe("$0.000123"); 256 | expect(formatCost(0.001)).toBe("$0.001000"); 257 | expect(formatCost(0.0099)).toBe("$0.009900"); 258 | }); 259 | 260 | it("should format small costs with 4 decimal places", () => { 261 | expect(formatCost(0.01)).toBe("$0.0100"); 262 | expect(formatCost(0.1234)).toBe("$0.1234"); 263 | expect(formatCost(0.99)).toBe("$0.9900"); 264 | }); 265 | 266 | it("should format costs >= $1 with 2 decimal places", () => { 267 | expect(formatCost(1)).toBe("$1.00"); 268 | expect(formatCost(1.234)).toBe("$1.23"); 269 | expect(formatCost(10.5)).toBe("$10.50"); 270 | expect(formatCost(100)).toBe("$100.00"); 271 | }); 272 | }); 273 | 274 | describe("formatMTokCost", () => { 275 | it('should format zero as "$0"', () => { 276 | expect(formatMTokCost(0)).toBe("$0"); 277 | }); 278 | 279 | it("should format very small per-MTok costs with 4 decimal places", () => { 280 | expect(formatMTokCost(0.001)).toBe("$0.0010"); 281 | expect(formatMTokCost(0.0099)).toBe("$0.0099"); 282 | }); 283 | 284 | it("should format per-MTok costs >= $0.01 with 2 decimal places", () => { 285 | expect(formatMTokCost(0.01)).toBe("$0.01"); 286 | expect(formatMTokCost(0.3)).toBe("$0.30"); 287 | expect(formatMTokCost(3)).toBe("$3.00"); 288 | expect(formatMTokCost(15)).toBe("$15.00"); 289 | }); 290 | }); 291 | 292 | describe("getModelPricingDisplay", () => { 293 | it("should convert per-token costs to per-MTok", () => { 294 | const pricing: ModelPricing = { 295 | inputCostPerToken: 0.000003, // $3 per MTok 296 | outputCostPerToken: 0.000015, // $15 per MTok 297 | }; 298 | 299 | const display = getModelPricingDisplay(pricing); 300 | 301 | expect(display.inputCostPerMTok).toBe(3); 302 | expect(display.outputCostPerMTok).toBe(15); 303 | expect(display.cacheReadCostPerMTok).toBeUndefined(); 304 | }); 305 | 306 | it("should include cache read cost when available", () => { 307 | const pricing: ModelPricing = { 308 | inputCostPerToken: 0.000003, 309 | outputCostPerToken: 0.000015, 310 | cacheReadInputTokenCost: 0.0000003, // $0.30 per MTok 311 | }; 312 | 313 | const display = getModelPricingDisplay(pricing); 314 | 315 | expect(display.inputCostPerMTok).toBe(3); 316 | expect(display.outputCostPerMTok).toBe(15); 317 | expect(display.cacheReadCostPerMTok).toBe(0.3); 318 | }); 319 | 320 | it("should handle zero costs", () => { 321 | const pricing: ModelPricing = { 322 | inputCostPerToken: 0, 323 | outputCostPerToken: 0, 324 | }; 325 | 326 | const display = getModelPricingDisplay(pricing); 327 | 328 | expect(display.inputCostPerMTok).toBe(0); 329 | expect(display.outputCostPerMTok).toBe(0); 330 | }); 331 | 332 | it("should preserve explicit zero cost for cache read", () => { 333 | const pricing: ModelPricing = { 334 | inputCostPerToken: 0.000003, 335 | outputCostPerToken: 0.000015, 336 | cacheReadInputTokenCost: 0, 337 | }; 338 | 339 | const display = getModelPricingDisplay(pricing); 340 | 341 | expect(display.inputCostPerMTok).toBe(3); 342 | expect(display.outputCostPerMTok).toBe(15); 343 | expect(display.cacheReadCostPerMTok).toBe(0); 344 | }); 345 | }); 346 | -------------------------------------------------------------------------------- /lib/report-styles.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * CSS styles for the HTML report 3 | */ 4 | export function getReportStyles(): string { 5 | return ` 6 | :root { 7 | --bg: #f8f8f8; 8 | --surface: #ffffff; 9 | --text: #24292e; 10 | --text-muted: #6a737d; 11 | --border: #e1e4e8; 12 | --success: #238636; 13 | --error: #cf222e; 14 | --warning: #9a6700; 15 | --tool: #8250df; 16 | --mcp-enabled: #0969da; 17 | --mcp-disabled: #6a737d; 18 | --notice-bg: #ddf4ff; 19 | --notice-border: #54aeff; 20 | --passed-bg: #dafbe1; 21 | --passed-border: #238636; 22 | --failed-bg: #ffebe9; 23 | --failed-border: #cf222e; 24 | --skipped-bg: #fff8c5; 25 | --skipped-border: #9a6700; 26 | } 27 | 28 | [data-theme="dark"] { 29 | --bg: #0d1117; 30 | --surface: #161b22; 31 | --text: #e6edf3; 32 | --text-muted: #8b949e; 33 | --border: #30363d; 34 | --success: #3fb950; 35 | --error: #f85149; 36 | --warning: #d29922; 37 | --tool: #a371f7; 38 | --mcp-enabled: #58a6ff; 39 | --mcp-disabled: #8b949e; 40 | --notice-bg: #1c2d41; 41 | --notice-border: #388bfd; 42 | --passed-bg: #1a3d24; 43 | --passed-border: #3fb950; 44 | --failed-bg: #3d1a1a; 45 | --failed-border: #f85149; 46 | --skipped-bg: #3d3514; 47 | --skipped-border: #d29922; 48 | } 49 | 50 | * { 51 | margin: 0; 52 | padding: 0; 53 | box-sizing: border-box; 54 | } 55 | 56 | html { 57 | background: var(--bg); 58 | color: var(--text); 59 | font-family: 'JetBrains Mono', 'SF Mono', 'Monaco', 'Menlo', monospace; 60 | font-size: 13px; 61 | line-height: 1.4; 62 | } 63 | 64 | body { 65 | padding: 12px; 66 | max-width: 1200px; 67 | margin: 0 auto; 68 | } 69 | 70 | header { 71 | background: var(--surface); 72 | border: 1px solid var(--border); 73 | border-radius: 4px; 74 | padding: 12px; 75 | margin-bottom: 12px; 76 | } 77 | 78 | .header-top { 79 | display: flex; 80 | justify-content: space-between; 81 | align-items: center; 82 | margin-bottom: 8px; 83 | } 84 | 85 | h1 { 86 | font-size: 16px; 87 | font-weight: 600; 88 | display: flex; 89 | align-items: center; 90 | gap: 8px; 91 | } 92 | 93 | .meta { 94 | font-size: 12px; 95 | color: var(--text-muted); 96 | } 97 | 98 | .summary-bar { 99 | display: flex; 100 | gap: 16px; 101 | padding-top: 8px; 102 | border-top: 1px solid var(--border); 103 | margin-top: 8px; 104 | } 105 | 106 | .summary-item { 107 | display: flex; 108 | align-items: center; 109 | gap: 4px; 110 | font-size: 12px; 111 | } 112 | 113 | .summary-item.passed { color: var(--success); } 114 | .summary-item.failed { color: var(--error); } 115 | .summary-item.skipped { color: var(--warning); } 116 | 117 | .mcp-badge { 118 | font-size: 11px; 119 | padding: 2px 6px; 120 | border-radius: 3px; 121 | font-weight: 500; 122 | white-space: nowrap; 123 | } 124 | 125 | .mcp-badge.enabled { 126 | background: var(--mcp-enabled); 127 | color: white; 128 | } 129 | 130 | .mcp-badge.disabled { 131 | background: var(--bg); 132 | border: 1px solid var(--border); 133 | color: var(--text-muted); 134 | } 135 | 136 | .mcp-notice { 137 | background: var(--notice-bg); 138 | border: 1px solid var(--notice-border); 139 | border-radius: 4px; 140 | padding: 10px 12px; 141 | margin-bottom: 12px; 142 | display: flex; 143 | align-items: center; 144 | gap: 8px; 145 | font-size: 13px; 146 | } 147 | 148 | .notice-icon { 149 | font-size: 16px; 150 | flex-shrink: 0; 151 | } 152 | 153 | .notice-text { 154 | color: var(--text); 155 | line-height: 1.5; 156 | } 157 | 158 | .theme-toggle { 159 | background: none; 160 | border: 1px solid var(--border); 161 | border-radius: 3px; 162 | color: var(--text); 163 | cursor: pointer; 164 | padding: 4px 8px; 165 | font-size: 16px; 166 | } 167 | 168 | .theme-toggle:hover { 169 | background: var(--border); 170 | } 171 | 172 | /* Test Section Styles */ 173 | .test-section { 174 | background: var(--surface); 175 | border: 1px solid var(--border); 176 | border-radius: 4px; 177 | margin-bottom: 12px; 178 | } 179 | 180 | .test-section.passed { 181 | border-left: 3px solid var(--success); 182 | } 183 | 184 | .test-section.failed { 185 | border-left: 3px solid var(--error); 186 | } 187 | 188 | .test-section.skipped { 189 | border-left: 3px solid var(--warning); 190 | } 191 | 192 | .test-header { 193 | display: flex; 194 | align-items: center; 195 | gap: 8px; 196 | padding: 12px; 197 | cursor: pointer; 198 | user-select: none; 199 | list-style: none; 200 | } 201 | 202 | .test-header::-webkit-details-marker { 203 | display: none; 204 | } 205 | 206 | .test-header:hover { 207 | background: var(--bg); 208 | } 209 | 210 | .test-status { 211 | font-size: 16px; 212 | font-weight: bold; 213 | } 214 | 215 | .test-status.passed { color: var(--success); } 216 | .test-status.failed { color: var(--error); } 217 | .test-status.skipped { color: var(--warning); } 218 | 219 | .test-name { 220 | font-weight: 600; 221 | font-size: 14px; 222 | } 223 | 224 | .test-meta { 225 | margin-left: auto; 226 | color: var(--text-muted); 227 | font-size: 12px; 228 | } 229 | 230 | .test-content { 231 | padding: 12px; 232 | border-top: 1px solid var(--border); 233 | } 234 | 235 | .test-content h4 { 236 | font-size: 13px; 237 | font-weight: 600; 238 | margin-bottom: 8px; 239 | color: var(--text-muted); 240 | } 241 | 242 | .prompt-section { 243 | margin-bottom: 16px; 244 | } 245 | 246 | .prompt-section summary { 247 | cursor: pointer; 248 | padding: 8px; 249 | background: var(--bg); 250 | border: 1px solid var(--border); 251 | border-radius: 3px; 252 | font-weight: 600; 253 | } 254 | 255 | .prompt-text { 256 | padding: 12px; 257 | background: var(--bg); 258 | border: 1px solid var(--border); 259 | border-top: none; 260 | border-radius: 0 0 3px 3px; 261 | white-space: pre-wrap; 262 | font-size: 12px; 263 | max-height: 300px; 264 | overflow-y: auto; 265 | } 266 | 267 | .steps-section { 268 | margin-bottom: 16px; 269 | } 270 | 271 | .output-section { 272 | margin-bottom: 16px; 273 | } 274 | 275 | .token-summary { 276 | background: var(--bg); 277 | border: 1px solid var(--border); 278 | border-radius: 4px; 279 | padding: 12px; 280 | margin-bottom: 16px; 281 | display: flex; 282 | align-items: center; 283 | justify-content: space-between; 284 | } 285 | 286 | .token-summary h4 { 287 | margin: 0; 288 | font-size: 13px; 289 | font-weight: 600; 290 | color: var(--text-muted); 291 | } 292 | 293 | .token-count { 294 | font-size: 16px; 295 | font-weight: 600; 296 | color: var(--text); 297 | } 298 | 299 | .component-preview { 300 | position: relative; 301 | } 302 | 303 | .expand-button { 304 | display: block; 305 | width: 100%; 306 | padding: 8px; 307 | margin-top: 8px; 308 | background: var(--bg); 309 | border: 1px solid var(--border); 310 | border-radius: 3px; 311 | color: var(--text); 312 | font-size: 12px; 313 | font-weight: 600; 314 | cursor: pointer; 315 | font-family: inherit; 316 | transition: background 0.2s; 317 | } 318 | 319 | .expand-button:hover { 320 | background: var(--border); 321 | } 322 | 323 | .verification-section { 324 | margin-top: 16px; 325 | } 326 | 327 | /* Step Styles */ 328 | .step { 329 | background: var(--surface); 330 | border: 1px solid var(--border); 331 | border-radius: 4px; 332 | margin-bottom: 8px; 333 | } 334 | 335 | .step-header { 336 | display: flex; 337 | align-items: center; 338 | gap: 8px; 339 | padding: 8px 12px; 340 | cursor: pointer; 341 | user-select: none; 342 | list-style: none; 343 | } 344 | 345 | .step-header::-webkit-details-marker { 346 | display: none; 347 | } 348 | 349 | .step-header:hover { 350 | background: var(--bg); 351 | } 352 | 353 | .step-num { 354 | font-weight: 600; 355 | } 356 | 357 | .line { 358 | flex: 1; 359 | height: 1px; 360 | background: var(--border); 361 | } 362 | 363 | .tokens { 364 | color: var(--text-muted); 365 | cursor: help; 366 | border-bottom: 1px dotted var(--text-muted); 367 | } 368 | 369 | .output { 370 | color: var(--text); 371 | cursor: help; 372 | border-bottom: 1px dotted var(--text-muted); 373 | } 374 | 375 | .reason { 376 | color: var(--text-muted); 377 | font-size: 12px; 378 | } 379 | 380 | .step-content { 381 | padding: 12px; 382 | border-top: 1px solid var(--border); 383 | } 384 | 385 | .text { 386 | white-space: pre-wrap; 387 | margin-bottom: 8px; 388 | padding-left: 8px; 389 | border-left: 2px solid var(--border); 390 | } 391 | 392 | .tool, 393 | .result { 394 | margin: 8px 0; 395 | border: 1px solid var(--border); 396 | border-radius: 3px; 397 | } 398 | 399 | .tool summary, 400 | .result summary { 401 | padding: 4px 8px; 402 | cursor: pointer; 403 | user-select: none; 404 | list-style: none; 405 | } 406 | 407 | .tool summary::-webkit-details-marker, 408 | .result summary::-webkit-details-marker { 409 | display: none; 410 | } 411 | 412 | .tool summary:hover, 413 | .result summary:hover { 414 | background: var(--bg); 415 | } 416 | 417 | .arrow { 418 | color: var(--tool); 419 | } 420 | 421 | .tool-name { 422 | font-weight: 600; 423 | } 424 | 425 | .status { 426 | font-weight: 600; 427 | } 428 | 429 | .status.success { 430 | color: var(--success); 431 | } 432 | 433 | .status.error { 434 | color: var(--error); 435 | } 436 | 437 | .result.error { 438 | border-color: var(--error); 439 | } 440 | 441 | .input, 442 | .output { 443 | padding: 8px; 444 | background: var(--bg); 445 | border-top: 1px solid var(--border); 446 | overflow-x: auto; 447 | font-size: 12px; 448 | } 449 | 450 | .code { 451 | padding: 8px; 452 | background: var(--bg); 453 | border: 1px solid var(--border); 454 | border-radius: 3px; 455 | overflow-x: auto; 456 | font-size: 12px; 457 | white-space: pre-wrap; 458 | } 459 | 460 | .code-preview, 461 | .code-full { 462 | max-height: none; 463 | overflow-y: visible; 464 | } 465 | 466 | /* Verification Styles */ 467 | .verification-result { 468 | padding: 12px; 469 | border-radius: 4px; 470 | border: 1px solid var(--border); 471 | } 472 | 473 | .verification-result.passed { 474 | background: var(--passed-bg); 475 | border-color: var(--passed-border); 476 | } 477 | 478 | .verification-result.failed { 479 | background: var(--failed-bg); 480 | border-color: var(--failed-border); 481 | } 482 | 483 | .verification-result.skipped { 484 | background: var(--skipped-bg); 485 | border-color: var(--skipped-border); 486 | } 487 | 488 | .verification-header { 489 | display: flex; 490 | align-items: center; 491 | gap: 8px; 492 | } 493 | 494 | .verification-icon { 495 | font-size: 18px; 496 | font-weight: bold; 497 | } 498 | 499 | .verification-result.passed .verification-icon { color: var(--success); } 500 | .verification-result.failed .verification-icon { color: var(--error); } 501 | .verification-result.skipped .verification-icon { color: var(--warning); } 502 | 503 | .verification-text { 504 | font-weight: 600; 505 | } 506 | 507 | .verification-stats { 508 | margin-left: auto; 509 | color: var(--text-muted); 510 | font-size: 12px; 511 | } 512 | 513 | .verification-error { 514 | margin-top: 8px; 515 | padding: 8px; 516 | background: var(--bg); 517 | border-radius: 3px; 518 | font-size: 12px; 519 | color: var(--error); 520 | } 521 | 522 | .failed-tests-details { 523 | margin-top: 12px; 524 | } 525 | 526 | .failed-tests-details summary { 527 | cursor: pointer; 528 | font-weight: 600; 529 | padding: 4px 0; 530 | } 531 | 532 | .failed-tests-list { 533 | list-style: none; 534 | margin-top: 8px; 535 | } 536 | 537 | .failed-test { 538 | margin-bottom: 12px; 539 | padding: 8px; 540 | background: var(--bg); 541 | border-radius: 3px; 542 | } 543 | 544 | .failed-test-name { 545 | font-weight: 600; 546 | margin-bottom: 4px; 547 | color: var(--error); 548 | } 549 | 550 | .failed-test-error { 551 | font-size: 11px; 552 | white-space: pre-wrap; 553 | max-height: 200px; 554 | overflow-y: auto; 555 | background: var(--surface); 556 | padding: 8px; 557 | border-radius: 3px; 558 | } 559 | 560 | @media (max-width: 768px) { 561 | body { 562 | padding: 8px; 563 | } 564 | } 565 | `; 566 | } 567 | -------------------------------------------------------------------------------- /index.ts: -------------------------------------------------------------------------------- 1 | import { Experimental_Agent as Agent, hasToolCall, stepCountIs } from "ai"; 2 | import { experimental_createMCPClient as createMCPClient } from "./node_modules/@ai-sdk/mcp/dist/index.mjs"; 3 | import { Experimental_StdioMCPTransport as StdioMCPTransport } from "./node_modules/@ai-sdk/mcp/dist/mcp-stdio/index.mjs"; 4 | import { writeFileSync, mkdirSync, existsSync } from "node:fs"; 5 | import { 6 | generateReport, 7 | type SingleTestResult, 8 | } from "./lib/report.ts"; 9 | import { 10 | getTimestampedFilename, 11 | isHttpUrl, 12 | extractResultWriteContent, 13 | calculateTotalCost, 14 | } from "./lib/utils.ts"; 15 | import { 16 | discoverTests, 17 | buildAgentPrompt, 18 | type TestDefinition, 19 | } from "./lib/test-discovery.ts"; 20 | import { 21 | setupOutputsDirectory, 22 | cleanupOutputsDirectory, 23 | cleanupTestEnvironment, 24 | runTestVerification, 25 | } from "./lib/output-test-runner.ts"; 26 | import { resultWriteTool, testComponentTool } from "./lib/tools/index.ts"; 27 | import { 28 | buildPricingMap, 29 | lookupPricingFromMap, 30 | getModelPricingDisplay, 31 | formatCost, 32 | formatMTokCost, 33 | type ModelPricingLookup, 34 | type GatewayModel, 35 | } from "./lib/pricing.ts"; 36 | import type { LanguageModel } from "ai"; 37 | import { 38 | intro, 39 | multiselect, 40 | isCancel, 41 | cancel, 42 | text, 43 | select, 44 | confirm, 45 | note, 46 | } from "@clack/prompts"; 47 | import { gateway } from "ai"; 48 | 49 | async function validateAndConfirmPricing( 50 | models: string[], 51 | pricingMap: Map, 52 | ) { 53 | const lookups = new Map(); 54 | 55 | for (const modelId of models) { 56 | const lookup = lookupPricingFromMap(modelId, pricingMap); 57 | lookups.set(modelId, lookup); 58 | } 59 | 60 | const modelsWithPricing = models.filter((m) => lookups.get(m) !== null); 61 | const modelsWithoutPricing = models.filter((m) => lookups.get(m) === null); 62 | 63 | if (modelsWithoutPricing.length === 0) { 64 | const pricingLines = models.map((modelId) => { 65 | const lookup = lookups.get(modelId)!; 66 | const display = getModelPricingDisplay(lookup.pricing); 67 | return `${modelId}\n → ${formatMTokCost(display.inputCostPerMTok)}/MTok in, ${formatMTokCost(display.outputCostPerMTok)}/MTok out`; 68 | }); 69 | 70 | note(pricingLines.join("\n\n"), "💰 Pricing Found"); 71 | 72 | const usePricing = await confirm({ 73 | message: "Enable cost calculation?", 74 | initialValue: true, 75 | }); 76 | 77 | if (isCancel(usePricing)) { 78 | cancel("Operation cancelled."); 79 | process.exit(0); 80 | } 81 | 82 | return { enabled: usePricing, lookups }; 83 | } else { 84 | const lines: string[] = []; 85 | 86 | if (modelsWithoutPricing.length > 0) { 87 | lines.push("No pricing found for:"); 88 | for (const modelId of modelsWithoutPricing) { 89 | lines.push(` ✗ ${modelId}`); 90 | } 91 | } 92 | 93 | if (modelsWithPricing.length > 0) { 94 | lines.push(""); 95 | lines.push("Pricing available for:"); 96 | for (const modelId of modelsWithPricing) { 97 | const lookup = lookups.get(modelId)!; 98 | const display = getModelPricingDisplay(lookup.pricing); 99 | lines.push( 100 | ` ✓ ${modelId} (${formatMTokCost(display.inputCostPerMTok)}/MTok in)`, 101 | ); 102 | } 103 | } 104 | 105 | lines.push(""); 106 | lines.push("Cost calculation will be disabled."); 107 | 108 | note(lines.join("\n"), "⚠️ Pricing Incomplete"); 109 | 110 | const proceed = await confirm({ 111 | message: "Continue without pricing?", 112 | initialValue: true, 113 | }); 114 | 115 | if (isCancel(proceed) || !proceed) { 116 | cancel("Operation cancelled."); 117 | process.exit(0); 118 | } 119 | 120 | return { enabled: false, lookups }; 121 | } 122 | } 123 | 124 | async function selectOptions() { 125 | intro("🚀 Svelte AI Bench"); 126 | 127 | const availableModels = await gateway.getAvailableModels(); 128 | 129 | const gatewayModels = availableModels.models as GatewayModel[]; 130 | const pricingMap = buildPricingMap(gatewayModels); 131 | 132 | const models = await multiselect({ 133 | message: "Select model(s) to benchmark", 134 | options: [{ value: "custom", label: "Custom" }].concat( 135 | availableModels.models.reduce>( 136 | (arr, model) => { 137 | if (model.modelType === "language") { 138 | arr.push({ value: model.id, label: model.name }); 139 | } 140 | return arr; 141 | }, 142 | [], 143 | ), 144 | ), 145 | }); 146 | 147 | if (isCancel(models)) { 148 | cancel("Operation cancelled."); 149 | process.exit(0); 150 | } 151 | 152 | if (models.includes("custom")) { 153 | const customModel = await text({ 154 | message: "Enter custom model id", 155 | }); 156 | if (isCancel(customModel)) { 157 | cancel("Operation cancelled."); 158 | process.exit(0); 159 | } 160 | models.push(customModel); 161 | } 162 | 163 | const selectedModels = models.filter((model) => model !== "custom"); 164 | 165 | const pricing = await validateAndConfirmPricing(selectedModels, pricingMap); 166 | 167 | const mcpIntegration = await select({ 168 | message: "Which MCP integration to use?", 169 | options: [ 170 | { value: "none", label: "No MCP Integration" }, 171 | { value: "http", label: "MCP over HTTP" }, 172 | { value: "stdio", label: "MCP over StdIO" }, 173 | ], 174 | }); 175 | 176 | if (isCancel(mcpIntegration)) { 177 | cancel("Operation cancelled."); 178 | process.exit(0); 179 | } 180 | 181 | let mcp: string | undefined = undefined; 182 | 183 | if (mcpIntegration !== "none") { 184 | const custom = await confirm({ 185 | message: "Do you want to provide a custom MCP server/command?", 186 | initialValue: false, 187 | }); 188 | 189 | if (isCancel(custom)) { 190 | cancel("Operation cancelled."); 191 | process.exit(0); 192 | } 193 | 194 | if (custom) { 195 | const customMcp = await text({ 196 | message: "Insert custom url or command", 197 | }); 198 | if (isCancel(customMcp)) { 199 | cancel("Operation cancelled."); 200 | process.exit(0); 201 | } 202 | 203 | mcp = customMcp; 204 | } else { 205 | mcp = 206 | mcpIntegration === "http" 207 | ? "https://mcp.svelte.dev/mcp" 208 | : "npx -y @sveltejs/mcp"; 209 | } 210 | } 211 | 212 | const testingTool = await confirm({ 213 | message: "Do you want to provide the testing tool to the model?", 214 | initialValue: true, 215 | }); 216 | 217 | if (isCancel(testingTool)) { 218 | cancel("Operation cancelled."); 219 | process.exit(0); 220 | } 221 | 222 | return { 223 | models: selectedModels, 224 | mcp, 225 | testingTool, 226 | pricing, 227 | }; 228 | } 229 | 230 | function parseCommandString(commandString: string): { 231 | command: string; 232 | args: string[]; 233 | } { 234 | const parts = commandString.trim().split(/\s+/); 235 | const command = parts[0] ?? ""; 236 | const args = parts.slice(1); 237 | return { command, args }; 238 | } 239 | 240 | async function runSingleTest( 241 | test: TestDefinition, 242 | model: LanguageModel, 243 | mcpClient: Awaited> | null, 244 | testComponentEnabled: boolean, 245 | testIndex: number, 246 | totalTests: number, 247 | ): Promise { 248 | console.log(`\n[${testIndex + 1}/${totalTests}] Running test: ${test.name}`); 249 | console.log("─".repeat(50)); 250 | 251 | const prompt = buildAgentPrompt(test); 252 | 253 | try { 254 | const tools = { 255 | ResultWrite: resultWriteTool, 256 | ...(testComponentEnabled && { TestComponent: testComponentTool(test) }), 257 | ...(mcpClient ? await mcpClient.tools() : {}), 258 | }; 259 | 260 | let stepCounter = 0; 261 | const agent = new Agent({ 262 | model, 263 | stopWhen: [hasToolCall("ResultWrite"), stepCountIs(10)], 264 | tools, 265 | onStepFinish: (step) => { 266 | if (process.env.VERBOSE_LOGGING !== "true") { 267 | return; 268 | } 269 | stepCounter++; 270 | console.log(` Step ${stepCounter}:`); 271 | if (step.text) { 272 | const preview = 273 | step.text.length > 100 274 | ? step.text.slice(0, 100) + "..." 275 | : step.text; 276 | console.log(`💬 Text: ${preview}`); 277 | } 278 | if (step.toolCalls && step.toolCalls.length > 0) { 279 | for (const call of step.toolCalls) { 280 | if (call) { 281 | console.log(`🔧 Tool call: ${call.toolName}`); 282 | } 283 | } 284 | } 285 | if (step.toolResults && step.toolResults.length > 0) { 286 | for (const result of step.toolResults) { 287 | if (result && "output" in result) { 288 | const resultStr = JSON.stringify(result.output); 289 | const preview = 290 | resultStr.length > 80 291 | ? resultStr.slice(0, 80) + "..." 292 | : resultStr; 293 | console.log(`📤 Tool result: ${preview}`); 294 | } 295 | } 296 | } 297 | }, 298 | }); 299 | 300 | console.log(" ⏳ Running agent..."); 301 | if (testComponentEnabled) { 302 | console.log(" 📋 TestComponent tool is available"); 303 | } 304 | const result = await agent.generate({ prompt }); 305 | 306 | const resultWriteContent = extractResultWriteContent(result.steps); 307 | 308 | if (!resultWriteContent) { 309 | console.log(" ⚠️ No ResultWrite output found"); 310 | return { 311 | testName: test.name, 312 | prompt: test.prompt, 313 | steps: result.steps as unknown as SingleTestResult["steps"], 314 | resultWriteContent: null, 315 | verification: null, 316 | }; 317 | } 318 | 319 | console.log(" ✓ Component generated"); 320 | 321 | console.log(" ⏳ Verifying against tests..."); 322 | const verification = await runTestVerification(test, resultWriteContent); 323 | 324 | if (verification.passed) { 325 | console.log( 326 | `✓ All tests passed (${verification.numPassed}/${verification.numTests})`, 327 | ); 328 | } else { 329 | console.log( 330 | `✗ Tests failed (${verification.numFailed}/${verification.numTests} failed)`, 331 | ); 332 | if (verification.failedTests) { 333 | for (const ft of verification.failedTests) { 334 | console.log(`- ${ft.fullName}`); 335 | } 336 | } 337 | } 338 | 339 | cleanupTestEnvironment(test.name); 340 | 341 | return { 342 | testName: test.name, 343 | prompt: test.prompt, 344 | steps: result.steps as unknown as SingleTestResult["steps"], 345 | resultWriteContent, 346 | verification, 347 | }; 348 | } catch (error) { 349 | console.error(`✗ Error running test: ${error}`); 350 | return { 351 | testName: test.name, 352 | prompt: test.prompt, 353 | steps: [], 354 | resultWriteContent: null, 355 | verification: { 356 | testName: test.name, 357 | passed: false, 358 | numTests: 0, 359 | numPassed: 0, 360 | numFailed: 0, 361 | duration: 0, 362 | error: error instanceof Error ? error.message : String(error), 363 | }, 364 | }; 365 | } 366 | } 367 | 368 | async function main() { 369 | const { models, mcp, testingTool, pricing } = await selectOptions(); 370 | 371 | const mcpServerUrl = mcp; 372 | const mcpEnabled = !!mcp; 373 | 374 | const testComponentEnabled = testingTool; 375 | 376 | const isHttpTransport = mcpServerUrl && isHttpUrl(mcpServerUrl); 377 | const mcpTransportType = isHttpTransport ? "HTTP" : "StdIO"; 378 | 379 | console.log("\n╔════════════════════════════════════════════════════╗"); 380 | console.log("║ SvelteBench 2.0 - Multi-Test ║"); 381 | console.log("╚════════════════════════════════════════════════════╝"); 382 | 383 | console.log("\n📋 Models:"); 384 | for (const modelId of models) { 385 | const lookup = pricing.lookups.get(modelId); 386 | if (pricing.enabled && lookup) { 387 | const display = getModelPricingDisplay(lookup.pricing); 388 | console.log(` ${modelId}`); 389 | console.log( 390 | ` 💰 ${formatMTokCost(display.inputCostPerMTok)}/MTok in, ${formatMTokCost(display.outputCostPerMTok)}/MTok out`, 391 | ); 392 | } else { 393 | console.log(` ${modelId}`); 394 | } 395 | } 396 | 397 | console.log(`\n💰 Pricing: ${pricing.enabled ? "Enabled" : "Disabled"}`); 398 | 399 | console.log(`🔌 MCP Integration: ${mcpEnabled ? "Enabled" : "Disabled"}`); 400 | if (mcpEnabled) { 401 | console.log(` Transport: ${mcpTransportType}`); 402 | if (isHttpTransport) { 403 | console.log(` URL: ${mcpServerUrl}`); 404 | } else { 405 | console.log(` Command: ${mcpServerUrl}`); 406 | } 407 | } 408 | 409 | console.log( 410 | `🧪 TestComponent Tool: ${testComponentEnabled ? "Enabled" : "Disabled"}`, 411 | ); 412 | 413 | console.log("\n📁 Discovering tests..."); 414 | const tests = discoverTests(); 415 | console.log( 416 | ` Found ${tests.length} test(s): ${tests.map((t) => t.name).join(", ")}`, 417 | ); 418 | 419 | if (tests.length === 0) { 420 | console.error("No tests found in tests/ directory"); 421 | process.exit(1); 422 | } 423 | 424 | setupOutputsDirectory(); 425 | 426 | let mcpClient = null; 427 | if (mcpEnabled) { 428 | if (isHttpTransport) { 429 | mcpClient = await createMCPClient({ 430 | transport: { 431 | type: "http", 432 | url: mcpServerUrl, 433 | }, 434 | }); 435 | } else { 436 | const { command, args } = parseCommandString(mcpServerUrl!); 437 | mcpClient = await createMCPClient({ 438 | transport: new StdioMCPTransport({ 439 | command, 440 | args, 441 | }), 442 | }); 443 | } 444 | } 445 | 446 | let totalFailed = 0; 447 | 448 | for (const modelId of models) { 449 | console.log("\n" + "═".repeat(50)); 450 | console.log(`🤖 Running benchmark for model: ${modelId}`); 451 | console.log("═".repeat(50)); 452 | 453 | const pricingLookup = pricing.enabled 454 | ? (pricing.lookups.get(modelId) ?? null) 455 | : null; 456 | 457 | if (pricingLookup) { 458 | const display = getModelPricingDisplay(pricingLookup.pricing); 459 | console.log( 460 | `💰 Pricing: ${formatMTokCost(display.inputCostPerMTok)}/MTok in, ${formatMTokCost(display.outputCostPerMTok)}/MTok out`, 461 | ); 462 | } 463 | 464 | const model = gateway.languageModel(modelId); 465 | 466 | const testResults = []; 467 | const startTime = Date.now(); 468 | 469 | for (let i = 0; i < tests.length; i++) { 470 | const test = tests[i]; 471 | if (!test) continue; 472 | const result = await runSingleTest( 473 | test, 474 | model, 475 | mcpClient, 476 | testComponentEnabled, 477 | i, 478 | tests.length, 479 | ); 480 | testResults.push(result); 481 | } 482 | 483 | const totalDuration = Date.now() - startTime; 484 | 485 | console.log("\n" + "═".repeat(50)); 486 | console.log("📊 Test Summary"); 487 | console.log("═".repeat(50)); 488 | 489 | const passed = testResults.filter((r) => r.verification?.passed).length; 490 | const failed = testResults.filter( 491 | (r) => r.verification && !r.verification.passed, 492 | ).length; 493 | totalFailed += failed; 494 | const skipped = testResults.filter((r) => !r.verification).length; 495 | 496 | for (const result of testResults) { 497 | const status = result.verification 498 | ? result.verification.passed 499 | ? "✓" 500 | : "✗" 501 | : "⊘"; 502 | const statusText = result.verification 503 | ? result.verification.passed 504 | ? "PASSED" 505 | : "FAILED" 506 | : "SKIPPED"; 507 | console.log(`${status} ${result.testName}: ${statusText}`); 508 | } 509 | 510 | console.log("─".repeat(50)); 511 | console.log( 512 | `Total: ${passed} passed, ${failed} failed, ${skipped} skipped (${(totalDuration / 1000).toFixed(1)}s)`, 513 | ); 514 | 515 | let totalCost = null; 516 | let pricingInfo = null; 517 | 518 | if (pricingLookup) { 519 | totalCost = calculateTotalCost(testResults, pricingLookup.pricing); 520 | const pricingDisplay = getModelPricingDisplay(pricingLookup.pricing); 521 | pricingInfo = { 522 | inputCostPerMTok: pricingDisplay.inputCostPerMTok, 523 | outputCostPerMTok: pricingDisplay.outputCostPerMTok, 524 | cacheReadCostPerMTok: pricingDisplay.cacheReadCostPerMTok, 525 | }; 526 | 527 | console.log("\n💰 Cost Summary"); 528 | console.log("─".repeat(50)); 529 | console.log( 530 | `Input tokens: ${totalCost.inputTokens.toLocaleString()} (${formatCost(totalCost.inputCost)})`, 531 | ); 532 | console.log( 533 | `Output tokens: ${totalCost.outputTokens.toLocaleString()} (${formatCost(totalCost.outputCost)})`, 534 | ); 535 | if (totalCost.cachedInputTokens > 0) { 536 | console.log( 537 | `Cached tokens: ${totalCost.cachedInputTokens.toLocaleString()} (${formatCost(totalCost.cacheReadCost)})`, 538 | ); 539 | } 540 | console.log(`Total cost: ${formatCost(totalCost.totalCost)}`); 541 | } 542 | 543 | const resultsDir = "results"; 544 | if (!existsSync(resultsDir)) { 545 | mkdirSync(resultsDir, { recursive: true }); 546 | } 547 | 548 | const jsonFilename = getTimestampedFilename("result", "json", modelId); 549 | const htmlFilename = getTimestampedFilename("result", "html", modelId); 550 | const jsonPath = `${resultsDir}/${jsonFilename}`; 551 | const htmlPath = `${resultsDir}/${htmlFilename}`; 552 | 553 | const resultData = { 554 | tests: testResults, 555 | metadata: { 556 | mcpEnabled, 557 | mcpServerUrl: mcpEnabled ? mcpServerUrl! : null, 558 | mcpTransportType: mcpEnabled ? mcpTransportType : null, 559 | timestamp: new Date().toISOString(), 560 | model: modelId, 561 | pricingKey: pricingLookup?.matchedKey ?? null, 562 | pricing: pricingInfo, 563 | totalCost, 564 | }, 565 | }; 566 | 567 | writeFileSync(jsonPath, JSON.stringify(resultData, null, 2)); 568 | console.log(`\n✓ Results saved to ${jsonPath}`); 569 | 570 | await generateReport(jsonPath, htmlPath); 571 | } 572 | 573 | cleanupOutputsDirectory(); 574 | 575 | process.exit(totalFailed > 0 ? 1 : 0); 576 | } 577 | 578 | main().catch((error) => { 579 | console.error("Fatal error:", error); 580 | process.exit(1); 581 | }); 582 | -------------------------------------------------------------------------------- /lib/report-template.ts: -------------------------------------------------------------------------------- 1 | import type { TestVerificationResult } from "./output-test-runner.ts"; 2 | import type { MultiTestResultData, SingleTestResult } from "./report.ts"; 3 | import { getReportStyles } from "./report-styles.ts"; 4 | import { formatCost, formatMTokCost } from "./pricing.ts"; 5 | 6 | interface TextBlock { 7 | type: "text"; 8 | text: string; 9 | } 10 | 11 | interface ToolCallBlock { 12 | type: "tool-call"; 13 | toolCallId: string; 14 | toolName: string; 15 | input: Record; 16 | dynamic?: boolean; 17 | } 18 | 19 | interface ToolResultBlock { 20 | type: "tool-result"; 21 | toolCallId: string; 22 | toolName: string; 23 | input: Record; 24 | output: { 25 | content: Array<{ type: string; text?: string; [key: string]: unknown }>; 26 | isError?: boolean; 27 | }; 28 | dynamic?: boolean; 29 | } 30 | 31 | type ContentBlock = TextBlock | ToolCallBlock | ToolResultBlock; 32 | 33 | interface Step { 34 | content: ContentBlock[]; 35 | finishReason: string; 36 | usage: { 37 | inputTokens: number; 38 | outputTokens: number; 39 | totalTokens: number; 40 | cachedInputTokens: number; 41 | }; 42 | [key: string]: unknown; 43 | } 44 | 45 | function escapeHtml(text: string) { 46 | const map: Record = { 47 | "&": "&", 48 | "<": "<", 49 | ">": ">", 50 | '"': """, 51 | "'": "'", 52 | }; 53 | let result = ""; 54 | for (const char of text) { 55 | result += map[char] ?? char; 56 | } 57 | return result; 58 | } 59 | 60 | function formatTimestamp(timestamp: string) { 61 | const date = new Date(timestamp); 62 | return date.toLocaleString("en-US", { 63 | year: "numeric", 64 | month: "short", 65 | day: "numeric", 66 | hour: "2-digit", 67 | minute: "2-digit", 68 | second: "2-digit", 69 | }); 70 | } 71 | 72 | function getFirstLines(code: string, numLines: number) { 73 | const lines = code.split("\n"); 74 | return lines.slice(0, numLines).join("\n"); 75 | } 76 | 77 | function renderContentBlock(block: ContentBlock) { 78 | if (block.type === "text") { 79 | return `
    ${escapeHtml(block.text)}
    `; 80 | } else if (block.type === "tool-call") { 81 | const inputJson = JSON.stringify(block.input, null, 2); 82 | return `
    83 | ${escapeHtml(block.toolName)} 84 |
    ${escapeHtml(inputJson)}
    85 |
    `; 86 | } else if (block.type === "tool-result") { 87 | const outputText = JSON.stringify(block.output, null, 2); 88 | const isError = 89 | block.output && 90 | typeof block.output === "object" && 91 | "error" in block.output; 92 | const statusIcon = isError ? "✗" : "✓"; 93 | return `
    94 | ${statusIcon} Output 95 |
    ${escapeHtml(outputText)}
    96 |
    `; 97 | } 98 | return ""; 99 | } 100 | 101 | function renderVerificationResult( 102 | verification: TestVerificationResult | null, 103 | ) { 104 | if (!verification) { 105 | return `
    106 | 107 | Test verification not run 108 |
    `; 109 | } 110 | 111 | const statusClass = verification.passed ? "passed" : "failed"; 112 | const statusIcon = verification.passed ? "✓" : "✗"; 113 | const statusText = verification.passed ? "All tests passed" : "Tests failed"; 114 | 115 | let failedTestsHtml = ""; 116 | if (verification.failedTests && verification.failedTests.length > 0) { 117 | const failedItems = verification.failedTests 118 | .map( 119 | (ft) => `
  • 120 |
    ${escapeHtml(ft.fullName)}
    121 |
    ${escapeHtml(ft.errorMessage)}
    122 |
  • `, 123 | ) 124 | .join(""); 125 | failedTestsHtml = `
    126 | Failed Tests (${verification.failedTests.length}) 127 |
      ${failedItems}
    128 |
    `; 129 | } 130 | 131 | let errorHtml = ""; 132 | if (verification.error) { 133 | errorHtml = `
    Error: ${escapeHtml(verification.error)}
    `; 134 | } 135 | 136 | return `
    137 |
    138 | ${statusIcon} 139 | ${statusText} 140 | ${verification.numPassed}/${verification.numTests} tests (${verification.duration}ms) 141 |
    142 | ${errorHtml} 143 | ${failedTestsHtml} 144 |
    `; 145 | } 146 | 147 | function renderSteps(steps: Step[]) { 148 | return steps 149 | .map((step, index) => { 150 | const assistantContentHtml = 151 | step.content.map((block) => renderContentBlock(block)).join("") || 152 | '
    No response
    '; 153 | 154 | const cachedInfo = 155 | step.usage.cachedInputTokens > 0 156 | ? `, ${step.usage.cachedInputTokens.toLocaleString()}⚡` 157 | : ""; 158 | 159 | const inputTokens = step.usage.inputTokens; 160 | const cachedTokens = step.usage.cachedInputTokens; 161 | const uncachedInputTokens = inputTokens - cachedTokens; 162 | 163 | return ` 164 |
    165 | 166 | Step ${index + 1} 167 | 168 | ${step.usage.totalTokens.toLocaleString()} tok 169 | (${step.usage.outputTokens.toLocaleString()}↑${cachedInfo}) 170 | ${step.finishReason} 171 | 172 |
    173 | ${assistantContentHtml} 174 |
    175 |
    `; 176 | }) 177 | .join("\n"); 178 | } 179 | 180 | function renderTestSection(test: SingleTestResult, index: number) { 181 | const totalTokens = test.steps.reduce( 182 | (sum, step) => sum + step.usage.totalTokens, 183 | 0, 184 | ); 185 | const stepCount = test.steps.length; 186 | const verificationStatus = test.verification 187 | ? test.verification.passed 188 | ? "passed" 189 | : "failed" 190 | : "skipped"; 191 | const verificationIcon = test.verification 192 | ? test.verification.passed 193 | ? "✓" 194 | : "✗" 195 | : "⊘"; 196 | 197 | const stepsHtml = renderSteps(test.steps); 198 | const verificationHtml = renderVerificationResult(test.verification); 199 | 200 | const componentId = `component-${test.testName.replace(/[^a-zA-Z0-9]/g, "-")}`; 201 | 202 | const resultWriteHtml = test.resultWriteContent 203 | ? `
    204 |
    205 |

    Total Tokens Used

    206 |
    ${totalTokens.toLocaleString()} tokens
    207 |
    208 |

    Generated Component

    209 |
    210 |
    ${escapeHtml(getFirstLines(test.resultWriteContent, 5))}
    211 | 212 | 216 |
    217 |
    ` 218 | : ""; 219 | 220 | return ` 221 |
    222 | 223 | ${verificationIcon} 224 | ${escapeHtml(test.testName)} 225 | ${stepCount} steps · ${totalTokens.toLocaleString()} tokens 226 | 227 |
    228 |
    229 | Prompt 230 |
    ${escapeHtml(test.prompt)}
    231 |
    232 | 233 |
    234 |

    Agent Steps

    235 | ${stepsHtml} 236 |
    237 | 238 | ${resultWriteHtml} 239 | 240 |
    241 |

    Test Verification

    242 | ${verificationHtml} 243 |
    244 |
    245 |
    `; 246 | } 247 | 248 | function renderPricingSection(data: MultiTestResultData) { 249 | const { metadata } = data; 250 | const { pricing, totalCost, pricingKey } = metadata; 251 | 252 | if (!pricing && !totalCost) { 253 | return ""; 254 | } 255 | 256 | let pricingInfoHtml = ""; 257 | if (pricing) { 258 | const pricingKeyDisplay = pricingKey 259 | ? `${escapeHtml(pricingKey)}` 260 | : ""; 261 | 262 | pricingInfoHtml = ` 263 |
    264 | Model Pricing: 265 | ${pricingKeyDisplay} 266 | ${formatMTokCost(pricing.inputCostPerMTok)}/MTok in 267 | · 268 | ${formatMTokCost(pricing.outputCostPerMTok)}/MTok out 269 | ${pricing.cacheReadCostPerMTok !== undefined ? `·${formatMTokCost(pricing.cacheReadCostPerMTok)}/MTok cached` : ""} 270 |
    271 | `; 272 | } 273 | 274 | let costBreakdownHtml = ""; 275 | if (totalCost) { 276 | const uncachedInputTokens = 277 | totalCost.inputTokens - totalCost.cachedInputTokens; 278 | 279 | costBreakdownHtml = ` 280 |
    281 |
    282 | Input tokens: 283 | ${uncachedInputTokens.toLocaleString()} 284 | ${formatCost(totalCost.inputCost)} 285 |
    286 |
    287 | Output tokens: 288 | ${totalCost.outputTokens.toLocaleString()} 289 | ${formatCost(totalCost.outputCost)} 290 |
    291 | ${ 292 | totalCost.cachedInputTokens > 0 293 | ? ` 294 |
    295 | Cached tokens: 296 | ${totalCost.cachedInputTokens.toLocaleString()} ⚡ 297 | ${formatCost(totalCost.cacheReadCost)} 298 |
    299 | ` 300 | : "" 301 | } 302 |
    303 | Total Cost: 304 | 305 | ${formatCost(totalCost.totalCost)} 306 |
    307 |
    308 | `; 309 | } 310 | 311 | return ` 312 |
    313 |
    314 | 💰 315 | Cost Summary 316 |
    317 | ${pricingInfoHtml} 318 | ${costBreakdownHtml} 319 |
    320 | `; 321 | } 322 | 323 | function getPricingStyles() { 324 | return ` 325 | .pricing-section { 326 | background: var(--surface); 327 | border: 1px solid var(--border); 328 | border-radius: 4px; 329 | padding: 12px; 330 | margin-bottom: 12px; 331 | } 332 | 333 | .pricing-header { 334 | display: flex; 335 | align-items: center; 336 | gap: 8px; 337 | margin-bottom: 12px; 338 | font-weight: 600; 339 | } 340 | 341 | .pricing-icon { 342 | font-size: 16px; 343 | } 344 | 345 | .pricing-title { 346 | font-size: 14px; 347 | } 348 | 349 | .pricing-rates { 350 | display: flex; 351 | align-items: center; 352 | gap: 8px; 353 | font-size: 12px; 354 | color: var(--text-muted); 355 | margin-bottom: 12px; 356 | padding-bottom: 12px; 357 | border-bottom: 1px solid var(--border); 358 | flex-wrap: wrap; 359 | } 360 | 361 | .rate-label { 362 | font-weight: 500; 363 | } 364 | 365 | .pricing-key { 366 | font-family: 'JetBrains Mono', monospace; 367 | background: var(--bg); 368 | padding: 2px 6px; 369 | border-radius: 3px; 370 | border: 1px solid var(--border); 371 | color: var(--text); 372 | font-size: 11px; 373 | } 374 | 375 | .rate-value { 376 | font-family: 'JetBrains Mono', monospace; 377 | } 378 | 379 | .rate-separator { 380 | color: var(--border); 381 | } 382 | 383 | .cost-breakdown { 384 | display: flex; 385 | flex-direction: column; 386 | gap: 6px; 387 | } 388 | 389 | .cost-row { 390 | display: grid; 391 | grid-template-columns: 120px 1fr auto; 392 | gap: 8px; 393 | align-items: center; 394 | font-size: 13px; 395 | } 396 | 397 | .cost-row.cached { 398 | color: var(--text-muted); 399 | } 400 | 401 | .cost-row.total { 402 | margin-top: 8px; 403 | padding-top: 8px; 404 | border-top: 1px solid var(--border); 405 | font-weight: 600; 406 | } 407 | 408 | .cost-label { 409 | color: var(--text-muted); 410 | } 411 | 412 | .cost-row.total .cost-label { 413 | color: var(--text); 414 | } 415 | 416 | .cost-tokens { 417 | font-family: 'JetBrains Mono', monospace; 418 | text-align: right; 419 | } 420 | 421 | .cost-value { 422 | font-family: 'JetBrains Mono', monospace; 423 | font-weight: 500; 424 | text-align: right; 425 | min-width: 80px; 426 | } 427 | 428 | .cost-row.total .cost-value { 429 | color: var(--success); 430 | font-size: 15px; 431 | } 432 | `; 433 | } 434 | 435 | export function generateMultiTestHtml(data: MultiTestResultData) { 436 | const metadata = data.metadata; 437 | const totalTests = data.tests.length; 438 | const passedTests = data.tests.filter((t) => t.verification?.passed).length; 439 | const failedTests = data.tests.filter( 440 | (t) => t.verification && !t.verification.passed, 441 | ).length; 442 | const skippedTests = data.tests.filter((t) => !t.verification).length; 443 | 444 | const totalTokens = data.tests.reduce( 445 | (sum, test) => 446 | sum + test.steps.reduce((s, step) => s + step.usage.totalTokens, 0), 447 | 0, 448 | ); 449 | 450 | const mcpBadge = metadata.mcpEnabled 451 | ? metadata.mcpTransportType === "StdIO" 452 | ? `MCP ✓ (StdIO: ${escapeHtml(metadata.mcpServerUrl || "")})` 453 | : `MCP ✓ (${escapeHtml(metadata.mcpServerUrl || "")})` 454 | : `MCP ✗`; 455 | 456 | const mcpNotice = !metadata.mcpEnabled 457 | ? ` 458 |
    459 | ℹ️ 460 | MCP integration was not used in this benchmark. The agent ran with built-in tools only. 461 |
    ` 462 | : ""; 463 | 464 | const costDisplay = metadata.totalCost 465 | ? `${formatCost(metadata.totalCost.totalCost)}` 466 | : ""; 467 | 468 | const overallStatus = 469 | failedTests === 0 && skippedTests === 0 470 | ? "all-passed" 471 | : failedTests > 0 472 | ? "has-failures" 473 | : "has-skipped"; 474 | 475 | const testsHtml = data.tests 476 | .map((test, index) => renderTestSection(test, index)) 477 | .join("\n"); 478 | 479 | const pricingHtml = renderPricingSection(data); 480 | 481 | const styles = 482 | getReportStyles() + 483 | getPricingStyles() + 484 | ` 485 | .cost-badge { 486 | background: var(--success); 487 | color: white; 488 | font-size: 11px; 489 | padding: 2px 6px; 490 | border-radius: 3px; 491 | font-weight: 500; 492 | font-family: 'JetBrains Mono', monospace; 493 | } 494 | `; 495 | 496 | return ` 497 | 498 | 499 | 500 | 501 | SvelteBench 2.0 - Multi-Test Report 502 | 503 | 504 | 505 |
    506 |
    507 |
    508 |

    SvelteBench 2.0 ${mcpBadge} ${costDisplay}

    509 |
    ${escapeHtml(metadata.model)} · ${totalTests} tests · ${totalTokens.toLocaleString()} tokens · ${formatTimestamp(metadata.timestamp)}
    510 |
    511 | 512 |
    513 |
    514 |
    ✓ ${passedTests} passed
    515 |
    ✗ ${failedTests} failed
    516 | ${skippedTests > 0 ? `
    ⊘ ${skippedTests} skipped
    ` : ""} 517 |
    518 |
    519 | 520 | ${mcpNotice} 521 | 522 | ${pricingHtml} 523 | 524 | ${testsHtml} 525 | 526 | 557 | 558 | `; 559 | } 560 | -------------------------------------------------------------------------------- /results/result-2025-12-07-16-48-32.json: -------------------------------------------------------------------------------- 1 | { 2 | "steps": [ 3 | { 4 | "content": [ 5 | { 6 | "type": "reasoning", 7 | "text": "The user asked: \"Can you build a counter component in svelte? Use the ResultWrite tool to write the result to a file when you are done.\"\n\n\n## Understanding User Request \n- The user asked for a counter component to be built in Svelte. \n- They also requested the result be written to a file using a tool called ResultWrite.\n\n## Designing Counter Component\n- A basic Svelte counter needs a script section with a count variable, set initially to 0.\n- Include functions to increment and decrement the count, triggered by button clicks.", 8 | "providerMetadata": { 9 | "openrouter": { 10 | "reasoning_details": [ 11 | { 12 | "type": "reasoning.summary", 13 | "summary": "The user asked: \"Can you build a counter component in svelte? Use the ResultWrite tool to write the result to a file when you are done.\"\n\n\n## Understanding User Request \n- The user asked for a counter component to be built in Svelte. \n- They also requested the result be written to a file using a tool called ResultWrite.\n\n## Designing Counter Component\n- A basic Svelte counter needs a script section with a count variable, set initially to 0.\n- Include functions to increment and decrement the count, triggered by button clicks.", 14 | "format": "xai-responses-v1", 15 | "index": 0 16 | } 17 | ] 18 | } 19 | } 20 | }, 21 | { 22 | "type": "reasoning", 23 | "text": "[REDACTED]", 24 | "providerMetadata": { 25 | "openrouter": { 26 | "reasoning_details": [ 27 | { 28 | "type": "reasoning.encrypted", 29 | "data": "f1L8uRV32UhXYq98EQOfEw5K//gAe24eugguZxGbaBcmTCP5AytwHZmS2SDZrPh/KioJzm5xm58CIKptdj5fZTuc1HjzE+Ceb+QozdlITIKmCooLg3yJTZHW9px6FTErx+qp8DA+Xlo2X3mioWmefs+LsFMuwBJSl4FyvWfgRTjUBsPWlm9NY64sI3mNzi1zMrID9wIjqeIlrapssIHEw8noJSAlvjGyY+H5wnXo1A1zp487B0apGXxb+nFsLTZySLLPi8FiD0eFPsFj/MjWeMLdYrEYUZNId3xx9Pvrxuskpz0aiSznVxi3QpT9m61EO4rH10CzTI7WB2+WOfH3M/aArmsRsS79AsXh7p11VR+LqnTSrbdJk8b/j0EcNN91J/KyAna3kOjC4Uy0XRXHorBYvFuXYsSagUCMar9LE98XJikyKcBB26KvjAY4uqAjwQBtn9NVBA9b8veK6qoaTJ8Yuz7j52xzKDlt+uD99UpvzHLzxGKAFwz2HX53G5ghbSuJ4Hke/E++OULzZhkeHjiXKKoZjogjhSUA4W50rLN81itnB/Z/z3CM5m6tyOl1sDcApovTnnUtE+d23GQkx3i9WRIprRJF64JcuMc2up3WyLIMefrcIV1RbKzLG3WxuPB0uNysYRVnhVDDGBJN94g+zZc97jg8C0jSV8TkkhzELUNgz2j9Fip3DqgVnrU8HDbC+cEgqTRjfSTZi7LFdIwog3EOGI87m3KL2B4DYK+V6AXkrmv1GQ0LGzJhKwLYtFhVxvswCU0uqvm/+s88LTJpAEeOYbUpQj+y43QrP7q1Erjx0KxemnFYiMLha/467eiJyNVLFLdUNI1hgKswfGhiMRZkMHAA77n3s+0Ru5gC6sHK3frQ8Usaa09pIErIBC75gn2w9hSRGa/irpG5PyyJph8IgpvVi9YV1GZqKRj5Wo3GJJw+JcfyFIJe+rA0Ltetmm9BsbNNlItkTVk+xKJ3HB0V1a3m4KbipqApP+aF7+I7lR7z1FeYU8AvFDVp5JZbFf9uChkDzk3K/Ey3QpXOkwg+4RnkM82QNstqEDIm4QzU7dKShM9dgPFGGxcgJx+nGQElUTRDuZ2jSewrGy81N6KDTUfUubFXs9/nwjmJpuNlLh1JMkAWiIvRYd11zAAnlKTrc2FWj6ezKiiWwAsQ8kJ/+K06Jwu11aLyVFkmSOlov2bNmNzH3XlWIti8LhmWP9fCv74XibX1lfaDx0VwB1KeQUJm3fxpPpRPwLDzhCMMn4UqXraLkeq57zxyJxyVnL/iq5honAB2eHQJnITDUEPwPCLBLQhKp5UuNITdjBXHuG+u/M8YRNEAnTqiWFnAd7nnzdCz30FfnceJClnYjJfcXgDLlYDZfW/BL6xQUiauXuJn4g+eiyIPANlG0FOsyU6ds48zu1w29EnpCv+Vzsc5u2cqXDS6Ez3L7NfwIdpnRlA894D61K8+191M4fSAbjDm7rBEymGNKabkHjVAj6uZOtKfn2VjEfZYnjkSh6NOH9Wjq2gYWTsKEPOMszyO/oUBD7DIGptfuoKR7dQQwQ8TyOHIO9JBJX6/Iuc6xduvjemaBwQdssaP5SO0Yyz/pTaAd+35v9KikhMR2Dlb0ljgUjH87P4AQK7tWTRZwFszTrBT6z8AMhmtftp/4YYqJ5aOoqeOCJeuiYBW6dHB8KndPtcNQ9nC2eogATiUUJm5Nv5W6AnN6GUB1a9P1SKNsbFNzT6+J0AlTeFfxoES6eIBw5YHLqPOZsZ2NmpjctkDUjJqAuPpId7KwnXWO67yLagiMpuUcya/pkWQkiAV7ds595+ekWdZMiNVg7lJyy5imGEAJWopUtNvMBo6cjdYipaZvadfWOsPj3fHHdoA3vIf3gjW5L5V/LxVZEEFEzLDL7UC0KlG1D8y/b1WmJ6bDAHVXZPuMb1AEYiEtUsed73lywJO13Drgo6YdvSqvro1WyMjTwDLN8iaWCIEf7uWnTknIzzW6BnL5FVT/FEIfZk7LGzDW1Op45EzZLrXlNTi4x8ZYfangKB2AIHcUng6qJMHWJBCmJGK2rRBod2IlQXewUqO36EklVW8EO5vank5Xn+T5QwRCngWC9vFJpiHyBQ58+I5UyariQfA2Bx/KW+gIdtZA7FrMTEaYjNzzFvCxF0hzdagQ6ZC+mQYuOmgbyhBZOepVR+yIurTBMopQJDaiII5tUEuU5UOJ8ZW3lonh0Zb6qxDgc9C9/46wtVINufPGsO3K9Le7uIamhKX2QVxEzReDyzJwrDZfCI3RQBWsnYzZ1vZsNzkZrabWaXcT6GthIgKRIGESL3PrtTPdhFQcMOYWz4ERUQfJFsJbgHzQ1srJqp95c6VjI3lKjHIZ3nxDCvQyL+mJtAeX4hzqwcam8fok8uOp2hqq1SHa8alqZ3C2RbT+8D/hezQuOpiGsPyjx8HC5pBhqJPx5RhOHvX2EQhst1xl25SICv4FEFypnDmCCt0dhph/ZKqZd/jTgUQpsE0/O5yurLXT0iG9D6bpZ0pAy/IjyUgQ32/mJu0nv1wfg5oql8VhvhWWTZa7Q6lGU8uk7eD9M3YSTjkc9dmd9GetC10gyXSLAGOVve4SyuU8+cU/jf6x8MzapoweSHgUiUWQXa+kxv7XmRsQ/39eCy+qcyPcgrN7PYWz2qGpyv/mSHiSvH+6kxnCUks0i6gbAm5a9rISE4DjGuQJSWBINRb+J83kQuAZxO+Qf12HNRlBQzVIZqs5K4U9vtVyAIWGiwAmR4HUbrqnkjLJA8yiDSg8hXw1nJ1A52eYqmuLT9EqgTKEbKeIGrYUf0UsnPGvbXu4cGKYRYK0/Okazl8/EDKc+id", 30 | "id": "rs_37c358c8-1875-26be-886e-4dfe1ce9f4f8", 31 | "format": "xai-responses-v1", 32 | "index": 0 33 | } 34 | ] 35 | } 36 | } 37 | }, 38 | { 39 | "type": "tool-call", 40 | "toolCallId": "call_22619064", 41 | "toolName": "ResultWrite", 42 | "input": { 43 | "content": "\n\n\n{count}\n" 44 | }, 45 | "providerMetadata": { 46 | "openrouter": { 47 | "reasoning_details": [ 48 | { 49 | "type": "reasoning.summary", 50 | "summary": "The user asked: \"Can you build a counter component in svelte? Use the ResultWrite tool to write the result to a file when you are done.\"\n\n\n## Understanding User Request \n- The user asked for a counter component to be built in Svelte. \n- They also requested the result be written to a file using a tool called ResultWrite.\n\n## Designing Counter Component\n- A basic Svelte counter needs a script section with a count variable, set initially to 0.\n- Include functions to increment and decrement the count, triggered by button clicks.", 51 | "format": "xai-responses-v1", 52 | "index": 0 53 | }, 54 | { 55 | "type": "reasoning.encrypted", 56 | "data": "f1L8uRV32UhXYq98EQOfEw5K//gAe24eugguZxGbaBcmTCP5AytwHZmS2SDZrPh/KioJzm5xm58CIKptdj5fZTuc1HjzE+Ceb+QozdlITIKmCooLg3yJTZHW9px6FTErx+qp8DA+Xlo2X3mioWmefs+LsFMuwBJSl4FyvWfgRTjUBsPWlm9NY64sI3mNzi1zMrID9wIjqeIlrapssIHEw8noJSAlvjGyY+H5wnXo1A1zp487B0apGXxb+nFsLTZySLLPi8FiD0eFPsFj/MjWeMLdYrEYUZNId3xx9Pvrxuskpz0aiSznVxi3QpT9m61EO4rH10CzTI7WB2+WOfH3M/aArmsRsS79AsXh7p11VR+LqnTSrbdJk8b/j0EcNN91J/KyAna3kOjC4Uy0XRXHorBYvFuXYsSagUCMar9LE98XJikyKcBB26KvjAY4uqAjwQBtn9NVBA9b8veK6qoaTJ8Yuz7j52xzKDlt+uD99UpvzHLzxGKAFwz2HX53G5ghbSuJ4Hke/E++OULzZhkeHjiXKKoZjogjhSUA4W50rLN81itnB/Z/z3CM5m6tyOl1sDcApovTnnUtE+d23GQkx3i9WRIprRJF64JcuMc2up3WyLIMefrcIV1RbKzLG3WxuPB0uNysYRVnhVDDGBJN94g+zZc97jg8C0jSV8TkkhzELUNgz2j9Fip3DqgVnrU8HDbC+cEgqTRjfSTZi7LFdIwog3EOGI87m3KL2B4DYK+V6AXkrmv1GQ0LGzJhKwLYtFhVxvswCU0uqvm/+s88LTJpAEeOYbUpQj+y43QrP7q1Erjx0KxemnFYiMLha/467eiJyNVLFLdUNI1hgKswfGhiMRZkMHAA77n3s+0Ru5gC6sHK3frQ8Usaa09pIErIBC75gn2w9hSRGa/irpG5PyyJph8IgpvVi9YV1GZqKRj5Wo3GJJw+JcfyFIJe+rA0Ltetmm9BsbNNlItkTVk+xKJ3HB0V1a3m4KbipqApP+aF7+I7lR7z1FeYU8AvFDVp5JZbFf9uChkDzk3K/Ey3QpXOkwg+4RnkM82QNstqEDIm4QzU7dKShM9dgPFGGxcgJx+nGQElUTRDuZ2jSewrGy81N6KDTUfUubFXs9/nwjmJpuNlLh1JMkAWiIvRYd11zAAnlKTrc2FWj6ezKiiWwAsQ8kJ/+K06Jwu11aLyVFkmSOlov2bNmNzH3XlWIti8LhmWP9fCv74XibX1lfaDx0VwB1KeQUJm3fxpPpRPwLDzhCMMn4UqXraLkeq57zxyJxyVnL/iq5honAB2eHQJnITDUEPwPCLBLQhKp5UuNITdjBXHuG+u/M8YRNEAnTqiWFnAd7nnzdCz30FfnceJClnYjJfcXgDLlYDZfW/BL6xQUiauXuJn4g+eiyIPANlG0FOsyU6ds48zu1w29EnpCv+Vzsc5u2cqXDS6Ez3L7NfwIdpnRlA894D61K8+191M4fSAbjDm7rBEymGNKabkHjVAj6uZOtKfn2VjEfZYnjkSh6NOH9Wjq2gYWTsKEPOMszyO/oUBD7DIGptfuoKR7dQQwQ8TyOHIO9JBJX6/Iuc6xduvjemaBwQdssaP5SO0Yyz/pTaAd+35v9KikhMR2Dlb0ljgUjH87P4AQK7tWTRZwFszTrBT6z8AMhmtftp/4YYqJ5aOoqeOCJeuiYBW6dHB8KndPtcNQ9nC2eogATiUUJm5Nv5W6AnN6GUB1a9P1SKNsbFNzT6+J0AlTeFfxoES6eIBw5YHLqPOZsZ2NmpjctkDUjJqAuPpId7KwnXWO67yLagiMpuUcya/pkWQkiAV7ds595+ekWdZMiNVg7lJyy5imGEAJWopUtNvMBo6cjdYipaZvadfWOsPj3fHHdoA3vIf3gjW5L5V/LxVZEEFEzLDL7UC0KlG1D8y/b1WmJ6bDAHVXZPuMb1AEYiEtUsed73lywJO13Drgo6YdvSqvro1WyMjTwDLN8iaWCIEf7uWnTknIzzW6BnL5FVT/FEIfZk7LGzDW1Op45EzZLrXlNTi4x8ZYfangKB2AIHcUng6qJMHWJBCmJGK2rRBod2IlQXewUqO36EklVW8EO5vank5Xn+T5QwRCngWC9vFJpiHyBQ58+I5UyariQfA2Bx/KW+gIdtZA7FrMTEaYjNzzFvCxF0hzdagQ6ZC+mQYuOmgbyhBZOepVR+yIurTBMopQJDaiII5tUEuU5UOJ8ZW3lonh0Zb6qxDgc9C9/46wtVINufPGsO3K9Le7uIamhKX2QVxEzReDyzJwrDZfCI3RQBWsnYzZ1vZsNzkZrabWaXcT6GthIgKRIGESL3PrtTPdhFQcMOYWz4ERUQfJFsJbgHzQ1srJqp95c6VjI3lKjHIZ3nxDCvQyL+mJtAeX4hzqwcam8fok8uOp2hqq1SHa8alqZ3C2RbT+8D/hezQuOpiGsPyjx8HC5pBhqJPx5RhOHvX2EQhst1xl25SICv4FEFypnDmCCt0dhph/ZKqZd/jTgUQpsE0/O5yurLXT0iG9D6bpZ0pAy/IjyUgQ32/mJu0nv1wfg5oql8VhvhWWTZa7Q6lGU8uk7eD9M3YSTjkc9dmd9GetC10gyXSLAGOVve4SyuU8+cU/jf6x8MzapoweSHgUiUWQXa+kxv7XmRsQ/39eCy+qcyPcgrN7PYWz2qGpyv/mSHiSvH+6kxnCUks0i6gbAm5a9rISE4DjGuQJSWBINRb+J83kQuAZxO+Qf12HNRlBQzVIZqs5K4U9vtVyAIWGiwAmR4HUbrqnkjLJA8yiDSg8hXw1nJ1A52eYqmuLT9EqgTKEbKeIGrYUf0UsnPGvbXu4cGKYRYK0/Okazl8/EDKc+id", 57 | "id": "rs_37c358c8-1875-26be-886e-4dfe1ce9f4f8", 58 | "format": "xai-responses-v1", 59 | "index": 0 60 | } 61 | ] 62 | } 63 | } 64 | }, 65 | { 66 | "type": "tool-result", 67 | "toolCallId": "call_22619064", 68 | "toolName": "ResultWrite", 69 | "input": { 70 | "content": "\n\n\n{count}\n" 71 | }, 72 | "output": { 73 | "success": true 74 | }, 75 | "dynamic": false 76 | } 77 | ], 78 | "finishReason": "tool-calls", 79 | "usage": { 80 | "inputTokens": 431, 81 | "outputTokens": 540, 82 | "totalTokens": 971, 83 | "reasoningTokens": 463, 84 | "cachedInputTokens": 320 85 | }, 86 | "warnings": [], 87 | "request": { 88 | "body": { 89 | "model": "x-ai/grok-code-fast-1", 90 | "messages": [ 91 | { 92 | "role": "user", 93 | "content": "Can you build a counter component in svelte? Use the ResultWrite tool to write the result to a file when you are done." 94 | } 95 | ], 96 | "tools": [ 97 | { 98 | "type": "function", 99 | "function": { 100 | "name": "ResultWrite", 101 | "description": "Write content to a result file", 102 | "parameters": { 103 | "$schema": "http://json-schema.org/draft-07/schema#", 104 | "type": "object", 105 | "properties": { 106 | "content": { 107 | "description": "The content to write to the result file", 108 | "type": "string" 109 | } 110 | }, 111 | "required": [ 112 | "content" 113 | ], 114 | "additionalProperties": false 115 | } 116 | } 117 | } 118 | ], 119 | "tool_choice": "auto" 120 | } 121 | }, 122 | "response": { 123 | "id": "gen-1765122501-2azq1uAhq0dB8bfw8LqK", 124 | "timestamp": "2025-12-07T15:48:32.009Z", 125 | "modelId": "x-ai/grok-code-fast-1", 126 | "headers": { 127 | "access-control-allow-origin": "*", 128 | "cf-ray": "9aa52ab04e8ef9aa-ARN", 129 | "connection": "keep-alive", 130 | "content-encoding": "gzip", 131 | "content-type": "application/json", 132 | "date": "Sun, 07 Dec 2025 15:48:21 GMT", 133 | "permissions-policy": "payment=(self \"https://checkout.stripe.com\" \"https://connect-js.stripe.com\" \"https://js.stripe.com\" \"https://*.js.stripe.com\" \"https://hooks.stripe.com\")", 134 | "referrer-policy": "no-referrer, strict-origin-when-cross-origin", 135 | "server": "cloudflare", 136 | "transfer-encoding": "chunked", 137 | "vary": "Accept-Encoding", 138 | "x-content-type-options": "nosniff" 139 | }, 140 | "messages": [ 141 | { 142 | "role": "assistant", 143 | "content": [ 144 | { 145 | "type": "reasoning", 146 | "text": "The user asked: \"Can you build a counter component in svelte? Use the ResultWrite tool to write the result to a file when you are done.\"\n\n\n## Understanding User Request \n- The user asked for a counter component to be built in Svelte. \n- They also requested the result be written to a file using a tool called ResultWrite.\n\n## Designing Counter Component\n- A basic Svelte counter needs a script section with a count variable, set initially to 0.\n- Include functions to increment and decrement the count, triggered by button clicks.", 147 | "providerOptions": { 148 | "openrouter": { 149 | "reasoning_details": [ 150 | { 151 | "type": "reasoning.summary", 152 | "summary": "The user asked: \"Can you build a counter component in svelte? Use the ResultWrite tool to write the result to a file when you are done.\"\n\n\n## Understanding User Request \n- The user asked for a counter component to be built in Svelte. \n- They also requested the result be written to a file using a tool called ResultWrite.\n\n## Designing Counter Component\n- A basic Svelte counter needs a script section with a count variable, set initially to 0.\n- Include functions to increment and decrement the count, triggered by button clicks.", 153 | "format": "xai-responses-v1", 154 | "index": 0 155 | } 156 | ] 157 | } 158 | } 159 | }, 160 | { 161 | "type": "reasoning", 162 | "text": "[REDACTED]", 163 | "providerOptions": { 164 | "openrouter": { 165 | "reasoning_details": [ 166 | { 167 | "type": "reasoning.encrypted", 168 | "data": "f1L8uRV32UhXYq98EQOfEw5K//gAe24eugguZxGbaBcmTCP5AytwHZmS2SDZrPh/KioJzm5xm58CIKptdj5fZTuc1HjzE+Ceb+QozdlITIKmCooLg3yJTZHW9px6FTErx+qp8DA+Xlo2X3mioWmefs+LsFMuwBJSl4FyvWfgRTjUBsPWlm9NY64sI3mNzi1zMrID9wIjqeIlrapssIHEw8noJSAlvjGyY+H5wnXo1A1zp487B0apGXxb+nFsLTZySLLPi8FiD0eFPsFj/MjWeMLdYrEYUZNId3xx9Pvrxuskpz0aiSznVxi3QpT9m61EO4rH10CzTI7WB2+WOfH3M/aArmsRsS79AsXh7p11VR+LqnTSrbdJk8b/j0EcNN91J/KyAna3kOjC4Uy0XRXHorBYvFuXYsSagUCMar9LE98XJikyKcBB26KvjAY4uqAjwQBtn9NVBA9b8veK6qoaTJ8Yuz7j52xzKDlt+uD99UpvzHLzxGKAFwz2HX53G5ghbSuJ4Hke/E++OULzZhkeHjiXKKoZjogjhSUA4W50rLN81itnB/Z/z3CM5m6tyOl1sDcApovTnnUtE+d23GQkx3i9WRIprRJF64JcuMc2up3WyLIMefrcIV1RbKzLG3WxuPB0uNysYRVnhVDDGBJN94g+zZc97jg8C0jSV8TkkhzELUNgz2j9Fip3DqgVnrU8HDbC+cEgqTRjfSTZi7LFdIwog3EOGI87m3KL2B4DYK+V6AXkrmv1GQ0LGzJhKwLYtFhVxvswCU0uqvm/+s88LTJpAEeOYbUpQj+y43QrP7q1Erjx0KxemnFYiMLha/467eiJyNVLFLdUNI1hgKswfGhiMRZkMHAA77n3s+0Ru5gC6sHK3frQ8Usaa09pIErIBC75gn2w9hSRGa/irpG5PyyJph8IgpvVi9YV1GZqKRj5Wo3GJJw+JcfyFIJe+rA0Ltetmm9BsbNNlItkTVk+xKJ3HB0V1a3m4KbipqApP+aF7+I7lR7z1FeYU8AvFDVp5JZbFf9uChkDzk3K/Ey3QpXOkwg+4RnkM82QNstqEDIm4QzU7dKShM9dgPFGGxcgJx+nGQElUTRDuZ2jSewrGy81N6KDTUfUubFXs9/nwjmJpuNlLh1JMkAWiIvRYd11zAAnlKTrc2FWj6ezKiiWwAsQ8kJ/+K06Jwu11aLyVFkmSOlov2bNmNzH3XlWIti8LhmWP9fCv74XibX1lfaDx0VwB1KeQUJm3fxpPpRPwLDzhCMMn4UqXraLkeq57zxyJxyVnL/iq5honAB2eHQJnITDUEPwPCLBLQhKp5UuNITdjBXHuG+u/M8YRNEAnTqiWFnAd7nnzdCz30FfnceJClnYjJfcXgDLlYDZfW/BL6xQUiauXuJn4g+eiyIPANlG0FOsyU6ds48zu1w29EnpCv+Vzsc5u2cqXDS6Ez3L7NfwIdpnRlA894D61K8+191M4fSAbjDm7rBEymGNKabkHjVAj6uZOtKfn2VjEfZYnjkSh6NOH9Wjq2gYWTsKEPOMszyO/oUBD7DIGptfuoKR7dQQwQ8TyOHIO9JBJX6/Iuc6xduvjemaBwQdssaP5SO0Yyz/pTaAd+35v9KikhMR2Dlb0ljgUjH87P4AQK7tWTRZwFszTrBT6z8AMhmtftp/4YYqJ5aOoqeOCJeuiYBW6dHB8KndPtcNQ9nC2eogATiUUJm5Nv5W6AnN6GUB1a9P1SKNsbFNzT6+J0AlTeFfxoES6eIBw5YHLqPOZsZ2NmpjctkDUjJqAuPpId7KwnXWO67yLagiMpuUcya/pkWQkiAV7ds595+ekWdZMiNVg7lJyy5imGEAJWopUtNvMBo6cjdYipaZvadfWOsPj3fHHdoA3vIf3gjW5L5V/LxVZEEFEzLDL7UC0KlG1D8y/b1WmJ6bDAHVXZPuMb1AEYiEtUsed73lywJO13Drgo6YdvSqvro1WyMjTwDLN8iaWCIEf7uWnTknIzzW6BnL5FVT/FEIfZk7LGzDW1Op45EzZLrXlNTi4x8ZYfangKB2AIHcUng6qJMHWJBCmJGK2rRBod2IlQXewUqO36EklVW8EO5vank5Xn+T5QwRCngWC9vFJpiHyBQ58+I5UyariQfA2Bx/KW+gIdtZA7FrMTEaYjNzzFvCxF0hzdagQ6ZC+mQYuOmgbyhBZOepVR+yIurTBMopQJDaiII5tUEuU5UOJ8ZW3lonh0Zb6qxDgc9C9/46wtVINufPGsO3K9Le7uIamhKX2QVxEzReDyzJwrDZfCI3RQBWsnYzZ1vZsNzkZrabWaXcT6GthIgKRIGESL3PrtTPdhFQcMOYWz4ERUQfJFsJbgHzQ1srJqp95c6VjI3lKjHIZ3nxDCvQyL+mJtAeX4hzqwcam8fok8uOp2hqq1SHa8alqZ3C2RbT+8D/hezQuOpiGsPyjx8HC5pBhqJPx5RhOHvX2EQhst1xl25SICv4FEFypnDmCCt0dhph/ZKqZd/jTgUQpsE0/O5yurLXT0iG9D6bpZ0pAy/IjyUgQ32/mJu0nv1wfg5oql8VhvhWWTZa7Q6lGU8uk7eD9M3YSTjkc9dmd9GetC10gyXSLAGOVve4SyuU8+cU/jf6x8MzapoweSHgUiUWQXa+kxv7XmRsQ/39eCy+qcyPcgrN7PYWz2qGpyv/mSHiSvH+6kxnCUks0i6gbAm5a9rISE4DjGuQJSWBINRb+J83kQuAZxO+Qf12HNRlBQzVIZqs5K4U9vtVyAIWGiwAmR4HUbrqnkjLJA8yiDSg8hXw1nJ1A52eYqmuLT9EqgTKEbKeIGrYUf0UsnPGvbXu4cGKYRYK0/Okazl8/EDKc+id", 169 | "id": "rs_37c358c8-1875-26be-886e-4dfe1ce9f4f8", 170 | "format": "xai-responses-v1", 171 | "index": 0 172 | } 173 | ] 174 | } 175 | } 176 | }, 177 | { 178 | "type": "tool-call", 179 | "toolCallId": "call_22619064", 180 | "toolName": "ResultWrite", 181 | "input": { 182 | "content": "\n\n\n{count}\n" 183 | }, 184 | "providerOptions": { 185 | "openrouter": { 186 | "reasoning_details": [ 187 | { 188 | "type": "reasoning.summary", 189 | "summary": "The user asked: \"Can you build a counter component in svelte? Use the ResultWrite tool to write the result to a file when you are done.\"\n\n\n## Understanding User Request \n- The user asked for a counter component to be built in Svelte. \n- They also requested the result be written to a file using a tool called ResultWrite.\n\n## Designing Counter Component\n- A basic Svelte counter needs a script section with a count variable, set initially to 0.\n- Include functions to increment and decrement the count, triggered by button clicks.", 190 | "format": "xai-responses-v1", 191 | "index": 0 192 | }, 193 | { 194 | "type": "reasoning.encrypted", 195 | "data": "f1L8uRV32UhXYq98EQOfEw5K//gAe24eugguZxGbaBcmTCP5AytwHZmS2SDZrPh/KioJzm5xm58CIKptdj5fZTuc1HjzE+Ceb+QozdlITIKmCooLg3yJTZHW9px6FTErx+qp8DA+Xlo2X3mioWmefs+LsFMuwBJSl4FyvWfgRTjUBsPWlm9NY64sI3mNzi1zMrID9wIjqeIlrapssIHEw8noJSAlvjGyY+H5wnXo1A1zp487B0apGXxb+nFsLTZySLLPi8FiD0eFPsFj/MjWeMLdYrEYUZNId3xx9Pvrxuskpz0aiSznVxi3QpT9m61EO4rH10CzTI7WB2+WOfH3M/aArmsRsS79AsXh7p11VR+LqnTSrbdJk8b/j0EcNN91J/KyAna3kOjC4Uy0XRXHorBYvFuXYsSagUCMar9LE98XJikyKcBB26KvjAY4uqAjwQBtn9NVBA9b8veK6qoaTJ8Yuz7j52xzKDlt+uD99UpvzHLzxGKAFwz2HX53G5ghbSuJ4Hke/E++OULzZhkeHjiXKKoZjogjhSUA4W50rLN81itnB/Z/z3CM5m6tyOl1sDcApovTnnUtE+d23GQkx3i9WRIprRJF64JcuMc2up3WyLIMefrcIV1RbKzLG3WxuPB0uNysYRVnhVDDGBJN94g+zZc97jg8C0jSV8TkkhzELUNgz2j9Fip3DqgVnrU8HDbC+cEgqTRjfSTZi7LFdIwog3EOGI87m3KL2B4DYK+V6AXkrmv1GQ0LGzJhKwLYtFhVxvswCU0uqvm/+s88LTJpAEeOYbUpQj+y43QrP7q1Erjx0KxemnFYiMLha/467eiJyNVLFLdUNI1hgKswfGhiMRZkMHAA77n3s+0Ru5gC6sHK3frQ8Usaa09pIErIBC75gn2w9hSRGa/irpG5PyyJph8IgpvVi9YV1GZqKRj5Wo3GJJw+JcfyFIJe+rA0Ltetmm9BsbNNlItkTVk+xKJ3HB0V1a3m4KbipqApP+aF7+I7lR7z1FeYU8AvFDVp5JZbFf9uChkDzk3K/Ey3QpXOkwg+4RnkM82QNstqEDIm4QzU7dKShM9dgPFGGxcgJx+nGQElUTRDuZ2jSewrGy81N6KDTUfUubFXs9/nwjmJpuNlLh1JMkAWiIvRYd11zAAnlKTrc2FWj6ezKiiWwAsQ8kJ/+K06Jwu11aLyVFkmSOlov2bNmNzH3XlWIti8LhmWP9fCv74XibX1lfaDx0VwB1KeQUJm3fxpPpRPwLDzhCMMn4UqXraLkeq57zxyJxyVnL/iq5honAB2eHQJnITDUEPwPCLBLQhKp5UuNITdjBXHuG+u/M8YRNEAnTqiWFnAd7nnzdCz30FfnceJClnYjJfcXgDLlYDZfW/BL6xQUiauXuJn4g+eiyIPANlG0FOsyU6ds48zu1w29EnpCv+Vzsc5u2cqXDS6Ez3L7NfwIdpnRlA894D61K8+191M4fSAbjDm7rBEymGNKabkHjVAj6uZOtKfn2VjEfZYnjkSh6NOH9Wjq2gYWTsKEPOMszyO/oUBD7DIGptfuoKR7dQQwQ8TyOHIO9JBJX6/Iuc6xduvjemaBwQdssaP5SO0Yyz/pTaAd+35v9KikhMR2Dlb0ljgUjH87P4AQK7tWTRZwFszTrBT6z8AMhmtftp/4YYqJ5aOoqeOCJeuiYBW6dHB8KndPtcNQ9nC2eogATiUUJm5Nv5W6AnN6GUB1a9P1SKNsbFNzT6+J0AlTeFfxoES6eIBw5YHLqPOZsZ2NmpjctkDUjJqAuPpId7KwnXWO67yLagiMpuUcya/pkWQkiAV7ds595+ekWdZMiNVg7lJyy5imGEAJWopUtNvMBo6cjdYipaZvadfWOsPj3fHHdoA3vIf3gjW5L5V/LxVZEEFEzLDL7UC0KlG1D8y/b1WmJ6bDAHVXZPuMb1AEYiEtUsed73lywJO13Drgo6YdvSqvro1WyMjTwDLN8iaWCIEf7uWnTknIzzW6BnL5FVT/FEIfZk7LGzDW1Op45EzZLrXlNTi4x8ZYfangKB2AIHcUng6qJMHWJBCmJGK2rRBod2IlQXewUqO36EklVW8EO5vank5Xn+T5QwRCngWC9vFJpiHyBQ58+I5UyariQfA2Bx/KW+gIdtZA7FrMTEaYjNzzFvCxF0hzdagQ6ZC+mQYuOmgbyhBZOepVR+yIurTBMopQJDaiII5tUEuU5UOJ8ZW3lonh0Zb6qxDgc9C9/46wtVINufPGsO3K9Le7uIamhKX2QVxEzReDyzJwrDZfCI3RQBWsnYzZ1vZsNzkZrabWaXcT6GthIgKRIGESL3PrtTPdhFQcMOYWz4ERUQfJFsJbgHzQ1srJqp95c6VjI3lKjHIZ3nxDCvQyL+mJtAeX4hzqwcam8fok8uOp2hqq1SHa8alqZ3C2RbT+8D/hezQuOpiGsPyjx8HC5pBhqJPx5RhOHvX2EQhst1xl25SICv4FEFypnDmCCt0dhph/ZKqZd/jTgUQpsE0/O5yurLXT0iG9D6bpZ0pAy/IjyUgQ32/mJu0nv1wfg5oql8VhvhWWTZa7Q6lGU8uk7eD9M3YSTjkc9dmd9GetC10gyXSLAGOVve4SyuU8+cU/jf6x8MzapoweSHgUiUWQXa+kxv7XmRsQ/39eCy+qcyPcgrN7PYWz2qGpyv/mSHiSvH+6kxnCUks0i6gbAm5a9rISE4DjGuQJSWBINRb+J83kQuAZxO+Qf12HNRlBQzVIZqs5K4U9vtVyAIWGiwAmR4HUbrqnkjLJA8yiDSg8hXw1nJ1A52eYqmuLT9EqgTKEbKeIGrYUf0UsnPGvbXu4cGKYRYK0/Okazl8/EDKc+id", 196 | "id": "rs_37c358c8-1875-26be-886e-4dfe1ce9f4f8", 197 | "format": "xai-responses-v1", 198 | "index": 0 199 | } 200 | ] 201 | } 202 | } 203 | } 204 | ] 205 | }, 206 | { 207 | "role": "tool", 208 | "content": [ 209 | { 210 | "type": "tool-result", 211 | "toolCallId": "call_22619064", 212 | "toolName": "ResultWrite", 213 | "output": { 214 | "type": "json", 215 | "value": { 216 | "success": true 217 | } 218 | } 219 | } 220 | ] 221 | } 222 | ] 223 | }, 224 | "providerMetadata": { 225 | "openrouter": { 226 | "provider": "xAI", 227 | "reasoning_details": [ 228 | { 229 | "type": "reasoning.summary", 230 | "summary": "The user asked: \"Can you build a counter component in svelte? Use the ResultWrite tool to write the result to a file when you are done.\"\n\n\n## Understanding User Request \n- The user asked for a counter component to be built in Svelte. \n- They also requested the result be written to a file using a tool called ResultWrite.\n\n## Designing Counter Component\n- A basic Svelte counter needs a script section with a count variable, set initially to 0.\n- Include functions to increment and decrement the count, triggered by button clicks.", 231 | "format": "xai-responses-v1", 232 | "index": 0 233 | }, 234 | { 235 | "type": "reasoning.encrypted", 236 | "data": "f1L8uRV32UhXYq98EQOfEw5K//gAe24eugguZxGbaBcmTCP5AytwHZmS2SDZrPh/KioJzm5xm58CIKptdj5fZTuc1HjzE+Ceb+QozdlITIKmCooLg3yJTZHW9px6FTErx+qp8DA+Xlo2X3mioWmefs+LsFMuwBJSl4FyvWfgRTjUBsPWlm9NY64sI3mNzi1zMrID9wIjqeIlrapssIHEw8noJSAlvjGyY+H5wnXo1A1zp487B0apGXxb+nFsLTZySLLPi8FiD0eFPsFj/MjWeMLdYrEYUZNId3xx9Pvrxuskpz0aiSznVxi3QpT9m61EO4rH10CzTI7WB2+WOfH3M/aArmsRsS79AsXh7p11VR+LqnTSrbdJk8b/j0EcNN91J/KyAna3kOjC4Uy0XRXHorBYvFuXYsSagUCMar9LE98XJikyKcBB26KvjAY4uqAjwQBtn9NVBA9b8veK6qoaTJ8Yuz7j52xzKDlt+uD99UpvzHLzxGKAFwz2HX53G5ghbSuJ4Hke/E++OULzZhkeHjiXKKoZjogjhSUA4W50rLN81itnB/Z/z3CM5m6tyOl1sDcApovTnnUtE+d23GQkx3i9WRIprRJF64JcuMc2up3WyLIMefrcIV1RbKzLG3WxuPB0uNysYRVnhVDDGBJN94g+zZc97jg8C0jSV8TkkhzELUNgz2j9Fip3DqgVnrU8HDbC+cEgqTRjfSTZi7LFdIwog3EOGI87m3KL2B4DYK+V6AXkrmv1GQ0LGzJhKwLYtFhVxvswCU0uqvm/+s88LTJpAEeOYbUpQj+y43QrP7q1Erjx0KxemnFYiMLha/467eiJyNVLFLdUNI1hgKswfGhiMRZkMHAA77n3s+0Ru5gC6sHK3frQ8Usaa09pIErIBC75gn2w9hSRGa/irpG5PyyJph8IgpvVi9YV1GZqKRj5Wo3GJJw+JcfyFIJe+rA0Ltetmm9BsbNNlItkTVk+xKJ3HB0V1a3m4KbipqApP+aF7+I7lR7z1FeYU8AvFDVp5JZbFf9uChkDzk3K/Ey3QpXOkwg+4RnkM82QNstqEDIm4QzU7dKShM9dgPFGGxcgJx+nGQElUTRDuZ2jSewrGy81N6KDTUfUubFXs9/nwjmJpuNlLh1JMkAWiIvRYd11zAAnlKTrc2FWj6ezKiiWwAsQ8kJ/+K06Jwu11aLyVFkmSOlov2bNmNzH3XlWIti8LhmWP9fCv74XibX1lfaDx0VwB1KeQUJm3fxpPpRPwLDzhCMMn4UqXraLkeq57zxyJxyVnL/iq5honAB2eHQJnITDUEPwPCLBLQhKp5UuNITdjBXHuG+u/M8YRNEAnTqiWFnAd7nnzdCz30FfnceJClnYjJfcXgDLlYDZfW/BL6xQUiauXuJn4g+eiyIPANlG0FOsyU6ds48zu1w29EnpCv+Vzsc5u2cqXDS6Ez3L7NfwIdpnRlA894D61K8+191M4fSAbjDm7rBEymGNKabkHjVAj6uZOtKfn2VjEfZYnjkSh6NOH9Wjq2gYWTsKEPOMszyO/oUBD7DIGptfuoKR7dQQwQ8TyOHIO9JBJX6/Iuc6xduvjemaBwQdssaP5SO0Yyz/pTaAd+35v9KikhMR2Dlb0ljgUjH87P4AQK7tWTRZwFszTrBT6z8AMhmtftp/4YYqJ5aOoqeOCJeuiYBW6dHB8KndPtcNQ9nC2eogATiUUJm5Nv5W6AnN6GUB1a9P1SKNsbFNzT6+J0AlTeFfxoES6eIBw5YHLqPOZsZ2NmpjctkDUjJqAuPpId7KwnXWO67yLagiMpuUcya/pkWQkiAV7ds595+ekWdZMiNVg7lJyy5imGEAJWopUtNvMBo6cjdYipaZvadfWOsPj3fHHdoA3vIf3gjW5L5V/LxVZEEFEzLDL7UC0KlG1D8y/b1WmJ6bDAHVXZPuMb1AEYiEtUsed73lywJO13Drgo6YdvSqvro1WyMjTwDLN8iaWCIEf7uWnTknIzzW6BnL5FVT/FEIfZk7LGzDW1Op45EzZLrXlNTi4x8ZYfangKB2AIHcUng6qJMHWJBCmJGK2rRBod2IlQXewUqO36EklVW8EO5vank5Xn+T5QwRCngWC9vFJpiHyBQ58+I5UyariQfA2Bx/KW+gIdtZA7FrMTEaYjNzzFvCxF0hzdagQ6ZC+mQYuOmgbyhBZOepVR+yIurTBMopQJDaiII5tUEuU5UOJ8ZW3lonh0Zb6qxDgc9C9/46wtVINufPGsO3K9Le7uIamhKX2QVxEzReDyzJwrDZfCI3RQBWsnYzZ1vZsNzkZrabWaXcT6GthIgKRIGESL3PrtTPdhFQcMOYWz4ERUQfJFsJbgHzQ1srJqp95c6VjI3lKjHIZ3nxDCvQyL+mJtAeX4hzqwcam8fok8uOp2hqq1SHa8alqZ3C2RbT+8D/hezQuOpiGsPyjx8HC5pBhqJPx5RhOHvX2EQhst1xl25SICv4FEFypnDmCCt0dhph/ZKqZd/jTgUQpsE0/O5yurLXT0iG9D6bpZ0pAy/IjyUgQ32/mJu0nv1wfg5oql8VhvhWWTZa7Q6lGU8uk7eD9M3YSTjkc9dmd9GetC10gyXSLAGOVve4SyuU8+cU/jf6x8MzapoweSHgUiUWQXa+kxv7XmRsQ/39eCy+qcyPcgrN7PYWz2qGpyv/mSHiSvH+6kxnCUks0i6gbAm5a9rISE4DjGuQJSWBINRb+J83kQuAZxO+Qf12HNRlBQzVIZqs5K4U9vtVyAIWGiwAmR4HUbrqnkjLJA8yiDSg8hXw1nJ1A52eYqmuLT9EqgTKEbKeIGrYUf0UsnPGvbXu4cGKYRYK0/Okazl8/EDKc+id", 237 | "id": "rs_37c358c8-1875-26be-886e-4dfe1ce9f4f8", 238 | "format": "xai-responses-v1", 239 | "index": 0 240 | } 241 | ], 242 | "usage": { 243 | "promptTokens": 431, 244 | "promptTokensDetails": { 245 | "cachedTokens": 320 246 | }, 247 | "completionTokens": 540, 248 | "completionTokensDetails": { 249 | "reasoningTokens": 463 250 | }, 251 | "totalTokens": 971, 252 | "cost": 0.000830214 253 | } 254 | } 255 | } 256 | } 257 | ], 258 | "resultWriteContent": "\n\n\n{count}\n", 259 | "metadata": { 260 | "mcpEnabled": false, 261 | "mcpServerUrl": null, 262 | "timestamp": "2025-12-07T15:48:32.016Z", 263 | "model": "openrouter/x-ai/grok-code-fast-1" 264 | } 265 | } --------------------------------------------------------------------------------