├── .prettierrc
├── results
    ├── .gitkeep
    └── result-2025-12-07-16-48-32.json
├── .bunversion
├── .cocominify
├── GEMINI.md
├── .env.example
├── .prettierignore
├── .cocoignore
├── vitest-setup.js
├── svelte.config.js
├── lib
    ├── tools
    │   ├── index.ts
    │   ├── result-write.ts
    │   └── test-component.ts
    ├── test-discovery.ts
    ├── utils.ts
    ├── report.ts
    ├── pricing.ts
    ├── utils.test.ts
    ├── output-test-runner.ts
    ├── verify-references.ts
    ├── pricing.test.ts
    ├── report-styles.ts
    └── report-template.ts
├── CLAUDE.md
├── verify-references.ts
├── scripts
    └── install.sh
├── vitest.config.ts
├── tests
    ├── snippets
    │   ├── Reference.svelte
    │   ├── prompt.md
    │   └── test.ts
    ├── counter
    │   ├── Reference.svelte
    │   ├── prompt.md
    │   └── test.ts
    └── derived-by
    │   ├── Reference.svelte
    │   ├── prompt.md
    │   └── test.ts
├── .gitignore
├── .github
    └── workflows
    │   └── test.yml
├── tsconfig.json
├── eslint.config.js
├── generate-report.ts
├── patches
    └── @ai-sdk+mcp+0.0.11.patch
├── package.json
├── README.md
├── AGENTS.md
└── index.ts


/.prettierrc:
--------------------------------------------------------------------------------
1 | {}
2 | 


--------------------------------------------------------------------------------
/results/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.bunversion:
--------------------------------------------------------------------------------
1 | bun-v1.3.4


--------------------------------------------------------------------------------
/.cocominify:
--------------------------------------------------------------------------------
1 | tests/
2 | data/


--------------------------------------------------------------------------------
/GEMINI.md:
--------------------------------------------------------------------------------
1 | @AGENTS.md
2 | 


--------------------------------------------------------------------------------
/.env.example:
--------------------------------------------------------------------------------
1 | VERCEL_OIDC_TOKEN="create with vercel cli"


--------------------------------------------------------------------------------
/.prettierignore:
--------------------------------------------------------------------------------
1 | # Ignore artifacts:
2 | tests
3 | results


--------------------------------------------------------------------------------
/.cocoignore:
--------------------------------------------------------------------------------
1 | .env.example
2 | results/
3 | .claude
4 | patches


--------------------------------------------------------------------------------
/vitest-setup.js:
--------------------------------------------------------------------------------
1 | import "@testing-library/jest-dom/vitest";
2 | 


--------------------------------------------------------------------------------
/svelte.config.js:
--------------------------------------------------------------------------------
1 | export default {
2 |   compilerOptions: {
3 |     runes: true,
4 |   },
5 | };
6 | 


--------------------------------------------------------------------------------
/lib/tools/index.ts:
--------------------------------------------------------------------------------
1 | export { resultWriteTool } from "./result-write.ts";
2 | export { testComponentTool } from "./test-component.ts";
3 | 


--------------------------------------------------------------------------------
/CLAUDE.md:
--------------------------------------------------------------------------------
1 | # CLAUDE.md
2 | 
3 | This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
4 | 
5 | @AGENTS.md
6 | 


--------------------------------------------------------------------------------
/verify-references.ts:
--------------------------------------------------------------------------------
1 | import { verifyAllReferences } from "./lib/verify-references.ts";
2 | 
3 | console.log("=== Reference Implementation Verification ===\n");
4 | 
5 | const exitCode = await verifyAllReferences();
6 | 
7 | process.exit(exitCode);
8 | 


--------------------------------------------------------------------------------
/scripts/install.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Read the Bun version from .bunversion file
 4 | if [ -f ".bunversion" ]; then
 5 |   BUN_VERSION=$(cat .bunversion | tr -d '[:space:]')
 6 |   echo "Installing Bun version: $BUN_VERSION"
 7 |   curl -fsSL https://bun.com/install | bash -s "$BUN_VERSION"
 8 | else
 9 |   echo "Error: .bunversion file not found"
10 |   exit 1
11 | fi
12 | 


--------------------------------------------------------------------------------
/vitest.config.ts:
--------------------------------------------------------------------------------
 1 | import { defineConfig } from "vitest/config";
 2 | import { svelte } from "@sveltejs/vite-plugin-svelte";
 3 | import { svelteTesting } from "@testing-library/svelte/vite";
 4 | 
 5 | export default defineConfig({
 6 |   plugins: [svelte(), svelteTesting()],
 7 |   test: {
 8 |     environment: "jsdom",
 9 |     setupFiles: ["./vitest-setup.js"],
10 |     include: ["**/*.{test,spec}.{js,ts,jsx,tsx}", "**/test.ts"],
11 |   },
12 | });
13 | 


--------------------------------------------------------------------------------
/lib/tools/result-write.ts:
--------------------------------------------------------------------------------
 1 | import { tool } from "ai";
 2 | import { z } from "zod";
 3 | 
 4 | export const resultWriteTool = tool({
 5 |   description:
 6 |     "Write your final Svelte component code. Call this when you have completed implementing the component and are ready to submit.",
 7 |   inputSchema: z.object({
 8 |     content: z.string().describe("The complete Svelte component code"),
 9 |   }),
10 |   execute: async ({ content }) => {
11 |     const lines = content.split("\n").length;
12 |     console.log(`[ResultWrite] Received ${lines} lines of code`);
13 |     return { success: true };
14 |   },
15 | });
16 | 


--------------------------------------------------------------------------------
/tests/snippets/Reference.svelte:
--------------------------------------------------------------------------------
 1 | <svelte:options runes={true} />
 2 | 
 3 | <script>
 4 |   const bookTitles = ["The Lord of the Rings", "To Kill a Mockingbird", "1984"];
 5 | </script>
 6 | 
 7 | {#snippet title(bookTitle)}
 8 |   <span data-testid="book-title">{bookTitle}</span>
 9 | {/snippet}
10 | 
11 | <ul>
12 |   {#each bookTitles as bookTitle}
13 |     <li data-testid="book-item">
14 |       {@render title(bookTitle)}
15 |     </li>
16 |   {/each}
17 | </ul>
18 | 
19 | <style>
20 |   ul {
21 |     list-style-type: none;
22 |     padding: 0;
23 |   }
24 | 
25 |   li {
26 |     margin: 0.5rem 0;
27 |     padding: 0.5rem;
28 |     border-bottom: 1px solid #eee;
29 |   }
30 | 
31 |   span {
32 |     font-weight: bold;
33 |   }
34 | </style>
35 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # dependencies (bun install)
 2 | node_modules
 3 | 
 4 | # output
 5 | out
 6 | dist
 7 | *.tgz
 8 | 
 9 | # test outputs directory
10 | outputs
11 | 
12 | # code coverage
13 | coverage
14 | *.lcov
15 | 
16 | # logs
17 | logs
18 | _.log
19 | report.[0-9]_.[0-9]_.[0-9]_.[0-9]_.json
20 | 
21 | # dotenv environment variable files
22 | .env
23 | .env.development.local
24 | .env.test.local
25 | .env.production.local
26 | .env.local
27 | 
28 | # caches
29 | .eslintcache
30 | .cache
31 | *.tsbuildinfo
32 | 
33 | # IntelliJ based IDEs
34 | .idea
35 | 
36 | # Finder (MacOS) folder config
37 | .DS_Store
38 | 
39 | # results directory (ignore HTML files but keep JSON and .gitkeep)
40 | results/*
41 | !results/.gitkeep
42 | !results/*.json
43 | .vercel
44 | .env*.local
45 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | name: Test
 2 | 
 3 | on:
 4 |   pull_request:
 5 |   push:
 6 |     branches:
 7 |       - main
 8 | 
 9 | jobs:
10 |   test:
11 |     name: Run Tests
12 |     runs-on: ubuntu-latest
13 |     steps:
14 |       - uses: actions/checkout@v4
15 | 
16 |       - name: Get Bun version
17 |         id: bun-version
18 |         run: echo "version=$(cat .bunversion | sed 's/bun-v//')" >> $GITHUB_OUTPUT
19 | 
20 |       - uses: oven-sh/setup-bun@v2
21 |         with:
22 |           bun-version: ${{ steps.bun-version.outputs.version }}
23 | 
24 |       - name: Install dependencies
25 |         run: bun install
26 | 
27 |       - name: Run ESLint
28 |         run: bun run lint
29 | 
30 |       - name: TypeScript type check
31 |         run: bun run tsc
32 | 
33 |       - name: Run tests
34 |         run: bun test
35 | 


--------------------------------------------------------------------------------
/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "compilerOptions": {
 3 |     // Environment setup & latest features
 4 |     "lib": ["ESNext"],
 5 |     "target": "ESNext",
 6 |     "module": "Preserve",
 7 |     "moduleDetection": "force",
 8 |     "jsx": "react-jsx",
 9 |     "allowJs": true,
10 | 
11 |     // Bundler mode
12 |     "moduleResolution": "bundler",
13 |     "allowImportingTsExtensions": true,
14 |     "verbatimModuleSyntax": true,
15 |     "noEmit": true,
16 | 
17 |     // Best practices
18 |     "strict": true,
19 |     "skipLibCheck": true,
20 |     "noFallthroughCasesInSwitch": true,
21 |     "noUncheckedIndexedAccess": true,
22 |     "noImplicitOverride": true,
23 | 
24 |     // Some stricter flags (disabled by default)
25 |     "noUnusedLocals": false,
26 |     "noUnusedParameters": false,
27 |     "noPropertyAccessFromIndexSignature": false
28 |   }
29 | }
30 | 


--------------------------------------------------------------------------------
/eslint.config.js:
--------------------------------------------------------------------------------
 1 | import tseslint from "@typescript-eslint/eslint-plugin";
 2 | import parser from "@typescript-eslint/parser";
 3 | 
 4 | export default [
 5 |   {
 6 |     files: ["**/*.ts"],
 7 |     languageOptions: {
 8 |       parser,
 9 |       parserOptions: {
10 |         project: "./tsconfig.json",
11 |       },
12 |     },
13 |     plugins: {
14 |       "@typescript-eslint": tseslint,
15 |     },
16 |     rules: {
17 |       "@typescript-eslint/naming-convention": [
18 |         "error",
19 |         {
20 |           selector: "variable",
21 |           format: ["camelCase"],
22 |           leadingUnderscore: "allow",
23 |           trailingUnderscore: "allow",
24 |         },
25 |         {
26 |           selector: "variable",
27 |           modifiers: ["const"],
28 |           format: ["camelCase", "UPPER_CASE"],
29 |           leadingUnderscore: "allow",
30 |           trailingUnderscore: "allow",
31 |         },
32 |       ],
33 |     },
34 |   },
35 | ];
36 | 


--------------------------------------------------------------------------------
/tests/snippets/prompt.md:
--------------------------------------------------------------------------------
 1 | # Snippet Component Task
 2 | 
 3 | Create a simple Svelte 5 component that demonstrates the basic use of snippets.
 4 | 
 5 | ## Requirements:
 6 | 
 7 | 1. Create a component with a hardcoded array of 3 book titles (strings) - "The Lord of the Rings", "To Kill a Mockingbird", and "1984"
 8 | 2. Create a snippet called `title` that takes a book title string as a parameter
 9 | 3. The snippet should display the book title in a `<span>` element with `data-testid="book-title"`
10 | 4. Use the `{@render ...}` syntax to render the snippet for each book title in a list
11 | 5. Each rendered title should be wrapped in a `<li>` element with `data-testid="book-item"`
12 | 
13 | ## Example HTML structure:
14 | 
15 | ```html
16 | <ul>
17 |   <li data-testid="book-item">The Lord of the Rings</li>
18 |   <li data-testid="book-item">To Kill a Mockingbird</li>
19 |   <li data-testid="book-item">1984</li>
20 | </ul>
21 | ```
22 | 
23 | Please implement this component using Svelte 5 runes.
24 | 


--------------------------------------------------------------------------------
/tests/counter/Reference.svelte:
--------------------------------------------------------------------------------
 1 | <svelte:options runes={true} />
 2 | 
 3 | <script>
 4 |   let count = $state(0);
 5 | 
 6 |   function increment() {
 7 |     count++;
 8 |   }
 9 | 
10 |   function decrement() {
11 |     count--;
12 |   }
13 | </script>
14 | 
15 | <div class="counter">
16 |   <button data-testid="decrement-button" onclick={decrement}>-</button>
17 |   <span data-testid="count-value">{count}</span>
18 |   <button data-testid="increment-button" onclick={increment}>+</button>
19 | </div>
20 | 
21 | <style>
22 |   .counter {
23 |     display: flex;
24 |     align-items: center;
25 |     gap: 1rem;
26 |     padding: 1rem;
27 |   }
28 | 
29 |   button {
30 |     background-color: #e2e8f0;
31 |     border: none;
32 |     border-radius: 0.25rem;
33 |     padding: 0.5rem 1rem;
34 |     font-size: 1.25rem;
35 |     cursor: pointer;
36 |   }
37 | 
38 |   button:hover {
39 |     background-color: #cbd5e0;
40 |   }
41 | 
42 |   span {
43 |     font-size: 1.5rem;
44 |     font-weight: bold;
45 |     min-width: 2rem;
46 |     text-align: center;
47 |   }
48 | </style>
49 | 


--------------------------------------------------------------------------------
/tests/counter/prompt.md:
--------------------------------------------------------------------------------
 1 | # Counter Component Task
 2 | 
 3 | Create a Svelte 5 component called Counter that implements a simple counter with increment and decrement functionality.
 4 | 
 5 | ## Requirements:
 6 | 
 7 | 1. Use Svelte 5's `$state` for reactivity
 8 | 2. The counter should start at 0
 9 | 3. Include a decrement button with the `data-testid="decrement-button"` attribute
10 | 4. Include an increment button with the `data-testid="increment-button"` attribute
11 | 5. Display the current count with the `data-testid="count-value"` attribute
12 | 6. Clicking increment should increase the count by 1
13 | 7. Clicking decrement should decrease the count by 1
14 | 8. Style the counter with a CSS class "counter"
15 | 
16 | Example structure:
17 | 
18 | ```html
19 | <div class="counter">
20 |   <button data-testid="decrement-button">-</button>
21 |   <span data-testid="count-value">0</span>
22 |   <button data-testid="increment-button">+</button>
23 | </div>
24 | ```
25 | 
26 | Please implement this component using Svelte 5 syntax. Make sure you only return one component.
27 | 


--------------------------------------------------------------------------------
/generate-report.ts:
--------------------------------------------------------------------------------
 1 | import { generateReport } from "./lib/report.ts";
 2 | import { readdirSync } from "node:fs";
 3 | 
 4 | function getAllResultFiles(): string[] {
 5 |   const resultsDir = "results";
 6 |   const files = readdirSync(resultsDir);
 7 | 
 8 |   const resultFiles = files.filter(
 9 |     (file) => file.startsWith("result-") && file.endsWith(".json"),
10 |   );
11 | 
12 |   if (resultFiles.length === 0) {
13 |     throw new Error("No result files found in results/ directory");
14 |   }
15 | 
16 |   resultFiles.sort((a, b) => b.localeCompare(a));
17 | 
18 |   return resultFiles.map((file) => `${resultsDir}/${file}`);
19 | }
20 | 
21 | const resultFiles = getAllResultFiles();
22 | 
23 | console.log(`Found ${resultFiles.length} result file(s) to process\n`);
24 | 
25 | for (const jsonPath of resultFiles) {
26 |   const htmlPath = jsonPath.replace(/\.json$/, ".html");
27 |   console.log(`Generating report: ${jsonPath} -> ${htmlPath}`);
28 |   await generateReport(jsonPath, htmlPath, false);
29 | }
30 | 
31 | console.log(`\nSuccessfully generated ${resultFiles.length} HTML report(s)`);
32 | 


--------------------------------------------------------------------------------
/patches/@ai-sdk+mcp+0.0.11.patch:
--------------------------------------------------------------------------------
 1 | diff --git a/node_modules/@ai-sdk/mcp/dist/index.mjs b/node_modules/@ai-sdk/mcp/dist/index.mjs
 2 | index 0000000..0000000 100644
 3 | --- a/node_modules/@ai-sdk/mcp/dist/index.mjs
 4 | +++ b/node_modules/@ai-sdk/mcp/dist/index.mjs
 5 | @@ -1350,7 +1350,7 @@ var HttpMCPTransport = class {
 6 |                  const { done, value } = await reader.read();
 7 |                  if (done) return;
 8 |                  const { event, data } = value;
 9 | -                if (event === "message") {
10 | +                if (!event || event === "message") {
11 |                    try {
12 |                      const msg = JSONRPCMessageSchema.parse(JSON.parse(data));
13 |                      (_a4 = this.onmessage) == null ? void 0 : _a4.call(this, msg);
14 | @@ -1475,7 +1475,7 @@ var HttpMCPTransport = class {
15 |              if (id) {
16 |                this.lastInboundEventId = id;
17 |              }
18 | -            if (event === "message") {
19 | +            if (!event || event === "message") {
20 |                try {
21 |                  const msg = JSONRPCMessageSchema.parse(JSON.parse(data));
22 |                  (_a4 = this.onmessage) == null ? void 0 : _a4.call(this, msg);
23 | 


--------------------------------------------------------------------------------
/tests/derived-by/Reference.svelte:
--------------------------------------------------------------------------------
 1 | <svelte:options runes={true} />
 2 | 
 3 | <script>
 4 |   let text = $state("");
 5 | 
 6 |   let textStats = $derived.by(() => {
 7 |     const wordCount = text.trim() === "" ? 0 : text.trim().split(/\s+/).length;
 8 |     const charCount = text.length;
 9 |     const isLongText = wordCount > 15;
10 | 
11 |     return {
12 |       wordCount,
13 |       charCount,
14 |       isLongText,
15 |     };
16 |   });
17 | 
18 |   function clearText() {
19 |     text = "";
20 |   }
21 | </script>
22 | 
23 | <div>
24 |   <input data-testid="text-input" type="text" bind:value={text} placeholder="Type some text..." />
25 |   <button data-testid="clear-button" onclick={clearText}> Clear </button>
26 | 
27 |   <div>
28 |     <p data-testid="word-count">Words: {textStats.wordCount}</p>
29 |     <p data-testid="char-count">Characters: {textStats.charCount}</p>
30 |     <p data-testid="length-indicator">
31 |       Status: {textStats.isLongText ? "Long text" : "Short text"}
32 |     </p>
33 |   </div>
34 | </div>
35 | 
36 | <style>
37 |   div {
38 |     margin: 1rem 0;
39 |   }
40 | 
41 |   input {
42 |     padding: 0.5rem;
43 |     width: 300px;
44 |   }
45 | 
46 |   button {
47 |     margin-left: 0.5rem;
48 |     padding: 0.5rem 1rem;
49 |   }
50 | </style>
51 | 


--------------------------------------------------------------------------------
/tests/snippets/test.ts:
--------------------------------------------------------------------------------
 1 | import { render, screen } from "@testing-library/svelte";
 2 | import { expect, test, describe } from "vitest";
 3 | import BookList from "./Component.svelte";
 4 | 
 5 | describe("Simple Snippet component", () => {
 6 |   test("renders the correct number of book titles", () => {
 7 |     render(BookList);
 8 | 
 9 |     const bookItems = screen.getAllByTestId("book-item");
10 |     expect(bookItems.length).toBe(3);
11 | 
12 |     const bookTitles = screen.getAllByTestId("book-title");
13 |     expect(bookTitles.length).toBe(3);
14 |   });
15 | 
16 |   test("displays correct book titles", () => {
17 |     render(BookList);
18 | 
19 |     const bookTitles = screen.getAllByTestId("book-title");
20 | 
21 |     expect(bookTitles[0]).toHaveTextContent("The Lord of the Rings");
22 |     expect(bookTitles[1]).toHaveTextContent("To Kill a Mockingbird");
23 |     expect(bookTitles[2]).toHaveTextContent("1984");
24 |   });
25 | 
26 |   test("has the correct structure for each book item", () => {
27 |     render(BookList);
28 | 
29 |     const bookItems = screen.getAllByTestId("book-item");
30 | 
31 |     bookItems.forEach((item) => {
32 |       expect(item.tagName).toBe("LI");
33 | 
34 |       const title = item.querySelector('[data-testid="book-title"]');
35 |       expect(title).toBeInTheDocument();
36 |       expect(title?.tagName).toBe("SPAN");
37 |     });
38 |   });
39 | });
40 | 


--------------------------------------------------------------------------------
/tests/derived-by/prompt.md:
--------------------------------------------------------------------------------
 1 | # $derived.by Component Task
 2 | 
 3 | Create a Svelte 5 component that demonstrates the `$derived.by` rune for complex derivations.
 4 | 
 5 | ## Requirements:
 6 | 
 7 | 1. Create a text input field that allows the user to enter text
 8 | 2. Use `$state` to store the current text value, starting with an empty string
 9 | 3. Use `$derived.by` to calculate:
10 |    - The number of words in the text
11 |    - The number of characters in the text
12 |    - Whether the text is considered "long" (more than 15 words)
13 | 4. Display all these derived values below the input field
14 | 5. Include a "Clear" button that resets the text to an empty string
15 | 
16 | Elements should have these data-testid attributes:
17 | 
18 | - "text-input" for the input field
19 | - "word-count" for displaying the word count
20 | - "char-count" for displaying the character count
21 | - "length-indicator" for displaying whether the text is long
22 | - "clear-button" for the clear button
23 | 
24 | Example structure:
25 | 
26 | ```html
27 | <div>
28 |   <input data-testid="text-input" type="text" />
29 |   <button data-testid="clear-button">Clear</button>
30 |   <div>
31 |     <p data-testid="word-count">Words: 0</p>
32 |     <p data-testid="char-count">Characters: 0</p>
33 |     <p data-testid="length-indicator">Status: Short text</p>
34 |   </div>
35 | </div>
36 | ```
37 | 
38 | Please implement this component using Svelte 5 runes.
39 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "ai-sdk-bench",
 3 |   "module": "index.ts",
 4 |   "type": "module",
 5 |   "private": true,
 6 |   "scripts": {
 7 |     "start": "bun run vercel:env:pull && bun ./index.ts",
 8 |     "postinstall": "patch-package",
 9 |     "update": "bun update --interactive",
10 |     "verify-tests": "bun run verify-references.ts",
11 |     "prettier": "prettier --write .",
12 |     "lint": "eslint .",
13 |     "lint:fix": "eslint . --fix",
14 |     "tsc": "tsc --noEmit",
15 |     "vercel:link": "vercel link",
16 |     "vercel:env:pull": "vercel env pull .env.local --yes"
17 |   },
18 |   "devDependencies": {
19 |     "@sveltejs/vite-plugin-svelte": "^5.0.3",
20 |     "@testing-library/jest-dom": "^6.6.3",
21 |     "@types/bun": "^1.3.3",
22 |     "@typescript-eslint/eslint-plugin": "^8.49.0",
23 |     "@typescript-eslint/parser": "^8.49.0",
24 |     "eslint": "^9.39.2",
25 |     "jsdom": "^26.1.0",
26 |     "patch-package": "^8.0.1",
27 |     "prettier": "^3.7.4"
28 |   },
29 |   "peerDependencies": {
30 |     "typescript": "^5.9.3"
31 |   },
32 |   "dependencies": {
33 |     "@ai-sdk/anthropic": "^2.0.53",
34 |     "@ai-sdk/mcp": "0.0.11",
35 |     "@ai-sdk/openai": "^2.0.77",
36 |     "@ai-sdk/openai-compatible": "^1.0.28",
37 |     "@clack/prompts": "^0.11.0",
38 |     "@openrouter/ai-sdk-provider": "^1.4.1",
39 |     "@testing-library/svelte": "^5.2.9",
40 |     "@testing-library/user-event": "^14.6.1",
41 |     "ai": "^5.0.108",
42 |     "vercel": "^49.1.2",
43 |     "vitest": "^4.0.15",
44 |     "zod": "^4.1.13"
45 |   }
46 | }
47 | 


--------------------------------------------------------------------------------
/lib/test-discovery.ts:
--------------------------------------------------------------------------------
 1 | import { readdirSync, statSync, existsSync, readFileSync } from "node:fs";
 2 | import { join } from "node:path";
 3 | 
 4 | export interface TestDefinition {
 5 |   name: string;
 6 |   directory: string;
 7 |   referenceFile: string;
 8 |   componentFile: string;
 9 |   testFile: string;
10 |   promptFile: string;
11 |   prompt: string;
12 | }
13 | 
14 | export function discoverTests() {
15 |   const testsDir = join(process.cwd(), "tests");
16 |   const definitions = [];
17 | 
18 |   try {
19 |     const entries = readdirSync(testsDir);
20 | 
21 |     for (const entry of entries) {
22 |       const entryPath = join(testsDir, entry);
23 |       const stat = statSync(entryPath);
24 | 
25 |       if (stat.isDirectory()) {
26 |         const referenceFile = join(entryPath, "Reference.svelte");
27 |         const testFile = join(entryPath, "test.ts");
28 |         const promptFile = join(entryPath, "prompt.md");
29 |         const componentFile = join(entryPath, "Component.svelte");
30 | 
31 |         if (
32 |           existsSync(referenceFile) &&
33 |           existsSync(testFile) &&
34 |           existsSync(promptFile)
35 |         ) {
36 |           const prompt = readFileSync(promptFile, "utf-8");
37 | 
38 |           definitions.push({
39 |             name: entry,
40 |             directory: entryPath,
41 |             referenceFile,
42 |             componentFile,
43 |             testFile,
44 |             promptFile,
45 |             prompt,
46 |           });
47 |         } else {
48 |           const missing = [];
49 |           if (!existsSync(referenceFile)) missing.push("Reference.svelte");
50 |           if (!existsSync(testFile)) missing.push("test.ts");
51 |           if (!existsSync(promptFile)) missing.push("prompt.md");
52 |           console.warn(`⚠️  Skipping ${entry}: missing ${missing.join(", ")}`);
53 |         }
54 |       }
55 |     }
56 |   } catch (error) {
57 |     console.error("Error discovering tests:", error);
58 |   }
59 | 
60 |   definitions.sort((a, b) => a.name.localeCompare(b.name));
61 | 
62 |   return definitions;
63 | }
64 | 
65 | export function buildAgentPrompt(test: TestDefinition) {
66 |   return `${test.prompt}
67 | 
68 | IMPORTANT: When you have finished implementing the component, use the ResultWrite tool to output your final Svelte component code. Only output the component code itself, no explanations or markdown formatting.`;
69 | }
70 | 


--------------------------------------------------------------------------------
/lib/tools/test-component.ts:
--------------------------------------------------------------------------------
 1 | import { tool } from "ai";
 2 | import { z } from "zod";
 3 | import type { TestDefinition } from "../test-discovery.ts";
 4 | import {
 5 |   runTestVerification,
 6 |   cleanupTestEnvironment,
 7 | } from "../output-test-runner.ts";
 8 | 
 9 | export function testComponentTool(test: TestDefinition) {
10 |   return tool({
11 |     description:
12 |       "Test your Svelte component against the test suite. Use this to verify your implementation and get feedback on any failing tests before submitting with ResultWrite. Returns detailed information about which tests passed or failed.",
13 |     inputSchema: z.object({
14 |       content: z
15 |         .string()
16 |         .describe("The complete Svelte component code to test"),
17 |     }),
18 |     execute: async ({ content }) => {
19 |       const lines = content.split("\n").length;
20 |       console.log(`[TestComponent] Testing ${lines} lines of code...`);
21 | 
22 |       try {
23 |         const result = await runTestVerification(test, content);
24 | 
25 |         cleanupTestEnvironment(test.name);
26 | 
27 |         if (result.passed) {
28 |           console.log(`[TestComponent] ✓ All ${result.numTests} tests passed`);
29 |           return {
30 |             success: true,
31 |             message: `All ${result.numTests} tests passed!`,
32 |             passed: result.numPassed,
33 |             failed: result.numFailed,
34 |             total: result.numTests,
35 |             duration: result.duration,
36 |           };
37 |         } else {
38 |           console.log(
39 |             `[TestComponent] ✗ ${result.numFailed}/${result.numTests} tests failed`,
40 |           );
41 |           return {
42 |             success: false,
43 |             message: `${result.numFailed} of ${result.numTests} tests failed`,
44 |             passed: result.numPassed,
45 |             failed: result.numFailed,
46 |             total: result.numTests,
47 |             duration: result.duration,
48 |             error: result.error,
49 |             failedTests: result.failedTests?.map((ft) => ({
50 |               name: ft.fullName,
51 |               error: ft.errorMessage,
52 |             })),
53 |           };
54 |         }
55 |       } catch (error) {
56 |         cleanupTestEnvironment(test.name);
57 |         console.log(`[TestComponent] ✗ Error running tests`);
58 |         return {
59 |           success: false,
60 |           message: "Failed to run tests",
61 |           error: error instanceof Error ? error.message : String(error),
62 |         };
63 |       }
64 |     },
65 |   });
66 | }
67 | 


--------------------------------------------------------------------------------
/tests/derived-by/test.ts:
--------------------------------------------------------------------------------
 1 | import { render, screen } from "@testing-library/svelte";
 2 | import { expect, test, describe } from "vitest";
 3 | import userEvent from "@testing-library/user-event";
 4 | import TextAnalyzer from "./Component.svelte";
 5 | 
 6 | describe("TextAnalyzer component", () => {
 7 |   test("renders with initial state", () => {
 8 |     render(TextAnalyzer);
 9 | 
10 |     expect(screen.getByTestId("word-count")).toHaveTextContent("Words: 0");
11 |     expect(screen.getByTestId("char-count")).toHaveTextContent("Characters: 0");
12 |     expect(screen.getByTestId("length-indicator")).toHaveTextContent("Status: Short text");
13 |   });
14 | 
15 |   test("updates counts when text is entered", async () => {
16 |     const user = userEvent.setup();
17 |     render(TextAnalyzer);
18 | 
19 |     const input = screen.getByTestId("text-input");
20 | 
21 |     // Enter a short text
22 |     await user.type(input, "Hello world");
23 | 
24 |     expect(screen.getByTestId("word-count")).toHaveTextContent("Words: 2");
25 |     expect(screen.getByTestId("char-count")).toHaveTextContent("Characters: 11");
26 |     expect(screen.getByTestId("length-indicator")).toHaveTextContent("Status: Short text");
27 | 
28 |     // Clear and enter a longer text
29 |     await user.clear(input);
30 |     await user.type(
31 |       input,
32 |       "This is a much longer text that should have more than fifteen words so that we can test the long text indicator functionality properly",
33 |     );
34 | 
35 |     expect(screen.getByTestId("word-count")).toHaveTextContent("Words: 24");
36 |     expect(screen.getByTestId("char-count")).toHaveTextContent("Characters: 134");
37 |     expect(screen.getByTestId("length-indicator")).toHaveTextContent("Status: Long text");
38 |   });
39 | 
40 |   test("clear button resets the text", async () => {
41 |     const user = userEvent.setup();
42 |     render(TextAnalyzer);
43 | 
44 |     const input = screen.getByTestId("text-input");
45 |     const clearButton = screen.getByTestId("clear-button");
46 | 
47 |     // Enter some text
48 |     await user.type(input, "Hello world");
49 | 
50 |     // Verify counts
51 |     expect(screen.getByTestId("word-count")).toHaveTextContent("Words: 2");
52 | 
53 |     // Click the clear button
54 |     await user.click(clearButton);
55 | 
56 |     // Verify everything is reset
57 |     expect(screen.getByTestId("word-count")).toHaveTextContent("Words: 0");
58 |     expect(screen.getByTestId("char-count")).toHaveTextContent("Characters: 0");
59 |     expect(screen.getByTestId("length-indicator")).toHaveTextContent("Status: Short text");
60 |     expect(input).toHaveValue("");
61 |   });
62 | });
63 | 


--------------------------------------------------------------------------------
/tests/counter/test.ts:
--------------------------------------------------------------------------------
 1 | import { render, screen } from "@testing-library/svelte";
 2 | import { expect, test, describe } from "vitest";
 3 | import userEvent from "@testing-library/user-event";
 4 | import Counter from "./Component.svelte"; // Your component that is written using the ResultWrite tool
 5 | 
 6 | describe("Counter component", () => {
 7 |   test("renders with initial count of 0", () => {
 8 |     render(Counter);
 9 | 
10 |     // Use data-testid to get elements
11 |     const countElement = screen.getByTestId("count-value");
12 |     const decrementButton = screen.getByTestId("decrement-button");
13 |     const incrementButton = screen.getByTestId("increment-button");
14 | 
15 |     // Check initial state
16 |     expect(countElement).toHaveTextContent("0");
17 |     expect(decrementButton).toBeInTheDocument();
18 |     expect(incrementButton).toBeInTheDocument();
19 |   });
20 | 
21 |   test("increments the count when + button is clicked", async () => {
22 |     const user = userEvent.setup();
23 |     render(Counter);
24 | 
25 |     const incrementButton = screen.getByTestId("increment-button");
26 |     const countElement = screen.getByTestId("count-value");
27 | 
28 |     // Initial count should be 0
29 |     expect(countElement).toHaveTextContent("0");
30 | 
31 |     // Click the increment button
32 |     await user.click(incrementButton);
33 | 
34 |     // Count should now be 1
35 |     expect(countElement).toHaveTextContent("1");
36 |   });
37 | 
38 |   test("decrements the count when - button is clicked", async () => {
39 |     const user = userEvent.setup();
40 |     render(Counter);
41 | 
42 |     const decrementButton = screen.getByTestId("decrement-button");
43 |     const countElement = screen.getByTestId("count-value");
44 | 
45 |     // Initial count should be 0
46 |     expect(countElement).toHaveTextContent("0");
47 | 
48 |     // Click the decrement button
49 |     await user.click(decrementButton);
50 | 
51 |     // Count should now be -1
52 |     expect(countElement).toHaveTextContent("-1");
53 |   });
54 | 
55 |   test("handles multiple clicks correctly", async () => {
56 |     const user = userEvent.setup();
57 |     render(Counter);
58 | 
59 |     const decrementButton = screen.getByTestId("decrement-button");
60 |     const incrementButton = screen.getByTestId("increment-button");
61 |     const countElement = screen.getByTestId("count-value");
62 | 
63 |     // Increment twice
64 |     await user.click(incrementButton);
65 |     await user.click(incrementButton);
66 |     expect(countElement).toHaveTextContent("2");
67 | 
68 |     // Decrement once
69 |     await user.click(decrementButton);
70 |     expect(countElement).toHaveTextContent("1");
71 |   });
72 | });
73 | 


--------------------------------------------------------------------------------
/lib/utils.ts:
--------------------------------------------------------------------------------
 1 | import { calculateCost, type ModelPricing } from "./pricing.ts";
 2 | import type { SingleTestResult, TotalCostInfo } from "./report.ts";
 3 | 
 4 | export function sanitizeModelName(modelName: string) {
 5 |   return modelName.replace(/[^a-zA-Z0-9.]/g, "-");
 6 | }
 7 | 
 8 | export function getTimestampedFilename(
 9 |   prefix: string,
10 |   extension: string,
11 |   modelName?: string,
12 |   now: Date = new Date(),
13 | ) {
14 |   const year = now.getUTCFullYear();
15 |   const month = String(now.getUTCMonth() + 1).padStart(2, "0");
16 |   const day = String(now.getUTCDate()).padStart(2, "0");
17 |   const hours = String(now.getUTCHours()).padStart(2, "0");
18 |   const minutes = String(now.getUTCMinutes()).padStart(2, "0");
19 |   const seconds = String(now.getUTCSeconds()).padStart(2, "0");
20 | 
21 |   const timestamp = `${year}-${month}-${day}-${hours}-${minutes}-${seconds}`;
22 |   const modelSuffix = modelName ? `-${sanitizeModelName(modelName)}` : "";
23 | 
24 |   return `${prefix}-${timestamp}${modelSuffix}.${extension}`;
25 | }
26 | 
27 | export function isHttpUrl(str: string) {
28 |   return str.startsWith("http://") || str.startsWith("https://");
29 | }
30 | 
31 | export function extractResultWriteContent(steps: unknown[]) {
32 |   for (const step of steps) {
33 |     const s = step as {
34 |       content?: Array<{
35 |         type: string;
36 |         toolName?: string;
37 |         input?: { content: string };
38 |       }>;
39 |     };
40 |     if (s.content) {
41 |       for (const content of s.content) {
42 |         if (
43 |           content.type === "tool-call" &&
44 |           content.toolName === "ResultWrite"
45 |         ) {
46 |           return content.input?.content ?? null;
47 |         }
48 |       }
49 |     }
50 |   }
51 |   return null;
52 | }
53 | 
54 | export function calculateTotalCost(
55 |   tests: SingleTestResult[],
56 |   pricing: ModelPricing,
57 | ) {
58 |   let totalInputTokens = 0;
59 |   let totalOutputTokens = 0;
60 |   let totalCachedInputTokens = 0;
61 | 
62 |   for (const test of tests) {
63 |     for (const step of test.steps) {
64 |       totalInputTokens += step.usage.inputTokens;
65 |       totalOutputTokens += step.usage.outputTokens;
66 |       totalCachedInputTokens += step.usage.cachedInputTokens ?? 0;
67 |     }
68 |   }
69 | 
70 |   const costResult = calculateCost(
71 |     pricing,
72 |     totalInputTokens,
73 |     totalOutputTokens,
74 |     totalCachedInputTokens,
75 |   );
76 | 
77 |   return {
78 |     inputCost: costResult.inputCost,
79 |     outputCost: costResult.outputCost,
80 |     cacheReadCost: costResult.cacheReadCost,
81 |     totalCost: costResult.totalCost,
82 |     inputTokens: totalInputTokens,
83 |     outputTokens: totalOutputTokens,
84 |     cachedInputTokens: totalCachedInputTokens,
85 |   };
86 | }
87 | 


--------------------------------------------------------------------------------
/lib/report.ts:
--------------------------------------------------------------------------------
  1 | import { readFile, writeFile } from "node:fs/promises";
  2 | import type { TestVerificationResult } from "./output-test-runner.ts";
  3 | import { generateMultiTestHtml } from "./report-template.ts";
  4 | 
  5 | interface TextBlock {
  6 |   type: "text";
  7 |   text: string;
  8 | }
  9 | 
 10 | interface ToolCallBlock {
 11 |   type: "tool-call";
 12 |   toolCallId: string;
 13 |   toolName: string;
 14 |   input: Record<string, unknown>;
 15 |   dynamic?: boolean;
 16 | }
 17 | 
 18 | interface ToolResultBlock {
 19 |   type: "tool-result";
 20 |   toolCallId: string;
 21 |   toolName: string;
 22 |   input: Record<string, unknown>;
 23 |   output: {
 24 |     content: Array<{ type: string; text?: string; [key: string]: unknown }>;
 25 |     isError?: boolean;
 26 |   };
 27 |   dynamic?: boolean;
 28 | }
 29 | 
 30 | type ContentBlock = TextBlock | ToolCallBlock | ToolResultBlock;
 31 | 
 32 | interface Message {
 33 |   role: "user" | "assistant";
 34 |   content: ContentBlock[];
 35 | }
 36 | 
 37 | interface Usage {
 38 |   inputTokens: number;
 39 |   outputTokens: number;
 40 |   totalTokens: number;
 41 |   cachedInputTokens: number;
 42 | }
 43 | 
 44 | interface RequestBody {
 45 |   model: string;
 46 |   max_tokens: number;
 47 |   messages: Message[];
 48 | }
 49 | 
 50 | interface ResponseBody {
 51 |   id: string;
 52 |   timestamp: string;
 53 |   modelId: string;
 54 |   [key: string]: unknown;
 55 | }
 56 | 
 57 | interface Step {
 58 |   content: ContentBlock[];
 59 |   finishReason: string;
 60 |   usage: Usage;
 61 |   request: {
 62 |     body: RequestBody;
 63 |   };
 64 |   response: ResponseBody;
 65 |   [key: string]: unknown;
 66 | }
 67 | 
 68 | export interface PricingInfo {
 69 |   inputCostPerMTok: number;
 70 |   outputCostPerMTok: number;
 71 |   cacheReadCostPerMTok?: number;
 72 | }
 73 | 
 74 | export interface TotalCostInfo {
 75 |   inputCost: number;
 76 |   outputCost: number;
 77 |   cacheReadCost: number;
 78 |   totalCost: number;
 79 |   inputTokens: number;
 80 |   outputTokens: number;
 81 |   cachedInputTokens: number;
 82 | }
 83 | 
 84 | interface Metadata {
 85 |   mcpEnabled: boolean;
 86 |   mcpServerUrl: string | null;
 87 |   mcpTransportType?: string | null;
 88 |   timestamp: string;
 89 |   model: string;
 90 |   pricingKey?: string | null;
 91 |   pricing?: PricingInfo | null;
 92 |   totalCost?: TotalCostInfo | null;
 93 | }
 94 | 
 95 | export interface SingleTestResult {
 96 |   testName: string;
 97 |   prompt: string;
 98 |   steps: Step[];
 99 |   resultWriteContent: string | null;
100 |   verification: TestVerificationResult | null;
101 | }
102 | 
103 | export interface MultiTestResultData {
104 |   tests: SingleTestResult[];
105 |   metadata: Metadata;
106 | }
107 | 
108 | interface LegacyResultData {
109 |   steps: Step[];
110 |   resultWriteContent?: string | null;
111 |   metadata?: Metadata;
112 | }
113 | 
114 | export async function generateReport(
115 |   resultPath: string,
116 |   outputPath: string,
117 |   openBrowser = true,
118 | ) {
119 |   try {
120 |     const jsonContent = await readFile(resultPath, "utf-8");
121 |     const data = JSON.parse(jsonContent);
122 | 
123 |     let html;
124 | 
125 |     if ("tests" in data && Array.isArray(data.tests)) {
126 |       html = generateMultiTestHtml(data as MultiTestResultData);
127 |     } else {
128 |       const legacyData = data as LegacyResultData;
129 |       const multiTestData = {
130 |         tests: [
131 |           {
132 |             testName: "Legacy Test",
133 |             prompt: "Static prompt (legacy format)",
134 |             steps: legacyData.steps,
135 |             resultWriteContent: legacyData.resultWriteContent ?? null,
136 |             verification: null,
137 |           },
138 |         ],
139 |         metadata: legacyData.metadata ?? {
140 |           mcpEnabled: false,
141 |           mcpServerUrl: null,
142 |           timestamp: new Date().toISOString(),
143 |           model: "unknown",
144 |         },
145 |       };
146 |       html = generateMultiTestHtml(multiTestData);
147 |     }
148 | 
149 |     await writeFile(outputPath, html, "utf-8");
150 | 
151 |     console.log(`✓ Report generated successfully: ${outputPath}`);
152 | 
153 |     if (openBrowser) {
154 |       Bun.spawn(["open", outputPath]);
155 |     }
156 |   } catch (error) {
157 |     console.error("Error generating report:", error);
158 |     throw error;
159 |   }
160 | }
161 | 


--------------------------------------------------------------------------------
/lib/pricing.ts:
--------------------------------------------------------------------------------
  1 | export interface ModelPricing {
  2 |   inputCostPerToken: number;
  3 |   outputCostPerToken: number;
  4 |   cacheReadInputTokenCost?: number;
  5 |   cacheCreationInputTokenCost?: number;
  6 | }
  7 | 
  8 | export interface CostCalculation {
  9 |   inputCost: number;
 10 |   outputCost: number;
 11 |   cacheReadCost: number;
 12 |   totalCost: number;
 13 |   inputTokens: number;
 14 |   outputTokens: number;
 15 |   cachedInputTokens: number;
 16 | }
 17 | 
 18 | export interface ModelPricingDisplay {
 19 |   inputCostPerMTok: number;
 20 |   outputCostPerMTok: number;
 21 |   cacheReadCostPerMTok?: number;
 22 | }
 23 | 
 24 | export interface ModelPricingLookup {
 25 |   pricing: ModelPricing;
 26 |   matchedKey: string;
 27 | }
 28 | 
 29 | export interface GatewayPricing {
 30 |   input?: string;
 31 |   output?: string;
 32 |   cachedInputTokens?: string;
 33 |   cacheCreationInputTokens?: string;
 34 | }
 35 | 
 36 | export interface GatewayModel {
 37 |   id: string;
 38 |   name: string;
 39 |   description?: string;
 40 |   pricing?: GatewayPricing;
 41 |   specification?: {
 42 |     specificationVersion: string;
 43 |     provider: string;
 44 |     modelId: string;
 45 |   };
 46 |   modelType: string;
 47 | }
 48 | 
 49 | export function extractPricingFromGatewayModel(
 50 |   model: GatewayModel,
 51 | ) {
 52 |   if (!model.pricing) {
 53 |     return null;
 54 |   }
 55 | 
 56 |   const { pricing } = model;
 57 | 
 58 |   const inputCost = pricing.input ? parseFloat(pricing.input) : NaN;
 59 |   const outputCost = pricing.output ? parseFloat(pricing.output) : NaN;
 60 | 
 61 |   if (isNaN(inputCost) || isNaN(outputCost)) {
 62 |     throw new Error(
 63 |       `Invalid pricing for model ${model.id}: input and output pricing must be valid numbers.`,
 64 |     );
 65 |   }
 66 | 
 67 |   const result: ModelPricing = {
 68 |     inputCostPerToken: inputCost,
 69 |     outputCostPerToken: outputCost,
 70 |   };
 71 | 
 72 |   if (pricing.cachedInputTokens) {
 73 |     const cached = parseFloat(pricing.cachedInputTokens);
 74 |     if (!isNaN(cached)) {
 75 |       result.cacheReadInputTokenCost = cached;
 76 |     }
 77 |   }
 78 | 
 79 |   if (pricing.cacheCreationInputTokens) {
 80 |     const creation = parseFloat(pricing.cacheCreationInputTokens);
 81 |     if (!isNaN(creation)) {
 82 |       result.cacheCreationInputTokenCost = creation;
 83 |     }
 84 |   }
 85 | 
 86 |   return result;
 87 | }
 88 | 
 89 | export function buildPricingMap(
 90 |   models: GatewayModel[],
 91 | ) {
 92 |   const map = new Map<string, ModelPricingLookup | null>();
 93 | 
 94 |   for (const model of models) {
 95 |     const pricing = extractPricingFromGatewayModel(model);
 96 |     if (pricing) {
 97 |       map.set(model.id, {
 98 |         pricing,
 99 |         matchedKey: model.id,
100 |       });
101 |     } else {
102 |       map.set(model.id, null);
103 |     }
104 |   }
105 | 
106 |   return map;
107 | }
108 | 
109 | export function lookupPricingFromMap(
110 |   modelId: string,
111 |   pricingMap: Map<string, ModelPricingLookup | null>,
112 | ) {
113 |   return pricingMap.get(modelId) ?? null;
114 | }
115 | 
116 | export function getModelPricingDisplay(
117 |   pricing: ModelPricing,
118 | ) {
119 |   return {
120 |     inputCostPerMTok: pricing.inputCostPerToken * 1_000_000,
121 |     outputCostPerMTok: pricing.outputCostPerToken * 1_000_000,
122 |     cacheReadCostPerMTok:
123 |       pricing.cacheReadInputTokenCost !== undefined
124 |         ? pricing.cacheReadInputTokenCost * 1_000_000
125 |         : undefined,
126 |   };
127 | }
128 | 
129 | export function calculateCost(
130 |   pricing: ModelPricing,
131 |   inputTokens: number,
132 |   outputTokens: number,
133 |   cachedInputTokens: number = 0,
134 | ) {
135 |   const uncachedInputTokens = inputTokens - cachedInputTokens;
136 |   const inputCost = uncachedInputTokens * pricing.inputCostPerToken;
137 | 
138 |   const outputCost = outputTokens * pricing.outputCostPerToken;
139 | 
140 |   const cacheReadCost =
141 |     cachedInputTokens * (pricing.cacheReadInputTokenCost ?? 0);
142 | 
143 |   return {
144 |     inputCost,
145 |     outputCost,
146 |     cacheReadCost,
147 |     totalCost: inputCost + outputCost + cacheReadCost,
148 |     inputTokens,
149 |     outputTokens,
150 |     cachedInputTokens,
151 |   };
152 | }
153 | 
154 | export function formatCost(cost: number) {
155 |   if (cost === 0) return "$0.00";
156 |   if (cost < 0.01) {
157 |     return `$${cost.toFixed(6)}`;
158 |   }
159 |   if (cost < 1) {
160 |     return `$${cost.toFixed(4)}`;
161 |   }
162 |   return `$${cost.toFixed(2)}`;
163 | }
164 | 
165 | export function formatMTokCost(costPerMTok: number) {
166 |   if (costPerMTok === 0) return "$0";
167 |   if (costPerMTok < 0.01) {
168 |     return `$${costPerMTok.toFixed(4)}`;
169 |   }
170 |   return `$${costPerMTok.toFixed(2)}`;
171 | }
172 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # ai-sdk-bench
  2 | 
  3 | AI SDK benchmarking tool that tests AI agents with MCP (Model Context Protocol) integration using the Vercel AI Gateway. Automatically discovers and runs all tests in the `tests/` directory, verifying LLM-generated Svelte components against test suites.
  4 | 
  5 | ## Installation
  6 | 
  7 | To install dependencies:
  8 | 
  9 | ```bash
 10 | ./scripts/install.sh # installs the correct bun version
 11 | bun install
 12 | ```
 13 | 
 14 | ## Setup
 15 | 
 16 | Configure your API keys in `.env`:
 17 | 
 18 | 1. Install Vercel CLI if you haven't already
 19 | 2. Run `bun run vercel:link` and link the benchmark to a project that has AI Gateway enabled
 20 | 3. Run the benchmark with "bun run dev"
 21 | 
 22 | ### Required API Keys
 23 | 
 24 | You'll need at least one API key for the providers you want to test:
 25 | 
 26 | - `VERCEL_OIDC_TOKEN`: The OIDC token for vercel AI gateway
 27 | 
 28 | ## Usage
 29 | 
 30 | To run the benchmark:
 31 | 
 32 | ```bash
 33 | bun run index.ts
 34 | ```
 35 | 
 36 | ### Interactive CLI
 37 | 
 38 | The benchmark features an interactive CLI that will prompt you for configuration:
 39 | 
 40 | 1. **Model Selection**: Choose one or more models from the Vercel AI Gateway
 41 |    - Select from available models in your configured providers
 42 |    - Optionally add custom model IDs
 43 |    - Can test multiple models in a single run
 44 | 
 45 | 2. **MCP Integration**: Choose your MCP configuration
 46 |    - **No MCP Integration**: Run without external tools
 47 |    - **MCP over HTTP**: Use HTTP-based MCP server (default: `https://mcp.svelte.dev/mcp`)
 48 |    - **MCP over StdIO**: Use local MCP server via command (default: `npx -y @sveltejs/mcp`)
 49 |    - Option to provide custom MCP server URL or command
 50 | 
 51 | 3. **TestComponent Tool**: Enable/disable the testing tool for models
 52 |    - Allows models to run tests during component development
 53 |    - Enabled by default
 54 | 
 55 | ### Benchmark Workflow
 56 | 
 57 | After configuration, the benchmark will:
 58 | 
 59 | 1. Discover all tests in `tests/` directory
 60 | 2. For each selected model and test:
 61 |    - Run the AI agent with the test's prompt
 62 |    - Extract the generated Svelte component
 63 |    - Verify the component against the test suite
 64 | 3. Generate a combined report with all results
 65 | 
 66 | ### Results and Reports
 67 | 
 68 | Results are saved to the `results/` directory with timestamped filenames:
 69 | 
 70 | - `results/result-2024-12-07-14-30-45.json` - Full execution trace with all test results
 71 | - `results/result-2024-12-07-14-30-45.html` - Interactive HTML report with expandable test sections
 72 | 
 73 | The HTML report includes:
 74 | 
 75 | - Summary bar showing passed/failed/skipped counts
 76 | - Expandable sections for each test
 77 | - Step-by-step execution trace
 78 | - Generated component code
 79 | - Test verification results with pass/fail details
 80 | - Token usage statistics
 81 | - MCP status badge
 82 | - Dark/light theme toggle
 83 | 
 84 | To regenerate an HTML report from a JSON file:
 85 | 
 86 | ```bash
 87 | # Regenerate most recent result
 88 | bun run generate-report.ts
 89 | 
 90 | # Regenerate specific result
 91 | bun run generate-report.ts results/result-2024-12-07-14-30-45.json
 92 | ```
 93 | 
 94 | ## Test Structure
 95 | 
 96 | Each test in the `tests/` directory should have:
 97 | 
 98 | ```
 99 | tests/
100 |   {test-name}/
101 |     Reference.svelte  - Reference implementation (known-good solution)
102 |     test.ts          - Vitest test file (imports "./Component.svelte")
103 |     prompt.md        - Prompt for the AI agent
104 | ```
105 | 
106 | The benchmark:
107 | 
108 | 1. Reads the prompt from `prompt.md`
109 | 2. Asks the agent to generate a component
110 | 3. Writes the generated component to a temporary location
111 | 4. Runs the tests against the generated component
112 | 5. Reports pass/fail status
113 | 
114 | ## Verifying Reference Implementations
115 | 
116 | To verify that all reference implementations pass their tests:
117 | 
118 | ```bash
119 | bun run verify-tests
120 | ```
121 | 
122 | This copies each `Reference.svelte` to `Component.svelte` temporarily and runs the tests.
123 | 
124 | ## MCP Integration
125 | 
126 | The tool supports optional integration with MCP (Model Context Protocol) servers through the interactive CLI. When running the benchmark, you'll be prompted to choose:
127 | 
128 | - **No MCP Integration**: Run without external tools
129 | - **MCP over HTTP**: Connect to an HTTP-based MCP server
130 |   - Default: `https://mcp.svelte.dev/mcp`
131 |   - Option to provide a custom URL
132 | - **MCP over StdIO**: Connect to a local MCP server via command
133 |   - Default: `npx -y @sveltejs/mcp`
134 |   - Option to provide a custom command
135 | 
136 | MCP status, transport type, and server configuration are documented in both the JSON metadata and displayed as a badge in the HTML report.
137 | 
138 | ## Exit Codes
139 | 
140 | - `0`: All tests passed
141 | - `1`: One or more tests failed
142 | 
143 | ## Documentation
144 | 
145 | See [AGENTS.md](AGENTS.md) for detailed documentation on:
146 | 
147 | - Architecture and components
148 | - Environment variables and model configuration
149 | - MCP integration details
150 | - Development commands
151 | - Multi-test result format
152 | 
153 | ---
154 | 
155 | This project was created using `bun init` in bun v1.3.3. [Bun](https://bun.com) is a fast all-in-one JavaScript runtime.
156 | 


--------------------------------------------------------------------------------
/lib/utils.test.ts:
--------------------------------------------------------------------------------
  1 | import { describe, it, expect } from "vitest";
  2 | import {
  3 |   sanitizeModelName,
  4 |   getTimestampedFilename,
  5 |   calculateTotalCost,
  6 | } from "./utils.ts";
  7 | import type { ModelPricing } from "./pricing.ts";
  8 | import type { SingleTestResult } from "./report.ts";
  9 | 
 10 | describe("sanitizeModelName", () => {
 11 |   it("replaces slashes with dashes", () => {
 12 |     expect(sanitizeModelName("anthropic/claude-sonnet-4")).toBe(
 13 |       "anthropic-claude-sonnet-4",
 14 |     );
 15 |   });
 16 | 
 17 |   it("replaces special characters with dashes", () => {
 18 |     expect(sanitizeModelName("model@version")).toBe("model-version");
 19 |     expect(sanitizeModelName("model_name")).toBe("model-name");
 20 |     expect(sanitizeModelName("model name")).toBe("model-name");
 21 |   });
 22 | 
 23 |   it("preserves dots", () => {
 24 |     expect(sanitizeModelName("gpt-4.0")).toBe("gpt-4.0");
 25 |     expect(sanitizeModelName("model.v1.2.3")).toBe("model.v1.2.3");
 26 |   });
 27 | 
 28 |   it("preserves alphanumeric characters", () => {
 29 |     expect(sanitizeModelName("gpt4o")).toBe("gpt4o");
 30 |     expect(sanitizeModelName("claude3")).toBe("claude3");
 31 |   });
 32 | 
 33 |   it("handles multiple consecutive special characters", () => {
 34 |     expect(sanitizeModelName("model///name")).toBe("model---name");
 35 |     expect(sanitizeModelName("model@#$name")).toBe("model---name");
 36 |   });
 37 | });
 38 | 
 39 | describe("getTimestampedFilename", () => {
 40 |   const fixedDate = new Date("2025-12-12T14:30:45Z");
 41 | 
 42 |   it("generates filename without model name", () => {
 43 |     const result = getTimestampedFilename(
 44 |       "result",
 45 |       "json",
 46 |       undefined,
 47 |       fixedDate,
 48 |     );
 49 |     expect(result).toBe("result-2025-12-12-14-30-45.json");
 50 |   });
 51 | 
 52 |   it("generates filename with simple model name", () => {
 53 |     const result = getTimestampedFilename(
 54 |       "result",
 55 |       "json",
 56 |       "gpt-4o",
 57 |       fixedDate,
 58 |     );
 59 |     expect(result).toBe("result-2025-12-12-14-30-45-gpt-4o.json");
 60 |   });
 61 | 
 62 |   it("generates filename with model name containing slashes", () => {
 63 |     const result = getTimestampedFilename(
 64 |       "result",
 65 |       "json",
 66 |       "anthropic/claude-sonnet-4",
 67 |       fixedDate,
 68 |     );
 69 |     expect(result).toBe(
 70 |       "result-2025-12-12-14-30-45-anthropic-claude-sonnet-4.json",
 71 |     );
 72 |   });
 73 | 
 74 |   it("generates filename with model name containing special characters", () => {
 75 |     const result = getTimestampedFilename(
 76 |       "result",
 77 |       "html",
 78 |       "model@v1.2.3",
 79 |       fixedDate,
 80 |     );
 81 |     expect(result).toBe("result-2025-12-12-14-30-45-model-v1.2.3.html");
 82 |   });
 83 | 
 84 |   it("handles different file extensions", () => {
 85 |     const result = getTimestampedFilename(
 86 |       "output",
 87 |       "txt",
 88 |       "test-model",
 89 |       fixedDate,
 90 |     );
 91 |     expect(result).toBe("output-2025-12-12-14-30-45-test-model.txt");
 92 |   });
 93 | 
 94 |   it("pads single-digit months and days", () => {
 95 |     const earlyDate = new Date("2025-01-05T08:09:07Z");
 96 |     const result = getTimestampedFilename(
 97 |       "result",
 98 |       "json",
 99 |       undefined,
100 |       earlyDate,
101 |     );
102 |     expect(result).toBe("result-2025-01-05-08-09-07.json");
103 |   });
104 | });
105 | 
106 | describe("calculateTotalCost", () => {
107 |   const pricing: ModelPricing = {
108 |     inputCostPerToken: 1.0 / 1_000_000,
109 |     outputCostPerToken: 2.0 / 1_000_000,
110 |     cacheReadInputTokenCost: 0.1 / 1_000_000,
111 |   };
112 | 
113 |   it("calculates zero cost for empty results", () => {
114 |     const tests: SingleTestResult[] = [];
115 |     const result = calculateTotalCost(tests, pricing);
116 | 
117 |     expect(result).toEqual({
118 |       inputCost: 0,
119 |       outputCost: 0,
120 |       cacheReadCost: 0,
121 |       totalCost: 0,
122 |       inputTokens: 0,
123 |       outputTokens: 0,
124 |       cachedInputTokens: 0,
125 |     });
126 |   });
127 | 
128 |   it("aggregates usage from multiple steps and tests", () => {
129 |     const tests: SingleTestResult[] = [
130 |       {
131 |         testName: "test1",
132 |         prompt: "p1",
133 |         resultWriteContent: null,
134 |         verification: {} as any,
135 |         steps: [
136 |           {
137 |             usage: {
138 |               inputTokens: 100,
139 |               outputTokens: 50,
140 |               cachedInputTokens: 10,
141 |             },
142 |           } as any,
143 |           {
144 |             usage: {
145 |               inputTokens: 200,
146 |               outputTokens: 100,
147 |               cachedInputTokens: 0,
148 |             },
149 |           } as any,
150 |         ],
151 |       },
152 |       {
153 |         testName: "test2",
154 |         prompt: "p2",
155 |         resultWriteContent: null,
156 |         verification: {} as any,
157 |         steps: [
158 |           {
159 |             usage: {
160 |               inputTokens: 300,
161 |               outputTokens: 150,
162 |               cachedInputTokens: 20,
163 |             },
164 |           } as any,
165 |         ],
166 |       },
167 |     ];
168 | 
169 |     // Total Input: 100 + 200 + 300 = 600
170 |     // Total Output: 50 + 100 + 150 = 300
171 |     // Total Cached: 10 + 0 + 20 = 30
172 |     // Uncached Input: 600 - 30 = 570
173 | 
174 |     // Costs (per Token):
175 |     // Input: 570 * (1.0 / 1e6) = 0.00057
176 |     // Output: 300 * (2.0 / 1e6) = 0.0006
177 |     // Cache: 30 * (0.1 / 1e6) = 0.000003
178 |     // Total: 0.00057 + 0.0006 + 0.000003 = 0.001173
179 | 
180 |     const result = calculateTotalCost(tests, pricing);
181 | 
182 |     expect(result).toEqual({
183 |       inputCost: 0.00057,
184 |       outputCost: 0.0006,
185 |       cacheReadCost: 0.000003,
186 |       totalCost: 0.001173,
187 |       inputTokens: 600,
188 |       outputTokens: 300,
189 |       cachedInputTokens: 30,
190 |     });
191 |   });
192 | });
193 | 


--------------------------------------------------------------------------------
/lib/output-test-runner.ts:
--------------------------------------------------------------------------------
  1 | import {
  2 |   mkdirSync,
  3 |   rmSync,
  4 |   existsSync,
  5 |   writeFileSync,
  6 |   copyFileSync,
  7 | } from "node:fs";
  8 | import { join } from "node:path";
  9 | import { startVitest } from "vitest/node";
 10 | import type { TestDefinition } from "./test-discovery.ts";
 11 | 
 12 | const OUTPUTS_DIR = join(process.cwd(), "outputs");
 13 | 
 14 | export interface FailedTest {
 15 |   fullName: string;
 16 |   errorMessage: string;
 17 | }
 18 | 
 19 | export interface TestVerificationResult {
 20 |   testName: string;
 21 |   passed: boolean;
 22 |   numTests: number;
 23 |   numPassed: number;
 24 |   numFailed: number;
 25 |   duration: number;
 26 |   error?: string;
 27 |   failedTests?: FailedTest[];
 28 | }
 29 | 
 30 | export function setupOutputsDirectory() {
 31 |   if (existsSync(OUTPUTS_DIR)) {
 32 |     rmSync(OUTPUTS_DIR, { recursive: true, force: true });
 33 |   }
 34 |   mkdirSync(OUTPUTS_DIR, { recursive: true });
 35 | }
 36 | 
 37 | export function cleanupOutputsDirectory() {
 38 |   if (existsSync(OUTPUTS_DIR)) {
 39 |     rmSync(OUTPUTS_DIR, { recursive: true, force: true });
 40 |   }
 41 | }
 42 | 
 43 | export function prepareTestEnvironment(
 44 |   test: TestDefinition,
 45 |   componentCode: string,
 46 | ) {
 47 |   const testDir = join(OUTPUTS_DIR, test.name);
 48 | 
 49 |   if (existsSync(testDir)) {
 50 |     rmSync(testDir, { recursive: true, force: true });
 51 |   }
 52 |   mkdirSync(testDir, { recursive: true });
 53 | 
 54 |   const componentPath = join(testDir, "Component.svelte");
 55 |   writeFileSync(componentPath, componentCode, "utf-8");
 56 | 
 57 |   const testFilePath = join(testDir, "test.ts");
 58 |   copyFileSync(test.testFile, testFilePath);
 59 | 
 60 |   return testDir;
 61 | }
 62 | 
 63 | export function cleanupTestEnvironment(testName: string) {
 64 |   const testDir = join(OUTPUTS_DIR, testName);
 65 |   if (existsSync(testDir)) {
 66 |     rmSync(testDir, { recursive: true, force: true });
 67 |   }
 68 | }
 69 | 
 70 | export async function runTestVerification(
 71 |   test: TestDefinition,
 72 |   componentCode: string,
 73 | ) {
 74 |   const startTime = Date.now();
 75 | 
 76 |   try {
 77 |     const testDir = prepareTestEnvironment(test, componentCode);
 78 |     const testFilePath = join(testDir, "test.ts");
 79 | 
 80 |     const vitest = await startVitest("test", [testFilePath], {
 81 |       watch: false,
 82 |       reporters: ["verbose"],
 83 |     });
 84 | 
 85 |     if (!vitest) {
 86 |       return {
 87 |         testName: test.name,
 88 |         passed: false,
 89 |         numTests: 0,
 90 |         numPassed: 0,
 91 |         numFailed: 0,
 92 |         duration: Date.now() - startTime,
 93 |         error: "Failed to start vitest",
 94 |       };
 95 |     }
 96 | 
 97 |     await vitest.close();
 98 | 
 99 |     const testModules = vitest.state.getTestModules();
100 |     const failedTests = [];
101 |     const allErrors = [];
102 | 
103 |     const unhandledErrors = vitest.state.getUnhandledErrors();
104 |     for (const error of unhandledErrors) {
105 |       const errorMessage =
106 |         error instanceof Error ? error.message : String(error);
107 |       allErrors.push(errorMessage);
108 |     }
109 | 
110 |     let passed = true;
111 |     let numTests = 0;
112 |     let numFailed = 0;
113 | 
114 |     if (!testModules || testModules.length === 0) {
115 |       return {
116 |         testName: test.name,
117 |         passed: false,
118 |         numTests: 0,
119 |         numPassed: 0,
120 |         numFailed: 0,
121 |         duration: Date.now() - startTime,
122 |         error:
123 |           allErrors.length > 0 ? allErrors.join("\n") : "No test modules found",
124 |       };
125 |     }
126 | 
127 |     for (const module of testModules) {
128 |       if (!module.ok()) {
129 |         passed = false;
130 |       }
131 | 
132 |       const moduleErrors = module.errors();
133 |       for (const error of moduleErrors) {
134 |         if (error.message) {
135 |           allErrors.push(error.message);
136 |         }
137 |       }
138 | 
139 |       if (!module.children) {
140 |         continue;
141 |       }
142 | 
143 |       try {
144 |         const tests = Array.from(module.children.allTests());
145 |         numTests += tests.length;
146 | 
147 |         for (const t of tests) {
148 |           const result = t.result();
149 | 
150 |           if (result.state === "failed") {
151 |             numFailed++;
152 | 
153 |             const ancestorTitles: string[] = [];
154 |             let parent = t.parent;
155 |             while (parent && "name" in parent) {
156 |               if (parent.name) {
157 |                 ancestorTitles.unshift(parent.name);
158 |               }
159 |               parent = (
160 |                 "parent" in parent
161 |                   ? (parent as { parent?: unknown }).parent
162 |                   : undefined
163 |               ) as typeof parent;
164 |             }
165 | 
166 |             const fullName =
167 |               ancestorTitles.length > 0
168 |                 ? `${ancestorTitles.join(" > ")} > ${t.name}`
169 |                 : t.name;
170 | 
171 |             const errorMessages: string[] = [];
172 |             if (result.errors) {
173 |               for (const testError of result.errors) {
174 |                 if (testError.message) {
175 |                   errorMessages.push(testError.message);
176 |                   allErrors.push(testError.message);
177 |                 }
178 |               }
179 |             }
180 | 
181 |             failedTests.push({
182 |               fullName,
183 |               errorMessage:
184 |                 errorMessages.join("\n") || "No error message available",
185 |             });
186 |           }
187 |         }
188 |       } catch (err) {
189 |         console.error(`Error processing module tests for ${test.name}:`, err);
190 |         const errorMessage = err instanceof Error ? err.message : String(err);
191 |         allErrors.push(errorMessage);
192 |         passed = false;
193 |       }
194 |     }
195 | 
196 |     const numPassed = numTests - numFailed;
197 | 
198 |     return {
199 |       testName: test.name,
200 |       passed: passed && numFailed === 0,
201 |       numTests,
202 |       numPassed,
203 |       numFailed,
204 |       duration: Date.now() - startTime,
205 |       failedTests: failedTests.length > 0 ? failedTests : undefined,
206 |       error: allErrors.length > 0 && !passed ? allErrors[0] : undefined,
207 |     };
208 |   } catch (error) {
209 |     return {
210 |       testName: test.name,
211 |       passed: false,
212 |       numTests: 0,
213 |       numPassed: 0,
214 |       numFailed: 0,
215 |       duration: Date.now() - startTime,
216 |       error: error instanceof Error ? error.message : String(error),
217 |     };
218 |   }
219 | }
220 | 


--------------------------------------------------------------------------------
/lib/verify-references.ts:
--------------------------------------------------------------------------------
  1 | import {
  2 |   readdirSync,
  3 |   statSync,
  4 |   copyFileSync,
  5 |   unlinkSync,
  6 |   existsSync,
  7 | } from "node:fs";
  8 | import { join } from "node:path";
  9 | import { startVitest } from "vitest/node";
 10 | 
 11 | interface TestDefinition {
 12 |   name: string;
 13 |   directory: string;
 14 |   referenceFile: string;
 15 |   componentFile: string;
 16 |   testFile: string;
 17 |   promptFile: string;
 18 | }
 19 | 
 20 | interface FailedTest {
 21 |   fullName: string;
 22 |   errorMessage: string;
 23 | }
 24 | 
 25 | interface TestResult {
 26 |   testName: string;
 27 |   passed: boolean;
 28 |   numTests: number;
 29 |   numPassed: number;
 30 |   numFailed: number;
 31 |   duration: number;
 32 |   error?: string;
 33 |   failedTests?: FailedTest[];
 34 | }
 35 | 
 36 | export function loadTestDefinitions() {
 37 |   const testsDir = join(process.cwd(), "tests");
 38 |   const definitions = [];
 39 | 
 40 |   try {
 41 |     const entries = readdirSync(testsDir);
 42 | 
 43 |     for (const entry of entries) {
 44 |       const entryPath = join(testsDir, entry);
 45 |       const stat = statSync(entryPath);
 46 | 
 47 |       if (stat.isDirectory()) {
 48 |         const referenceFile = join(entryPath, "Reference.svelte");
 49 |         const testFile = join(entryPath, "test.ts");
 50 |         const promptFile = join(entryPath, "prompt.md");
 51 |         const componentFile = join(entryPath, "Component.svelte");
 52 | 
 53 |         if (existsSync(referenceFile) && existsSync(testFile)) {
 54 |           definitions.push({
 55 |             name: entry,
 56 |             directory: entryPath,
 57 |             referenceFile,
 58 |             componentFile,
 59 |             testFile,
 60 |             promptFile,
 61 |           });
 62 |         } else {
 63 |           console.warn(
 64 |             `⚠️  Skipping ${entry}: missing Reference.svelte or test.ts`,
 65 |           );
 66 |         }
 67 |       }
 68 |     }
 69 |   } catch (error) {
 70 |     console.error("Error loading test definitions:", error);
 71 |   }
 72 | 
 73 |   return definitions;
 74 | }
 75 | 
 76 | export function copyReferenceToComponent(testDef: TestDefinition) {
 77 |   copyFileSync(testDef.referenceFile, testDef.componentFile);
 78 | }
 79 | 
 80 | export function cleanupComponent(testDef: TestDefinition) {
 81 |   if (existsSync(testDef.componentFile)) {
 82 |     try {
 83 |       unlinkSync(testDef.componentFile);
 84 |     } catch (error) {
 85 |       console.warn(`⚠️  Failed to cleanup ${testDef.componentFile}:`, error);
 86 |     }
 87 |   }
 88 | }
 89 | 
 90 | export async function runTest(testDef: TestDefinition) {
 91 |   const startTime = Date.now();
 92 | 
 93 |   try {
 94 |     const vitest = await startVitest("test", [testDef.testFile], {
 95 |       watch: false,
 96 |       reporters: ["verbose"],
 97 |     });
 98 | 
 99 |     if (!vitest) {
100 |       return {
101 |         testName: testDef.name,
102 |         passed: false,
103 |         numTests: 0,
104 |         numPassed: 0,
105 |         numFailed: 0,
106 |         duration: Date.now() - startTime,
107 |         error: "Failed to start vitest",
108 |       };
109 |     }
110 | 
111 |     await vitest.close();
112 | 
113 |     const testModules = vitest.state.getTestModules();
114 |     const failedTests = [];
115 |     const allErrors = [];
116 | 
117 |     const unhandledErrors = vitest.state.getUnhandledErrors();
118 |     for (const error of unhandledErrors) {
119 |       const errorMessage =
120 |         error instanceof Error ? error.message : String(error);
121 |       allErrors.push(errorMessage);
122 |     }
123 | 
124 |     let passed = true;
125 |     let numTests = 0;
126 |     let numFailed = 0;
127 | 
128 |     if (!testModules || testModules.length === 0) {
129 |       return {
130 |         testName: testDef.name,
131 |         passed: false,
132 |         numTests: 0,
133 |         numPassed: 0,
134 |         numFailed: 0,
135 |         duration: Date.now() - startTime,
136 |         error:
137 |           allErrors.length > 0 ? allErrors.join("\n") : "No test modules found",
138 |       };
139 |     }
140 | 
141 |     for (const module of testModules) {
142 |       if (!module.ok()) {
143 |         passed = false;
144 |       }
145 | 
146 |       const moduleErrors = module.errors();
147 |       for (const error of moduleErrors) {
148 |         if (error.message) {
149 |           allErrors.push(error.message);
150 |         }
151 |       }
152 | 
153 |       if (!module.children) {
154 |         continue;
155 |       }
156 | 
157 |       try {
158 |         const tests = Array.from(module.children.allTests());
159 |         numTests += tests.length;
160 | 
161 |         for (const t of tests) {
162 |           const result = t.result();
163 | 
164 |           if (result.state === "failed") {
165 |             numFailed++;
166 | 
167 |             const ancestorTitles: string[] = [];
168 |             let parent = t.parent;
169 |             while (parent && "name" in parent) {
170 |               if (parent.name) {
171 |                 ancestorTitles.unshift(parent.name);
172 |               }
173 |               parent = (
174 |                 "parent" in parent
175 |                   ? (parent as { parent?: unknown }).parent
176 |                   : undefined
177 |               ) as typeof parent;
178 |             }
179 | 
180 |             const fullName =
181 |               ancestorTitles.length > 0
182 |                 ? `${ancestorTitles.join(" > ")} > ${t.name}`
183 |                 : t.name;
184 | 
185 |             const errorMessages: string[] = [];
186 |             if (result.errors) {
187 |               for (const testError of result.errors) {
188 |                 if (testError.message) {
189 |                   errorMessages.push(testError.message);
190 |                   allErrors.push(testError.message);
191 |                 }
192 |               }
193 |             }
194 | 
195 |             failedTests.push({
196 |               fullName,
197 |               errorMessage:
198 |                 errorMessages.join("\n") || "No error message available",
199 |             });
200 |           }
201 |         }
202 |       } catch (err) {
203 |         console.error(
204 |           `Error processing module tests for ${testDef.name}:`,
205 |           err,
206 |         );
207 |         const errorMessage = err instanceof Error ? err.message : String(err);
208 |         allErrors.push(errorMessage);
209 |         passed = false;
210 |       }
211 |     }
212 | 
213 |     const numPassed = numTests - numFailed;
214 | 
215 |     return {
216 |       testName: testDef.name,
217 |       passed: passed && numFailed === 0,
218 |       numTests,
219 |       numPassed,
220 |       numFailed,
221 |       duration: Date.now() - startTime,
222 |       failedTests: failedTests.length > 0 ? failedTests : undefined,
223 |       error: allErrors.length > 0 && !passed ? allErrors[0] : undefined,
224 |     };
225 |   } catch (error) {
226 |     return {
227 |       testName: testDef.name,
228 |       passed: false,
229 |       numTests: 0,
230 |       numPassed: 0,
231 |       numFailed: 0,
232 |       duration: Date.now() - startTime,
233 |       error: error instanceof Error ? error.message : String(error),
234 |     };
235 |   }
236 | }
237 | 
238 | export function printSummary(results: TestResult[]) {
239 |   console.log("\n=== Test Verification Summary ===\n");
240 | 
241 |   const totalSuites = results.length;
242 |   const passedSuites = results.filter((r) => r.passed).length;
243 | 
244 |   for (const result of results) {
245 |     const status = result.passed ? "✓ PASSED" : "✗ FAILED";
246 |     const testInfo = `${result.numPassed}/${result.numTests} tests`;
247 |     const durationInfo = `${result.duration}ms`;
248 | 
249 |     console.log(`${result.testName}: ${status} (${testInfo}, ${durationInfo})`);
250 | 
251 |     if (result.error) {
252 |       console.log(`Error: ${result.error}`);
253 |     }
254 | 
255 |     if (!result.passed && result.failedTests && result.failedTests.length > 0) {
256 |       console.log("  Failed tests:");
257 |       for (const failed of result.failedTests) {
258 |         console.log(`✗ ${failed.fullName}`);
259 |       }
260 |     }
261 |   }
262 | 
263 |   console.log(`\nTotal: ${passedSuites}/${totalSuites} suites passed`);
264 | 
265 |   if (passedSuites === totalSuites) {
266 |     console.log("All reference implementations verified successfully!");
267 |   } else {
268 |     console.log(`${totalSuites - passedSuites} suite(s) failed.`);
269 |   }
270 | }
271 | 
272 | export async function verifyAllReferences() {
273 |   console.log("Discovering test suites...");
274 |   const tests = loadTestDefinitions();
275 |   console.log(`Found ${tests.length} test suite(s)\n`);
276 | 
277 |   if (tests.length === 0) {
278 |     console.log("No test suites found in tests/ directory");
279 |     return 1;
280 |   }
281 | 
282 |   const results = [];
283 | 
284 |   for (const test of tests) {
285 |     console.log(`Running tests/${test.name}...`);
286 | 
287 |     try {
288 |       copyReferenceToComponent(test);
289 |       console.log("  ✓ Copied Reference.svelte → Component.svelte");
290 | 
291 |       const result = await runTest(test);
292 |       results.push(result);
293 | 
294 |       if (result.passed) {
295 |         console.log(`✓ All tests passed (${result.duration}ms)`);
296 |       } else {
297 |         console.log(
298 |           `✗ Tests failed (${result.numFailed}/${result.numTests} failed)`,
299 |         );
300 |         if (result.error) {
301 |           console.log(`Error: ${result.error}`);
302 |         }
303 |         if (result.failedTests && result.failedTests.length > 0) {
304 |           console.log("\n  Failed tests:");
305 |           for (const failed of result.failedTests) {
306 |             console.log(`✗ ${failed.fullName}`);
307 |             const errorLines = failed.errorMessage.split("\n");
308 |             for (const line of errorLines) {
309 |               if (line.trim()) {
310 |                 console.log(` ${line}`);
311 |               }
312 |             }
313 |           }
314 |           console.log();
315 |         }
316 |       }
317 |     } finally {
318 |       cleanupComponent(test);
319 |       console.log("  ✓ Cleaned up Component.svelte\n");
320 |     }
321 |   }
322 | 
323 |   printSummary(results);
324 | 
325 |   const allPassed = results.every((r) => r.passed);
326 |   return allPassed ? 0 : 1;
327 | }
328 | 


--------------------------------------------------------------------------------
/AGENTS.md:
--------------------------------------------------------------------------------
  1 | ## Project Overview
  2 | 
  3 | AI SDK benchmarking tool built with Vercel AI SDK and Bun runtime. Tests AI agents with MCP (Model Context Protocol) server integration using the Vercel AI Gateway. Automatically discovers and runs all tests in the `tests/` directory and verifies LLM-generated Svelte components against test suites.
  4 | 
  5 | ## Development Commands
  6 | 
  7 | ```bash
  8 | # Install dependencies (runs patch-package automatically)
  9 | bun install
 10 | 
 11 | # Run the main benchmark (interactive CLI)
 12 | bun run start
 13 | 
 14 | # Verify reference implementations against test suites
 15 | bun run verify-tests
 16 | 
 17 | # Generate HTML reports from all result JSON files
 18 | bun run generate-report.ts
 19 | 
 20 | # Generate HTML report from specific result file
 21 | bun run generate-report.ts results/result-2024-12-07-14-30-45.json
 22 | 
 23 | # Run unit tests for lib modules
 24 | bun run test:self
 25 | 
 26 | # Run TypeScript type checking
 27 | bun tsc --noEmit
 28 | 
 29 | # Format code with Prettier
 30 | bun run prettier
 31 | ```
 32 | 
 33 | ## Environment Variables
 34 | 
 35 | ### Vercel AI Gateway
 36 | 
 37 | The benchmark uses the Vercel AI Gateway for model access. Configuration:
 38 | 
 39 | 1. Link to a Vercel project with AI Gateway enabled: `bun run vercel:link`
 40 | 2. Pull environment variables: `bun run vercel:env:pull`
 41 | 
 42 | Required environment variable:
 43 | 
 44 | - `VERCEL_OIDC_TOKEN`: OIDC token for Vercel AI Gateway authentication
 45 | 
 46 | ### MCP Server Configuration
 47 | 
 48 | MCP integration is configured via the interactive CLI at runtime. Options:
 49 | 
 50 | - **No MCP Integration**: Agent runs with built-in tools only
 51 | - **MCP over HTTP**: Uses HTTP transport (default: `https://mcp.svelte.dev/mcp`)
 52 | - **MCP over StdIO**: Uses local command (default: `npx -y @sveltejs/mcp`)
 53 | 
 54 | ## Architecture
 55 | 
 56 | ### Directory Structure
 57 | 
 58 | ```
 59 | ├── index.ts                    # Main entry point with interactive CLI
 60 | ├── lib/
 61 | │   ├── pricing.ts              # Cost calculation from gateway pricing
 62 | │   ├── pricing.test.ts         # Unit tests for pricing module
 63 | │   ├── test-discovery.ts       # Test suite discovery and prompt building
 64 | │   ├── test-discovery.test.ts  # Unit tests for test discovery
 65 | │   ├── output-test-runner.ts   # Vitest runner for component verification
 66 | │   ├── output-test-runner.test.ts # Unit tests for output runner
 67 | │   ├── verify-references.ts    # Reference implementation verification
 68 | │   ├── report.ts               # Report generation orchestration
 69 | │   ├── report-template.ts      # HTML report template generation
 70 | │   ├── report-styles.ts        # CSS styles for HTML reports
 71 | │   └── tools/
 72 | │       ├── index.ts            # Tool exports
 73 | │       ├── result-write.ts     # ResultWrite tool for final output
 74 | │       ├── result-write.test.ts # Unit tests for ResultWrite tool
 75 | │       ├── test-component.ts   # TestComponent tool for iterative testing
 76 | │       └── test-component.test.ts # Unit tests for TestComponent tool
 77 | ├── tests/                      # Benchmark test suites
 78 | │   └── {test-name}/
 79 | │       ├── Reference.svelte    # Reference implementation
 80 | │       ├── test.ts             # Vitest test file
 81 | │       └── prompt.md           # Agent prompt
 82 | ├── results/                    # Benchmark results (JSON + HTML)
 83 | ├── outputs/                    # Temporary directory for test verification
 84 | └── patches/                    # Patches for dependencies
 85 | ```
 86 | 
 87 | ### Test Suite Structure
 88 | 
 89 | Benchmark test suites in `tests/` directory:
 90 | 
 91 | ```
 92 | tests/
 93 |   {test-name}/
 94 |     Reference.svelte  - Reference implementation of the component
 95 |     test.ts          - Vitest test file (imports "./Component.svelte")
 96 |     prompt.md        - Prompt for AI agents to implement the component
 97 | ```
 98 | 
 99 | **Benchmark Workflow:**
100 | 
101 | 1. `index.ts` presents interactive CLI for model/MCP selection
102 | 2. Discovers all test suites in `tests/`
103 | 3. For each selected model and test:
104 |    - Loads `prompt.md` and builds agent prompt
105 |    - Agent generates component code using available tools
106 |    - Agent calls `ResultWrite` tool with the component code
107 |    - Component is written to `outputs/{test-name}/Component.svelte`
108 |    - Test file is copied to `outputs/{test-name}/test.ts`
109 |    - Vitest runs tests against the generated component
110 |    - Results are collected (pass/fail, error messages)
111 |    - Output directory is cleaned up
112 | 4. All results are saved to timestamped JSON file
113 | 5. HTML report is generated with expandable sections for each test
114 | 
115 | ### Agent Tools
116 | 
117 | **ResultWrite** (`lib/tools/result-write.ts`):
118 | 
119 | - Called when agent completes component implementation
120 | - Signals the agent to stop (via `stopWhen` configuration)
121 | - Accepts `content` parameter with Svelte component code
122 | 
123 | **TestComponent** (`lib/tools/test-component.ts`):
124 | 
125 | - Optional tool for iterative development
126 | - Runs component against test suite before final submission
127 | - Returns pass/fail status and detailed error messages
128 | - Enabled/disabled via interactive CLI
129 | 
130 | ### Interactive CLI
131 | 
132 | The benchmark uses `@clack/prompts` for an interactive CLI that prompts for:
133 | 
134 | 1. **Model Selection**: Multi-select from Vercel AI Gateway available models
135 | 2. **MCP Integration**: Choose HTTP, StdIO, or no MCP
136 | 3. **TestComponent Tool**: Enable/disable iterative testing tool
137 | 4. **Pricing Confirmation**: Review and confirm cost calculation settings
138 | 
139 | ### Pricing System
140 | 
141 | The pricing module (`lib/pricing.ts`) handles cost calculation:
142 | 
143 | - Extracts pricing from Vercel AI Gateway model metadata
144 | - Calculates costs based on input/output/cached tokens
145 | - Supports cache read billing at reduced rates
146 | - Displays costs in reports with per-million-token rates
147 | 
148 | Key functions:
149 | 
150 | - `extractPricingFromGatewayModel()`: Parse gateway model pricing
151 | - `buildPricingMap()`: Build lookup map from gateway models
152 | - `calculateCost()`: Calculate total cost from token usage
153 | - `formatCost()` / `formatMTokCost()`: Format costs for display
154 | 
155 | ### Key Technologies
156 | 
157 | - **Vercel AI SDK v5**: Agent framework with tool calling
158 | - **Vercel AI Gateway**: Unified access to multiple AI providers
159 | - **@ai-sdk/mcp**: MCP client integration (with custom patch)
160 | - **@clack/prompts**: Interactive CLI prompts
161 | - **Bun Runtime**: JavaScript runtime (not Node.js)
162 | - **Vitest**: Test framework for component testing
163 | - **@testing-library/svelte**: Testing utilities for Svelte components
164 | 
165 | ### MCP Integration
166 | 
167 | The project uses `@ai-sdk/mcp` with a custom patch applied via `patch-package`:
168 | 
169 | - Patch location: `patches/@ai-sdk+mcp+0.0.11.patch`
170 | - Fixes: Handles missing event types in HTTP SSE responses
171 | - Supports both HTTP and StdIO transports
172 | - Configuration via interactive CLI at runtime
173 | 
174 | ### Data Flow
175 | 
176 | 1. Interactive CLI collects configuration (models, MCP, tools)
177 | 2. Gateway provides available models and pricing
178 | 3. Test discovery scans `tests/` directory
179 | 4. For each model and test:
180 |    a. Agent receives prompt with access to tools (built-in + optional MCP)
181 |    b. Agent iterates through steps, calling tools as needed
182 |    c. Agent stops when `ResultWrite` tool is called
183 |    d. Component is written to `outputs/{test-name}/Component.svelte`
184 |    e. Vitest runs test file against the generated component
185 |    f. Test results are collected (pass/fail, error details)
186 |    g. Output directory is cleaned up
187 | 5. Results aggregated with pricing calculations
188 | 6. Results written to `results/result-YYYY-MM-DD-HH-MM-SS.json`
189 | 7. HTML report generated at `results/result-YYYY-MM-DD-HH-MM-SS.html`
190 | 8. Report automatically opens in default browser
191 | 
192 | ### Output Files
193 | 
194 | All results are saved in the `results/` directory with timestamped filenames:
195 | 
196 | - **JSON files**: `result-2024-12-07-14-30-45.json` - Complete execution trace
197 | - **HTML files**: `result-2024-12-07-14-30-45.html` - Interactive visualization
198 | 
199 | **Multi-Test Result JSON Structure:**
200 | 
201 | ```json
202 | {
203 |   "tests": [
204 |     {
205 |       "testName": "counter",
206 |       "prompt": "# Counter Component Task...",
207 |       "steps": [...],
208 |       "resultWriteContent": "<script>...</script>...",
209 |       "verification": {
210 |         "testName": "counter",
211 |         "passed": true,
212 |         "numTests": 4,
213 |         "numPassed": 4,
214 |         "numFailed": 0,
215 |         "duration": 150,
216 |         "failedTests": []
217 |       }
218 |     }
219 |   ],
220 |   "metadata": {
221 |     "mcpEnabled": true,
222 |     "mcpServerUrl": "https://mcp.svelte.dev/mcp",
223 |     "mcpTransportType": "HTTP",
224 |     "timestamp": "2024-12-07T14:30:45.123Z",
225 |     "model": "anthropic/claude-sonnet-4",
226 |     "pricingKey": "anthropic/claude-sonnet-4",
227 |     "pricing": {
228 |       "inputCostPerMTok": 3,
229 |       "outputCostPerMTok": 15,
230 |       "cacheReadCostPerMTok": 0.3
231 |     },
232 |     "totalCost": {
233 |       "inputCost": 0.003,
234 |       "outputCost": 0.015,
235 |       "cacheReadCost": 0.0003,
236 |       "totalCost": 0.0183,
237 |       "inputTokens": 1000,
238 |       "outputTokens": 1000,
239 |       "cachedInputTokens": 1000
240 |     }
241 |   }
242 | }
243 | ```
244 | 
245 | ## Unit Tests
246 | 
247 | Unit tests for library modules are in `lib/*.test.ts`:
248 | 
249 | - `lib/pricing.test.ts` - Pricing extraction, calculation, formatting
250 | - `lib/test-discovery.test.ts` - Test suite discovery and prompt building
251 | - `lib/output-test-runner.test.ts` - Output directory management
252 | - `lib/tools/result-write.test.ts` - ResultWrite tool behavior
253 | - `lib/tools/test-component.test.ts` - TestComponent tool behavior
254 | 
255 | Run unit tests with: `bun run test:self`
256 | 
257 | ## TypeScript Configuration
258 | 
259 | - **Runtime**: Bun (not Node.js)
260 | - **Module System**: ESNext with `module: "Preserve"` and `moduleResolution: "bundler"`
261 | - **Strict Mode**: Enabled with additional checks:
262 |   - `noUncheckedIndexedAccess: true` - array/index access always includes undefined
263 |   - `noImplicitOverride: true` - override keyword required
264 |   - `noFallthroughCasesInSwitch: true`
265 | - **Import Extensions**: `.ts` extensions allowed in imports
266 | - **No Emit**: TypeScript compilation not required for Bun runtime
267 | 
268 | ## Important Notes
269 | 
270 | - The MCP client import uses a direct path to the patched module: `./node_modules/@ai-sdk/mcp/dist/index.mjs`
271 | - Agent stops execution when the `ResultWrite` tool is called (configured via `stopWhen` option)
272 | - Agent also stops after 10 steps maximum (configured via `stepCountIs(10)`)
273 | - The `outputs/` directory is used temporarily for test verification and is cleaned up after each test
274 | - HTML reports include expandable sections for each test with full step details
275 | - Test verification results show pass/fail status and failed test details
276 | - Token usage includes cached token counts when available
277 | - All result files are saved with timestamps to preserve historical benchmarks
278 | - MCP integration can be configured via interactive CLI without code changes
279 | - MCP status is clearly indicated in both the JSON metadata and HTML report with a visual badge
280 | - Exit code is 0 if all tests pass, 1 if any tests fail
281 | - Pricing is fetched from Vercel AI Gateway model metadata at runtime
282 | 


--------------------------------------------------------------------------------
/lib/pricing.test.ts:
--------------------------------------------------------------------------------
  1 | import { describe, it, expect } from "vitest";
  2 | import {
  3 |   extractPricingFromGatewayModel,
  4 |   buildPricingMap,
  5 |   lookupPricingFromMap,
  6 |   calculateCost,
  7 |   formatCost,
  8 |   formatMTokCost,
  9 |   getModelPricingDisplay,
 10 |   type ModelPricing,
 11 |   type GatewayModel,
 12 | } from "./pricing.ts";
 13 | 
 14 | describe("extractPricingFromGatewayModel", () => {
 15 |   it("should extract pricing from a gateway model with all fields", () => {
 16 |     const model: GatewayModel = {
 17 |       id: "anthropic/claude-opus-4.5",
 18 |       name: "Claude Opus 4.5",
 19 |       pricing: {
 20 |         input: "0.000005",
 21 |         output: "0.000025",
 22 |         cachedInputTokens: "0.0000005",
 23 |         cacheCreationInputTokens: "0.00000625",
 24 |       },
 25 |       modelType: "language",
 26 |     };
 27 | 
 28 |     const pricing = extractPricingFromGatewayModel(model);
 29 | 
 30 |     expect(pricing).not.toBeNull();
 31 |     expect(pricing!.inputCostPerToken).toBe(0.000005);
 32 |     expect(pricing!.outputCostPerToken).toBe(0.000025);
 33 |     expect(pricing!.cacheReadInputTokenCost).toBe(0.0000005);
 34 |     expect(pricing!.cacheCreationInputTokenCost).toBe(0.00000625);
 35 |   });
 36 | 
 37 |   it("should extract pricing with only input and output", () => {
 38 |     const model: GatewayModel = {
 39 |       id: "openai/gpt-4o",
 40 |       name: "GPT-4o",
 41 |       pricing: {
 42 |         input: "0.000003",
 43 |         output: "0.000015",
 44 |       },
 45 |       modelType: "language",
 46 |     };
 47 | 
 48 |     const pricing = extractPricingFromGatewayModel(model);
 49 | 
 50 |     expect(pricing).not.toBeNull();
 51 |     expect(pricing!.inputCostPerToken).toBe(0.000003);
 52 |     expect(pricing!.outputCostPerToken).toBe(0.000015);
 53 |     expect(pricing!.cacheReadInputTokenCost).toBeUndefined();
 54 |   });
 55 | 
 56 |   it("should return null for model without pricing", () => {
 57 |     const model: GatewayModel = {
 58 |       id: "local/model",
 59 |       name: "Local Model",
 60 |       modelType: "language",
 61 |     };
 62 | 
 63 |     const pricing = extractPricingFromGatewayModel(model);
 64 |     expect(pricing).toBeNull();
 65 |   });
 66 | 
 67 |   it("should throw error for model with empty pricing object", () => {
 68 |     const model: GatewayModel = {
 69 |       id: "local/model",
 70 |       name: "Local Model",
 71 |       pricing: {},
 72 |       modelType: "language",
 73 |     };
 74 | 
 75 |     expect(() => extractPricingFromGatewayModel(model)).toThrowError(
 76 |       /Invalid pricing/,
 77 |     );
 78 |   });
 79 | 
 80 |   it("should throw error for invalid pricing values", () => {
 81 |     const model: GatewayModel = {
 82 |       id: "test/model",
 83 |       name: "Test Model",
 84 |       pricing: {
 85 |         input: "invalid",
 86 |         output: "0.000015",
 87 |       },
 88 |       modelType: "language",
 89 |     };
 90 | 
 91 |     expect(() => extractPricingFromGatewayModel(model)).toThrowError(
 92 |       /Invalid pricing/,
 93 |     );
 94 |   });
 95 | });
 96 | 
 97 | describe("buildPricingMap", () => {
 98 |   it("should build a map from gateway models", () => {
 99 |     const models: GatewayModel[] = [
100 |       {
101 |         id: "anthropic/claude-sonnet-4",
102 |         name: "Claude Sonnet 4",
103 |         pricing: { input: "0.000003", output: "0.000015" },
104 |         modelType: "language",
105 |       },
106 |       {
107 |         id: "openai/gpt-4o",
108 |         name: "GPT-4o",
109 |         pricing: { input: "0.000005", output: "0.000015" },
110 |         modelType: "language",
111 |       },
112 |       {
113 |         id: "local/model",
114 |         name: "Local Model",
115 |         modelType: "language",
116 |       },
117 |     ];
118 | 
119 |     const map = buildPricingMap(models);
120 | 
121 |     expect(map.size).toBe(3);
122 |     expect(map.get("anthropic/claude-sonnet-4")).not.toBeNull();
123 |     expect(map.get("openai/gpt-4o")).not.toBeNull();
124 |     expect(map.get("local/model")).toBeNull();
125 |   });
126 | });
127 | 
128 | describe("lookupPricingFromMap", () => {
129 |   it("should return pricing lookup for existing model", () => {
130 |     const models: GatewayModel[] = [
131 |       {
132 |         id: "anthropic/claude-sonnet-4",
133 |         name: "Claude Sonnet 4",
134 |         pricing: { input: "0.000003", output: "0.000015" },
135 |         modelType: "language",
136 |       },
137 |     ];
138 | 
139 |     const map = buildPricingMap(models);
140 |     const lookup = lookupPricingFromMap("anthropic/claude-sonnet-4", map);
141 | 
142 |     expect(lookup).not.toBeNull();
143 |     expect(lookup!.matchedKey).toBe("anthropic/claude-sonnet-4");
144 |     expect(lookup!.pricing.inputCostPerToken).toBe(0.000003);
145 |   });
146 | 
147 |   it("should return null for non-existent model", () => {
148 |     const map = buildPricingMap([]);
149 |     const lookup = lookupPricingFromMap("non/existent", map);
150 |     expect(lookup).toBeNull();
151 |   });
152 | });
153 | 
154 | describe("calculateCost", () => {
155 |   const basePricing: ModelPricing = {
156 |     inputCostPerToken: 0.000003, // $3 per MTok
157 |     outputCostPerToken: 0.000015, // $15 per MTok
158 |   };
159 | 
160 |   const pricingWithCache: ModelPricing = {
161 |     ...basePricing,
162 |     cacheReadInputTokenCost: 0.0000003, // $0.30 per MTok (10% of input)
163 |   };
164 | 
165 |   describe("basic cost calculation", () => {
166 |     it("should calculate cost with no cached tokens", () => {
167 |       const result = calculateCost(basePricing, 1000, 500, 0);
168 | 
169 |       expect(result.inputTokens).toBe(1000);
170 |       expect(result.outputTokens).toBe(500);
171 |       expect(result.cachedInputTokens).toBe(0);
172 |       expect(result.inputCost).toBe(0.003); // 1000 * $3/MTok
173 |       expect(result.outputCost).toBeCloseTo(0.0075); // 500 * $15/MTok
174 |       expect(result.cacheReadCost).toBe(0);
175 |       expect(result.totalCost).toBe(0.0105);
176 |     });
177 | 
178 |     it("should default cachedInputTokens to 0", () => {
179 |       const result = calculateCost(basePricing, 1000, 500);
180 | 
181 |       expect(result.cachedInputTokens).toBe(0);
182 |       expect(result.inputCost).toBe(0.003);
183 |     });
184 |   });
185 | 
186 |   describe("cached token billing", () => {
187 |     it("should bill cached tokens at reduced rate", () => {
188 |       // 1000 input tokens, 800 are cached
189 |       const result = calculateCost(pricingWithCache, 1000, 500, 800);
190 | 
191 |       expect(result.inputTokens).toBe(1000);
192 |       expect(result.cachedInputTokens).toBe(800);
193 |       // Uncached: 200 tokens * $3/MTok = $0.0006
194 |       expect(result.inputCost).toBeCloseTo(0.0006);
195 |       // Cached: 800 tokens * $0.30/MTok = $0.00024
196 |       expect(result.cacheReadCost).toBeCloseTo(0.00024);
197 |       // Output: 500 * $15/MTok = $0.0075
198 |       expect(result.outputCost).toBeCloseTo(0.0075);
199 |       expect(result.totalCost).toBeCloseTo(0.00834);
200 |     });
201 | 
202 |     it("should treat cached tokens as free when no cache rate specified", () => {
203 |       // Using basePricing which has no cacheReadInputTokenCost
204 |       const result = calculateCost(basePricing, 1000, 500, 800);
205 | 
206 |       // Only 200 uncached tokens should be billed
207 |       expect(result.inputCost).toBeCloseTo(0.0006);
208 |       expect(result.cacheReadCost).toBe(0);
209 |     });
210 | 
211 |     it("should handle all tokens being cached", () => {
212 |       const result = calculateCost(pricingWithCache, 1000, 500, 1000);
213 | 
214 |       expect(result.inputCost).toBe(0);
215 |       expect(result.cacheReadCost).toBe(0.0003); // 1000 * $0.30/MTok
216 |     });
217 |   });
218 | 
219 |   describe("edge cases", () => {
220 |     it("should handle zero tokens", () => {
221 |       const result = calculateCost(basePricing, 0, 0, 0);
222 | 
223 |       expect(result.inputCost).toBe(0);
224 |       expect(result.outputCost).toBe(0);
225 |       expect(result.cacheReadCost).toBe(0);
226 |       expect(result.totalCost).toBe(0);
227 |     });
228 | 
229 |     it("should handle large token counts", () => {
230 |       const result = calculateCost(basePricing, 1_000_000, 500_000, 0);
231 | 
232 |       expect(result.inputCost).toBe(3); // 1M * $3/MTok
233 |       expect(result.outputCost).toBe(7.5); // 500K * $15/MTok
234 |       expect(result.totalCost).toBe(10.5);
235 |     });
236 | 
237 |     it("should handle pricing with zero costs", () => {
238 |       const freePricing: ModelPricing = {
239 |         inputCostPerToken: 0,
240 |         outputCostPerToken: 0,
241 |       };
242 |       const result = calculateCost(freePricing, 1000, 500, 0);
243 | 
244 |       expect(result.totalCost).toBe(0);
245 |     });
246 |   });
247 | });
248 | 
249 | describe("formatCost", () => {
250 |   it('should format zero as "$0.00"', () => {
251 |     expect(formatCost(0)).toBe("$0.00");
252 |   });
253 | 
254 |   it("should format very small costs with 6 decimal places", () => {
255 |     expect(formatCost(0.000123)).toBe("$0.000123");
256 |     expect(formatCost(0.001)).toBe("$0.001000");
257 |     expect(formatCost(0.0099)).toBe("$0.009900");
258 |   });
259 | 
260 |   it("should format small costs with 4 decimal places", () => {
261 |     expect(formatCost(0.01)).toBe("$0.0100");
262 |     expect(formatCost(0.1234)).toBe("$0.1234");
263 |     expect(formatCost(0.99)).toBe("$0.9900");
264 |   });
265 | 
266 |   it("should format costs >= $1 with 2 decimal places", () => {
267 |     expect(formatCost(1)).toBe("$1.00");
268 |     expect(formatCost(1.234)).toBe("$1.23");
269 |     expect(formatCost(10.5)).toBe("$10.50");
270 |     expect(formatCost(100)).toBe("$100.00");
271 |   });
272 | });
273 | 
274 | describe("formatMTokCost", () => {
275 |   it('should format zero as "$0"', () => {
276 |     expect(formatMTokCost(0)).toBe("$0");
277 |   });
278 | 
279 |   it("should format very small per-MTok costs with 4 decimal places", () => {
280 |     expect(formatMTokCost(0.001)).toBe("$0.0010");
281 |     expect(formatMTokCost(0.0099)).toBe("$0.0099");
282 |   });
283 | 
284 |   it("should format per-MTok costs >= $0.01 with 2 decimal places", () => {
285 |     expect(formatMTokCost(0.01)).toBe("$0.01");
286 |     expect(formatMTokCost(0.3)).toBe("$0.30");
287 |     expect(formatMTokCost(3)).toBe("$3.00");
288 |     expect(formatMTokCost(15)).toBe("$15.00");
289 |   });
290 | });
291 | 
292 | describe("getModelPricingDisplay", () => {
293 |   it("should convert per-token costs to per-MTok", () => {
294 |     const pricing: ModelPricing = {
295 |       inputCostPerToken: 0.000003, // $3 per MTok
296 |       outputCostPerToken: 0.000015, // $15 per MTok
297 |     };
298 | 
299 |     const display = getModelPricingDisplay(pricing);
300 | 
301 |     expect(display.inputCostPerMTok).toBe(3);
302 |     expect(display.outputCostPerMTok).toBe(15);
303 |     expect(display.cacheReadCostPerMTok).toBeUndefined();
304 |   });
305 | 
306 |   it("should include cache read cost when available", () => {
307 |     const pricing: ModelPricing = {
308 |       inputCostPerToken: 0.000003,
309 |       outputCostPerToken: 0.000015,
310 |       cacheReadInputTokenCost: 0.0000003, // $0.30 per MTok
311 |     };
312 | 
313 |     const display = getModelPricingDisplay(pricing);
314 | 
315 |     expect(display.inputCostPerMTok).toBe(3);
316 |     expect(display.outputCostPerMTok).toBe(15);
317 |     expect(display.cacheReadCostPerMTok).toBe(0.3);
318 |   });
319 | 
320 |   it("should handle zero costs", () => {
321 |     const pricing: ModelPricing = {
322 |       inputCostPerToken: 0,
323 |       outputCostPerToken: 0,
324 |     };
325 | 
326 |     const display = getModelPricingDisplay(pricing);
327 | 
328 |     expect(display.inputCostPerMTok).toBe(0);
329 |     expect(display.outputCostPerMTok).toBe(0);
330 |   });
331 | 
332 |   it("should preserve explicit zero cost for cache read", () => {
333 |     const pricing: ModelPricing = {
334 |       inputCostPerToken: 0.000003,
335 |       outputCostPerToken: 0.000015,
336 |       cacheReadInputTokenCost: 0,
337 |     };
338 | 
339 |     const display = getModelPricingDisplay(pricing);
340 | 
341 |     expect(display.inputCostPerMTok).toBe(3);
342 |     expect(display.outputCostPerMTok).toBe(15);
343 |     expect(display.cacheReadCostPerMTok).toBe(0);
344 |   });
345 | });
346 | 


--------------------------------------------------------------------------------
/lib/report-styles.ts:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * CSS styles for the HTML report
  3 |  */
  4 | export function getReportStyles(): string {
  5 |   return `
  6 |     :root {
  7 |       --bg: #f8f8f8;
  8 |       --surface: #ffffff;
  9 |       --text: #24292e;
 10 |       --text-muted: #6a737d;
 11 |       --border: #e1e4e8;
 12 |       --success: #238636;
 13 |       --error: #cf222e;
 14 |       --warning: #9a6700;
 15 |       --tool: #8250df;
 16 |       --mcp-enabled: #0969da;
 17 |       --mcp-disabled: #6a737d;
 18 |       --notice-bg: #ddf4ff;
 19 |       --notice-border: #54aeff;
 20 |       --passed-bg: #dafbe1;
 21 |       --passed-border: #238636;
 22 |       --failed-bg: #ffebe9;
 23 |       --failed-border: #cf222e;
 24 |       --skipped-bg: #fff8c5;
 25 |       --skipped-border: #9a6700;
 26 |     }
 27 | 
 28 |     [data-theme="dark"] {
 29 |       --bg: #0d1117;
 30 |       --surface: #161b22;
 31 |       --text: #e6edf3;
 32 |       --text-muted: #8b949e;
 33 |       --border: #30363d;
 34 |       --success: #3fb950;
 35 |       --error: #f85149;
 36 |       --warning: #d29922;
 37 |       --tool: #a371f7;
 38 |       --mcp-enabled: #58a6ff;
 39 |       --mcp-disabled: #8b949e;
 40 |       --notice-bg: #1c2d41;
 41 |       --notice-border: #388bfd;
 42 |       --passed-bg: #1a3d24;
 43 |       --passed-border: #3fb950;
 44 |       --failed-bg: #3d1a1a;
 45 |       --failed-border: #f85149;
 46 |       --skipped-bg: #3d3514;
 47 |       --skipped-border: #d29922;
 48 |     }
 49 | 
 50 |     * {
 51 |       margin: 0;
 52 |       padding: 0;
 53 |       box-sizing: border-box;
 54 |     }
 55 | 
 56 |     html {
 57 |       background: var(--bg);
 58 |       color: var(--text);
 59 |       font-family: 'JetBrains Mono', 'SF Mono', 'Monaco', 'Menlo', monospace;
 60 |       font-size: 13px;
 61 |       line-height: 1.4;
 62 |     }
 63 | 
 64 |     body {
 65 |       padding: 12px;
 66 |       max-width: 1200px;
 67 |       margin: 0 auto;
 68 |     }
 69 | 
 70 |     header {
 71 |       background: var(--surface);
 72 |       border: 1px solid var(--border);
 73 |       border-radius: 4px;
 74 |       padding: 12px;
 75 |       margin-bottom: 12px;
 76 |     }
 77 | 
 78 |     .header-top {
 79 |       display: flex;
 80 |       justify-content: space-between;
 81 |       align-items: center;
 82 |       margin-bottom: 8px;
 83 |     }
 84 | 
 85 |     h1 {
 86 |       font-size: 16px;
 87 |       font-weight: 600;
 88 |       display: flex;
 89 |       align-items: center;
 90 |       gap: 8px;
 91 |     }
 92 | 
 93 |     .meta {
 94 |       font-size: 12px;
 95 |       color: var(--text-muted);
 96 |     }
 97 | 
 98 |     .summary-bar {
 99 |       display: flex;
100 |       gap: 16px;
101 |       padding-top: 8px;
102 |       border-top: 1px solid var(--border);
103 |       margin-top: 8px;
104 |     }
105 | 
106 |     .summary-item {
107 |       display: flex;
108 |       align-items: center;
109 |       gap: 4px;
110 |       font-size: 12px;
111 |     }
112 | 
113 |     .summary-item.passed { color: var(--success); }
114 |     .summary-item.failed { color: var(--error); }
115 |     .summary-item.skipped { color: var(--warning); }
116 | 
117 |     .mcp-badge {
118 |       font-size: 11px;
119 |       padding: 2px 6px;
120 |       border-radius: 3px;
121 |       font-weight: 500;
122 |       white-space: nowrap;
123 |     }
124 | 
125 |     .mcp-badge.enabled {
126 |       background: var(--mcp-enabled);
127 |       color: white;
128 |     }
129 | 
130 |     .mcp-badge.disabled {
131 |       background: var(--bg);
132 |       border: 1px solid var(--border);
133 |       color: var(--text-muted);
134 |     }
135 | 
136 |     .mcp-notice {
137 |       background: var(--notice-bg);
138 |       border: 1px solid var(--notice-border);
139 |       border-radius: 4px;
140 |       padding: 10px 12px;
141 |       margin-bottom: 12px;
142 |       display: flex;
143 |       align-items: center;
144 |       gap: 8px;
145 |       font-size: 13px;
146 |     }
147 | 
148 |     .notice-icon {
149 |       font-size: 16px;
150 |       flex-shrink: 0;
151 |     }
152 | 
153 |     .notice-text {
154 |       color: var(--text);
155 |       line-height: 1.5;
156 |     }
157 | 
158 |     .theme-toggle {
159 |       background: none;
160 |       border: 1px solid var(--border);
161 |       border-radius: 3px;
162 |       color: var(--text);
163 |       cursor: pointer;
164 |       padding: 4px 8px;
165 |       font-size: 16px;
166 |     }
167 | 
168 |     .theme-toggle:hover {
169 |       background: var(--border);
170 |     }
171 | 
172 |     /* Test Section Styles */
173 |     .test-section {
174 |       background: var(--surface);
175 |       border: 1px solid var(--border);
176 |       border-radius: 4px;
177 |       margin-bottom: 12px;
178 |     }
179 | 
180 |     .test-section.passed {
181 |       border-left: 3px solid var(--success);
182 |     }
183 | 
184 |     .test-section.failed {
185 |       border-left: 3px solid var(--error);
186 |     }
187 | 
188 |     .test-section.skipped {
189 |       border-left: 3px solid var(--warning);
190 |     }
191 | 
192 |     .test-header {
193 |       display: flex;
194 |       align-items: center;
195 |       gap: 8px;
196 |       padding: 12px;
197 |       cursor: pointer;
198 |       user-select: none;
199 |       list-style: none;
200 |     }
201 | 
202 |     .test-header::-webkit-details-marker {
203 |       display: none;
204 |     }
205 | 
206 |     .test-header:hover {
207 |       background: var(--bg);
208 |     }
209 | 
210 |     .test-status {
211 |       font-size: 16px;
212 |       font-weight: bold;
213 |     }
214 | 
215 |     .test-status.passed { color: var(--success); }
216 |     .test-status.failed { color: var(--error); }
217 |     .test-status.skipped { color: var(--warning); }
218 | 
219 |     .test-name {
220 |       font-weight: 600;
221 |       font-size: 14px;
222 |     }
223 | 
224 |     .test-meta {
225 |       margin-left: auto;
226 |       color: var(--text-muted);
227 |       font-size: 12px;
228 |     }
229 | 
230 |     .test-content {
231 |       padding: 12px;
232 |       border-top: 1px solid var(--border);
233 |     }
234 | 
235 |     .test-content h4 {
236 |       font-size: 13px;
237 |       font-weight: 600;
238 |       margin-bottom: 8px;
239 |       color: var(--text-muted);
240 |     }
241 | 
242 |     .prompt-section {
243 |       margin-bottom: 16px;
244 |     }
245 | 
246 |     .prompt-section summary {
247 |       cursor: pointer;
248 |       padding: 8px;
249 |       background: var(--bg);
250 |       border: 1px solid var(--border);
251 |       border-radius: 3px;
252 |       font-weight: 600;
253 |     }
254 | 
255 |     .prompt-text {
256 |       padding: 12px;
257 |       background: var(--bg);
258 |       border: 1px solid var(--border);
259 |       border-top: none;
260 |       border-radius: 0 0 3px 3px;
261 |       white-space: pre-wrap;
262 |       font-size: 12px;
263 |       max-height: 300px;
264 |       overflow-y: auto;
265 |     }
266 | 
267 |     .steps-section {
268 |       margin-bottom: 16px;
269 |     }
270 | 
271 |     .output-section {
272 |       margin-bottom: 16px;
273 |     }
274 | 
275 |     .token-summary {
276 |       background: var(--bg);
277 |       border: 1px solid var(--border);
278 |       border-radius: 4px;
279 |       padding: 12px;
280 |       margin-bottom: 16px;
281 |       display: flex;
282 |       align-items: center;
283 |       justify-content: space-between;
284 |     }
285 | 
286 |     .token-summary h4 {
287 |       margin: 0;
288 |       font-size: 13px;
289 |       font-weight: 600;
290 |       color: var(--text-muted);
291 |     }
292 | 
293 |     .token-count {
294 |       font-size: 16px;
295 |       font-weight: 600;
296 |       color: var(--text);
297 |     }
298 | 
299 |     .component-preview {
300 |       position: relative;
301 |     }
302 | 
303 |     .expand-button {
304 |       display: block;
305 |       width: 100%;
306 |       padding: 8px;
307 |       margin-top: 8px;
308 |       background: var(--bg);
309 |       border: 1px solid var(--border);
310 |       border-radius: 3px;
311 |       color: var(--text);
312 |       font-size: 12px;
313 |       font-weight: 600;
314 |       cursor: pointer;
315 |       font-family: inherit;
316 |       transition: background 0.2s;
317 |     }
318 | 
319 |     .expand-button:hover {
320 |       background: var(--border);
321 |     }
322 | 
323 |     .verification-section {
324 |       margin-top: 16px;
325 |     }
326 | 
327 |     /* Step Styles */
328 |     .step {
329 |       background: var(--surface);
330 |       border: 1px solid var(--border);
331 |       border-radius: 4px;
332 |       margin-bottom: 8px;
333 |     }
334 | 
335 |     .step-header {
336 |       display: flex;
337 |       align-items: center;
338 |       gap: 8px;
339 |       padding: 8px 12px;
340 |       cursor: pointer;
341 |       user-select: none;
342 |       list-style: none;
343 |     }
344 | 
345 |     .step-header::-webkit-details-marker {
346 |       display: none;
347 |     }
348 | 
349 |     .step-header:hover {
350 |       background: var(--bg);
351 |     }
352 | 
353 |     .step-num {
354 |       font-weight: 600;
355 |     }
356 | 
357 |     .line {
358 |       flex: 1;
359 |       height: 1px;
360 |       background: var(--border);
361 |     }
362 | 
363 |     .tokens {
364 |       color: var(--text-muted);
365 |       cursor: help;
366 |       border-bottom: 1px dotted var(--text-muted);
367 |     }
368 | 
369 |     .output {
370 |       color: var(--text);
371 |       cursor: help;
372 |       border-bottom: 1px dotted var(--text-muted);
373 |     }
374 | 
375 |     .reason {
376 |       color: var(--text-muted);
377 |       font-size: 12px;
378 |     }
379 | 
380 |     .step-content {
381 |       padding: 12px;
382 |       border-top: 1px solid var(--border);
383 |     }
384 | 
385 |     .text {
386 |       white-space: pre-wrap;
387 |       margin-bottom: 8px;
388 |       padding-left: 8px;
389 |       border-left: 2px solid var(--border);
390 |     }
391 | 
392 |     .tool,
393 |     .result {
394 |       margin: 8px 0;
395 |       border: 1px solid var(--border);
396 |       border-radius: 3px;
397 |     }
398 | 
399 |     .tool summary,
400 |     .result summary {
401 |       padding: 4px 8px;
402 |       cursor: pointer;
403 |       user-select: none;
404 |       list-style: none;
405 |     }
406 | 
407 |     .tool summary::-webkit-details-marker,
408 |     .result summary::-webkit-details-marker {
409 |       display: none;
410 |     }
411 | 
412 |     .tool summary:hover,
413 |     .result summary:hover {
414 |       background: var(--bg);
415 |     }
416 | 
417 |     .arrow {
418 |       color: var(--tool);
419 |     }
420 | 
421 |     .tool-name {
422 |       font-weight: 600;
423 |     }
424 | 
425 |     .status {
426 |       font-weight: 600;
427 |     }
428 | 
429 |     .status.success {
430 |       color: var(--success);
431 |     }
432 | 
433 |     .status.error {
434 |       color: var(--error);
435 |     }
436 | 
437 |     .result.error {
438 |       border-color: var(--error);
439 |     }
440 | 
441 |     .input,
442 |     .output {
443 |       padding: 8px;
444 |       background: var(--bg);
445 |       border-top: 1px solid var(--border);
446 |       overflow-x: auto;
447 |       font-size: 12px;
448 |     }
449 | 
450 |     .code {
451 |       padding: 8px;
452 |       background: var(--bg);
453 |       border: 1px solid var(--border);
454 |       border-radius: 3px;
455 |       overflow-x: auto;
456 |       font-size: 12px;
457 |       white-space: pre-wrap;
458 |     }
459 | 
460 |     .code-preview,
461 |     .code-full {
462 |       max-height: none;
463 |       overflow-y: visible;
464 |     }
465 | 
466 |     /* Verification Styles */
467 |     .verification-result {
468 |       padding: 12px;
469 |       border-radius: 4px;
470 |       border: 1px solid var(--border);
471 |     }
472 | 
473 |     .verification-result.passed {
474 |       background: var(--passed-bg);
475 |       border-color: var(--passed-border);
476 |     }
477 | 
478 |     .verification-result.failed {
479 |       background: var(--failed-bg);
480 |       border-color: var(--failed-border);
481 |     }
482 | 
483 |     .verification-result.skipped {
484 |       background: var(--skipped-bg);
485 |       border-color: var(--skipped-border);
486 |     }
487 | 
488 |     .verification-header {
489 |       display: flex;
490 |       align-items: center;
491 |       gap: 8px;
492 |     }
493 | 
494 |     .verification-icon {
495 |       font-size: 18px;
496 |       font-weight: bold;
497 |     }
498 | 
499 |     .verification-result.passed .verification-icon { color: var(--success); }
500 |     .verification-result.failed .verification-icon { color: var(--error); }
501 |     .verification-result.skipped .verification-icon { color: var(--warning); }
502 | 
503 |     .verification-text {
504 |       font-weight: 600;
505 |     }
506 | 
507 |     .verification-stats {
508 |       margin-left: auto;
509 |       color: var(--text-muted);
510 |       font-size: 12px;
511 |     }
512 | 
513 |     .verification-error {
514 |       margin-top: 8px;
515 |       padding: 8px;
516 |       background: var(--bg);
517 |       border-radius: 3px;
518 |       font-size: 12px;
519 |       color: var(--error);
520 |     }
521 | 
522 |     .failed-tests-details {
523 |       margin-top: 12px;
524 |     }
525 | 
526 |     .failed-tests-details summary {
527 |       cursor: pointer;
528 |       font-weight: 600;
529 |       padding: 4px 0;
530 |     }
531 | 
532 |     .failed-tests-list {
533 |       list-style: none;
534 |       margin-top: 8px;
535 |     }
536 | 
537 |     .failed-test {
538 |       margin-bottom: 12px;
539 |       padding: 8px;
540 |       background: var(--bg);
541 |       border-radius: 3px;
542 |     }
543 | 
544 |     .failed-test-name {
545 |       font-weight: 600;
546 |       margin-bottom: 4px;
547 |       color: var(--error);
548 |     }
549 | 
550 |     .failed-test-error {
551 |       font-size: 11px;
552 |       white-space: pre-wrap;
553 |       max-height: 200px;
554 |       overflow-y: auto;
555 |       background: var(--surface);
556 |       padding: 8px;
557 |       border-radius: 3px;
558 |     }
559 | 
560 |     @media (max-width: 768px) {
561 |       body {
562 |         padding: 8px;
563 |       }
564 |     }
565 |   `;
566 | }
567 | 


--------------------------------------------------------------------------------
/index.ts:
--------------------------------------------------------------------------------
  1 | import { Experimental_Agent as Agent, hasToolCall, stepCountIs } from "ai";
  2 | import { experimental_createMCPClient as createMCPClient } from "./node_modules/@ai-sdk/mcp/dist/index.mjs";
  3 | import { Experimental_StdioMCPTransport as StdioMCPTransport } from "./node_modules/@ai-sdk/mcp/dist/mcp-stdio/index.mjs";
  4 | import { writeFileSync, mkdirSync, existsSync } from "node:fs";
  5 | import {
  6 |   generateReport,
  7 |   type SingleTestResult,
  8 | } from "./lib/report.ts";
  9 | import {
 10 |   getTimestampedFilename,
 11 |   isHttpUrl,
 12 |   extractResultWriteContent,
 13 |   calculateTotalCost,
 14 | } from "./lib/utils.ts";
 15 | import {
 16 |   discoverTests,
 17 |   buildAgentPrompt,
 18 |   type TestDefinition,
 19 | } from "./lib/test-discovery.ts";
 20 | import {
 21 |   setupOutputsDirectory,
 22 |   cleanupOutputsDirectory,
 23 |   cleanupTestEnvironment,
 24 |   runTestVerification,
 25 | } from "./lib/output-test-runner.ts";
 26 | import { resultWriteTool, testComponentTool } from "./lib/tools/index.ts";
 27 | import {
 28 |   buildPricingMap,
 29 |   lookupPricingFromMap,
 30 |   getModelPricingDisplay,
 31 |   formatCost,
 32 |   formatMTokCost,
 33 |   type ModelPricingLookup,
 34 |   type GatewayModel,
 35 | } from "./lib/pricing.ts";
 36 | import type { LanguageModel } from "ai";
 37 | import {
 38 |   intro,
 39 |   multiselect,
 40 |   isCancel,
 41 |   cancel,
 42 |   text,
 43 |   select,
 44 |   confirm,
 45 |   note,
 46 | } from "@clack/prompts";
 47 | import { gateway } from "ai";
 48 | 
 49 | async function validateAndConfirmPricing(
 50 |   models: string[],
 51 |   pricingMap: Map<string, ModelPricingLookup | null>,
 52 | ) {
 53 |   const lookups = new Map<string, ModelPricingLookup | null>();
 54 | 
 55 |   for (const modelId of models) {
 56 |     const lookup = lookupPricingFromMap(modelId, pricingMap);
 57 |     lookups.set(modelId, lookup);
 58 |   }
 59 | 
 60 |   const modelsWithPricing = models.filter((m) => lookups.get(m) !== null);
 61 |   const modelsWithoutPricing = models.filter((m) => lookups.get(m) === null);
 62 | 
 63 |   if (modelsWithoutPricing.length === 0) {
 64 |     const pricingLines = models.map((modelId) => {
 65 |       const lookup = lookups.get(modelId)!;
 66 |       const display = getModelPricingDisplay(lookup.pricing);
 67 |       return `${modelId}\n  → ${formatMTokCost(display.inputCostPerMTok)}/MTok in, ${formatMTokCost(display.outputCostPerMTok)}/MTok out`;
 68 |     });
 69 | 
 70 |     note(pricingLines.join("\n\n"), "💰 Pricing Found");
 71 | 
 72 |     const usePricing = await confirm({
 73 |       message: "Enable cost calculation?",
 74 |       initialValue: true,
 75 |     });
 76 | 
 77 |     if (isCancel(usePricing)) {
 78 |       cancel("Operation cancelled.");
 79 |       process.exit(0);
 80 |     }
 81 | 
 82 |     return { enabled: usePricing, lookups };
 83 |   } else {
 84 |     const lines: string[] = [];
 85 | 
 86 |     if (modelsWithoutPricing.length > 0) {
 87 |       lines.push("No pricing found for:");
 88 |       for (const modelId of modelsWithoutPricing) {
 89 |         lines.push(`  ✗ ${modelId}`);
 90 |       }
 91 |     }
 92 | 
 93 |     if (modelsWithPricing.length > 0) {
 94 |       lines.push("");
 95 |       lines.push("Pricing available for:");
 96 |       for (const modelId of modelsWithPricing) {
 97 |         const lookup = lookups.get(modelId)!;
 98 |         const display = getModelPricingDisplay(lookup.pricing);
 99 |         lines.push(
100 |           `  ✓ ${modelId} (${formatMTokCost(display.inputCostPerMTok)}/MTok in)`,
101 |         );
102 |       }
103 |     }
104 | 
105 |     lines.push("");
106 |     lines.push("Cost calculation will be disabled.");
107 | 
108 |     note(lines.join("\n"), "⚠️  Pricing Incomplete");
109 | 
110 |     const proceed = await confirm({
111 |       message: "Continue without pricing?",
112 |       initialValue: true,
113 |     });
114 | 
115 |     if (isCancel(proceed) || !proceed) {
116 |       cancel("Operation cancelled.");
117 |       process.exit(0);
118 |     }
119 | 
120 |     return { enabled: false, lookups };
121 |   }
122 | }
123 | 
124 | async function selectOptions() {
125 |   intro("🚀 Svelte AI Bench");
126 | 
127 |   const availableModels = await gateway.getAvailableModels();
128 | 
129 |   const gatewayModels = availableModels.models as GatewayModel[];
130 |   const pricingMap = buildPricingMap(gatewayModels);
131 | 
132 |   const models = await multiselect({
133 |     message: "Select model(s) to benchmark",
134 |     options: [{ value: "custom", label: "Custom" }].concat(
135 |       availableModels.models.reduce<Array<{ value: string; label: string }>>(
136 |         (arr, model) => {
137 |           if (model.modelType === "language") {
138 |             arr.push({ value: model.id, label: model.name });
139 |           }
140 |           return arr;
141 |         },
142 |         [],
143 |       ),
144 |     ),
145 |   });
146 | 
147 |   if (isCancel(models)) {
148 |     cancel("Operation cancelled.");
149 |     process.exit(0);
150 |   }
151 | 
152 |   if (models.includes("custom")) {
153 |     const customModel = await text({
154 |       message: "Enter custom model id",
155 |     });
156 |     if (isCancel(customModel)) {
157 |       cancel("Operation cancelled.");
158 |       process.exit(0);
159 |     }
160 |     models.push(customModel);
161 |   }
162 | 
163 |   const selectedModels = models.filter((model) => model !== "custom");
164 | 
165 |   const pricing = await validateAndConfirmPricing(selectedModels, pricingMap);
166 | 
167 |   const mcpIntegration = await select({
168 |     message: "Which MCP integration to use?",
169 |     options: [
170 |       { value: "none", label: "No MCP Integration" },
171 |       { value: "http", label: "MCP over HTTP" },
172 |       { value: "stdio", label: "MCP over StdIO" },
173 |     ],
174 |   });
175 | 
176 |   if (isCancel(mcpIntegration)) {
177 |     cancel("Operation cancelled.");
178 |     process.exit(0);
179 |   }
180 | 
181 |   let mcp: string | undefined = undefined;
182 | 
183 |   if (mcpIntegration !== "none") {
184 |     const custom = await confirm({
185 |       message: "Do you want to provide a custom MCP server/command?",
186 |       initialValue: false,
187 |     });
188 | 
189 |     if (isCancel(custom)) {
190 |       cancel("Operation cancelled.");
191 |       process.exit(0);
192 |     }
193 | 
194 |     if (custom) {
195 |       const customMcp = await text({
196 |         message: "Insert custom url or command",
197 |       });
198 |       if (isCancel(customMcp)) {
199 |         cancel("Operation cancelled.");
200 |         process.exit(0);
201 |       }
202 | 
203 |       mcp = customMcp;
204 |     } else {
205 |       mcp =
206 |         mcpIntegration === "http"
207 |           ? "https://mcp.svelte.dev/mcp"
208 |           : "npx -y @sveltejs/mcp";
209 |     }
210 |   }
211 | 
212 |   const testingTool = await confirm({
213 |     message: "Do you want to provide the testing tool to the model?",
214 |     initialValue: true,
215 |   });
216 | 
217 |   if (isCancel(testingTool)) {
218 |     cancel("Operation cancelled.");
219 |     process.exit(0);
220 |   }
221 | 
222 |   return {
223 |     models: selectedModels,
224 |     mcp,
225 |     testingTool,
226 |     pricing,
227 |   };
228 | }
229 | 
230 | function parseCommandString(commandString: string): {
231 |   command: string;
232 |   args: string[];
233 | } {
234 |   const parts = commandString.trim().split(/\s+/);
235 |   const command = parts[0] ?? "";
236 |   const args = parts.slice(1);
237 |   return { command, args };
238 | }
239 | 
240 | async function runSingleTest(
241 |   test: TestDefinition,
242 |   model: LanguageModel,
243 |   mcpClient: Awaited<ReturnType<typeof createMCPClient>> | null,
244 |   testComponentEnabled: boolean,
245 |   testIndex: number,
246 |   totalTests: number,
247 | ): Promise<SingleTestResult> {
248 |   console.log(`\n[${testIndex + 1}/${totalTests}] Running test: ${test.name}`);
249 |   console.log("─".repeat(50));
250 | 
251 |   const prompt = buildAgentPrompt(test);
252 | 
253 |   try {
254 |     const tools = {
255 |       ResultWrite: resultWriteTool,
256 |       ...(testComponentEnabled && { TestComponent: testComponentTool(test) }),
257 |       ...(mcpClient ? await mcpClient.tools() : {}),
258 |     };
259 | 
260 |     let stepCounter = 0;
261 |     const agent = new Agent({
262 |       model,
263 |       stopWhen: [hasToolCall("ResultWrite"), stepCountIs(10)],
264 |       tools,
265 |       onStepFinish: (step) => {
266 |         if (process.env.VERBOSE_LOGGING !== "true") {
267 |           return;
268 |         }
269 |         stepCounter++;
270 |         console.log(`  Step ${stepCounter}:`);
271 |         if (step.text) {
272 |           const preview =
273 |             step.text.length > 100
274 |               ? step.text.slice(0, 100) + "..."
275 |               : step.text;
276 |           console.log(`💬 Text: ${preview}`);
277 |         }
278 |         if (step.toolCalls && step.toolCalls.length > 0) {
279 |           for (const call of step.toolCalls) {
280 |             if (call) {
281 |               console.log(`🔧 Tool call: ${call.toolName}`);
282 |             }
283 |           }
284 |         }
285 |         if (step.toolResults && step.toolResults.length > 0) {
286 |           for (const result of step.toolResults) {
287 |             if (result && "output" in result) {
288 |               const resultStr = JSON.stringify(result.output);
289 |               const preview =
290 |                 resultStr.length > 80
291 |                   ? resultStr.slice(0, 80) + "..."
292 |                   : resultStr;
293 |               console.log(`📤 Tool result: ${preview}`);
294 |             }
295 |           }
296 |         }
297 |       },
298 |     });
299 | 
300 |     console.log("  ⏳ Running agent...");
301 |     if (testComponentEnabled) {
302 |       console.log("  📋 TestComponent tool is available");
303 |     }
304 |     const result = await agent.generate({ prompt });
305 | 
306 |     const resultWriteContent = extractResultWriteContent(result.steps);
307 | 
308 |     if (!resultWriteContent) {
309 |       console.log("  ⚠️  No ResultWrite output found");
310 |       return {
311 |         testName: test.name,
312 |         prompt: test.prompt,
313 |         steps: result.steps as unknown as SingleTestResult["steps"],
314 |         resultWriteContent: null,
315 |         verification: null,
316 |       };
317 |     }
318 | 
319 |     console.log("  ✓ Component generated");
320 | 
321 |     console.log("  ⏳ Verifying against tests...");
322 |     const verification = await runTestVerification(test, resultWriteContent);
323 | 
324 |     if (verification.passed) {
325 |       console.log(
326 |         `✓ All tests passed (${verification.numPassed}/${verification.numTests})`,
327 |       );
328 |     } else {
329 |       console.log(
330 |         `✗ Tests failed (${verification.numFailed}/${verification.numTests} failed)`,
331 |       );
332 |       if (verification.failedTests) {
333 |         for (const ft of verification.failedTests) {
334 |           console.log(`- ${ft.fullName}`);
335 |         }
336 |       }
337 |     }
338 | 
339 |     cleanupTestEnvironment(test.name);
340 | 
341 |     return {
342 |       testName: test.name,
343 |       prompt: test.prompt,
344 |       steps: result.steps as unknown as SingleTestResult["steps"],
345 |       resultWriteContent,
346 |       verification,
347 |     };
348 |   } catch (error) {
349 |     console.error(`✗ Error running test: ${error}`);
350 |     return {
351 |       testName: test.name,
352 |       prompt: test.prompt,
353 |       steps: [],
354 |       resultWriteContent: null,
355 |       verification: {
356 |         testName: test.name,
357 |         passed: false,
358 |         numTests: 0,
359 |         numPassed: 0,
360 |         numFailed: 0,
361 |         duration: 0,
362 |         error: error instanceof Error ? error.message : String(error),
363 |       },
364 |     };
365 |   }
366 | }
367 | 
368 | async function main() {
369 |   const { models, mcp, testingTool, pricing } = await selectOptions();
370 | 
371 |   const mcpServerUrl = mcp;
372 |   const mcpEnabled = !!mcp;
373 | 
374 |   const testComponentEnabled = testingTool;
375 | 
376 |   const isHttpTransport = mcpServerUrl && isHttpUrl(mcpServerUrl);
377 |   const mcpTransportType = isHttpTransport ? "HTTP" : "StdIO";
378 | 
379 |   console.log("\n╔════════════════════════════════════════════════════╗");
380 |   console.log("║            SvelteBench 2.0 - Multi-Test            ║");
381 |   console.log("╚════════════════════════════════════════════════════╝");
382 | 
383 |   console.log("\n📋 Models:");
384 |   for (const modelId of models) {
385 |     const lookup = pricing.lookups.get(modelId);
386 |     if (pricing.enabled && lookup) {
387 |       const display = getModelPricingDisplay(lookup.pricing);
388 |       console.log(`   ${modelId}`);
389 |       console.log(
390 |         `      💰 ${formatMTokCost(display.inputCostPerMTok)}/MTok in, ${formatMTokCost(display.outputCostPerMTok)}/MTok out`,
391 |       );
392 |     } else {
393 |       console.log(`   ${modelId}`);
394 |     }
395 |   }
396 | 
397 |   console.log(`\n💰 Pricing: ${pricing.enabled ? "Enabled" : "Disabled"}`);
398 | 
399 |   console.log(`🔌 MCP Integration: ${mcpEnabled ? "Enabled" : "Disabled"}`);
400 |   if (mcpEnabled) {
401 |     console.log(`   Transport: ${mcpTransportType}`);
402 |     if (isHttpTransport) {
403 |       console.log(`   URL: ${mcpServerUrl}`);
404 |     } else {
405 |       console.log(`   Command: ${mcpServerUrl}`);
406 |     }
407 |   }
408 | 
409 |   console.log(
410 |     `🧪 TestComponent Tool: ${testComponentEnabled ? "Enabled" : "Disabled"}`,
411 |   );
412 | 
413 |   console.log("\n📁 Discovering tests...");
414 |   const tests = discoverTests();
415 |   console.log(
416 |     `   Found ${tests.length} test(s): ${tests.map((t) => t.name).join(", ")}`,
417 |   );
418 | 
419 |   if (tests.length === 0) {
420 |     console.error("No tests found in tests/ directory");
421 |     process.exit(1);
422 |   }
423 | 
424 |   setupOutputsDirectory();
425 | 
426 |   let mcpClient = null;
427 |   if (mcpEnabled) {
428 |     if (isHttpTransport) {
429 |       mcpClient = await createMCPClient({
430 |         transport: {
431 |           type: "http",
432 |           url: mcpServerUrl,
433 |         },
434 |       });
435 |     } else {
436 |       const { command, args } = parseCommandString(mcpServerUrl!);
437 |       mcpClient = await createMCPClient({
438 |         transport: new StdioMCPTransport({
439 |           command,
440 |           args,
441 |         }),
442 |       });
443 |     }
444 |   }
445 | 
446 |   let totalFailed = 0;
447 | 
448 |   for (const modelId of models) {
449 |     console.log("\n" + "═".repeat(50));
450 |     console.log(`🤖 Running benchmark for model: ${modelId}`);
451 |     console.log("═".repeat(50));
452 | 
453 |     const pricingLookup = pricing.enabled
454 |       ? (pricing.lookups.get(modelId) ?? null)
455 |       : null;
456 | 
457 |     if (pricingLookup) {
458 |       const display = getModelPricingDisplay(pricingLookup.pricing);
459 |       console.log(
460 |         `💰 Pricing: ${formatMTokCost(display.inputCostPerMTok)}/MTok in, ${formatMTokCost(display.outputCostPerMTok)}/MTok out`,
461 |       );
462 |     }
463 | 
464 |     const model = gateway.languageModel(modelId);
465 | 
466 |     const testResults = [];
467 |     const startTime = Date.now();
468 | 
469 |     for (let i = 0; i < tests.length; i++) {
470 |       const test = tests[i];
471 |       if (!test) continue;
472 |       const result = await runSingleTest(
473 |         test,
474 |         model,
475 |         mcpClient,
476 |         testComponentEnabled,
477 |         i,
478 |         tests.length,
479 |       );
480 |       testResults.push(result);
481 |     }
482 | 
483 |     const totalDuration = Date.now() - startTime;
484 | 
485 |     console.log("\n" + "═".repeat(50));
486 |     console.log("📊 Test Summary");
487 |     console.log("═".repeat(50));
488 | 
489 |     const passed = testResults.filter((r) => r.verification?.passed).length;
490 |     const failed = testResults.filter(
491 |       (r) => r.verification && !r.verification.passed,
492 |     ).length;
493 |     totalFailed += failed;
494 |     const skipped = testResults.filter((r) => !r.verification).length;
495 | 
496 |     for (const result of testResults) {
497 |       const status = result.verification
498 |         ? result.verification.passed
499 |           ? "✓"
500 |           : "✗"
501 |         : "⊘";
502 |       const statusText = result.verification
503 |         ? result.verification.passed
504 |           ? "PASSED"
505 |           : "FAILED"
506 |         : "SKIPPED";
507 |       console.log(`${status} ${result.testName}: ${statusText}`);
508 |     }
509 | 
510 |     console.log("─".repeat(50));
511 |     console.log(
512 |       `Total: ${passed} passed, ${failed} failed, ${skipped} skipped (${(totalDuration / 1000).toFixed(1)}s)`,
513 |     );
514 | 
515 |     let totalCost = null;
516 |     let pricingInfo = null;
517 | 
518 |     if (pricingLookup) {
519 |       totalCost = calculateTotalCost(testResults, pricingLookup.pricing);
520 |       const pricingDisplay = getModelPricingDisplay(pricingLookup.pricing);
521 |       pricingInfo = {
522 |         inputCostPerMTok: pricingDisplay.inputCostPerMTok,
523 |         outputCostPerMTok: pricingDisplay.outputCostPerMTok,
524 |         cacheReadCostPerMTok: pricingDisplay.cacheReadCostPerMTok,
525 |       };
526 | 
527 |       console.log("\n💰 Cost Summary");
528 |       console.log("─".repeat(50));
529 |       console.log(
530 |         `Input tokens: ${totalCost.inputTokens.toLocaleString()} (${formatCost(totalCost.inputCost)})`,
531 |       );
532 |       console.log(
533 |         `Output tokens: ${totalCost.outputTokens.toLocaleString()} (${formatCost(totalCost.outputCost)})`,
534 |       );
535 |       if (totalCost.cachedInputTokens > 0) {
536 |         console.log(
537 |           `Cached tokens: ${totalCost.cachedInputTokens.toLocaleString()} (${formatCost(totalCost.cacheReadCost)})`,
538 |         );
539 |       }
540 |       console.log(`Total cost: ${formatCost(totalCost.totalCost)}`);
541 |     }
542 | 
543 |     const resultsDir = "results";
544 |     if (!existsSync(resultsDir)) {
545 |       mkdirSync(resultsDir, { recursive: true });
546 |     }
547 | 
548 |     const jsonFilename = getTimestampedFilename("result", "json", modelId);
549 |     const htmlFilename = getTimestampedFilename("result", "html", modelId);
550 |     const jsonPath = `${resultsDir}/${jsonFilename}`;
551 |     const htmlPath = `${resultsDir}/${htmlFilename}`;
552 | 
553 |     const resultData = {
554 |       tests: testResults,
555 |       metadata: {
556 |         mcpEnabled,
557 |         mcpServerUrl: mcpEnabled ? mcpServerUrl! : null,
558 |         mcpTransportType: mcpEnabled ? mcpTransportType : null,
559 |         timestamp: new Date().toISOString(),
560 |         model: modelId,
561 |         pricingKey: pricingLookup?.matchedKey ?? null,
562 |         pricing: pricingInfo,
563 |         totalCost,
564 |       },
565 |     };
566 | 
567 |     writeFileSync(jsonPath, JSON.stringify(resultData, null, 2));
568 |     console.log(`\n✓ Results saved to ${jsonPath}`);
569 | 
570 |     await generateReport(jsonPath, htmlPath);
571 |   }
572 | 
573 |   cleanupOutputsDirectory();
574 | 
575 |   process.exit(totalFailed > 0 ? 1 : 0);
576 | }
577 | 
578 | main().catch((error) => {
579 |   console.error("Fatal error:", error);
580 |   process.exit(1);
581 | });
582 | 


--------------------------------------------------------------------------------
/lib/report-template.ts:
--------------------------------------------------------------------------------
  1 | import type { TestVerificationResult } from "./output-test-runner.ts";
  2 | import type { MultiTestResultData, SingleTestResult } from "./report.ts";
  3 | import { getReportStyles } from "./report-styles.ts";
  4 | import { formatCost, formatMTokCost } from "./pricing.ts";
  5 | 
  6 | interface TextBlock {
  7 |   type: "text";
  8 |   text: string;
  9 | }
 10 | 
 11 | interface ToolCallBlock {
 12 |   type: "tool-call";
 13 |   toolCallId: string;
 14 |   toolName: string;
 15 |   input: Record<string, unknown>;
 16 |   dynamic?: boolean;
 17 | }
 18 | 
 19 | interface ToolResultBlock {
 20 |   type: "tool-result";
 21 |   toolCallId: string;
 22 |   toolName: string;
 23 |   input: Record<string, unknown>;
 24 |   output: {
 25 |     content: Array<{ type: string; text?: string; [key: string]: unknown }>;
 26 |     isError?: boolean;
 27 |   };
 28 |   dynamic?: boolean;
 29 | }
 30 | 
 31 | type ContentBlock = TextBlock | ToolCallBlock | ToolResultBlock;
 32 | 
 33 | interface Step {
 34 |   content: ContentBlock[];
 35 |   finishReason: string;
 36 |   usage: {
 37 |     inputTokens: number;
 38 |     outputTokens: number;
 39 |     totalTokens: number;
 40 |     cachedInputTokens: number;
 41 |   };
 42 |   [key: string]: unknown;
 43 | }
 44 | 
 45 | function escapeHtml(text: string) {
 46 |   const map: Record<string, string> = {
 47 |     "&": "&amp;",
 48 |     "<": "&lt;",
 49 |     ">": "&gt;",
 50 |     '"': "&quot;",
 51 |     "'": "&#039;",
 52 |   };
 53 |   let result = "";
 54 |   for (const char of text) {
 55 |     result += map[char] ?? char;
 56 |   }
 57 |   return result;
 58 | }
 59 | 
 60 | function formatTimestamp(timestamp: string) {
 61 |   const date = new Date(timestamp);
 62 |   return date.toLocaleString("en-US", {
 63 |     year: "numeric",
 64 |     month: "short",
 65 |     day: "numeric",
 66 |     hour: "2-digit",
 67 |     minute: "2-digit",
 68 |     second: "2-digit",
 69 |   });
 70 | }
 71 | 
 72 | function getFirstLines(code: string, numLines: number) {
 73 |   const lines = code.split("\n");
 74 |   return lines.slice(0, numLines).join("\n");
 75 | }
 76 | 
 77 | function renderContentBlock(block: ContentBlock) {
 78 |   if (block.type === "text") {
 79 |     return `<div class="text">${escapeHtml(block.text)}</div>`;
 80 |   } else if (block.type === "tool-call") {
 81 |     const inputJson = JSON.stringify(block.input, null, 2);
 82 |     return `<details class="tool">
 83 |   <summary><span class="arrow">→</span> <span class="tool-name">${escapeHtml(block.toolName)}</span></summary>
 84 |   <pre class="input">${escapeHtml(inputJson)}</pre>
 85 | </details>`;
 86 |   } else if (block.type === "tool-result") {
 87 |     const outputText = JSON.stringify(block.output, null, 2);
 88 |     const isError =
 89 |       block.output &&
 90 |       typeof block.output === "object" &&
 91 |       "error" in block.output;
 92 |     const statusIcon = isError ? "✗" : "✓";
 93 |     return `<details class="result ${isError ? "error" : ""}">
 94 |   <summary><span class="status ${isError ? "error" : "success"}">${statusIcon}</span> Output</summary>
 95 |   <pre class="output">${escapeHtml(outputText)}</pre>
 96 | </details>`;
 97 |   }
 98 |   return "";
 99 | }
100 | 
101 | function renderVerificationResult(
102 |   verification: TestVerificationResult | null,
103 | ) {
104 |   if (!verification) {
105 |     return `<div class="verification-result skipped">
106 |       <span class="verification-icon">⊘</span>
107 |       <span class="verification-text">Test verification not run</span>
108 |     </div>`;
109 |   }
110 | 
111 |   const statusClass = verification.passed ? "passed" : "failed";
112 |   const statusIcon = verification.passed ? "✓" : "✗";
113 |   const statusText = verification.passed ? "All tests passed" : "Tests failed";
114 | 
115 |   let failedTestsHtml = "";
116 |   if (verification.failedTests && verification.failedTests.length > 0) {
117 |     const failedItems = verification.failedTests
118 |       .map(
119 |         (ft) => `<li class="failed-test">
120 |           <div class="failed-test-name">${escapeHtml(ft.fullName)}</div>
121 |           <pre class="failed-test-error">${escapeHtml(ft.errorMessage)}</pre>
122 |         </li>`,
123 |       )
124 |       .join("");
125 |     failedTestsHtml = `<details class="failed-tests-details">
126 |       <summary>Failed Tests (${verification.failedTests.length})</summary>
127 |       <ul class="failed-tests-list">${failedItems}</ul>
128 |     </details>`;
129 |   }
130 | 
131 |   let errorHtml = "";
132 |   if (verification.error) {
133 |     errorHtml = `<div class="verification-error">Error: ${escapeHtml(verification.error)}</div>`;
134 |   }
135 | 
136 |   return `<div class="verification-result ${statusClass}">
137 |     <div class="verification-header">
138 |       <span class="verification-icon">${statusIcon}</span>
139 |       <span class="verification-text">${statusText}</span>
140 |       <span class="verification-stats">${verification.numPassed}/${verification.numTests} tests (${verification.duration}ms)</span>
141 |     </div>
142 |     ${errorHtml}
143 |     ${failedTestsHtml}
144 |   </div>`;
145 | }
146 | 
147 | function renderSteps(steps: Step[]) {
148 |   return steps
149 |     .map((step, index) => {
150 |       const assistantContentHtml =
151 |         step.content.map((block) => renderContentBlock(block)).join("") ||
152 |         '<div class="text">No response</div>';
153 | 
154 |       const cachedInfo =
155 |         step.usage.cachedInputTokens > 0
156 |           ? `, ${step.usage.cachedInputTokens.toLocaleString()}⚡`
157 |           : "";
158 | 
159 |       const inputTokens = step.usage.inputTokens;
160 |       const cachedTokens = step.usage.cachedInputTokens;
161 |       const uncachedInputTokens = inputTokens - cachedTokens;
162 | 
163 |       return `
164 |     <details class="step">
165 |       <summary class="step-header">
166 |         <span class="step-num">Step ${index + 1}</span>
167 |         <span class="line"></span>
168 |         <span class="tokens" title="Total tokens: ${step.usage.totalTokens.toLocaleString()}&#10;Input: ${inputTokens.toLocaleString()} (${uncachedInputTokens.toLocaleString()} new + ${cachedTokens?.toLocaleString()} cached)&#10;Output: ${step.usage.outputTokens.toLocaleString()}">${step.usage.totalTokens.toLocaleString()} tok</span>
169 |         <span class="output" title="Output tokens generated: ${step.usage.outputTokens.toLocaleString()}&#10;${cachedTokens > 0 ? `Cached input tokens (⚡): ${cachedTokens?.toLocaleString()} (not billed)` : "No cached tokens"}">(${step.usage.outputTokens.toLocaleString()}↑${cachedInfo})</span>
170 |         <span class="reason">${step.finishReason}</span>
171 |       </summary>
172 |       <div class="step-content">
173 |         ${assistantContentHtml}
174 |       </div>
175 |     </details>`;
176 |     })
177 |     .join("\n");
178 | }
179 | 
180 | function renderTestSection(test: SingleTestResult, index: number) {
181 |   const totalTokens = test.steps.reduce(
182 |     (sum, step) => sum + step.usage.totalTokens,
183 |     0,
184 |   );
185 |   const stepCount = test.steps.length;
186 |   const verificationStatus = test.verification
187 |     ? test.verification.passed
188 |       ? "passed"
189 |       : "failed"
190 |     : "skipped";
191 |   const verificationIcon = test.verification
192 |     ? test.verification.passed
193 |       ? "✓"
194 |       : "✗"
195 |     : "⊘";
196 | 
197 |   const stepsHtml = renderSteps(test.steps);
198 |   const verificationHtml = renderVerificationResult(test.verification);
199 | 
200 |   const componentId = `component-${test.testName.replace(/[^a-zA-Z0-9]/g, "-")}`;
201 | 
202 |   const resultWriteHtml = test.resultWriteContent
203 |     ? `<div class="output-section">
204 |         <div class="token-summary">
205 |           <h4>Total Tokens Used</h4>
206 |           <div class="token-count">${totalTokens.toLocaleString()} tokens</div>
207 |         </div>
208 |         <h4>Generated Component</h4>
209 |         <div class="component-preview">
210 |           <pre class="code code-preview" id="${componentId}-preview">${escapeHtml(getFirstLines(test.resultWriteContent, 5))}</pre>
211 |           <pre class="code code-full" id="${componentId}-full" style="display: none;">${escapeHtml(test.resultWriteContent)}</pre>
212 |           <button class="expand-button" onclick="toggleComponentCode('${componentId}')">
213 |             <span class="expand-text">Show full code</span>
214 |             <span class="collapse-text" style="display: none;">Show less</span>
215 |           </button>
216 |         </div>
217 |       </div>`
218 |     : "";
219 | 
220 |   return `
221 |   <details class="test-section ${verificationStatus}" open>
222 |     <summary class="test-header">
223 |       <span class="test-status ${verificationStatus}">${verificationIcon}</span>
224 |       <span class="test-name">${escapeHtml(test.testName)}</span>
225 |       <span class="test-meta">${stepCount} steps · ${totalTokens.toLocaleString()} tokens</span>
226 |     </summary>
227 |     <div class="test-content">
228 |       <details class="prompt-section">
229 |         <summary>Prompt</summary>
230 |         <pre class="prompt-text">${escapeHtml(test.prompt)}</pre>
231 |       </details>
232 | 
233 |       <div class="steps-section">
234 |         <h4>Agent Steps</h4>
235 |         ${stepsHtml}
236 |       </div>
237 | 
238 |       ${resultWriteHtml}
239 | 
240 |       <div class="verification-section">
241 |         <h4>Test Verification</h4>
242 |         ${verificationHtml}
243 |       </div>
244 |     </div>
245 |   </details>`;
246 | }
247 | 
248 | function renderPricingSection(data: MultiTestResultData) {
249 |   const { metadata } = data;
250 |   const { pricing, totalCost, pricingKey } = metadata;
251 | 
252 |   if (!pricing && !totalCost) {
253 |     return "";
254 |   }
255 | 
256 |   let pricingInfoHtml = "";
257 |   if (pricing) {
258 |     const pricingKeyDisplay = pricingKey
259 |       ? `<span class="pricing-key" title="Key matched in model-pricing.json">${escapeHtml(pricingKey)}</span>`
260 |       : "";
261 | 
262 |     pricingInfoHtml = `
263 |       <div class="pricing-rates">
264 |         <span class="rate-label">Model Pricing:</span>
265 |         ${pricingKeyDisplay}
266 |         <span class="rate-value">${formatMTokCost(pricing.inputCostPerMTok)}/MTok in</span>
267 |         <span class="rate-separator">·</span>
268 |         <span class="rate-value">${formatMTokCost(pricing.outputCostPerMTok)}/MTok out</span>
269 |         ${pricing.cacheReadCostPerMTok !== undefined ? `<span class="rate-separator">·</span><span class="rate-value">${formatMTokCost(pricing.cacheReadCostPerMTok)}/MTok cached</span>` : ""}
270 |       </div>
271 |     `;
272 |   }
273 | 
274 |   let costBreakdownHtml = "";
275 |   if (totalCost) {
276 |     const uncachedInputTokens =
277 |       totalCost.inputTokens - totalCost.cachedInputTokens;
278 | 
279 |     costBreakdownHtml = `
280 |       <div class="cost-breakdown">
281 |         <div class="cost-row">
282 |           <span class="cost-label">Input tokens:</span>
283 |           <span class="cost-tokens">${uncachedInputTokens.toLocaleString()}</span>
284 |           <span class="cost-value">${formatCost(totalCost.inputCost)}</span>
285 |         </div>
286 |         <div class="cost-row">
287 |           <span class="cost-label">Output tokens:</span>
288 |           <span class="cost-tokens">${totalCost.outputTokens.toLocaleString()}</span>
289 |           <span class="cost-value">${formatCost(totalCost.outputCost)}</span>
290 |         </div>
291 |         ${
292 |           totalCost.cachedInputTokens > 0
293 |             ? `
294 |         <div class="cost-row cached">
295 |           <span class="cost-label">Cached tokens:</span>
296 |           <span class="cost-tokens">${totalCost.cachedInputTokens.toLocaleString()} ⚡</span>
297 |           <span class="cost-value">${formatCost(totalCost.cacheReadCost)}</span>
298 |         </div>
299 |         `
300 |             : ""
301 |         }
302 |         <div class="cost-row total">
303 |           <span class="cost-label">Total Cost:</span>
304 |           <span class="cost-tokens"></span>
305 |           <span class="cost-value">${formatCost(totalCost.totalCost)}</span>
306 |         </div>
307 |       </div>
308 |     `;
309 |   }
310 | 
311 |   return `
312 |     <div class="pricing-section">
313 |       <div class="pricing-header">
314 |         <span class="pricing-icon">💰</span>
315 |         <span class="pricing-title">Cost Summary</span>
316 |       </div>
317 |       ${pricingInfoHtml}
318 |       ${costBreakdownHtml}
319 |     </div>
320 |   `;
321 | }
322 | 
323 | function getPricingStyles() {
324 |   return `
325 |     .pricing-section {
326 |       background: var(--surface);
327 |       border: 1px solid var(--border);
328 |       border-radius: 4px;
329 |       padding: 12px;
330 |       margin-bottom: 12px;
331 |     }
332 | 
333 |     .pricing-header {
334 |       display: flex;
335 |       align-items: center;
336 |       gap: 8px;
337 |       margin-bottom: 12px;
338 |       font-weight: 600;
339 |     }
340 | 
341 |     .pricing-icon {
342 |       font-size: 16px;
343 |     }
344 | 
345 |     .pricing-title {
346 |       font-size: 14px;
347 |     }
348 | 
349 |     .pricing-rates {
350 |       display: flex;
351 |       align-items: center;
352 |       gap: 8px;
353 |       font-size: 12px;
354 |       color: var(--text-muted);
355 |       margin-bottom: 12px;
356 |       padding-bottom: 12px;
357 |       border-bottom: 1px solid var(--border);
358 |       flex-wrap: wrap;
359 |     }
360 | 
361 |     .rate-label {
362 |       font-weight: 500;
363 |     }
364 | 
365 |     .pricing-key {
366 |       font-family: 'JetBrains Mono', monospace;
367 |       background: var(--bg);
368 |       padding: 2px 6px;
369 |       border-radius: 3px;
370 |       border: 1px solid var(--border);
371 |       color: var(--text);
372 |       font-size: 11px;
373 |     }
374 | 
375 |     .rate-value {
376 |       font-family: 'JetBrains Mono', monospace;
377 |     }
378 | 
379 |     .rate-separator {
380 |       color: var(--border);
381 |     }
382 | 
383 |     .cost-breakdown {
384 |       display: flex;
385 |       flex-direction: column;
386 |       gap: 6px;
387 |     }
388 | 
389 |     .cost-row {
390 |       display: grid;
391 |       grid-template-columns: 120px 1fr auto;
392 |       gap: 8px;
393 |       align-items: center;
394 |       font-size: 13px;
395 |     }
396 | 
397 |     .cost-row.cached {
398 |       color: var(--text-muted);
399 |     }
400 | 
401 |     .cost-row.total {
402 |       margin-top: 8px;
403 |       padding-top: 8px;
404 |       border-top: 1px solid var(--border);
405 |       font-weight: 600;
406 |     }
407 | 
408 |     .cost-label {
409 |       color: var(--text-muted);
410 |     }
411 | 
412 |     .cost-row.total .cost-label {
413 |       color: var(--text);
414 |     }
415 | 
416 |     .cost-tokens {
417 |       font-family: 'JetBrains Mono', monospace;
418 |       text-align: right;
419 |     }
420 | 
421 |     .cost-value {
422 |       font-family: 'JetBrains Mono', monospace;
423 |       font-weight: 500;
424 |       text-align: right;
425 |       min-width: 80px;
426 |     }
427 | 
428 |     .cost-row.total .cost-value {
429 |       color: var(--success);
430 |       font-size: 15px;
431 |     }
432 |   `;
433 | }
434 | 
435 | export function generateMultiTestHtml(data: MultiTestResultData) {
436 |   const metadata = data.metadata;
437 |   const totalTests = data.tests.length;
438 |   const passedTests = data.tests.filter((t) => t.verification?.passed).length;
439 |   const failedTests = data.tests.filter(
440 |     (t) => t.verification && !t.verification.passed,
441 |   ).length;
442 |   const skippedTests = data.tests.filter((t) => !t.verification).length;
443 | 
444 |   const totalTokens = data.tests.reduce(
445 |     (sum, test) =>
446 |       sum + test.steps.reduce((s, step) => s + step.usage.totalTokens, 0),
447 |     0,
448 |   );
449 | 
450 |   const mcpBadge = metadata.mcpEnabled
451 |     ? metadata.mcpTransportType === "StdIO"
452 |       ? `<span class="mcp-badge enabled">MCP ✓ (StdIO: ${escapeHtml(metadata.mcpServerUrl || "")})</span>`
453 |       : `<span class="mcp-badge enabled">MCP ✓ (${escapeHtml(metadata.mcpServerUrl || "")})</span>`
454 |     : `<span class="mcp-badge disabled">MCP ✗</span>`;
455 | 
456 |   const mcpNotice = !metadata.mcpEnabled
457 |     ? `
458 |   <div class="mcp-notice">
459 |     <span class="notice-icon">ℹ️</span>
460 |     <span class="notice-text">MCP integration was not used in this benchmark. The agent ran with built-in tools only.</span>
461 |   </div>`
462 |     : "";
463 | 
464 |   const costDisplay = metadata.totalCost
465 |     ? `<span class="cost-badge">${formatCost(metadata.totalCost.totalCost)}</span>`
466 |     : "";
467 | 
468 |   const overallStatus =
469 |     failedTests === 0 && skippedTests === 0
470 |       ? "all-passed"
471 |       : failedTests > 0
472 |         ? "has-failures"
473 |         : "has-skipped";
474 | 
475 |   const testsHtml = data.tests
476 |     .map((test, index) => renderTestSection(test, index))
477 |     .join("\n");
478 | 
479 |   const pricingHtml = renderPricingSection(data);
480 | 
481 |   const styles =
482 |     getReportStyles() +
483 |     getPricingStyles() +
484 |     `
485 |     .cost-badge {
486 |       background: var(--success);
487 |       color: white;
488 |       font-size: 11px;
489 |       padding: 2px 6px;
490 |       border-radius: 3px;
491 |       font-weight: 500;
492 |       font-family: 'JetBrains Mono', monospace;
493 |     }
494 |   `;
495 | 
496 |   return `<!DOCTYPE html>
497 | <html lang="en">
498 | <head>
499 |   <meta charset="UTF-8">
500 |   <meta name="viewport" content="width=device-width, initial-scale=1.0">
501 |   <title>SvelteBench 2.0 - Multi-Test Report</title>
502 |   <style>${styles}</style>
503 | </head>
504 | <body>
505 |   <header>
506 |     <div class="header-top">
507 |       <div>
508 |         <h1>SvelteBench 2.0 ${mcpBadge} ${costDisplay}</h1>
509 |         <div class="meta">${escapeHtml(metadata.model)} · ${totalTests} tests · ${totalTokens.toLocaleString()} tokens · ${formatTimestamp(metadata.timestamp)}</div>
510 |       </div>
511 |       <button class="theme-toggle" onclick="toggleTheme()">◐</button>
512 |     </div>
513 |     <div class="summary-bar">
514 |       <div class="summary-item passed">✓ ${passedTests} passed</div>
515 |       <div class="summary-item failed">✗ ${failedTests} failed</div>
516 |       ${skippedTests > 0 ? `<div class="summary-item skipped">⊘ ${skippedTests} skipped</div>` : ""}
517 |     </div>
518 |   </header>
519 | 
520 |   ${mcpNotice}
521 | 
522 |   ${pricingHtml}
523 | 
524 |   ${testsHtml}
525 | 
526 |   <script>
527 |     function toggleTheme() {
528 |       const html = document.documentElement;
529 |       const current = html.dataset.theme || 'dark';
530 |       const next = current === 'light' ? 'dark' : 'light';
531 |       html.dataset.theme = next;
532 |       localStorage.setItem('theme', next);
533 |     }
534 | 
535 |     function toggleComponentCode(id) {
536 |       const preview = document.getElementById(id + '-preview');
537 |       const full = document.getElementById(id + '-full');
538 |       const button = event.target.closest('button');
539 |       const expandText = button.querySelector('.expand-text');
540 |       const collapseText = button.querySelector('.collapse-text');
541 |       
542 |       if (preview.style.display === 'none') {
543 |         preview.style.display = 'block';
544 |         full.style.display = 'none';
545 |         expandText.style.display = 'inline';
546 |         collapseText.style.display = 'none';
547 |       } else {
548 |         preview.style.display = 'none';
549 |         full.style.display = 'block';
550 |         expandText.style.display = 'none';
551 |         collapseText.style.display = 'inline';
552 |       }
553 |     }
554 | 
555 |     document.documentElement.dataset.theme = localStorage.getItem('theme') || 'dark';
556 |   </script>
557 | </body>
558 | </html>`;
559 | }
560 | 


--------------------------------------------------------------------------------
/results/result-2025-12-07-16-48-32.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "steps": [
  3 |     {
  4 |       "content": [
  5 |         {
  6 |           "type": "reasoning",
  7 |           "text": "The user asked: \"Can you build a counter component in svelte? Use the ResultWrite tool to write the result to a file when you are done.\"\n\n\n## Understanding User Request  \n- The user asked for a counter component to be built in Svelte.  \n- They also requested the result be written to a file using a tool called ResultWrite.\n\n## Designing Counter Component\n- A basic Svelte counter needs a script section with a count variable, set initially to 0.\n- Include functions to increment and decrement the count, triggered by button clicks.",
  8 |           "providerMetadata": {
  9 |             "openrouter": {
 10 |               "reasoning_details": [
 11 |                 {
 12 |                   "type": "reasoning.summary",
 13 |                   "summary": "The user asked: \"Can you build a counter component in svelte? Use the ResultWrite tool to write the result to a file when you are done.\"\n\n\n## Understanding User Request  \n- The user asked for a counter component to be built in Svelte.  \n- They also requested the result be written to a file using a tool called ResultWrite.\n\n## Designing Counter Component\n- A basic Svelte counter needs a script section with a count variable, set initially to 0.\n- Include functions to increment and decrement the count, triggered by button clicks.",
 14 |                   "format": "xai-responses-v1",
 15 |                   "index": 0
 16 |                 }
 17 |               ]
 18 |             }
 19 |           }
 20 |         },
 21 |         {
 22 |           "type": "reasoning",
 23 |           "text": "[REDACTED]",
 24 |           "providerMetadata": {
 25 |             "openrouter": {
 26 |               "reasoning_details": [
 27 |                 {
 28 |                   "type": "reasoning.encrypted",
 29 |                   "data": "f1L8uRV32UhXYq98EQOfEw5K//gAe24eugguZxGbaBcmTCP5AytwHZmS2SDZrPh/KioJzm5xm58CIKptdj5fZTuc1HjzE+Ceb+QozdlITIKmCooLg3yJTZHW9px6FTErx+qp8DA+Xlo2X3mioWmefs+LsFMuwBJSl4FyvWfgRTjUBsPWlm9NY64sI3mNzi1zMrID9wIjqeIlrapssIHEw8noJSAlvjGyY+H5wnXo1A1zp487B0apGXxb+nFsLTZySLLPi8FiD0eFPsFj/MjWeMLdYrEYUZNId3xx9Pvrxuskpz0aiSznVxi3QpT9m61EO4rH10CzTI7WB2+WOfH3M/aArmsRsS79AsXh7p11VR+LqnTSrbdJk8b/j0EcNN91J/KyAna3kOjC4Uy0XRXHorBYvFuXYsSagUCMar9LE98XJikyKcBB26KvjAY4uqAjwQBtn9NVBA9b8veK6qoaTJ8Yuz7j52xzKDlt+uD99UpvzHLzxGKAFwz2HX53G5ghbSuJ4Hke/E++OULzZhkeHjiXKKoZjogjhSUA4W50rLN81itnB/Z/z3CM5m6tyOl1sDcApovTnnUtE+d23GQkx3i9WRIprRJF64JcuMc2up3WyLIMefrcIV1RbKzLG3WxuPB0uNysYRVnhVDDGBJN94g+zZc97jg8C0jSV8TkkhzELUNgz2j9Fip3DqgVnrU8HDbC+cEgqTRjfSTZi7LFdIwog3EOGI87m3KL2B4DYK+V6AXkrmv1GQ0LGzJhKwLYtFhVxvswCU0uqvm/+s88LTJpAEeOYbUpQj+y43QrP7q1Erjx0KxemnFYiMLha/467eiJyNVLFLdUNI1hgKswfGhiMRZkMHAA77n3s+0Ru5gC6sHK3frQ8Usaa09pIErIBC75gn2w9hSRGa/irpG5PyyJph8IgpvVi9YV1GZqKRj5Wo3GJJw+JcfyFIJe+rA0Ltetmm9BsbNNlItkTVk+xKJ3HB0V1a3m4KbipqApP+aF7+I7lR7z1FeYU8AvFDVp5JZbFf9uChkDzk3K/Ey3QpXOkwg+4RnkM82QNstqEDIm4QzU7dKShM9dgPFGGxcgJx+nGQElUTRDuZ2jSewrGy81N6KDTUfUubFXs9/nwjmJpuNlLh1JMkAWiIvRYd11zAAnlKTrc2FWj6ezKiiWwAsQ8kJ/+K06Jwu11aLyVFkmSOlov2bNmNzH3XlWIti8LhmWP9fCv74XibX1lfaDx0VwB1KeQUJm3fxpPpRPwLDzhCMMn4UqXraLkeq57zxyJxyVnL/iq5honAB2eHQJnITDUEPwPCLBLQhKp5UuNITdjBXHuG+u/M8YRNEAnTqiWFnAd7nnzdCz30FfnceJClnYjJfcXgDLlYDZfW/BL6xQUiauXuJn4g+eiyIPANlG0FOsyU6ds48zu1w29EnpCv+Vzsc5u2cqXDS6Ez3L7NfwIdpnRlA894D61K8+191M4fSAbjDm7rBEymGNKabkHjVAj6uZOtKfn2VjEfZYnjkSh6NOH9Wjq2gYWTsKEPOMszyO/oUBD7DIGptfuoKR7dQQwQ8TyOHIO9JBJX6/Iuc6xduvjemaBwQdssaP5SO0Yyz/pTaAd+35v9KikhMR2Dlb0ljgUjH87P4AQK7tWTRZwFszTrBT6z8AMhmtftp/4YYqJ5aOoqeOCJeuiYBW6dHB8KndPtcNQ9nC2eogATiUUJm5Nv5W6AnN6GUB1a9P1SKNsbFNzT6+J0AlTeFfxoES6eIBw5YHLqPOZsZ2NmpjctkDUjJqAuPpId7KwnXWO67yLagiMpuUcya/pkWQkiAV7ds595+ekWdZMiNVg7lJyy5imGEAJWopUtNvMBo6cjdYipaZvadfWOsPj3fHHdoA3vIf3gjW5L5V/LxVZEEFEzLDL7UC0KlG1D8y/b1WmJ6bDAHVXZPuMb1AEYiEtUsed73lywJO13Drgo6YdvSqvro1WyMjTwDLN8iaWCIEf7uWnTknIzzW6BnL5FVT/FEIfZk7LGzDW1Op45EzZLrXlNTi4x8ZYfangKB2AIHcUng6qJMHWJBCmJGK2rRBod2IlQXewUqO36EklVW8EO5vank5Xn+T5QwRCngWC9vFJpiHyBQ58+I5UyariQfA2Bx/KW+gIdtZA7FrMTEaYjNzzFvCxF0hzdagQ6ZC+mQYuOmgbyhBZOepVR+yIurTBMopQJDaiII5tUEuU5UOJ8ZW3lonh0Zb6qxDgc9C9/46wtVINufPGsO3K9Le7uIamhKX2QVxEzReDyzJwrDZfCI3RQBWsnYzZ1vZsNzkZrabWaXcT6GthIgKRIGESL3PrtTPdhFQcMOYWz4ERUQfJFsJbgHzQ1srJqp95c6VjI3lKjHIZ3nxDCvQyL+mJtAeX4hzqwcam8fok8uOp2hqq1SHa8alqZ3C2RbT+8D/hezQuOpiGsPyjx8HC5pBhqJPx5RhOHvX2EQhst1xl25SICv4FEFypnDmCCt0dhph/ZKqZd/jTgUQpsE0/O5yurLXT0iG9D6bpZ0pAy/IjyUgQ32/mJu0nv1wfg5oql8VhvhWWTZa7Q6lGU8uk7eD9M3YSTjkc9dmd9GetC10gyXSLAGOVve4SyuU8+cU/jf6x8MzapoweSHgUiUWQXa+kxv7XmRsQ/39eCy+qcyPcgrN7PYWz2qGpyv/mSHiSvH+6kxnCUks0i6gbAm5a9rISE4DjGuQJSWBINRb+J83kQuAZxO+Qf12HNRlBQzVIZqs5K4U9vtVyAIWGiwAmR4HUbrqnkjLJA8yiDSg8hXw1nJ1A52eYqmuLT9EqgTKEbKeIGrYUf0UsnPGvbXu4cGKYRYK0/Okazl8/EDKc+id",
 30 |                   "id": "rs_37c358c8-1875-26be-886e-4dfe1ce9f4f8",
 31 |                   "format": "xai-responses-v1",
 32 |                   "index": 0
 33 |                 }
 34 |               ]
 35 |             }
 36 |           }
 37 |         },
 38 |         {
 39 |           "type": "tool-call",
 40 |           "toolCallId": "call_22619064",
 41 |           "toolName": "ResultWrite",
 42 |           "input": {
 43 |             "content": "<script>\n\tlet count = 0;\n\n\tfunction increment() {\n\t\tcount++;\n\t}\n\n\tfunction decrement() {\n\t\tcount--;\n\t}\n</script>\n\n<button on:click={decrement}>-</button>\n<span>{count}</span>\n<button on:click={increment}>+</button>"
 44 |           },
 45 |           "providerMetadata": {
 46 |             "openrouter": {
 47 |               "reasoning_details": [
 48 |                 {
 49 |                   "type": "reasoning.summary",
 50 |                   "summary": "The user asked: \"Can you build a counter component in svelte? Use the ResultWrite tool to write the result to a file when you are done.\"\n\n\n## Understanding User Request  \n- The user asked for a counter component to be built in Svelte.  \n- They also requested the result be written to a file using a tool called ResultWrite.\n\n## Designing Counter Component\n- A basic Svelte counter needs a script section with a count variable, set initially to 0.\n- Include functions to increment and decrement the count, triggered by button clicks.",
 51 |                   "format": "xai-responses-v1",
 52 |                   "index": 0
 53 |                 },
 54 |                 {
 55 |                   "type": "reasoning.encrypted",
 56 |                   "data": "f1L8uRV32UhXYq98EQOfEw5K//gAe24eugguZxGbaBcmTCP5AytwHZmS2SDZrPh/KioJzm5xm58CIKptdj5fZTuc1HjzE+Ceb+QozdlITIKmCooLg3yJTZHW9px6FTErx+qp8DA+Xlo2X3mioWmefs+LsFMuwBJSl4FyvWfgRTjUBsPWlm9NY64sI3mNzi1zMrID9wIjqeIlrapssIHEw8noJSAlvjGyY+H5wnXo1A1zp487B0apGXxb+nFsLTZySLLPi8FiD0eFPsFj/MjWeMLdYrEYUZNId3xx9Pvrxuskpz0aiSznVxi3QpT9m61EO4rH10CzTI7WB2+WOfH3M/aArmsRsS79AsXh7p11VR+LqnTSrbdJk8b/j0EcNN91J/KyAna3kOjC4Uy0XRXHorBYvFuXYsSagUCMar9LE98XJikyKcBB26KvjAY4uqAjwQBtn9NVBA9b8veK6qoaTJ8Yuz7j52xzKDlt+uD99UpvzHLzxGKAFwz2HX53G5ghbSuJ4Hke/E++OULzZhkeHjiXKKoZjogjhSUA4W50rLN81itnB/Z/z3CM5m6tyOl1sDcApovTnnUtE+d23GQkx3i9WRIprRJF64JcuMc2up3WyLIMefrcIV1RbKzLG3WxuPB0uNysYRVnhVDDGBJN94g+zZc97jg8C0jSV8TkkhzELUNgz2j9Fip3DqgVnrU8HDbC+cEgqTRjfSTZi7LFdIwog3EOGI87m3KL2B4DYK+V6AXkrmv1GQ0LGzJhKwLYtFhVxvswCU0uqvm/+s88LTJpAEeOYbUpQj+y43QrP7q1Erjx0KxemnFYiMLha/467eiJyNVLFLdUNI1hgKswfGhiMRZkMHAA77n3s+0Ru5gC6sHK3frQ8Usaa09pIErIBC75gn2w9hSRGa/irpG5PyyJph8IgpvVi9YV1GZqKRj5Wo3GJJw+JcfyFIJe+rA0Ltetmm9BsbNNlItkTVk+xKJ3HB0V1a3m4KbipqApP+aF7+I7lR7z1FeYU8AvFDVp5JZbFf9uChkDzk3K/Ey3QpXOkwg+4RnkM82QNstqEDIm4QzU7dKShM9dgPFGGxcgJx+nGQElUTRDuZ2jSewrGy81N6KDTUfUubFXs9/nwjmJpuNlLh1JMkAWiIvRYd11zAAnlKTrc2FWj6ezKiiWwAsQ8kJ/+K06Jwu11aLyVFkmSOlov2bNmNzH3XlWIti8LhmWP9fCv74XibX1lfaDx0VwB1KeQUJm3fxpPpRPwLDzhCMMn4UqXraLkeq57zxyJxyVnL/iq5honAB2eHQJnITDUEPwPCLBLQhKp5UuNITdjBXHuG+u/M8YRNEAnTqiWFnAd7nnzdCz30FfnceJClnYjJfcXgDLlYDZfW/BL6xQUiauXuJn4g+eiyIPANlG0FOsyU6ds48zu1w29EnpCv+Vzsc5u2cqXDS6Ez3L7NfwIdpnRlA894D61K8+191M4fSAbjDm7rBEymGNKabkHjVAj6uZOtKfn2VjEfZYnjkSh6NOH9Wjq2gYWTsKEPOMszyO/oUBD7DIGptfuoKR7dQQwQ8TyOHIO9JBJX6/Iuc6xduvjemaBwQdssaP5SO0Yyz/pTaAd+35v9KikhMR2Dlb0ljgUjH87P4AQK7tWTRZwFszTrBT6z8AMhmtftp/4YYqJ5aOoqeOCJeuiYBW6dHB8KndPtcNQ9nC2eogATiUUJm5Nv5W6AnN6GUB1a9P1SKNsbFNzT6+J0AlTeFfxoES6eIBw5YHLqPOZsZ2NmpjctkDUjJqAuPpId7KwnXWO67yLagiMpuUcya/pkWQkiAV7ds595+ekWdZMiNVg7lJyy5imGEAJWopUtNvMBo6cjdYipaZvadfWOsPj3fHHdoA3vIf3gjW5L5V/LxVZEEFEzLDL7UC0KlG1D8y/b1WmJ6bDAHVXZPuMb1AEYiEtUsed73lywJO13Drgo6YdvSqvro1WyMjTwDLN8iaWCIEf7uWnTknIzzW6BnL5FVT/FEIfZk7LGzDW1Op45EzZLrXlNTi4x8ZYfangKB2AIHcUng6qJMHWJBCmJGK2rRBod2IlQXewUqO36EklVW8EO5vank5Xn+T5QwRCngWC9vFJpiHyBQ58+I5UyariQfA2Bx/KW+gIdtZA7FrMTEaYjNzzFvCxF0hzdagQ6ZC+mQYuOmgbyhBZOepVR+yIurTBMopQJDaiII5tUEuU5UOJ8ZW3lonh0Zb6qxDgc9C9/46wtVINufPGsO3K9Le7uIamhKX2QVxEzReDyzJwrDZfCI3RQBWsnYzZ1vZsNzkZrabWaXcT6GthIgKRIGESL3PrtTPdhFQcMOYWz4ERUQfJFsJbgHzQ1srJqp95c6VjI3lKjHIZ3nxDCvQyL+mJtAeX4hzqwcam8fok8uOp2hqq1SHa8alqZ3C2RbT+8D/hezQuOpiGsPyjx8HC5pBhqJPx5RhOHvX2EQhst1xl25SICv4FEFypnDmCCt0dhph/ZKqZd/jTgUQpsE0/O5yurLXT0iG9D6bpZ0pAy/IjyUgQ32/mJu0nv1wfg5oql8VhvhWWTZa7Q6lGU8uk7eD9M3YSTjkc9dmd9GetC10gyXSLAGOVve4SyuU8+cU/jf6x8MzapoweSHgUiUWQXa+kxv7XmRsQ/39eCy+qcyPcgrN7PYWz2qGpyv/mSHiSvH+6kxnCUks0i6gbAm5a9rISE4DjGuQJSWBINRb+J83kQuAZxO+Qf12HNRlBQzVIZqs5K4U9vtVyAIWGiwAmR4HUbrqnkjLJA8yiDSg8hXw1nJ1A52eYqmuLT9EqgTKEbKeIGrYUf0UsnPGvbXu4cGKYRYK0/Okazl8/EDKc+id",
 57 |                   "id": "rs_37c358c8-1875-26be-886e-4dfe1ce9f4f8",
 58 |                   "format": "xai-responses-v1",
 59 |                   "index": 0
 60 |                 }
 61 |               ]
 62 |             }
 63 |           }
 64 |         },
 65 |         {
 66 |           "type": "tool-result",
 67 |           "toolCallId": "call_22619064",
 68 |           "toolName": "ResultWrite",
 69 |           "input": {
 70 |             "content": "<script>\n\tlet count = 0;\n\n\tfunction increment() {\n\t\tcount++;\n\t}\n\n\tfunction decrement() {\n\t\tcount--;\n\t}\n</script>\n\n<button on:click={decrement}>-</button>\n<span>{count}</span>\n<button on:click={increment}>+</button>"
 71 |           },
 72 |           "output": {
 73 |             "success": true
 74 |           },
 75 |           "dynamic": false
 76 |         }
 77 |       ],
 78 |       "finishReason": "tool-calls",
 79 |       "usage": {
 80 |         "inputTokens": 431,
 81 |         "outputTokens": 540,
 82 |         "totalTokens": 971,
 83 |         "reasoningTokens": 463,
 84 |         "cachedInputTokens": 320
 85 |       },
 86 |       "warnings": [],
 87 |       "request": {
 88 |         "body": {
 89 |           "model": "x-ai/grok-code-fast-1",
 90 |           "messages": [
 91 |             {
 92 |               "role": "user",
 93 |               "content": "Can you build a counter component in svelte? Use the ResultWrite tool to write the result to a file when you are done."
 94 |             }
 95 |           ],
 96 |           "tools": [
 97 |             {
 98 |               "type": "function",
 99 |               "function": {
100 |                 "name": "ResultWrite",
101 |                 "description": "Write content to a result file",
102 |                 "parameters": {
103 |                   "$schema": "http://json-schema.org/draft-07/schema#",
104 |                   "type": "object",
105 |                   "properties": {
106 |                     "content": {
107 |                       "description": "The content to write to the result file",
108 |                       "type": "string"
109 |                     }
110 |                   },
111 |                   "required": [
112 |                     "content"
113 |                   ],
114 |                   "additionalProperties": false
115 |                 }
116 |               }
117 |             }
118 |           ],
119 |           "tool_choice": "auto"
120 |         }
121 |       },
122 |       "response": {
123 |         "id": "gen-1765122501-2azq1uAhq0dB8bfw8LqK",
124 |         "timestamp": "2025-12-07T15:48:32.009Z",
125 |         "modelId": "x-ai/grok-code-fast-1",
126 |         "headers": {
127 |           "access-control-allow-origin": "*",
128 |           "cf-ray": "9aa52ab04e8ef9aa-ARN",
129 |           "connection": "keep-alive",
130 |           "content-encoding": "gzip",
131 |           "content-type": "application/json",
132 |           "date": "Sun, 07 Dec 2025 15:48:21 GMT",
133 |           "permissions-policy": "payment=(self \"https://checkout.stripe.com\" \"https://connect-js.stripe.com\" \"https://js.stripe.com\" \"https://*.js.stripe.com\" \"https://hooks.stripe.com\")",
134 |           "referrer-policy": "no-referrer, strict-origin-when-cross-origin",
135 |           "server": "cloudflare",
136 |           "transfer-encoding": "chunked",
137 |           "vary": "Accept-Encoding",
138 |           "x-content-type-options": "nosniff"
139 |         },
140 |         "messages": [
141 |           {
142 |             "role": "assistant",
143 |             "content": [
144 |               {
145 |                 "type": "reasoning",
146 |                 "text": "The user asked: \"Can you build a counter component in svelte? Use the ResultWrite tool to write the result to a file when you are done.\"\n\n\n## Understanding User Request  \n- The user asked for a counter component to be built in Svelte.  \n- They also requested the result be written to a file using a tool called ResultWrite.\n\n## Designing Counter Component\n- A basic Svelte counter needs a script section with a count variable, set initially to 0.\n- Include functions to increment and decrement the count, triggered by button clicks.",
147 |                 "providerOptions": {
148 |                   "openrouter": {
149 |                     "reasoning_details": [
150 |                       {
151 |                         "type": "reasoning.summary",
152 |                         "summary": "The user asked: \"Can you build a counter component in svelte? Use the ResultWrite tool to write the result to a file when you are done.\"\n\n\n## Understanding User Request  \n- The user asked for a counter component to be built in Svelte.  \n- They also requested the result be written to a file using a tool called ResultWrite.\n\n## Designing Counter Component\n- A basic Svelte counter needs a script section with a count variable, set initially to 0.\n- Include functions to increment and decrement the count, triggered by button clicks.",
153 |                         "format": "xai-responses-v1",
154 |                         "index": 0
155 |                       }
156 |                     ]
157 |                   }
158 |                 }
159 |               },
160 |               {
161 |                 "type": "reasoning",
162 |                 "text": "[REDACTED]",
163 |                 "providerOptions": {
164 |                   "openrouter": {
165 |                     "reasoning_details": [
166 |                       {
167 |                         "type": "reasoning.encrypted",
168 |                         "data": "f1L8uRV32UhXYq98EQOfEw5K//gAe24eugguZxGbaBcmTCP5AytwHZmS2SDZrPh/KioJzm5xm58CIKptdj5fZTuc1HjzE+Ceb+QozdlITIKmCooLg3yJTZHW9px6FTErx+qp8DA+Xlo2X3mioWmefs+LsFMuwBJSl4FyvWfgRTjUBsPWlm9NY64sI3mNzi1zMrID9wIjqeIlrapssIHEw8noJSAlvjGyY+H5wnXo1A1zp487B0apGXxb+nFsLTZySLLPi8FiD0eFPsFj/MjWeMLdYrEYUZNId3xx9Pvrxuskpz0aiSznVxi3QpT9m61EO4rH10CzTI7WB2+WOfH3M/aArmsRsS79AsXh7p11VR+LqnTSrbdJk8b/j0EcNN91J/KyAna3kOjC4Uy0XRXHorBYvFuXYsSagUCMar9LE98XJikyKcBB26KvjAY4uqAjwQBtn9NVBA9b8veK6qoaTJ8Yuz7j52xzKDlt+uD99UpvzHLzxGKAFwz2HX53G5ghbSuJ4Hke/E++OULzZhkeHjiXKKoZjogjhSUA4W50rLN81itnB/Z/z3CM5m6tyOl1sDcApovTnnUtE+d23GQkx3i9WRIprRJF64JcuMc2up3WyLIMefrcIV1RbKzLG3WxuPB0uNysYRVnhVDDGBJN94g+zZc97jg8C0jSV8TkkhzELUNgz2j9Fip3DqgVnrU8HDbC+cEgqTRjfSTZi7LFdIwog3EOGI87m3KL2B4DYK+V6AXkrmv1GQ0LGzJhKwLYtFhVxvswCU0uqvm/+s88LTJpAEeOYbUpQj+y43QrP7q1Erjx0KxemnFYiMLha/467eiJyNVLFLdUNI1hgKswfGhiMRZkMHAA77n3s+0Ru5gC6sHK3frQ8Usaa09pIErIBC75gn2w9hSRGa/irpG5PyyJph8IgpvVi9YV1GZqKRj5Wo3GJJw+JcfyFIJe+rA0Ltetmm9BsbNNlItkTVk+xKJ3HB0V1a3m4KbipqApP+aF7+I7lR7z1FeYU8AvFDVp5JZbFf9uChkDzk3K/Ey3QpXOkwg+4RnkM82QNstqEDIm4QzU7dKShM9dgPFGGxcgJx+nGQElUTRDuZ2jSewrGy81N6KDTUfUubFXs9/nwjmJpuNlLh1JMkAWiIvRYd11zAAnlKTrc2FWj6ezKiiWwAsQ8kJ/+K06Jwu11aLyVFkmSOlov2bNmNzH3XlWIti8LhmWP9fCv74XibX1lfaDx0VwB1KeQUJm3fxpPpRPwLDzhCMMn4UqXraLkeq57zxyJxyVnL/iq5honAB2eHQJnITDUEPwPCLBLQhKp5UuNITdjBXHuG+u/M8YRNEAnTqiWFnAd7nnzdCz30FfnceJClnYjJfcXgDLlYDZfW/BL6xQUiauXuJn4g+eiyIPANlG0FOsyU6ds48zu1w29EnpCv+Vzsc5u2cqXDS6Ez3L7NfwIdpnRlA894D61K8+191M4fSAbjDm7rBEymGNKabkHjVAj6uZOtKfn2VjEfZYnjkSh6NOH9Wjq2gYWTsKEPOMszyO/oUBD7DIGptfuoKR7dQQwQ8TyOHIO9JBJX6/Iuc6xduvjemaBwQdssaP5SO0Yyz/pTaAd+35v9KikhMR2Dlb0ljgUjH87P4AQK7tWTRZwFszTrBT6z8AMhmtftp/4YYqJ5aOoqeOCJeuiYBW6dHB8KndPtcNQ9nC2eogATiUUJm5Nv5W6AnN6GUB1a9P1SKNsbFNzT6+J0AlTeFfxoES6eIBw5YHLqPOZsZ2NmpjctkDUjJqAuPpId7KwnXWO67yLagiMpuUcya/pkWQkiAV7ds595+ekWdZMiNVg7lJyy5imGEAJWopUtNvMBo6cjdYipaZvadfWOsPj3fHHdoA3vIf3gjW5L5V/LxVZEEFEzLDL7UC0KlG1D8y/b1WmJ6bDAHVXZPuMb1AEYiEtUsed73lywJO13Drgo6YdvSqvro1WyMjTwDLN8iaWCIEf7uWnTknIzzW6BnL5FVT/FEIfZk7LGzDW1Op45EzZLrXlNTi4x8ZYfangKB2AIHcUng6qJMHWJBCmJGK2rRBod2IlQXewUqO36EklVW8EO5vank5Xn+T5QwRCngWC9vFJpiHyBQ58+I5UyariQfA2Bx/KW+gIdtZA7FrMTEaYjNzzFvCxF0hzdagQ6ZC+mQYuOmgbyhBZOepVR+yIurTBMopQJDaiII5tUEuU5UOJ8ZW3lonh0Zb6qxDgc9C9/46wtVINufPGsO3K9Le7uIamhKX2QVxEzReDyzJwrDZfCI3RQBWsnYzZ1vZsNzkZrabWaXcT6GthIgKRIGESL3PrtTPdhFQcMOYWz4ERUQfJFsJbgHzQ1srJqp95c6VjI3lKjHIZ3nxDCvQyL+mJtAeX4hzqwcam8fok8uOp2hqq1SHa8alqZ3C2RbT+8D/hezQuOpiGsPyjx8HC5pBhqJPx5RhOHvX2EQhst1xl25SICv4FEFypnDmCCt0dhph/ZKqZd/jTgUQpsE0/O5yurLXT0iG9D6bpZ0pAy/IjyUgQ32/mJu0nv1wfg5oql8VhvhWWTZa7Q6lGU8uk7eD9M3YSTjkc9dmd9GetC10gyXSLAGOVve4SyuU8+cU/jf6x8MzapoweSHgUiUWQXa+kxv7XmRsQ/39eCy+qcyPcgrN7PYWz2qGpyv/mSHiSvH+6kxnCUks0i6gbAm5a9rISE4DjGuQJSWBINRb+J83kQuAZxO+Qf12HNRlBQzVIZqs5K4U9vtVyAIWGiwAmR4HUbrqnkjLJA8yiDSg8hXw1nJ1A52eYqmuLT9EqgTKEbKeIGrYUf0UsnPGvbXu4cGKYRYK0/Okazl8/EDKc+id",
169 |                         "id": "rs_37c358c8-1875-26be-886e-4dfe1ce9f4f8",
170 |                         "format": "xai-responses-v1",
171 |                         "index": 0
172 |                       }
173 |                     ]
174 |                   }
175 |                 }
176 |               },
177 |               {
178 |                 "type": "tool-call",
179 |                 "toolCallId": "call_22619064",
180 |                 "toolName": "ResultWrite",
181 |                 "input": {
182 |                   "content": "<script>\n\tlet count = 0;\n\n\tfunction increment() {\n\t\tcount++;\n\t}\n\n\tfunction decrement() {\n\t\tcount--;\n\t}\n</script>\n\n<button on:click={decrement}>-</button>\n<span>{count}</span>\n<button on:click={increment}>+</button>"
183 |                 },
184 |                 "providerOptions": {
185 |                   "openrouter": {
186 |                     "reasoning_details": [
187 |                       {
188 |                         "type": "reasoning.summary",
189 |                         "summary": "The user asked: \"Can you build a counter component in svelte? Use the ResultWrite tool to write the result to a file when you are done.\"\n\n\n## Understanding User Request  \n- The user asked for a counter component to be built in Svelte.  \n- They also requested the result be written to a file using a tool called ResultWrite.\n\n## Designing Counter Component\n- A basic Svelte counter needs a script section with a count variable, set initially to 0.\n- Include functions to increment and decrement the count, triggered by button clicks.",
190 |                         "format": "xai-responses-v1",
191 |                         "index": 0
192 |                       },
193 |                       {
194 |                         "type": "reasoning.encrypted",
195 |                         "data": "f1L8uRV32UhXYq98EQOfEw5K//gAe24eugguZxGbaBcmTCP5AytwHZmS2SDZrPh/KioJzm5xm58CIKptdj5fZTuc1HjzE+Ceb+QozdlITIKmCooLg3yJTZHW9px6FTErx+qp8DA+Xlo2X3mioWmefs+LsFMuwBJSl4FyvWfgRTjUBsPWlm9NY64sI3mNzi1zMrID9wIjqeIlrapssIHEw8noJSAlvjGyY+H5wnXo1A1zp487B0apGXxb+nFsLTZySLLPi8FiD0eFPsFj/MjWeMLdYrEYUZNId3xx9Pvrxuskpz0aiSznVxi3QpT9m61EO4rH10CzTI7WB2+WOfH3M/aArmsRsS79AsXh7p11VR+LqnTSrbdJk8b/j0EcNN91J/KyAna3kOjC4Uy0XRXHorBYvFuXYsSagUCMar9LE98XJikyKcBB26KvjAY4uqAjwQBtn9NVBA9b8veK6qoaTJ8Yuz7j52xzKDlt+uD99UpvzHLzxGKAFwz2HX53G5ghbSuJ4Hke/E++OULzZhkeHjiXKKoZjogjhSUA4W50rLN81itnB/Z/z3CM5m6tyOl1sDcApovTnnUtE+d23GQkx3i9WRIprRJF64JcuMc2up3WyLIMefrcIV1RbKzLG3WxuPB0uNysYRVnhVDDGBJN94g+zZc97jg8C0jSV8TkkhzELUNgz2j9Fip3DqgVnrU8HDbC+cEgqTRjfSTZi7LFdIwog3EOGI87m3KL2B4DYK+V6AXkrmv1GQ0LGzJhKwLYtFhVxvswCU0uqvm/+s88LTJpAEeOYbUpQj+y43QrP7q1Erjx0KxemnFYiMLha/467eiJyNVLFLdUNI1hgKswfGhiMRZkMHAA77n3s+0Ru5gC6sHK3frQ8Usaa09pIErIBC75gn2w9hSRGa/irpG5PyyJph8IgpvVi9YV1GZqKRj5Wo3GJJw+JcfyFIJe+rA0Ltetmm9BsbNNlItkTVk+xKJ3HB0V1a3m4KbipqApP+aF7+I7lR7z1FeYU8AvFDVp5JZbFf9uChkDzk3K/Ey3QpXOkwg+4RnkM82QNstqEDIm4QzU7dKShM9dgPFGGxcgJx+nGQElUTRDuZ2jSewrGy81N6KDTUfUubFXs9/nwjmJpuNlLh1JMkAWiIvRYd11zAAnlKTrc2FWj6ezKiiWwAsQ8kJ/+K06Jwu11aLyVFkmSOlov2bNmNzH3XlWIti8LhmWP9fCv74XibX1lfaDx0VwB1KeQUJm3fxpPpRPwLDzhCMMn4UqXraLkeq57zxyJxyVnL/iq5honAB2eHQJnITDUEPwPCLBLQhKp5UuNITdjBXHuG+u/M8YRNEAnTqiWFnAd7nnzdCz30FfnceJClnYjJfcXgDLlYDZfW/BL6xQUiauXuJn4g+eiyIPANlG0FOsyU6ds48zu1w29EnpCv+Vzsc5u2cqXDS6Ez3L7NfwIdpnRlA894D61K8+191M4fSAbjDm7rBEymGNKabkHjVAj6uZOtKfn2VjEfZYnjkSh6NOH9Wjq2gYWTsKEPOMszyO/oUBD7DIGptfuoKR7dQQwQ8TyOHIO9JBJX6/Iuc6xduvjemaBwQdssaP5SO0Yyz/pTaAd+35v9KikhMR2Dlb0ljgUjH87P4AQK7tWTRZwFszTrBT6z8AMhmtftp/4YYqJ5aOoqeOCJeuiYBW6dHB8KndPtcNQ9nC2eogATiUUJm5Nv5W6AnN6GUB1a9P1SKNsbFNzT6+J0AlTeFfxoES6eIBw5YHLqPOZsZ2NmpjctkDUjJqAuPpId7KwnXWO67yLagiMpuUcya/pkWQkiAV7ds595+ekWdZMiNVg7lJyy5imGEAJWopUtNvMBo6cjdYipaZvadfWOsPj3fHHdoA3vIf3gjW5L5V/LxVZEEFEzLDL7UC0KlG1D8y/b1WmJ6bDAHVXZPuMb1AEYiEtUsed73lywJO13Drgo6YdvSqvro1WyMjTwDLN8iaWCIEf7uWnTknIzzW6BnL5FVT/FEIfZk7LGzDW1Op45EzZLrXlNTi4x8ZYfangKB2AIHcUng6qJMHWJBCmJGK2rRBod2IlQXewUqO36EklVW8EO5vank5Xn+T5QwRCngWC9vFJpiHyBQ58+I5UyariQfA2Bx/KW+gIdtZA7FrMTEaYjNzzFvCxF0hzdagQ6ZC+mQYuOmgbyhBZOepVR+yIurTBMopQJDaiII5tUEuU5UOJ8ZW3lonh0Zb6qxDgc9C9/46wtVINufPGsO3K9Le7uIamhKX2QVxEzReDyzJwrDZfCI3RQBWsnYzZ1vZsNzkZrabWaXcT6GthIgKRIGESL3PrtTPdhFQcMOYWz4ERUQfJFsJbgHzQ1srJqp95c6VjI3lKjHIZ3nxDCvQyL+mJtAeX4hzqwcam8fok8uOp2hqq1SHa8alqZ3C2RbT+8D/hezQuOpiGsPyjx8HC5pBhqJPx5RhOHvX2EQhst1xl25SICv4FEFypnDmCCt0dhph/ZKqZd/jTgUQpsE0/O5yurLXT0iG9D6bpZ0pAy/IjyUgQ32/mJu0nv1wfg5oql8VhvhWWTZa7Q6lGU8uk7eD9M3YSTjkc9dmd9GetC10gyXSLAGOVve4SyuU8+cU/jf6x8MzapoweSHgUiUWQXa+kxv7XmRsQ/39eCy+qcyPcgrN7PYWz2qGpyv/mSHiSvH+6kxnCUks0i6gbAm5a9rISE4DjGuQJSWBINRb+J83kQuAZxO+Qf12HNRlBQzVIZqs5K4U9vtVyAIWGiwAmR4HUbrqnkjLJA8yiDSg8hXw1nJ1A52eYqmuLT9EqgTKEbKeIGrYUf0UsnPGvbXu4cGKYRYK0/Okazl8/EDKc+id",
196 |                         "id": "rs_37c358c8-1875-26be-886e-4dfe1ce9f4f8",
197 |                         "format": "xai-responses-v1",
198 |                         "index": 0
199 |                       }
200 |                     ]
201 |                   }
202 |                 }
203 |               }
204 |             ]
205 |           },
206 |           {
207 |             "role": "tool",
208 |             "content": [
209 |               {
210 |                 "type": "tool-result",
211 |                 "toolCallId": "call_22619064",
212 |                 "toolName": "ResultWrite",
213 |                 "output": {
214 |                   "type": "json",
215 |                   "value": {
216 |                     "success": true
217 |                   }
218 |                 }
219 |               }
220 |             ]
221 |           }
222 |         ]
223 |       },
224 |       "providerMetadata": {
225 |         "openrouter": {
226 |           "provider": "xAI",
227 |           "reasoning_details": [
228 |             {
229 |               "type": "reasoning.summary",
230 |               "summary": "The user asked: \"Can you build a counter component in svelte? Use the ResultWrite tool to write the result to a file when you are done.\"\n\n\n## Understanding User Request  \n- The user asked for a counter component to be built in Svelte.  \n- They also requested the result be written to a file using a tool called ResultWrite.\n\n## Designing Counter Component\n- A basic Svelte counter needs a script section with a count variable, set initially to 0.\n- Include functions to increment and decrement the count, triggered by button clicks.",
231 |               "format": "xai-responses-v1",
232 |               "index": 0
233 |             },
234 |             {
235 |               "type": "reasoning.encrypted",
236 |               "data": "f1L8uRV32UhXYq98EQOfEw5K//gAe24eugguZxGbaBcmTCP5AytwHZmS2SDZrPh/KioJzm5xm58CIKptdj5fZTuc1HjzE+Ceb+QozdlITIKmCooLg3yJTZHW9px6FTErx+qp8DA+Xlo2X3mioWmefs+LsFMuwBJSl4FyvWfgRTjUBsPWlm9NY64sI3mNzi1zMrID9wIjqeIlrapssIHEw8noJSAlvjGyY+H5wnXo1A1zp487B0apGXxb+nFsLTZySLLPi8FiD0eFPsFj/MjWeMLdYrEYUZNId3xx9Pvrxuskpz0aiSznVxi3QpT9m61EO4rH10CzTI7WB2+WOfH3M/aArmsRsS79AsXh7p11VR+LqnTSrbdJk8b/j0EcNN91J/KyAna3kOjC4Uy0XRXHorBYvFuXYsSagUCMar9LE98XJikyKcBB26KvjAY4uqAjwQBtn9NVBA9b8veK6qoaTJ8Yuz7j52xzKDlt+uD99UpvzHLzxGKAFwz2HX53G5ghbSuJ4Hke/E++OULzZhkeHjiXKKoZjogjhSUA4W50rLN81itnB/Z/z3CM5m6tyOl1sDcApovTnnUtE+d23GQkx3i9WRIprRJF64JcuMc2up3WyLIMefrcIV1RbKzLG3WxuPB0uNysYRVnhVDDGBJN94g+zZc97jg8C0jSV8TkkhzELUNgz2j9Fip3DqgVnrU8HDbC+cEgqTRjfSTZi7LFdIwog3EOGI87m3KL2B4DYK+V6AXkrmv1GQ0LGzJhKwLYtFhVxvswCU0uqvm/+s88LTJpAEeOYbUpQj+y43QrP7q1Erjx0KxemnFYiMLha/467eiJyNVLFLdUNI1hgKswfGhiMRZkMHAA77n3s+0Ru5gC6sHK3frQ8Usaa09pIErIBC75gn2w9hSRGa/irpG5PyyJph8IgpvVi9YV1GZqKRj5Wo3GJJw+JcfyFIJe+rA0Ltetmm9BsbNNlItkTVk+xKJ3HB0V1a3m4KbipqApP+aF7+I7lR7z1FeYU8AvFDVp5JZbFf9uChkDzk3K/Ey3QpXOkwg+4RnkM82QNstqEDIm4QzU7dKShM9dgPFGGxcgJx+nGQElUTRDuZ2jSewrGy81N6KDTUfUubFXs9/nwjmJpuNlLh1JMkAWiIvRYd11zAAnlKTrc2FWj6ezKiiWwAsQ8kJ/+K06Jwu11aLyVFkmSOlov2bNmNzH3XlWIti8LhmWP9fCv74XibX1lfaDx0VwB1KeQUJm3fxpPpRPwLDzhCMMn4UqXraLkeq57zxyJxyVnL/iq5honAB2eHQJnITDUEPwPCLBLQhKp5UuNITdjBXHuG+u/M8YRNEAnTqiWFnAd7nnzdCz30FfnceJClnYjJfcXgDLlYDZfW/BL6xQUiauXuJn4g+eiyIPANlG0FOsyU6ds48zu1w29EnpCv+Vzsc5u2cqXDS6Ez3L7NfwIdpnRlA894D61K8+191M4fSAbjDm7rBEymGNKabkHjVAj6uZOtKfn2VjEfZYnjkSh6NOH9Wjq2gYWTsKEPOMszyO/oUBD7DIGptfuoKR7dQQwQ8TyOHIO9JBJX6/Iuc6xduvjemaBwQdssaP5SO0Yyz/pTaAd+35v9KikhMR2Dlb0ljgUjH87P4AQK7tWTRZwFszTrBT6z8AMhmtftp/4YYqJ5aOoqeOCJeuiYBW6dHB8KndPtcNQ9nC2eogATiUUJm5Nv5W6AnN6GUB1a9P1SKNsbFNzT6+J0AlTeFfxoES6eIBw5YHLqPOZsZ2NmpjctkDUjJqAuPpId7KwnXWO67yLagiMpuUcya/pkWQkiAV7ds595+ekWdZMiNVg7lJyy5imGEAJWopUtNvMBo6cjdYipaZvadfWOsPj3fHHdoA3vIf3gjW5L5V/LxVZEEFEzLDL7UC0KlG1D8y/b1WmJ6bDAHVXZPuMb1AEYiEtUsed73lywJO13Drgo6YdvSqvro1WyMjTwDLN8iaWCIEf7uWnTknIzzW6BnL5FVT/FEIfZk7LGzDW1Op45EzZLrXlNTi4x8ZYfangKB2AIHcUng6qJMHWJBCmJGK2rRBod2IlQXewUqO36EklVW8EO5vank5Xn+T5QwRCngWC9vFJpiHyBQ58+I5UyariQfA2Bx/KW+gIdtZA7FrMTEaYjNzzFvCxF0hzdagQ6ZC+mQYuOmgbyhBZOepVR+yIurTBMopQJDaiII5tUEuU5UOJ8ZW3lonh0Zb6qxDgc9C9/46wtVINufPGsO3K9Le7uIamhKX2QVxEzReDyzJwrDZfCI3RQBWsnYzZ1vZsNzkZrabWaXcT6GthIgKRIGESL3PrtTPdhFQcMOYWz4ERUQfJFsJbgHzQ1srJqp95c6VjI3lKjHIZ3nxDCvQyL+mJtAeX4hzqwcam8fok8uOp2hqq1SHa8alqZ3C2RbT+8D/hezQuOpiGsPyjx8HC5pBhqJPx5RhOHvX2EQhst1xl25SICv4FEFypnDmCCt0dhph/ZKqZd/jTgUQpsE0/O5yurLXT0iG9D6bpZ0pAy/IjyUgQ32/mJu0nv1wfg5oql8VhvhWWTZa7Q6lGU8uk7eD9M3YSTjkc9dmd9GetC10gyXSLAGOVve4SyuU8+cU/jf6x8MzapoweSHgUiUWQXa+kxv7XmRsQ/39eCy+qcyPcgrN7PYWz2qGpyv/mSHiSvH+6kxnCUks0i6gbAm5a9rISE4DjGuQJSWBINRb+J83kQuAZxO+Qf12HNRlBQzVIZqs5K4U9vtVyAIWGiwAmR4HUbrqnkjLJA8yiDSg8hXw1nJ1A52eYqmuLT9EqgTKEbKeIGrYUf0UsnPGvbXu4cGKYRYK0/Okazl8/EDKc+id",
237 |               "id": "rs_37c358c8-1875-26be-886e-4dfe1ce9f4f8",
238 |               "format": "xai-responses-v1",
239 |               "index": 0
240 |             }
241 |           ],
242 |           "usage": {
243 |             "promptTokens": 431,
244 |             "promptTokensDetails": {
245 |               "cachedTokens": 320
246 |             },
247 |             "completionTokens": 540,
248 |             "completionTokensDetails": {
249 |               "reasoningTokens": 463
250 |             },
251 |             "totalTokens": 971,
252 |             "cost": 0.000830214
253 |           }
254 |         }
255 |       }
256 |     }
257 |   ],
258 |   "resultWriteContent": "<script>\n\tlet count = 0;\n\n\tfunction increment() {\n\t\tcount++;\n\t}\n\n\tfunction decrement() {\n\t\tcount--;\n\t}\n</script>\n\n<button on:click={decrement}>-</button>\n<span>{count}</span>\n<button on:click={increment}>+</button>",
259 |   "metadata": {
260 |     "mcpEnabled": false,
261 |     "mcpServerUrl": null,
262 |     "timestamp": "2025-12-07T15:48:32.016Z",
263 |     "model": "openrouter/x-ai/grok-code-fast-1"
264 |   }
265 | }


--------------------------------------------------------------------------------