├── .nvmrc ├── favicon.png ├── vitest-setup.js ├── .aidigestignore ├── src ├── tests │ ├── hello-world │ │ ├── Reference.svelte │ │ ├── prompt.md │ │ └── test.ts │ ├── derived │ │ ├── Reference.svelte │ │ ├── prompt.md │ │ └── test.ts │ ├── each │ │ ├── Reference.svelte │ │ ├── prompt.md │ │ └── test.ts │ ├── effect │ │ ├── Reference.svelte │ │ ├── prompt.md │ │ └── test.ts │ ├── snippets │ │ ├── Reference.svelte │ │ ├── prompt.md │ │ └── test.ts │ ├── counter │ │ ├── Reference.svelte │ │ ├── prompt.md │ │ └── test.ts │ ├── inspect │ │ ├── prompt.md │ │ ├── Reference.svelte │ │ └── test.ts │ ├── props │ │ ├── Reference.svelte │ │ ├── prompt.md │ │ └── test.ts │ └── derived-by │ │ ├── Reference.svelte │ │ ├── prompt.md │ │ └── test.ts ├── utils │ ├── prompt.ts │ ├── code-cleaner.ts │ ├── ensure-dirs.ts │ ├── retry-wrapper.ts │ ├── humaneval.ts │ ├── model-validator.ts │ ├── code-cleaner.test.ts │ ├── retry-wrapper.test.ts │ ├── humaneval.spec.ts │ ├── test-runner.ts │ ├── file.ts │ └── test-manager.ts └── llms │ ├── google.ts │ ├── anthropic.ts │ ├── ollama.ts │ ├── openai.ts │ ├── index.ts │ ├── zai.ts │ ├── moonshot.ts │ └── openrouter.ts ├── .prettierrc ├── .prettierignore ├── vite.config.js ├── .gitignore ├── tsconfig.json ├── .github └── workflows │ ├── test-build.yml │ └── build-static.yml ├── package.json ├── .env.example ├── CLAUDE.md ├── merge-v1.ts ├── verify.ts ├── merge.ts ├── README.md ├── benchmarks └── v1 │ └── v1-benchmark-results-2025-05-25T20-01-22.048Z.json └── index.ts /.nvmrc: -------------------------------------------------------------------------------- 1 | 24 -------------------------------------------------------------------------------- /favicon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/khromov/svelte-bench/HEAD/favicon.png -------------------------------------------------------------------------------- /vitest-setup.js: -------------------------------------------------------------------------------- 1 | // console.log("Ran vitest-setup.js"); 2 | import "@testing-library/jest-dom/vitest"; 3 | -------------------------------------------------------------------------------- /.aidigestignore: -------------------------------------------------------------------------------- 1 | benchmarks 2 | .github 3 | tmp 4 | context 5 | src/tests/derived* 6 | src/tests/each 7 | src/tests/effect 8 | src/tests/hello-world 9 | src/tests/inspect 10 | src/tests/snippets -------------------------------------------------------------------------------- /src/tests/hello-world/Reference.svelte: -------------------------------------------------------------------------------- 1 | 2 | 3 |

Hello, World!

4 | 5 | 12 | -------------------------------------------------------------------------------- /.prettierrc: -------------------------------------------------------------------------------- 1 | { 2 | "semi": true, 3 | "singleQuote": false, 4 | "tabWidth": 2, 5 | "trailingComma": "all", 6 | "printWidth": 120, 7 | "plugins": ["prettier-plugin-svelte"], 8 | "overrides": [ 9 | { 10 | "files": "*.svelte", 11 | "options": { 12 | "parser": "svelte" 13 | } 14 | } 15 | ] 16 | } 17 | -------------------------------------------------------------------------------- /.prettierignore: -------------------------------------------------------------------------------- 1 | # Dependencies 2 | node_modules/ 3 | 4 | # Build output 5 | tmp/ 6 | coverage/ 7 | 8 | # Generated files 9 | benchmarks/ 10 | 11 | # Test prompts (should not be formatted) 12 | **/prompt.md 13 | context/ 14 | 15 | # Environment files 16 | .env* 17 | 18 | # Lock files 19 | pnpm-lock.yaml 20 | 21 | # Minified files 22 | *.min.js 23 | -------------------------------------------------------------------------------- /vite.config.js: -------------------------------------------------------------------------------- 1 | import { defineConfig } from "vitest/config"; 2 | import { svelte } from "@sveltejs/vite-plugin-svelte"; 3 | import { svelteTesting } from "@testing-library/svelte/vite"; 4 | 5 | export default defineConfig({ 6 | plugins: [svelte(), svelteTesting()], 7 | test: { 8 | environment: "jsdom", 9 | setupFiles: ["./vitest-setup.js"], 10 | }, 11 | }); 12 | -------------------------------------------------------------------------------- /src/tests/derived/Reference.svelte: -------------------------------------------------------------------------------- 1 | 2 | 3 | 11 | 12 |

13 |

Number: {number}

14 |

Doubled: {doubled}

15 | 16 |

17 | -------------------------------------------------------------------------------- /src/tests/each/Reference.svelte: -------------------------------------------------------------------------------- 1 | 2 | 3 | 10 | 11 |

12 |

{character}

17 | 18 |

19 | -------------------------------------------------------------------------------- /src/tests/effect/Reference.svelte: -------------------------------------------------------------------------------- 1 | 2 | 3 | 15 | 16 |

17 |

Number: {number}

18 |

Doubled: {doubled}

19 | 20 |

21 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # dependencies (npm install) 2 | node_modules 3 | 4 | # output 5 | out 6 | dist 7 | *.tgz 8 | 9 | 10 | benchmarks/**/*.html 11 | 12 | # code coverage 13 | coverage 14 | *.lcov 15 | 16 | # logs 17 | logs 18 | _.log 19 | report.[0-9]_.[0-9]_.[0-9]_.[0-9]_.json 20 | 21 | # dotenv environment variable files 22 | .env 23 | .env.development.local 24 | .env.test.local 25 | .env.production.local 26 | .env.local 27 | 28 | # caches 29 | .eslintcache 30 | .cache 31 | *.tsbuildinfo 32 | 33 | # IntelliJ based IDEs 34 | .idea 35 | 36 | # Finder (MacOS) folder config 37 | .DS_Store 38 | 39 | codebase.md 40 | tmp/ 41 | benchmarks/benchmark-results-merged.json 42 | benchmarks/v1/v1-benchmark-results-merged.json 43 | -------------------------------------------------------------------------------- /src/utils/prompt.ts: -------------------------------------------------------------------------------- 1 | export const DEFAULT_SYSTEM_PROMPT = 2 | "You are an expert Svelte developer. Generate only the Svelte component code requested. Return just the code with no explanation, comments, or markdown. Runes starting with $ like $state and $effect are never imported, they are built-in."; 3 | 4 | // New system prompt variant for use with context 5 | export const DEFAULT_SYSTEM_PROMPT_WITH_CONTEXT = 6 | "You are an expert Svelte developer. You are provided with the Svelte documentation, use it when implementing your solution. Generate only the Svelte component code requested. Return just the code with no explanation, comments, or markdown. Runes starting with $ like $state and $effect are never imported, they are built-in."; 7 | -------------------------------------------------------------------------------- /src/tests/snippets/Reference.svelte: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | 7 | {#snippet title(bookTitle)} 8 | {bookTitle} 9 | {/snippet} 10 | 11 |

14 | {@render title(bookTitle)} 15 |

18 | 19 | 35 | -------------------------------------------------------------------------------- /src/tests/hello-world/prompt.md: -------------------------------------------------------------------------------- 1 | # Hello World Component Task 2 | 3 | Create a simple Svelte 5 component that displays "Hello, World!" with some styling. 4 | 5 | ## Requirements: 6 | 7 | 1. Create a Svelte component called HelloWorld 8 | 2. The component should display the text "Hello, World!" in a div with data-testid="greeting" 9 | 3. Add a CSS class "greeting" to style the text 10 | 4. Make the text color blue 11 | 5. Center the text on the page 12 | 6. Add a small margin around the text 13 | 14 | Example structure (you can modify it as needed): 15 | 16 | ```html 17 |

Hello, World!

18 | ``` 19 | 20 | Please implement this component using Svelte 5 syntax. Make sure you only return one component. 21 | -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | // Enable latest features 4 | "lib": ["ESNext", "DOM"], 5 | "target": "ESNext", 6 | "module": "ESNext", 7 | "moduleDetection": "force", 8 | "jsx": "react-jsx", 9 | "allowJs": true, 10 | 11 | // Bundler mode 12 | "moduleResolution": "bundler", 13 | "allowImportingTsExtensions": true, 14 | "verbatimModuleSyntax": true, 15 | "noEmit": true, 16 | 17 | // Best practices 18 | "strict": true, 19 | "skipLibCheck": true, 20 | "noFallthroughCasesInSwitch": true, 21 | 22 | // Some stricter flags (disabled by default) 23 | "noUnusedLocals": false, 24 | "noUnusedParameters": false, 25 | "noPropertyAccessFromIndexSignature": false 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /src/tests/hello-world/test.ts: -------------------------------------------------------------------------------- 1 | import { render, screen } from "@testing-library/svelte"; 2 | import { expect, test, describe } from "vitest"; 3 | import HelloWorld from "./Component.svelte"; // Path to the generated component 4 | 5 | describe("HelloWorld component", () => { 6 | test("renders with Hello, World! text", () => { 7 | render(HelloWorld); 8 | 9 | // Get the greeting element 10 | const greetingElement = screen.getByTestId("greeting"); 11 | 12 | // Check that it contains the correct text 13 | expect(greetingElement).toHaveTextContent("Hello, World!"); 14 | }); 15 | 16 | test("has the correct styling class", () => { 17 | render(HelloWorld); 18 | 19 | // Get the greeting element 20 | const greetingElement = screen.getByTestId("greeting"); 21 | 22 | // Check that it has the greeting class 23 | expect(greetingElement).toHaveClass("greeting"); 24 | }); 25 | }); 26 | -------------------------------------------------------------------------------- /src/tests/effect/prompt.md: -------------------------------------------------------------------------------- 1 | # $effect Rune Component Task 2 | 3 | Create a simple Svelte 5 component that demonstrates the `$effect` rune. 4 | 5 | ## Requirements: 6 | 7 | 1. Use `$state` for a number input starting at 5 8 | 2. Use `$effect` to calculate the doubled value of the number 9 | 3. Display both the number and the doubled value 10 | 4. Include a button to increment the number by 1 11 | 12 | Elements should have these data-testid attributes: 13 | 14 | - "number-value" for displaying the number 15 | - "doubled-value" for displaying the doubled result 16 | - "increment-button" for the increment button 17 | 18 | Example structure: 19 | 20 | ```html 21 |

22 |

Number: {number}

23 |

Doubled: {doubled}

24 | 25 |

26 | ``` 27 | 28 | Please implement this component using Svelte 5 runes. 29 | -------------------------------------------------------------------------------- /src/tests/derived/prompt.md: -------------------------------------------------------------------------------- 1 | # $effect Rune Component Task 2 | 3 | Create a simple Svelte 5 component that demonstrates the `$derived` rune. 4 | 5 | ## Requirements: 6 | 7 | 1. Use `$state` for a number input starting at 5 8 | 2. Use `$derived` to calculate the doubled value of the number 9 | 3. Display both the number and the doubled value 10 | 4. Include a button to increment the number by 1 11 | 12 | Elements should have these data-testid attributes: 13 | 14 | - "number-value" for displaying the number 15 | - "doubled-value" for displaying the doubled result 16 | - "increment-button" for the increment button 17 | 18 | Example structure: 19 | 20 | ```html 21 |

22 |

Number: {number}

23 |

Doubled: {doubled}

24 | 25 |

26 | ``` 27 | 28 | Please implement this component using Svelte 5 runes. 29 | -------------------------------------------------------------------------------- /src/tests/snippets/prompt.md: -------------------------------------------------------------------------------- 1 | # Snippet Component Task 2 | 3 | Create a simple Svelte 5 component that demonstrates the basic use of snippets. 4 | 5 | ## Requirements: 6 | 7 | 1. Create a component with a hardcoded array of 3 book titles (strings) - "The Lord of the Rings", "To Kill a Mockingbird", and "1984" 8 | 2. Create a snippet called `title` that takes a book title string as a parameter 9 | 3. The snippet should display the book title in a `` element with `data-testid="book-title"` 10 | 4. Use the `{@render ...}` syntax to render the snippet for each book title in a list 11 | 5. Each rendered title should be wrapped in a `

The Lord of the Rings
To Kill a Mockingbird
1984

21 | ``` 22 | 23 | Please implement this component using Svelte 5 runes. 24 | -------------------------------------------------------------------------------- /src/tests/each/prompt.md: -------------------------------------------------------------------------------- 1 | # Each Block Component Task 2 | 3 | Create a simple Svelte 5 component that demonstrates the `{#each}` block. 4 | 5 | ## Requirements: 6 | 7 | 1. Create a component with a hardcoded array of 3 Seinfeld characters: "Jerry", "Elaine", "Kramer" 8 | 2. Use the `{#each}` block to render all characters in a list 9 | 3. Add a button that adds another character "George" to the array when clicked 10 | 4. Each item should display just the character name 11 | 12 | Elements should have these data-testid attributes: 13 | 14 | - "characters-list" for the list container 15 | - "character" for each character item 16 | - "add-george-button" for the button to add George 17 | 18 | Example structure: 19 | 20 | ```html 21 |

22 |

Jerry

26 | 27 |

28 | ``` 29 | 30 | Please implement this component using Svelte 5 runes. 31 | -------------------------------------------------------------------------------- /src/tests/counter/Reference.svelte: -------------------------------------------------------------------------------- 1 | 2 | 3 | 14 | 15 |

16 | 17 | {count} 18 | 19 |

20 | 21 | 49 | -------------------------------------------------------------------------------- /src/tests/counter/prompt.md: -------------------------------------------------------------------------------- 1 | # Counter Component Task 2 | 3 | Create a Svelte 5 component called Counter that implements a simple counter with increment and decrement functionality. 4 | 5 | ## Requirements: 6 | 7 | 1. Use Svelte 5's `$state` for reactivity 8 | 2. The counter should start at 0 9 | 3. Include a decrement button with the `data-testid="decrement-button"` attribute 10 | 4. Include an increment button with the `data-testid="increment-button"` attribute 11 | 5. Display the current count with the `data-testid="count-value"` attribute 12 | 6. Clicking increment should increase the count by 1 13 | 7. Clicking decrement should decrease the count by 1 14 | 8. Style the counter with a CSS class "counter" 15 | 16 | Example structure: 17 | 18 | ```html 19 |

20 | 21 | 0 22 | 23 |

24 | ``` 25 | 26 | Please implement this component using Svelte 5 syntax. Make sure you only return one component. 27 | -------------------------------------------------------------------------------- /.github/workflows/test-build.yml: -------------------------------------------------------------------------------- 1 | name: Test Build 2 | 3 | on: 4 | pull_request: 5 | 6 | permissions: 7 | contents: read 8 | 9 | jobs: 10 | test-build: 11 | runs-on: ubuntu-latest 12 | 13 | steps: 14 | - name: Checkout 15 | uses: actions/checkout@v4 16 | with: 17 | fetch-depth: 1 18 | persist-credentials: false 19 | 20 | - name: Setup pnpm 21 | uses: pnpm/action-setup@v4 22 | with: 23 | version: latest 24 | 25 | - name: Setup Node.js 26 | uses: actions/setup-node@v4 27 | with: 28 | node-version: 24 29 | cache: pnpm 30 | cache-dependency-path: pnpm-lock.yaml 31 | 32 | # Silence/enable required native builds (seen as "Ignored build scripts: esbuild") 33 | - name: Approve pnpm builds 34 | run: pnpm approve-builds esbuild 35 | 36 | - name: Install deps 37 | run: pnpm install --frozen-lockfile --prefer-offline 38 | 39 | - name: Test build 40 | env: 41 | NODE_ENV: production 42 | run: pnpm run build 43 | -------------------------------------------------------------------------------- /src/tests/inspect/prompt.md: -------------------------------------------------------------------------------- 1 | # $inspect Rune Component Task 2 | 3 | Create a Svelte 5 component that demonstrates the `$inspect` rune functionality using a single input field. 4 | 5 | ## Requirements: 6 | 7 | 1. Use Svelte 5's `$state` for a text input starting with "Hello world" 8 | 2. Use basic `$inspect` to log the input value 9 | 3. Implement `$inspect(...).with` to track updates to the input with a custom callback 10 | 4. Implement `$inspect.trace()` inside an effect that runs when the input changes 11 | 5. Display the character count of the input text (to demonstrate a derived value that depends on the input) 12 | 6. Include an input field with `data-testid="text-input"` 13 | 7. Display the input value with `data-testid="text-value"` 14 | 8. Display the character count with `data-testid="char-count"` 15 | 16 | Example structure: 17 | 18 | ```html 19 |

20 | 21 |

Current text: "{text}"

22 |

Character count: {text.length}

23 |

24 | ``` 25 | 26 | Please implement this component using Svelte 5 runes. 27 | -------------------------------------------------------------------------------- /src/tests/inspect/Reference.svelte: -------------------------------------------------------------------------------- 1 | 2 | 3 | 26 | 27 |

28 |

29 | Enter some text: 30 | 31 |

32 | 33 |

Current text: "{text}"

34 |

Character count: {charCount}

35 |

36 | -------------------------------------------------------------------------------- /src/tests/props/Reference.svelte: -------------------------------------------------------------------------------- 1 | 2 | 3 | 11 | 12 |

13 |

Hello, {name}!

14 |

Count: {countValue}

15 | 16 | 17 | {#if showDetails} 18 |

19 |

Name is {name}

20 |

Count is {countValue}

21 |

ShowDetails is {showDetails}

22 |

23 | {/if} 24 |

25 | 26 | 49 | -------------------------------------------------------------------------------- /src/tests/derived/test.ts: -------------------------------------------------------------------------------- 1 | import { render, screen } from "@testing-library/svelte"; 2 | import { expect, test, describe } from "vitest"; 3 | import userEvent from "@testing-library/user-event"; 4 | import NumberDoubler from "./Component.svelte"; 5 | 6 | describe("NumberDoubler component", () => { 7 | test("renders with initial state", () => { 8 | render(NumberDoubler); 9 | 10 | expect(screen.getByTestId("number-value")).toHaveTextContent("Number: 5"); 11 | expect(screen.getByTestId("doubled-value")).toHaveTextContent("Doubled: 10"); 12 | }); 13 | 14 | test("updates doubled value when number increments", async () => { 15 | const user = userEvent.setup(); 16 | render(NumberDoubler); 17 | 18 | await user.click(screen.getByTestId("increment-button")); 19 | 20 | expect(screen.getByTestId("number-value")).toHaveTextContent("Number: 6"); 21 | expect(screen.getByTestId("doubled-value")).toHaveTextContent("Doubled: 12"); 22 | 23 | // Click again 24 | await user.click(screen.getByTestId("increment-button")); 25 | 26 | expect(screen.getByTestId("number-value")).toHaveTextContent("Number: 7"); 27 | expect(screen.getByTestId("doubled-value")).toHaveTextContent("Doubled: 14"); 28 | }); 29 | }); 30 | -------------------------------------------------------------------------------- /src/tests/effect/test.ts: -------------------------------------------------------------------------------- 1 | import { render, screen } from "@testing-library/svelte"; 2 | import { expect, test, describe } from "vitest"; 3 | import userEvent from "@testing-library/user-event"; 4 | import NumberDoubler from "./Component.svelte"; 5 | 6 | describe("NumberDoubler component", () => { 7 | test("renders with initial state", () => { 8 | render(NumberDoubler); 9 | 10 | expect(screen.getByTestId("number-value")).toHaveTextContent("Number: 5"); 11 | expect(screen.getByTestId("doubled-value")).toHaveTextContent("Doubled: 10"); 12 | }); 13 | 14 | test("updates doubled value when number increments", async () => { 15 | const user = userEvent.setup(); 16 | render(NumberDoubler); 17 | 18 | await user.click(screen.getByTestId("increment-button")); 19 | 20 | expect(screen.getByTestId("number-value")).toHaveTextContent("Number: 6"); 21 | expect(screen.getByTestId("doubled-value")).toHaveTextContent("Doubled: 12"); 22 | 23 | // Click again 24 | await user.click(screen.getByTestId("increment-button")); 25 | 26 | expect(screen.getByTestId("number-value")).toHaveTextContent("Number: 7"); 27 | expect(screen.getByTestId("doubled-value")).toHaveTextContent("Doubled: 14"); 28 | }); 29 | }); 30 | -------------------------------------------------------------------------------- /src/tests/derived-by/Reference.svelte: -------------------------------------------------------------------------------- 1 | 2 | 3 | 22 | 23 |

24 | 25 | 26 | 27 |

28 |

Words: {textStats.wordCount}

29 |

Characters: {textStats.charCount}

30 |

31 | Status: {textStats.isLongText ? "Long text" : "Short text"} 32 |

33 |

34 |

35 | 36 | 51 | -------------------------------------------------------------------------------- /src/utils/code-cleaner.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Cleans markdown code block formatting from LLM-generated code 3 | * 4 | * This function handles various types of code block formatting that might be present 5 | * in the LLM output, including: 6 | * - Triple backticks with language identifiers (```svelte, ```html, ```js, etc.) 7 | * - Triple backticks without language identifiers 8 | * - Nested code blocks 9 | * - Improperly formatted code blocks 10 | * 11 | * @param code The code to clean 12 | * @returns The cleaned code with all markdown code block formatting removed 13 | */ 14 | export function cleanCodeMarkdown(code: string): string { 15 | // First, remove any opening code block markers with language identifiers 16 | // This handles patterns like ```svelte, ```html, ```js, etc. 17 | let cleanedCode = code.replace(/```[a-zA-Z]*\s*/g, ""); 18 | 19 | // Remove any standalone triple backticks 20 | cleanedCode = cleanedCode.replace(/```/g, ""); 21 | 22 | // Note: We do NOT remove single or double backticks as they are used in: 23 | // - JavaScript template literals (e.g., `string ${var}`) 24 | // - Inline code in markdown (e.g., `code`) 25 | // Only the triple backtick code fences should be removed 26 | 27 | // Trim whitespace from the beginning and end 28 | return cleanedCode.trim(); 29 | } 30 | -------------------------------------------------------------------------------- /src/tests/snippets/test.ts: -------------------------------------------------------------------------------- 1 | import { render, screen } from "@testing-library/svelte"; 2 | import { expect, test, describe } from "vitest"; 3 | import BookList from "./Component.svelte"; 4 | 5 | describe("Simple Snippet component", () => { 6 | test("renders the correct number of book titles", () => { 7 | render(BookList); 8 | 9 | const bookItems = screen.getAllByTestId("book-item"); 10 | expect(bookItems.length).toBe(3); 11 | 12 | const bookTitles = screen.getAllByTestId("book-title"); 13 | expect(bookTitles.length).toBe(3); 14 | }); 15 | 16 | test("displays correct book titles", () => { 17 | render(BookList); 18 | 19 | const bookTitles = screen.getAllByTestId("book-title"); 20 | 21 | expect(bookTitles[0]).toHaveTextContent("The Lord of the Rings"); 22 | expect(bookTitles[1]).toHaveTextContent("To Kill a Mockingbird"); 23 | expect(bookTitles[2]).toHaveTextContent("1984"); 24 | }); 25 | 26 | test("has the correct structure for each book item", () => { 27 | render(BookList); 28 | 29 | const bookItems = screen.getAllByTestId("book-item"); 30 | 31 | bookItems.forEach((item) => { 32 | expect(item.tagName).toBe("LI"); 33 | 34 | const title = item.querySelector('[data-testid="book-title"]'); 35 | expect(title).toBeInTheDocument(); 36 | expect(title?.tagName).toBe("SPAN"); 37 | }); 38 | }); 39 | }); 40 | -------------------------------------------------------------------------------- /src/tests/derived-by/prompt.md: -------------------------------------------------------------------------------- 1 | # $derived.by Component Task 2 | 3 | Create a Svelte 5 component that demonstrates the `$derived.by` rune for complex derivations. 4 | 5 | ## Requirements: 6 | 7 | 1. Create a text input field that allows the user to enter text 8 | 2. Use `$state` to store the current text value, starting with an empty string 9 | 3. Use `$derived.by` to calculate: 10 | - The number of words in the text 11 | - The number of characters in the text 12 | - Whether the text is considered "long" (more than 15 words) 13 | 4. Display all these derived values below the input field 14 | 5. Include a "Clear" button that resets the text to an empty string 15 | 16 | Elements should have these data-testid attributes: 17 | 18 | - "text-input" for the input field 19 | - "word-count" for displaying the word count 20 | - "char-count" for displaying the character count 21 | - "length-indicator" for displaying whether the text is long 22 | - "clear-button" for the clear button 23 | 24 | Example structure: 25 | 26 | ```html 27 |

28 | 29 | 30 |

31 |

Words: 0

32 |

Characters: 0

33 |

Status: Short text

34 |

35 |

36 | ``` 37 | 38 | Please implement this component using Svelte 5 runes. 39 | -------------------------------------------------------------------------------- /src/tests/props/prompt.md: -------------------------------------------------------------------------------- 1 | # $props Rune Component Task 2 | 3 | Create a Svelte 5 component that demonstrates the `$props` rune for accepting and using component properties. 4 | 5 | ## Requirements: 6 | 7 | 1. Create a component called PropsDemo that uses the `$props` rune to accept the following properties: 8 | 9 | - `name` (string) with a default value of "World" 10 | - `count` (number) with a default value of 0 11 | - `showDetails` (boolean) with a default value of false 12 | 13 | 2. Use `$state` to create a reactive variable for the count that can be updated 14 | 3. Display the name in a heading with `data-testid="name-display"` 15 | 4. Display the count value in a paragraph with `data-testid="count-display"` 16 | 5. Include a button with `data-testid="increment-button"` that increments the count by 1 17 | 6. If `showDetails` is true, display a div with `data-testid="details"` containing additional information about the props 18 | 7. If `showDetails` is false, this div should not be rendered 19 | 20 | Example HTML structure: 21 | 22 | ```html 23 |

24 |

Hello, World!

25 |

Count: 0

26 | 27 | 28 |

29 |

Name is World

30 |

Count is 0

31 |

ShowDetails is true

32 |

33 |

34 | ``` 35 | 36 | Note: The details div should only be shown when showDetails is true. 37 | 38 | Please implement this component using Svelte 5 syntax with the `$props` rune. Make sure you only return one component. 39 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "svelte-bench", 3 | "type": "module", 4 | "private": true, 5 | "devDependencies": { 6 | "@google/genai": "^1.33.0", 7 | "@sveltejs/vite-plugin-svelte": "^6.2.1", 8 | "@testing-library/svelte": "^5.2.9", 9 | "@testing-library/user-event": "^14.6.1", 10 | "@types/ejs": "^3.1.5", 11 | "@types/express": "^5.0.6", 12 | "jsdom": "^27.3.0", 13 | "openai": "^6.10.0", 14 | "prettier": "^3.7.4", 15 | "prettier-plugin-svelte": "^3.4.0", 16 | "vitest": "^3.2.4" 17 | }, 18 | "scripts": { 19 | "start": "pnpm run-tests && pnpm build", 20 | "run-tests": "tsx ./index.ts", 21 | "check": "tsc --noEmit", 22 | "test": "vitest run", 23 | "test:watch": "vitest", 24 | "build": "pnpm merge && tsx ./build-static.ts", 25 | "build-v1": "tsx ./merge-v1.ts && tsx ./build-static.ts", 26 | "verify": "tsx ./verify.ts", 27 | "merge": "tsx ./merge.ts", 28 | "merge-v1": "tsx ./merge-v1.ts", 29 | "open": "node -e \"require('child_process').spawn('open', ['benchmarks/benchmark-results-merged.html'], { stdio: 'inherit' })\"", 30 | "format": "prettier --write .", 31 | "format:check": "prettier --check ." 32 | }, 33 | "peerDependencies": { 34 | "typescript": "^5" 35 | }, 36 | "dependencies": { 37 | "@anthropic-ai/sdk": "^0.71.2", 38 | "@testing-library/jest-dom": "^6.9.1", 39 | "dotenv": "^17.2.3", 40 | "ejs": "^3.1.10", 41 | "express": "^5.2.1", 42 | "ollama": "^0.6.3", 43 | "rimraf": "^6.1.2", 44 | "svelte": "^5.45.9", 45 | "tsx": "^4.21.0", 46 | "undici": "^7.16.0", 47 | "vite": "^7.2.7" 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /src/tests/each/test.ts: -------------------------------------------------------------------------------- 1 | import { render, screen } from "@testing-library/svelte"; 2 | import { expect, test, describe } from "vitest"; 3 | import userEvent from "@testing-library/user-event"; 4 | import CharacterList from "./Component.svelte"; 5 | 6 | describe("CharacterList component", () => { 7 | test("renders all characters initially", () => { 8 | render(CharacterList); 9 | 10 | const characterElements = screen.getAllByTestId("character"); 11 | expect(characterElements.length).toBe(3); 12 | expect(characterElements[0]).toHaveTextContent("Jerry"); 13 | expect(characterElements[1]).toHaveTextContent("Elaine"); 14 | expect(characterElements[2]).toHaveTextContent("Kramer"); 15 | }); 16 | 17 | test("adds George to the list when button clicked", async () => { 18 | const user = userEvent.setup(); 19 | render(CharacterList); 20 | 21 | // Initial check 22 | let characterElements = screen.getAllByTestId("character"); 23 | expect(characterElements.length).toBe(3); 24 | 25 | // Click the button to add George 26 | await user.click(screen.getByTestId("add-george-button")); 27 | 28 | // Get updated elements 29 | characterElements = screen.getAllByTestId("character"); 30 | 31 | // Check that George was added 32 | expect(characterElements.length).toBe(4); 33 | expect(characterElements[3]).toHaveTextContent("George"); 34 | 35 | // Verify original characters are still there 36 | expect(characterElements[0]).toHaveTextContent("Jerry"); 37 | expect(characterElements[1]).toHaveTextContent("Elaine"); 38 | expect(characterElements[2]).toHaveTextContent("Kramer"); 39 | }); 40 | }); 41 | -------------------------------------------------------------------------------- /.env.example: -------------------------------------------------------------------------------- 1 | OPENAI_API_KEY=your-openai-api-key 2 | ANTHROPIC_API_KEY=your-anthropic-api-key 3 | GEMINI_API_KEY=your-gemini-api-key 4 | OPENROUTER_API_KEY=your-openrouter-api-key 5 | Z_AI_API_KEY=your-zai-api-key 6 | MOONSHOT_API_KEY=your-moonshot-api-key 7 | 8 | # Ollama configuration (optional - defaults to http://127.0.0.1:11434) 9 | # OLLAMA_HOST=http://127.0.0.1:11434 10 | 11 | # Debug Mode Settings 12 | # Set to "true" to enable debug mode (runs only one test with one model) 13 | # DEBUG_MODE=false 14 | # Optionally specify which test to run (defaults to first test if not specified) 15 | # DEBUG_TEST=counter 16 | # Optionally specify number of samples to generate in debug mode (defaults to 1) 17 | # DEBUG_SAMPLES=5 18 | # Optionally specify which provider to use (defaults to first provider if not specified) 19 | # DEBUG_PROVIDER=openai 20 | # Optionally specify which model to use (defaults to first model of the provider if not specified) 21 | # DEBUG_MODEL=gpt-4o-2024-11-20 22 | 23 | # OpenRouter Provider Selection (optional) 24 | # Specify preferred provider routing strategy for OpenRouter requests 25 | # See: https://openrouter.ai/docs/features/provider-routing 26 | # OPENROUTER_PROVIDER=auto 27 | 28 | # EXPERIMENTAL 29 | # Enable parallel testing (disabled by default, consumes tokens very quickly!) 30 | # PARALLEL_EXECUTION=true 31 | 32 | # Retry Configuration for LLM API Calls 33 | # Maximum number of retry attempts for failed LLM requests (default: 100) 34 | # RETRY_MAX_ATTEMPTS=100 35 | # Initial delay in milliseconds before first retry (default: 1000) 36 | # RETRY_INITIAL_DELAY_MS=1000 37 | # Maximum delay in milliseconds between retries (default: 30000) 38 | # RETRY_MAX_DELAY_MS=30000 39 | # Backoff factor for exponential delay increase (default: 2) 40 | # RETRY_BACKOFF_FACTOR=2 41 | -------------------------------------------------------------------------------- /.github/workflows/build-static.yml: -------------------------------------------------------------------------------- 1 | name: Build and Deploy Benchmark Results 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | workflow_dispatch: 8 | 9 | permissions: 10 | contents: read 11 | pages: write 12 | id-token: write 13 | 14 | concurrency: 15 | group: pages 16 | cancel-in-progress: true 17 | 18 | jobs: 19 | build-and-deploy: 20 | runs-on: ubuntu-latest 21 | environment: 22 | name: github-pages 23 | url: ${{ steps.deployment.outputs.page_url }} 24 | 25 | steps: 26 | - name: Checkout 27 | uses: actions/checkout@v4 28 | with: 29 | fetch-depth: 1 30 | persist-credentials: false 31 | 32 | - name: Setup pnpm 33 | uses: pnpm/action-setup@v4 34 | with: 35 | version: latest 36 | 37 | - name: Setup Node.js 38 | uses: actions/setup-node@v4 39 | with: 40 | node-version: 24 41 | cache: pnpm 42 | cache-dependency-path: pnpm-lock.yaml 43 | 44 | # Silence/enable required native builds (seen as "Ignored build scripts: esbuild") 45 | - name: Approve pnpm builds 46 | run: pnpm approve-builds esbuild 47 | 48 | - name: Install deps 49 | run: pnpm install --frozen-lockfile --prefer-offline 50 | 51 | - name: Build static files 52 | env: 53 | NODE_ENV: production 54 | run: pnpm run build 55 | 56 | - name: Setup Pages 57 | uses: actions/configure-pages@v5 58 | 59 | - name: Upload pages artifact 60 | uses: actions/upload-pages-artifact@v3 61 | with: 62 | path: benchmarks 63 | name: github-pages 64 | retention-days: 1 65 | 66 | - name: Deploy to GitHub Pages 67 | id: deployment 68 | uses: actions/deploy-pages@v4 69 | -------------------------------------------------------------------------------- /src/utils/ensure-dirs.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Utility to ensure all required directories exist 3 | */ 4 | 5 | import fs from "fs/promises"; 6 | import path from "path"; 7 | import { getAllLLMProviders } from "../llms"; 8 | 9 | /** 10 | * Ensure that required directories exist 11 | */ 12 | export async function ensureRequiredDirectories(): Promise { 13 | // Base directories 14 | const baseDirectories = [ 15 | path.resolve(process.cwd(), "tmp"), 16 | path.resolve(process.cwd(), "tmp", "samples"), 17 | path.resolve(process.cwd(), "tmp", "checkpoint"), 18 | path.resolve(process.cwd(), "benchmarks"), 19 | ]; 20 | 21 | for (const dir of baseDirectories) { 22 | try { 23 | await fs.mkdir(dir, { recursive: true }); 24 | } catch (error) { 25 | console.error(`Error creating directory ${dir}:`, error); 26 | throw error; 27 | } 28 | } 29 | 30 | // Create provider-specific directories 31 | try { 32 | const providerModels = await getAllLLMProviders(); 33 | 34 | // Get unique provider names 35 | const providerNames = [...new Set(providerModels.map((pm) => pm.name))]; 36 | 37 | // Create sample and checkpoint directories for each provider 38 | for (const provider of providerNames) { 39 | const sampleDir = path.resolve(process.cwd(), "tmp", "samples", provider.toLowerCase()); 40 | const checkpointDir = path.resolve(process.cwd(), "tmp", "checkpoint", provider.toLowerCase()); 41 | await fs.mkdir(sampleDir, { recursive: true }); 42 | await fs.mkdir(checkpointDir, { recursive: true }); 43 | } 44 | } catch (error) { 45 | console.error("Error creating provider-specific directories:", error); 46 | // Don't throw here, as missing provider-specific directories will be created on demand 47 | console.warn("Provider-specific directories will be created on demand"); 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /src/utils/retry-wrapper.ts: -------------------------------------------------------------------------------- 1 | export interface RetryOptions { 2 | maxAttempts?: number; 3 | initialDelayMs?: number; 4 | maxDelayMs?: number; 5 | backoffFactor?: number; 6 | onRetry?: (error: Error, attempt: number) => void; 7 | } 8 | 9 | const DEFAULT_OPTIONS: Required = { 10 | maxAttempts: parseInt(process.env.RETRY_MAX_ATTEMPTS || "5", 10), 11 | initialDelayMs: parseInt(process.env.RETRY_INITIAL_DELAY_MS || "1000", 10), 12 | maxDelayMs: parseInt(process.env.RETRY_MAX_DELAY_MS || "30000", 10), 13 | backoffFactor: parseFloat(process.env.RETRY_BACKOFF_FACTOR || "2"), 14 | onRetry: (error, attempt) => { 15 | console.warn(`⚠️ Retry attempt ${attempt} after error: ${error.message}`); 16 | }, 17 | }; 18 | 19 | export async function withRetry(fn: () => Promise, options?: RetryOptions): Promise { 20 | const opts = { ...DEFAULT_OPTIONS, ...options }; 21 | let lastError: Error; 22 | 23 | for (let attempt = 1; attempt <= opts.maxAttempts; attempt++) { 24 | try { 25 | return await fn(); 26 | } catch (error) { 27 | lastError = error instanceof Error ? error : new Error(String(error)); 28 | 29 | if (attempt === opts.maxAttempts) { 30 | console.error(`❌ Failed after ${opts.maxAttempts} attempts: ${lastError.message}`); 31 | throw lastError; 32 | } 33 | 34 | opts.onRetry(lastError, attempt); 35 | 36 | const baseDelayMs = Math.min(opts.initialDelayMs * Math.pow(opts.backoffFactor, attempt - 1), opts.maxDelayMs); 37 | 38 | // Add random jitter between 10-250ms to prevent thundering herd 39 | const jitterMs = Math.floor(Math.random() * 241) + 10; // 10-250ms 40 | const totalDelayMs = baseDelayMs + jitterMs; 41 | 42 | console.log(`⏳ Waiting ${totalDelayMs}ms before retry...`); 43 | await new Promise((resolve) => setTimeout(resolve, totalDelayMs)); 44 | } 45 | } 46 | 47 | throw lastError!; 48 | } 49 | -------------------------------------------------------------------------------- /src/utils/humaneval.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Implementation of the HumanEval methodology from the paper 3 | * "Evaluating Large Language Models Trained on Code" 4 | * 5 | * This implements the pass@k metric calculation as described in the paper. 6 | */ 7 | 8 | /** 9 | * Calculate pass@k using the unbiased estimator formula from the HumanEval paper 10 | * 11 | * The pass@k metric measures the probability that at least one of k randomly 12 | * selected samples would pass all unit tests. 13 | * 14 | * Formula: pass@k = 1 - (n-c choose k) / (n choose k) 15 | * 16 | * Numerically stable implementation using product form: 17 | * pass@k = 1 - prod(1 - k/j) for j from n-c+1 to n 18 | * 19 | * @param n Total number of samples 20 | * @param c Number of correct samples (samples that pass all tests) 21 | * @param k K in pass@k (number of samples to select) 22 | * @returns Unbiased estimate of pass@k 23 | */ 24 | export function calculatePassAtK(n: number, c: number, k: number): number { 25 | // If we have more correct samples than k, or exactly k samples remain 26 | // after removing all correct ones, we're guaranteed to get at least 27 | // one correct sample in our selection of k 28 | if (n - c < k) return 1.0; 29 | 30 | // Calculate 1 - prod(1 - k/j) for j from n-c+1 to n 31 | let result = 1.0; 32 | for (let j = n - c + 1; j <= n; j++) { 33 | result *= 1.0 - k / j; 34 | } 35 | 36 | return 1.0 - result; 37 | } 38 | 39 | /** 40 | * Interface for storing HumanEval results 41 | */ 42 | export interface HumanEvalResult { 43 | testName: string; 44 | provider: string; 45 | modelId: string; 46 | numSamples: number; 47 | numCorrect: number; 48 | pass1: number; 49 | pass10: number; 50 | context?: { 51 | used: boolean; 52 | filename?: string; 53 | content?: string; 54 | }; 55 | samples: { 56 | index: number; 57 | code: string; 58 | success: boolean; 59 | errors: string[]; 60 | temperature?: number; // Added temperature tracking 61 | }[]; 62 | } 63 | -------------------------------------------------------------------------------- /src/utils/model-validator.ts: -------------------------------------------------------------------------------- 1 | import { getLLMProvider } from "../llms"; 2 | 3 | /** 4 | * Validates if a model exists for a given provider by making a minimal API call 5 | * @param provider The LLM provider name 6 | * @param model The model identifier to validate 7 | * @returns true if model is valid, false otherwise 8 | */ 9 | export async function validateModel(provider: string, model: string): Promise { 10 | const simplePrompt = "Return the word 'test'"; 11 | 12 | try { 13 | // Get the provider instance with the specified model 14 | const llmProvider = await getLLMProvider(provider, model); 15 | 16 | // Make a minimal API call to validate model 17 | await llmProvider.generateCode(simplePrompt, 0.1); 18 | return true; 19 | } catch (error: any) { 20 | // Check for model not found errors 21 | if ( 22 | error.message?.includes("does not exist") || 23 | error.message?.includes("not found") || 24 | error.message?.includes("model must be") || 25 | error.status === 404 || 26 | error.status === 400 27 | ) { 28 | console.error(`Invalid model '${model}' for provider ${provider}: ${error.message}`); 29 | return false; 30 | } 31 | 32 | // For other errors (network, auth), throw them up 33 | console.error(`Validation error for ${provider}/${model}:`, error.message); 34 | throw error; 35 | } 36 | } 37 | 38 | /** 39 | * Validates multiple models for a provider 40 | * @param provider The LLM provider name 41 | * @param models Array of model identifiers to validate 42 | * @returns Array of valid model names 43 | */ 44 | export async function validateModels(provider: string, models: string[]): Promise { 45 | const validModels: string[] = []; 46 | 47 | for (const model of models) { 48 | try { 49 | const isValid = await validateModel(provider, model); 50 | if (isValid) { 51 | validModels.push(model); 52 | console.log(`✓ Model ${model} is valid for ${provider}`); 53 | } else { 54 | console.log(`✗ Model ${model} is not available for ${provider}`); 55 | } 56 | } catch (error) { 57 | console.error(`Failed to validate ${model} for ${provider}:`, error); 58 | } 59 | } 60 | 61 | return validModels; 62 | } 63 | -------------------------------------------------------------------------------- /src/tests/derived-by/test.ts: -------------------------------------------------------------------------------- 1 | import { render, screen } from "@testing-library/svelte"; 2 | import { expect, test, describe } from "vitest"; 3 | import userEvent from "@testing-library/user-event"; 4 | import TextAnalyzer from "./Component.svelte"; 5 | 6 | describe("TextAnalyzer component", () => { 7 | test("renders with initial state", () => { 8 | render(TextAnalyzer); 9 | 10 | expect(screen.getByTestId("word-count")).toHaveTextContent("Words: 0"); 11 | expect(screen.getByTestId("char-count")).toHaveTextContent("Characters: 0"); 12 | expect(screen.getByTestId("length-indicator")).toHaveTextContent("Status: Short text"); 13 | }); 14 | 15 | test("updates counts when text is entered", async () => { 16 | const user = userEvent.setup(); 17 | render(TextAnalyzer); 18 | 19 | const input = screen.getByTestId("text-input"); 20 | 21 | // Enter a short text 22 | await user.type(input, "Hello world"); 23 | 24 | expect(screen.getByTestId("word-count")).toHaveTextContent("Words: 2"); 25 | expect(screen.getByTestId("char-count")).toHaveTextContent("Characters: 11"); 26 | expect(screen.getByTestId("length-indicator")).toHaveTextContent("Status: Short text"); 27 | 28 | // Clear and enter a longer text 29 | await user.clear(input); 30 | await user.type( 31 | input, 32 | "This is a much longer text that should have more than fifteen words so that we can test the long text indicator functionality properly", 33 | ); 34 | 35 | expect(screen.getByTestId("word-count")).toHaveTextContent("Words: 24"); 36 | expect(screen.getByTestId("char-count")).toHaveTextContent("Characters: 134"); 37 | expect(screen.getByTestId("length-indicator")).toHaveTextContent("Status: Long text"); 38 | }); 39 | 40 | test("clear button resets the text", async () => { 41 | const user = userEvent.setup(); 42 | render(TextAnalyzer); 43 | 44 | const input = screen.getByTestId("text-input"); 45 | const clearButton = screen.getByTestId("clear-button"); 46 | 47 | // Enter some text 48 | await user.type(input, "Hello world"); 49 | 50 | // Verify counts 51 | expect(screen.getByTestId("word-count")).toHaveTextContent("Words: 2"); 52 | 53 | // Click the clear button 54 | await user.click(clearButton); 55 | 56 | // Verify everything is reset 57 | expect(screen.getByTestId("word-count")).toHaveTextContent("Words: 0"); 58 | expect(screen.getByTestId("char-count")).toHaveTextContent("Characters: 0"); 59 | expect(screen.getByTestId("length-indicator")).toHaveTextContent("Status: Short text"); 60 | expect(input).toHaveValue(""); 61 | }); 62 | }); 63 | -------------------------------------------------------------------------------- /src/tests/counter/test.ts: -------------------------------------------------------------------------------- 1 | import { render, screen } from "@testing-library/svelte"; 2 | import { expect, test, describe } from "vitest"; 3 | import userEvent from "@testing-library/user-event"; 4 | import Counter from "./Component.svelte"; // Path to the generated component 5 | 6 | describe("Counter component", () => { 7 | test("renders with initial count of 0", () => { 8 | render(Counter); 9 | 10 | // Use data-testid to get elements 11 | const countElement = screen.getByTestId("count-value"); 12 | const decrementButton = screen.getByTestId("decrement-button"); 13 | const incrementButton = screen.getByTestId("increment-button"); 14 | 15 | // Check initial state 16 | expect(countElement).toHaveTextContent("0"); 17 | expect(decrementButton).toBeInTheDocument(); 18 | expect(incrementButton).toBeInTheDocument(); 19 | }); 20 | 21 | test("increments the count when + button is clicked", async () => { 22 | const user = userEvent.setup(); 23 | render(Counter); 24 | 25 | const incrementButton = screen.getByTestId("increment-button"); 26 | const countElement = screen.getByTestId("count-value"); 27 | 28 | // Initial count should be 0 29 | expect(countElement).toHaveTextContent("0"); 30 | 31 | // Click the increment button 32 | await user.click(incrementButton); 33 | 34 | // Count should now be 1 35 | expect(countElement).toHaveTextContent("1"); 36 | }); 37 | 38 | test("decrements the count when - button is clicked", async () => { 39 | const user = userEvent.setup(); 40 | render(Counter); 41 | 42 | const decrementButton = screen.getByTestId("decrement-button"); 43 | const countElement = screen.getByTestId("count-value"); 44 | 45 | // Initial count should be 0 46 | expect(countElement).toHaveTextContent("0"); 47 | 48 | // Click the decrement button 49 | await user.click(decrementButton); 50 | 51 | // Count should now be -1 52 | expect(countElement).toHaveTextContent("-1"); 53 | }); 54 | 55 | test("handles multiple clicks correctly", async () => { 56 | const user = userEvent.setup(); 57 | render(Counter); 58 | 59 | const decrementButton = screen.getByTestId("decrement-button"); 60 | const incrementButton = screen.getByTestId("increment-button"); 61 | const countElement = screen.getByTestId("count-value"); 62 | 63 | // Increment twice 64 | await user.click(incrementButton); 65 | await user.click(incrementButton); 66 | expect(countElement).toHaveTextContent("2"); 67 | 68 | // Decrement once 69 | await user.click(decrementButton); 70 | expect(countElement).toHaveTextContent("1"); 71 | }); 72 | }); 73 | -------------------------------------------------------------------------------- /src/llms/google.ts: -------------------------------------------------------------------------------- 1 | import { DEFAULT_SYSTEM_PROMPT, DEFAULT_SYSTEM_PROMPT_WITH_CONTEXT } from "../utils/prompt"; 2 | import type { LLMProvider } from "./index"; 3 | import { GoogleGenAI } from "@google/genai"; 4 | 5 | export class GoogleGenAIProvider implements LLMProvider { 6 | private client: GoogleGenAI; 7 | private modelId: string; 8 | name = "Google"; 9 | 10 | constructor(modelId?: string) { 11 | const apiKey = process.env.GEMINI_API_KEY; 12 | if (!apiKey) { 13 | throw new Error("GEMINI_API_KEY environment variable is required"); 14 | } 15 | this.client = new GoogleGenAI({ apiKey }); 16 | // Default to gemini-2.0-flash-exp if no model specified 17 | this.modelId = modelId || "gemini-2.0-flash-exp"; 18 | } 19 | 20 | /** 21 | * Generate code from a prompt using Google Gemini 22 | * @param prompt The prompt to send to the LLM 23 | * @param temperature Optional temperature parameter for controlling randomness (default: 0.7) 24 | * @param contextContent Optional context content to include in prompts 25 | * @returns The generated code 26 | */ 27 | async generateCode(prompt: string, temperature?: number, contextContent?: string): Promise { 28 | try { 29 | console.log( 30 | `🤖 Generating code with Google Gemini using model: ${this.modelId} (temp: ${temperature ?? "default"})...`, 31 | ); 32 | 33 | const systemPrompt = contextContent ? DEFAULT_SYSTEM_PROMPT_WITH_CONTEXT : DEFAULT_SYSTEM_PROMPT; 34 | 35 | const promptWithContext = contextContent 36 | ? `${systemPrompt}\n\n${contextContent}\n\n${prompt}` 37 | : `${systemPrompt}\n\n${prompt}`; 38 | 39 | const requestOptions: any = { 40 | model: this.modelId, 41 | contents: promptWithContext, 42 | }; 43 | 44 | // Only add temperature config if it's defined 45 | if (temperature !== undefined) { 46 | requestOptions.config = { 47 | temperature: temperature, 48 | }; 49 | } 50 | 51 | const response = await this.client.models.generateContent(requestOptions); 52 | 53 | return response.text || ""; 54 | } catch (error) { 55 | console.error("Error generating code with Google Gemini:", error); 56 | throw new Error(`Failed to generate code: ${error instanceof Error ? error.message : String(error)}`); 57 | } 58 | } 59 | 60 | /** 61 | * Get all available models for this provider 62 | * @returns Array of model identifiers 63 | */ 64 | getModels(): string[] { 65 | // Return empty array since models are now dynamically validated 66 | return []; 67 | } 68 | 69 | /** 70 | * Get the model identifier that was used for generation 71 | * @returns The model identifier string 72 | */ 73 | getModelIdentifier(): string { 74 | return this.modelId; 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /src/tests/props/test.ts: -------------------------------------------------------------------------------- 1 | import { render, screen } from "@testing-library/svelte"; 2 | import { expect, test, describe } from "vitest"; 3 | import userEvent from "@testing-library/user-event"; 4 | import PropsDemo from "./Component.svelte"; // Path to the generated component 5 | 6 | describe("PropsDemo component", () => { 7 | test("renders with default props", () => { 8 | render(PropsDemo); 9 | 10 | // Check default values 11 | const nameDisplay = screen.getByTestId("name-display"); 12 | const countDisplay = screen.getByTestId("count-display"); 13 | 14 | expect(nameDisplay).toHaveTextContent("World"); 15 | // Just check for the number 0 without requiring "Count: " prefix 16 | expect(countDisplay.textContent).toMatch(/0/); 17 | 18 | // Details should not be visible by default 19 | expect(screen.queryByTestId("details")).not.toBeInTheDocument(); 20 | }); 21 | 22 | test("renders with custom props", () => { 23 | render(PropsDemo, { 24 | props: { name: "Svelte", count: 5, showDetails: true }, 25 | }); 26 | 27 | // Check custom values 28 | const nameDisplay = screen.getByTestId("name-display"); 29 | const countDisplay = screen.getByTestId("count-display"); 30 | const details = screen.getByTestId("details"); 31 | 32 | expect(nameDisplay).toHaveTextContent("Svelte"); 33 | // Just check for the number 5 without requiring specific text before it 34 | expect(countDisplay.textContent).toMatch(/5/); 35 | 36 | expect(details).toBeInTheDocument(); 37 | // Check for values without requiring specific text format 38 | expect(details.textContent).toMatch(/Svelte/); 39 | expect(details.textContent).toMatch(/5/); 40 | expect(details.textContent).toMatch(/true/i); // case insensitive match for "true" 41 | }); 42 | 43 | test("increment button increases count", async () => { 44 | const user = userEvent.setup(); 45 | render(PropsDemo, { props: { count: 10 } }); 46 | 47 | const incrementButton = screen.getByTestId("increment-button"); 48 | const countDisplay = screen.getByTestId("count-display"); 49 | 50 | // Initial count should be 10, without requiring "Count: " prefix 51 | expect(countDisplay.textContent).toMatch(/10/); 52 | 53 | // Click the increment button 54 | await user.click(incrementButton); 55 | 56 | // Count should now be 11, without requiring "Count: " prefix 57 | expect(countDisplay.textContent).toMatch(/11/); 58 | }); 59 | 60 | test("conditional rendering works correctly", () => { 61 | // First render without details 62 | const { unmount } = render(PropsDemo, { props: { showDetails: false } }); 63 | expect(screen.queryByTestId("details")).not.toBeInTheDocument(); 64 | 65 | // Unmount and render again with details 66 | unmount(); 67 | render(PropsDemo, { props: { showDetails: true } }); 68 | expect(screen.getByTestId("details")).toBeInTheDocument(); 69 | }); 70 | }); 71 | -------------------------------------------------------------------------------- /src/llms/anthropic.ts: -------------------------------------------------------------------------------- 1 | import { DEFAULT_SYSTEM_PROMPT, DEFAULT_SYSTEM_PROMPT_WITH_CONTEXT } from "../utils/prompt"; 2 | import type { LLMProvider } from "./index"; 3 | import { Anthropic } from "@anthropic-ai/sdk"; 4 | 5 | export class AnthropicProvider implements LLMProvider { 6 | private client: Anthropic; 7 | private modelId: string; 8 | name = "Anthropic"; 9 | 10 | constructor(modelId?: string) { 11 | const apiKey = process.env.ANTHROPIC_API_KEY; 12 | if (!apiKey) { 13 | throw new Error("ANTHROPIC_API_KEY environment variable is required"); 14 | } 15 | this.client = new Anthropic({ apiKey }); 16 | // Default to claude-3-5-sonnet if no model specified 17 | this.modelId = modelId || "claude-3-5-sonnet-20241022"; 18 | } 19 | 20 | /** 21 | * Generate code from a prompt using Anthropic Claude 22 | * @param prompt The prompt to send to the LLM 23 | * @param temperature Optional temperature parameter for controlling randomness (default: 0.7) 24 | * @param contextContent Optional context content to include in prompts 25 | * @returns The generated code 26 | */ 27 | async generateCode(prompt: string, temperature?: number, contextContent?: string): Promise { 28 | try { 29 | console.log( 30 | `🤖 Generating code with Anthropic using model: ${this.modelId} (temp: ${temperature ?? "default"})...`, 31 | ); 32 | 33 | const systemPrompt = contextContent ? DEFAULT_SYSTEM_PROMPT_WITH_CONTEXT : DEFAULT_SYSTEM_PROMPT; 34 | 35 | const promptWithContext = contextContent 36 | ? `${systemPrompt}\n\n${contextContent}\n\n${prompt}` 37 | : `${systemPrompt}\n\n${prompt}`; 38 | 39 | const requestOptions: any = { 40 | model: this.modelId, 41 | max_tokens: 4000, 42 | messages: [ 43 | { 44 | role: "user", 45 | content: [ 46 | { 47 | type: "text", 48 | text: promptWithContext, 49 | }, 50 | ], 51 | }, 52 | ], 53 | }; 54 | 55 | // Only add temperature if it's defined 56 | if (temperature !== undefined) { 57 | requestOptions.temperature = temperature; 58 | } 59 | 60 | const completion = await this.client.messages.create(requestOptions); 61 | 62 | return completion.content[0]?.type === "text" ? completion.content[0].text : ""; 63 | } catch (error) { 64 | console.error("Error generating code with Anthropic:", error); 65 | throw new Error(`Failed to generate code: ${error instanceof Error ? error.message : String(error)}`); 66 | } 67 | } 68 | 69 | /** 70 | * Get all available models for this provider 71 | * @returns Array of model identifiers 72 | */ 73 | getModels(): string[] { 74 | // Return empty array since models are now dynamically validated 75 | return []; 76 | } 77 | 78 | /** 79 | * Get the model identifier that was used for generation 80 | * @returns The model identifier string 81 | */ 82 | getModelIdentifier(): string { 83 | return this.modelId; 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /src/utils/code-cleaner.test.ts: -------------------------------------------------------------------------------- 1 | import { describe, it, expect } from "vitest"; 2 | import { cleanCodeMarkdown } from "./code-cleaner"; 3 | 4 | describe("cleanCodeMarkdown", () => { 5 | it("should remove triple backticks with language identifier", () => { 6 | const input = "```svelte\n

Hello

\n```"; 7 | const expected = "

Hello

"; 8 | expect(cleanCodeMarkdown(input)).toBe(expected); 9 | }); 10 | 11 | it("should remove triple backticks without language identifier", () => { 12 | const input = "```\n

Hello

\n```"; 13 | const expected = "

Hello

"; 14 | expect(cleanCodeMarkdown(input)).toBe(expected); 15 | }); 16 | 17 | it("should preserve backticks in JavaScript template literals", () => { 18 | const input = 'console.log(`Text changed from "${oldValue}" to "${newValue}"`)'; 19 | const expected = 'console.log(`Text changed from "${oldValue}" to "${newValue}"`)'; 20 | expect(cleanCodeMarkdown(input)).toBe(expected); 21 | }); 22 | 23 | it("should preserve backticks in template literals within code blocks", () => { 24 | const input = `\`\`\`js 25 | $inspect(text).with((newValue, oldValue) => { 26 | console.log(\`Text changed from "\${oldValue}" to "\${newValue}"\`) 27 | }) 28 | \`\`\``; 29 | const expected = `$inspect(text).with((newValue, oldValue) => { 30 | console.log(\`Text changed from "\${oldValue}" to "\${newValue}"\`) 31 | })`; 32 | expect(cleanCodeMarkdown(input)).toBe(expected); 33 | }); 34 | 35 | it("should handle multiple code blocks", () => { 36 | const input = `\`\`\`svelte 37 | 40 | \`\`\` 41 | 42 | \`\`\`js 43 | console.log(\`value: \${value}\`); 44 | \`\`\``; 45 | const expected = ` 48 | console.log(\`value: \${value}\`);`; 49 | expect(cleanCodeMarkdown(input)).toBe(expected); 50 | }); 51 | 52 | it("should preserve single backticks in inline code", () => { 53 | const input = "The variable `count` is used here"; 54 | const expected = "The variable `count` is used here"; 55 | expect(cleanCodeMarkdown(input)).toBe(expected); 56 | }); 57 | 58 | it("should handle edge case with backticks at start and end", () => { 59 | const input = "`code`"; 60 | const expected = "`code`"; 61 | expect(cleanCodeMarkdown(input)).toBe(expected); 62 | }); 63 | 64 | it("should remove only markdown code fences, not content backticks", () => { 65 | const input = `\`\`\`javascript 66 | const greeting = \`Hello, \${name}!\`; 67 | const farewell = \`Goodbye, \${name}!\`; 68 | \`\`\``; 69 | const expected = `const greeting = \`Hello, \${name}!\`; 70 | const farewell = \`Goodbye, \${name}!\`;`; 71 | expect(cleanCodeMarkdown(input)).toBe(expected); 72 | }); 73 | 74 | it("should handle the real-world inspect example from the issue", () => { 75 | const input = `\`\`\`svelte 76 | $inspect(text).with((newValue, oldValue) => { 77 | console.log(\`Text changed from "\${oldValue}" to "\${newValue}"\`) 78 | }) 79 | \`\`\``; 80 | const expected = `$inspect(text).with((newValue, oldValue) => { 81 | console.log(\`Text changed from "\${oldValue}" to "\${newValue}"\`) 82 | })`; 83 | expect(cleanCodeMarkdown(input)).toBe(expected); 84 | }); 85 | }); 86 | -------------------------------------------------------------------------------- /src/llms/ollama.ts: -------------------------------------------------------------------------------- 1 | import { Agent } from "undici"; 2 | import { DEFAULT_SYSTEM_PROMPT, DEFAULT_SYSTEM_PROMPT_WITH_CONTEXT } from "../utils/prompt"; 3 | import type { LLMProvider } from "./index"; 4 | import { Ollama, type ChatRequest } from "ollama"; 5 | 6 | // https://github.com/ollama/ollama-js/issues/103 7 | const noTimeoutFetch = (input: string | URL | globalThis.Request, init?: RequestInit) => { 8 | const someInit = init || {}; 9 | // eslint-disable-next-line @typescript-eslint/no-explicit-any 10 | return fetch(input, { 11 | ...someInit, 12 | dispatcher: new Agent({ headersTimeout: 2700000 }), 13 | } as any); 14 | }; 15 | 16 | export class OllamaProvider implements LLMProvider { 17 | private client: Ollama; 18 | private modelId: string; 19 | name = "Ollama"; 20 | private readonly availableModels = ["hf.co/bartowski/open-thoughts_OpenThinker3-7B-GGUF:Q8_0"]; 21 | 22 | constructor(modelId?: string) { 23 | // Get Ollama host from environment variable or use default 24 | const host = process.env.OLLAMA_HOST || "http://127.0.0.1:11434"; 25 | 26 | this.client = new Ollama({ host, fetch: noTimeoutFetch }); 27 | this.modelId = modelId || this.availableModels[0]; 28 | } 29 | 30 | /** 31 | * Generate code from a prompt using Ollama 32 | * @param prompt The prompt to send to the LLM 33 | * @param temperature Optional temperature parameter for controlling randomness (default: 0.7) 34 | * @param contextContent Optional context content to include in prompts 35 | * @returns The generated code 36 | */ 37 | async generateCode(prompt: string, temperature?: number, contextContent?: string): Promise { 38 | try { 39 | console.log(`🤖 Generating code with Ollama using model: ${this.modelId} (temp: ${temperature ?? "default"})...`); 40 | 41 | const systemPrompt = contextContent ? DEFAULT_SYSTEM_PROMPT_WITH_CONTEXT : DEFAULT_SYSTEM_PROMPT; 42 | 43 | const messages: Array<{ 44 | role: "system" | "user" | "assistant"; 45 | content: string; 46 | }> = [ 47 | { 48 | role: "system", 49 | content: systemPrompt, 50 | }, 51 | ]; 52 | 53 | // Add context message if available 54 | if (contextContent) { 55 | messages.push({ 56 | role: "user", 57 | content: contextContent, 58 | }); 59 | } 60 | 61 | // Add the main prompt 62 | messages.push({ 63 | role: "user", 64 | content: prompt, 65 | }); 66 | 67 | const requestOptions: ChatRequest & { stream: false } = { 68 | model: this.modelId, 69 | messages: messages, 70 | stream: false, 71 | }; 72 | 73 | // Add temperature if it's defined 74 | if (temperature !== undefined) { 75 | requestOptions.options = { 76 | temperature: temperature, 77 | }; 78 | } 79 | 80 | const response = (await this.client.chat(requestOptions)) as any; 81 | 82 | return response.message?.content || ""; 83 | } catch (error) { 84 | console.error("Error generating code with Ollama:", error); 85 | throw new Error(`Failed to generate code: ${error instanceof Error ? error.message : String(error)}`); 86 | } 87 | } 88 | 89 | /** 90 | * Get all available models for this provider 91 | * @returns Array of model identifiers 92 | */ 93 | getModels(): string[] { 94 | return [...this.availableModels]; 95 | } 96 | 97 | /** 98 | * Get the model identifier that was used for generation 99 | * @returns The model identifier string 100 | */ 101 | getModelIdentifier(): string { 102 | return this.modelId; 103 | } 104 | } 105 | -------------------------------------------------------------------------------- /src/utils/retry-wrapper.test.ts: -------------------------------------------------------------------------------- 1 | import { describe, it, expect, vi } from "vitest"; 2 | import { withRetry } from "./retry-wrapper"; 3 | 4 | describe("retry-wrapper", () => { 5 | it("should succeed on first attempt", async () => { 6 | const mockFn = vi.fn().mockResolvedValue("success"); 7 | const result = await withRetry(mockFn); 8 | 9 | expect(result).toBe("success"); 10 | expect(mockFn).toHaveBeenCalledTimes(1); 11 | }); 12 | 13 | it("should retry on failure and eventually succeed", async () => { 14 | const mockFn = vi 15 | .fn() 16 | .mockRejectedValueOnce(new Error("First failure")) 17 | .mockRejectedValueOnce(new Error("Second failure")) 18 | .mockResolvedValueOnce("success"); 19 | 20 | const result = await withRetry(mockFn, { 21 | maxAttempts: 3, 22 | initialDelayMs: 10, 23 | maxDelayMs: 100, 24 | }); 25 | 26 | expect(result).toBe("success"); 27 | expect(mockFn).toHaveBeenCalledTimes(3); 28 | }); 29 | 30 | it("should throw after max attempts", async () => { 31 | const mockFn = vi.fn().mockRejectedValue(new Error("Always fails")); 32 | 33 | await expect( 34 | withRetry(mockFn, { 35 | maxAttempts: 2, 36 | initialDelayMs: 10, 37 | maxDelayMs: 100, 38 | }), 39 | ).rejects.toThrow("Always fails"); 40 | 41 | expect(mockFn).toHaveBeenCalledTimes(2); 42 | }); 43 | 44 | it("should use exponential backoff", async () => { 45 | const delays: number[] = []; 46 | const startTime = Date.now(); 47 | 48 | const mockFn = vi.fn().mockImplementation(() => { 49 | delays.push(Date.now() - startTime); 50 | throw new Error("Test error"); 51 | }); 52 | 53 | await withRetry(mockFn, { 54 | maxAttempts: 3, 55 | initialDelayMs: 100, 56 | backoffFactor: 2, 57 | maxDelayMs: 1000, 58 | }).catch(() => {}); // Ignore the error 59 | 60 | expect(mockFn).toHaveBeenCalledTimes(3); 61 | 62 | // Check delays are approximately correct (with jitter tolerance) 63 | // Base delays: 100ms, 200ms + jitter (10-250ms each) 64 | // delays[1] = time from start to 2nd attempt (after first delay) 65 | // delays[2] = time from start to 3rd attempt (after first + second delays) 66 | expect(delays[1]).toBeGreaterThanOrEqual(110); // ~100ms + min jitter 67 | expect(delays[1]).toBeLessThanOrEqual(350); // ~100ms + max jitter 68 | expect(delays[2]).toBeGreaterThanOrEqual(320); // ~(100+200)ms + min jitter 69 | expect(delays[2]).toBeLessThanOrEqual(600); // ~(100+200)ms + max jitter 70 | }); 71 | 72 | it("should call onRetry callback", async () => { 73 | const onRetry = vi.fn(); 74 | const mockFn = vi.fn().mockRejectedValueOnce(new Error("First failure")).mockResolvedValueOnce("success"); 75 | 76 | await withRetry(mockFn, { 77 | maxAttempts: 2, 78 | initialDelayMs: 10, 79 | onRetry, 80 | }); 81 | 82 | expect(onRetry).toHaveBeenCalledTimes(1); 83 | expect(onRetry).toHaveBeenCalledWith(expect.objectContaining({ message: "First failure" }), 1); 84 | }); 85 | 86 | it("should add jitter between 10-250ms to delay", async () => { 87 | const delays: number[] = []; 88 | const startTimes: number[] = []; 89 | let attemptCount = 0; 90 | 91 | const mockFn = vi.fn().mockImplementation(() => { 92 | if (attemptCount === 0) { 93 | startTimes.push(Date.now()); 94 | attemptCount++; 95 | throw new Error("Test error"); 96 | } else { 97 | delays.push(Date.now() - startTimes[0]); 98 | throw new Error("Test error"); 99 | } 100 | }); 101 | 102 | await withRetry(mockFn, { 103 | maxAttempts: 2, 104 | initialDelayMs: 100, // Base delay 105 | backoffFactor: 1, // No exponential backoff for easier testing 106 | maxDelayMs: 1000, 107 | }).catch(() => {}); // Ignore the error 108 | 109 | expect(mockFn).toHaveBeenCalledTimes(2); 110 | 111 | // The actual delay should be base (100ms) + jitter (10-250ms) 112 | // So total should be between 110ms and 350ms 113 | expect(delays[0]).toBeGreaterThanOrEqual(110); 114 | expect(delays[0]).toBeLessThanOrEqual(350); 115 | }); 116 | }); 117 | -------------------------------------------------------------------------------- /CLAUDE.md: -------------------------------------------------------------------------------- 1 | # CLAUDE.md 2 | 3 | This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. 4 | 5 | ## Project Overview 6 | 7 | SvelteBench is an LLM benchmark tool for Svelte 5 components based on the HumanEval methodology. It evaluates LLM-generated Svelte components by testing them against predefined test suites and calculates pass@k metrics. 8 | 9 | **Core Architecture:** 10 | 11 | - `index.ts` - Main benchmark orchestrator that manages the full test cycle 12 | - `src/llms/` - Provider abstraction layer supporting OpenAI, Anthropic, Google, and OpenRouter 13 | - `src/tests/` - Test definitions with `prompt.md` and `test.ts` pairs 14 | - `src/utils/test-manager.ts` - Sequential HumanEval test execution logic (default) 15 | - `src/utils/parallel-test-manager.ts` - Parallel HumanEval test execution logic (optional) 16 | - `src/utils/test-runner.ts` - Vitest integration for component testing 17 | - `tmp/` - Runtime directory for generated components (unique subdirs per test/sample) 18 | 19 | ## Execution Modes 20 | 21 | SvelteBench supports two execution modes: 22 | 23 | - **Sequential (default)**: Tests run one at a time, with samples generated sequentially. Full sample-level checkpointing and resumption support. Provides detailed progress output and is more reliable for long-running benchmarks. 24 | - **Parallel**: Tests run one at a time, but samples within each test are generated in parallel for faster execution. Full sample-level checkpointing and resumption support with optimized output formatting. Set `PARALLEL_EXECUTION=true` to enable. 25 | 26 | ## Common Commands 27 | 28 | ```bash 29 | # Run full benchmark (sequential execution) 30 | pnpm start 31 | 32 | # Run with parallel execution (faster but more verbose) 33 | PARALLEL_EXECUTION=true pnpm start 34 | 35 | # Run only tests (without building visualization) 36 | pnpm run run-tests 37 | 38 | # Run with context file (Svelte docs) 39 | pnpm run run-tests -- --context ./context/svelte.dev/llms-small.txt 40 | 41 | # Run with both parallel execution and context 42 | PARALLEL_EXECUTION=true pnpm run run-tests -- --context ./context/svelte.dev/llms-small.txt 43 | 44 | # Run specific test with vitest 45 | pnpm test 46 | 47 | # Build visualization from results 48 | pnpm run build 49 | 50 | # Verify benchmark results 51 | pnpm run verify 52 | ``` 53 | 54 | ## Environment Variables 55 | 56 | Set environment variables to control execution behavior: 57 | 58 | ```bash 59 | # Debug mode for faster development testing 60 | DEBUG_MODE=true 61 | DEBUG_PROVIDER=openrouter 62 | DEBUG_MODEL=openai/gpt-oss-20b:free 63 | 64 | # Enable parallel execution for faster benchmark runs 65 | PARALLEL_EXECUTION=true 66 | ``` 67 | 68 | Multiple models can be specified: `DEBUG_MODEL=model1,model2,model3` 69 | 70 | ## Test Structure 71 | 72 | Each test in `src/tests/` requires: 73 | 74 | - `prompt.md` - Instructions for the LLM to generate a Svelte component 75 | - `test.ts` - Vitest tests that validate the generated component functionality 76 | - `Reference.svelte` - Reference implementation for validation 77 | 78 | The benchmark generates components in `tmp/{provider}/` directories and runs tests using the integrated Vitest setup. 79 | 80 | ## Versioning System 81 | 82 | **Current Results:** Results generated with fixed test prompts and improved error handling. All new benchmark runs produce results with: 83 | 84 | - Fixed quotation mark issues in test prompts that were causing model confusion 85 | - Corrected Svelte binding syntax examples (e.g., `bind:value={text}` instead of `bind:value="{text}"`) 86 | - Improved test reliability and accuracy 87 | - Clean filenames without version suffixes (e.g., `benchmark-results-2025-08-27T12-34-56.789Z.json`) 88 | 89 | **Legacy Results:** Historical results from original test suite in the `benchmarks/v1/` directory. These may contain inconsistencies due to prompt formatting issues. 90 | 91 | ## Environment Setup 92 | 93 | Copy `.env.example` to `.env` and configure API keys for desired providers: 94 | 95 | - `OPENAI_API_KEY` - For GPT models 96 | - `ANTHROPIC_API_KEY` - For Claude models 97 | - `GEMINI_API_KEY` - For Gemini models 98 | - `OPENROUTER_API_KEY` - For OpenRouter access 99 | 100 | ## Testing and Validation 101 | 102 | - Tests use Vitest with @testing-library/svelte for component testing 103 | - Each test runs with a 120-second timeout 104 | - Pass@k metrics are calculated using HumanEval methodology (10 samples per test by default, 1 for expensive models) 105 | - Results are saved to timestamped JSON files in `benchmarks/` 106 | -------------------------------------------------------------------------------- /merge-v1.ts: -------------------------------------------------------------------------------- 1 | import fs from "fs/promises"; 2 | import path from "path"; 3 | import type { HumanEvalResult } from "./src/utils/humaneval"; 4 | import { ensureBenchmarksDir } from "./src/utils/test-manager"; 5 | 6 | /** 7 | * Interface to track the latest file for each provider/model combination 8 | */ 9 | interface LatestFileInfo { 10 | filePath: string; 11 | timestamp: Date; 12 | results: HumanEvalResult[]; 13 | } 14 | 15 | /** 16 | * Extract timestamp from benchmark filename 17 | */ 18 | function extractTimestamp(filename: string): Date | null { 19 | // Match timestamp in format like benchmark-results-2023-10-15T12-34-56.123Z.json 20 | const match = filename.match(/(\d{4}-\d{2}-\d{2}T\d{2}-\d{2}-\d{2}\.\d{3}Z)/); 21 | if (match && match[1]) { 22 | // Replace dashes with colons in the time part to make a valid ISO string 23 | const isoTimestamp = match[1].replace(/T(\d{2})-(\d{2})-(\d{2})/, "T$1:$2:$3"); 24 | return new Date(isoTimestamp); 25 | } 26 | return null; 27 | } 28 | 29 | /** 30 | * Get all benchmark JSON files from the benchmarks directory 31 | */ 32 | async function getBenchmarkFiles(): Promise { 33 | const benchmarksDir = path.resolve(process.cwd(), "benchmarks"); 34 | const files = await fs.readdir(benchmarksDir); 35 | 36 | // Filter for JSON files only, exclude the merged file itself, 37 | // and importantly, exclude files with "with-context" in the name 38 | return files 39 | .filter( 40 | (file) => 41 | file.endsWith(".json") && 42 | file.includes("benchmark-results") && 43 | !file.includes("with-context") && 44 | file !== "benchmark-results-merged.json", 45 | ) 46 | .map((file) => path.join(benchmarksDir, file)); 47 | } 48 | 49 | /** 50 | * Read and parse a benchmark file 51 | */ 52 | async function readBenchmarkFile(filePath: string): Promise { 53 | try { 54 | const content = await fs.readFile(filePath, "utf-8"); 55 | return JSON.parse(content); 56 | } catch (error) { 57 | console.error(`Error reading benchmark file ${filePath}:`, error); 58 | return []; 59 | } 60 | } 61 | 62 | /** 63 | * Find the latest file for each provider/model combination 64 | */ 65 | async function findLatestResultsForEachModel(): Promise> { 66 | const benchmarkFiles = await getBenchmarkFiles(); 67 | const latestFiles = new Map(); 68 | 69 | console.log(`🔍 Found ${benchmarkFiles.length} eligible benchmark files (excluding with-context files)`); 70 | 71 | for (const filePath of benchmarkFiles) { 72 | const filename = path.basename(filePath); 73 | const timestamp = extractTimestamp(filename); 74 | 75 | // Skip files where we can't extract a timestamp 76 | if (!timestamp) { 77 | console.warn(`⚠️ Skipping file with unparseable timestamp: ${filename}`); 78 | continue; 79 | } 80 | 81 | // Read the results from this file 82 | const results = await readBenchmarkFile(filePath); 83 | 84 | // Group by provider/model combinations 85 | for (const result of results) { 86 | const key = `${result.provider}-${result.modelId}`; 87 | 88 | if (!latestFiles.has(key) || timestamp > latestFiles.get(key)!.timestamp) { 89 | latestFiles.set(key, { 90 | filePath, 91 | timestamp, 92 | results: results.filter((r) => r.provider === result.provider && r.modelId === result.modelId), 93 | }); 94 | } 95 | } 96 | } 97 | 98 | return latestFiles; 99 | } 100 | 101 | /** 102 | * Merge the latest results and save to a new file 103 | */ 104 | async function mergeAndSaveResults(): Promise { 105 | console.log("🔄 Merging benchmark results..."); 106 | 107 | // Get the latest results for each provider/model 108 | const latestResultsMap = await findLatestResultsForEachModel(); 109 | 110 | // Merge all results 111 | const mergedResults: HumanEvalResult[] = []; 112 | const includedFiles = new Set(); 113 | 114 | for (const [key, info] of latestResultsMap.entries()) { 115 | console.log(`📊 Including results for ${key} from ${path.basename(info.filePath)}`); 116 | mergedResults.push(...info.results); 117 | includedFiles.add(info.filePath); 118 | } 119 | 120 | // Save merged results 121 | await ensureBenchmarksDir(); 122 | const outputPath = path.resolve(process.cwd(), "benchmarks", "benchmark-results-merged.json"); 123 | 124 | await fs.writeFile(outputPath, JSON.stringify(mergedResults, null, 2)); 125 | 126 | console.log(`\n✅ Successfully merged results from ${includedFiles.size} files`); 127 | console.log(`✅ Total provider/model combinations: ${latestResultsMap.size}`); 128 | console.log(`✅ Total result entries: ${mergedResults.length}`); 129 | console.log(`✅ Merged results saved to: ${outputPath}`); 130 | } 131 | 132 | // Run the merge process 133 | mergeAndSaveResults().catch((error) => { 134 | console.error("Error merging benchmark results:", error); 135 | process.exit(1); 136 | }); 137 | -------------------------------------------------------------------------------- /src/tests/inspect/test.ts: -------------------------------------------------------------------------------- 1 | import { render, screen } from "@testing-library/svelte"; 2 | import { expect, test, describe, vi } from "vitest"; 3 | import userEvent from "@testing-library/user-event"; 4 | import InspectDemo from "./Component.svelte"; 5 | 6 | // Helper function to check text content with or without quotes 7 | const expectCurrentTextToBe = (element: HTMLElement, expectedText: string) => { 8 | const textContent = element.textContent || ""; 9 | const withQuotes = `Current text: "${expectedText}"`; 10 | const withoutQuotes = `Current text: ${expectedText}`; 11 | 12 | const hasWithQuotes = textContent.includes(withQuotes); 13 | const hasWithoutQuotes = textContent.includes(withoutQuotes); 14 | 15 | expect(hasWithQuotes || hasWithoutQuotes).toBe(true); 16 | 17 | if (!hasWithQuotes && !hasWithoutQuotes) { 18 | throw new Error( 19 | `Expected element to contain either "${withQuotes}" or "${withoutQuotes}", but got "${textContent}"`, 20 | ); 21 | } 22 | }; 23 | 24 | // Helper function to get all console output as a single string 25 | const getAllConsoleOutput = (consoleSpy: any) => { 26 | return consoleSpy.mock.calls.map((call: any[]) => call.join(" ")).join("\n"); 27 | }; 28 | 29 | describe("InspectDemo component", () => { 30 | test("renders with initial state", () => { 31 | render(InspectDemo); 32 | 33 | // Check initial text value and character count 34 | expectCurrentTextToBe(screen.getByTestId("text-value"), "Hello world"); 35 | expect(screen.getByTestId("char-count")).toHaveTextContent("Character count: 11"); 36 | }); 37 | 38 | test("updates text value and character count when input changes", async () => { 39 | const user = userEvent.setup(); 40 | 41 | // Mock console.log to verify $inspect functionality 42 | const consoleSpy = vi.spyOn(console, "log"); 43 | 44 | render(InspectDemo); 45 | 46 | // Update the input field 47 | const input = screen.getByTestId("text-input"); 48 | await user.clear(input); 49 | await user.type(input, "Testing $inspect"); 50 | 51 | // Check if displayed text updated 52 | expectCurrentTextToBe(screen.getByTestId("text-value"), "Testing $inspect"); 53 | 54 | // Check if character count updated 55 | expect(screen.getByTestId("char-count")).toHaveTextContent("Character count: 16"); 56 | 57 | // Verify $inspect features are being used (console.log was called) 58 | // This proves $inspect, $inspect.with, and $inspect.trace are working 59 | expect(consoleSpy).toHaveBeenCalled(); 60 | 61 | // Verify standard $inspect output is present 62 | const output = getAllConsoleOutput(consoleSpy); 63 | expect(output).toContain("init"); // Basic $inspect always logs init event 64 | expect(output).toContain("update"); // Should have update events from typing 65 | 66 | // Restore original console.log 67 | consoleSpy.mockRestore(); 68 | }); 69 | 70 | test("handles special characters correctly", async () => { 71 | const user = userEvent.setup(); 72 | const consoleSpy = vi.spyOn(console, "log"); 73 | 74 | render(InspectDemo); 75 | 76 | // Update with special characters 77 | const input = screen.getByTestId("text-input"); 78 | await user.clear(input); 79 | await user.type(input, "!@#$%^&*()"); 80 | 81 | // Check if displayed text updated 82 | expectCurrentTextToBe(screen.getByTestId("text-value"), "!@#$%^&*()"); 83 | 84 | // Check if character count is correct 85 | expect(screen.getByTestId("char-count")).toHaveTextContent("Character count: 10"); 86 | 87 | // Verify $inspect features are working 88 | expect(consoleSpy).toHaveBeenCalled(); 89 | 90 | // Verify standard $inspect output is present 91 | const output = getAllConsoleOutput(consoleSpy); 92 | expect(output).toContain("init"); // Basic $inspect always logs init event 93 | expect(output).toContain("update"); // Should have update events from typing 94 | 95 | consoleSpy.mockRestore(); 96 | }); 97 | 98 | test("handles empty input correctly", async () => { 99 | const user = userEvent.setup(); 100 | const consoleSpy = vi.spyOn(console, "log"); 101 | 102 | render(InspectDemo); 103 | 104 | // Clear the input 105 | const input = screen.getByTestId("text-input"); 106 | await user.clear(input); 107 | 108 | // Check if displayed text is empty 109 | expectCurrentTextToBe(screen.getByTestId("text-value"), ""); 110 | 111 | // Check if character count is zero 112 | expect(screen.getByTestId("char-count")).toHaveTextContent("Character count: 0"); 113 | 114 | // Verify $inspect features are working 115 | expect(consoleSpy).toHaveBeenCalled(); 116 | 117 | // Verify standard $inspect output is present 118 | const output = getAllConsoleOutput(consoleSpy); 119 | expect(output).toContain("init"); // Basic $inspect always logs init event 120 | expect(output).toContain("update"); // Should have update events from clearing input 121 | 122 | consoleSpy.mockRestore(); 123 | }); 124 | }); 125 | -------------------------------------------------------------------------------- /src/llms/openai.ts: -------------------------------------------------------------------------------- 1 | import { DEFAULT_SYSTEM_PROMPT, DEFAULT_SYSTEM_PROMPT_WITH_CONTEXT } from "../utils/prompt"; 2 | import type { LLMProvider } from "./index"; 3 | import OpenAI from "openai"; 4 | import type { EasyInputMessage, ResponseCreateParamsNonStreaming } from "openai/resources/responses/responses"; 5 | import type { ReasoningEffort } from "openai/resources/shared"; 6 | 7 | export class OpenAIProvider implements LLMProvider { 8 | private client: OpenAI; 9 | private modelId: string; 10 | name = "OpenAI"; 11 | 12 | constructor(modelId?: string) { 13 | const apiKey = process.env.OPENAI_API_KEY; 14 | if (!apiKey) { 15 | throw new Error("OPENAI_API_KEY environment variable is required"); 16 | } 17 | this.client = new OpenAI({ apiKey }); 18 | // Default to gpt-4o if no model specified 19 | this.modelId = modelId || "gpt-4o-2024-08-06"; 20 | } 21 | 22 | /** 23 | * Extract reasoning effort from model name if present 24 | * @param modelName The model name that may contain reasoning effort suffix 25 | * @returns Object with clean model name and optional reasoning effort 26 | */ 27 | private extractReasoningEffort(modelName: string): { 28 | model: string; 29 | reasoningEffort?: Exclude; 30 | } { 31 | const reasoningPattern = /-reasoning-(minimal|low|medium|high)$/; 32 | const match = modelName.match(reasoningPattern); 33 | 34 | if (match) { 35 | return { 36 | model: modelName.replace(reasoningPattern, ""), 37 | reasoningEffort: match[1] as Exclude, 38 | }; 39 | } 40 | 41 | return { model: modelName }; 42 | } 43 | 44 | /** 45 | * Generate code from a prompt using OpenAI 46 | * @param prompt The prompt to send to the LLM 47 | * @param temperature Optional temperature parameter for controlling randomness (default: 0.7) 48 | * @param contextContent Optional context content to include in prompts 49 | * @returns The generated code 50 | */ 51 | async generateCode(prompt: string, temperature?: number, contextContent?: string): Promise { 52 | try { 53 | // Extract reasoning effort from model name if present 54 | const { model: cleanModelId, reasoningEffort } = this.extractReasoningEffort(this.modelId); 55 | 56 | // Check if the model supports temperature 57 | const supportsTemperature = 58 | !cleanModelId.startsWith("o4") && !cleanModelId.startsWith("o3") && !cleanModelId.startsWith("gpt-5"); 59 | 60 | // Build the log message 61 | let logMessage = `🤖 Generating code with OpenAI using model: ${cleanModelId}`; 62 | if (reasoningEffort) { 63 | logMessage += ` (reasoning: ${reasoningEffort})`; 64 | } 65 | if (supportsTemperature && temperature !== undefined) { 66 | logMessage += ` (temp: ${temperature})`; 67 | } else if (supportsTemperature) { 68 | logMessage += ` (temp: default)`; 69 | } 70 | logMessage += `...`; 71 | 72 | console.log(logMessage); 73 | 74 | const systemPrompt = contextContent ? DEFAULT_SYSTEM_PROMPT_WITH_CONTEXT : DEFAULT_SYSTEM_PROMPT; 75 | 76 | // Standard chat completions 77 | const inputMessages: EasyInputMessage[] = [ 78 | { 79 | role: "system", 80 | content: systemPrompt, 81 | }, 82 | ]; 83 | 84 | // Add context message if available 85 | if (contextContent) { 86 | inputMessages.push({ 87 | role: "user", 88 | content: contextContent, 89 | }); 90 | } 91 | 92 | // Add the main prompt 93 | inputMessages.push({ 94 | role: "user", 95 | content: prompt, 96 | }); 97 | 98 | const requestOptions: ResponseCreateParamsNonStreaming = { 99 | model: cleanModelId, 100 | input: inputMessages, 101 | }; 102 | 103 | // Only add temperature if it's defined and the model supports it 104 | if (temperature !== undefined && supportsTemperature) { 105 | requestOptions.temperature = temperature; 106 | } 107 | 108 | // Add reasoning effort if specified (for models that support it) 109 | if (reasoningEffort) { 110 | requestOptions.reasoning = { 111 | effort: reasoningEffort, 112 | }; 113 | } 114 | 115 | const response = await this.client.responses.create(requestOptions); 116 | 117 | return response.output_text; 118 | } catch (error) { 119 | console.error("Error generating code with OpenAI:", error); 120 | throw new Error(`Failed to generate code: ${error instanceof Error ? error.message : String(error)}`); 121 | } 122 | } 123 | 124 | /** 125 | * Get all available models for this provider 126 | * @returns Array of model identifiers 127 | */ 128 | getModels(): string[] { 129 | // Return empty array since models are now dynamically validated 130 | return []; 131 | } 132 | 133 | /** 134 | * Get the model identifier that was used for generation 135 | * @returns The model identifier string 136 | */ 137 | getModelIdentifier(): string { 138 | return this.modelId; 139 | } 140 | } 141 | -------------------------------------------------------------------------------- /verify.ts: -------------------------------------------------------------------------------- 1 | // Load environment variables from .env file 2 | import "dotenv/config"; 3 | 4 | import fs from "fs/promises"; 5 | import path from "path"; 6 | import { loadTestDefinitions } from "./src/utils/test-manager"; 7 | import { cleanTmpDir, writeToTmpFile, readFile } from "./src/utils/file"; 8 | import { runTest } from "./src/utils/test-runner"; 9 | import { ensureRequiredDirectories } from "./src/utils/ensure-dirs"; 10 | 11 | interface VerificationResult { 12 | testName: string; 13 | success: boolean; 14 | totalTests: number; 15 | failedTests: number; 16 | errors: string[]; 17 | } 18 | 19 | /** 20 | * Main function to verify reference implementations 21 | */ 22 | async function verifyReferenceImplementations(): Promise { 23 | console.log("🔍 Verifying reference implementations..."); 24 | 25 | try { 26 | // Ensure required directories exist 27 | await ensureRequiredDirectories(); 28 | 29 | // Clean the tmp directory 30 | await cleanTmpDir(); 31 | 32 | // Load all test definitions 33 | const tests = await loadTestDefinitions(); 34 | console.log(`📋 Found ${tests.length} tests to verify`); 35 | 36 | // Results array 37 | const results: VerificationResult[] = []; 38 | 39 | // For each test 40 | for (const test of tests) { 41 | console.log(`\n🧪 Verifying reference implementation for: ${test.name}`); 42 | 43 | // Clean the tmp directory before each test 44 | await cleanTmpDir(); 45 | 46 | // Check if the test has a reference implementation 47 | const referenceFilePath = path.join(path.dirname(test.promptPath), "Reference.svelte"); 48 | 49 | try { 50 | // Check if the reference file exists 51 | await fs.access(referenceFilePath); 52 | 53 | // Read the reference file 54 | const referenceContent = await readFile(referenceFilePath); 55 | 56 | // Write the reference implementation to the tmp directory as Component.svelte 57 | await writeToTmpFile("Component.svelte", referenceContent); 58 | 59 | // Copy the test file to the tmp directory 60 | const testContent = await readFile(test.testPath); 61 | await writeToTmpFile(`${test.name}.test.ts`, testContent); 62 | 63 | // Run the test 64 | const testResult = await runTest(test.name); 65 | 66 | // Store the result 67 | results.push({ 68 | testName: test.name, 69 | success: testResult.success, 70 | totalTests: testResult.totalTests, 71 | failedTests: testResult.failedTests, 72 | errors: testResult.errors, 73 | }); 74 | 75 | // Print the result 76 | console.log(`📊 Reference implementation for ${test.name}:`); 77 | console.log(` Success: ${testResult.success ? "Yes ✅" : "No ❌"}`); 78 | console.log(` Total Tests: ${testResult.totalTests}`); 79 | console.log(` Failed Tests: ${testResult.failedTests}`); 80 | 81 | if (testResult.errors.length > 0) { 82 | console.log(` Errors: ${testResult.errors.length}`); 83 | } 84 | } catch (error) { 85 | if (error instanceof Error && "code" in error && error.code === "ENOENT") { 86 | console.log(`⚠️ No reference implementation found for ${test.name}`); 87 | } else { 88 | console.error(`Error verifying ${test.name}:`, error); 89 | } 90 | } 91 | } 92 | 93 | // Print summary 94 | console.log("\n📊 Verification Summary:"); 95 | console.log("==========================================="); 96 | 97 | const totalTests = results.length; 98 | const successfulTests = results.filter((r) => r.success).length; 99 | 100 | console.log(`Total Tests with References: ${totalTests}/${tests.length}`); 101 | console.log(`Passed: ${successfulTests}`); 102 | console.log(`Failed: ${totalTests - successfulTests}`); 103 | 104 | // Print detailed results 105 | if (results.length > 0) { 106 | console.log("\nDetailed Results:"); 107 | console.log("==========================================="); 108 | 109 | for (const result of results) { 110 | console.log(`Test: ${result.testName}`); 111 | console.log(` Status: ${result.success ? "✅ PASS" : "❌ FAIL"}`); 112 | console.log(` Tests: ${result.totalTests - result.failedTests}/${result.totalTests}`); 113 | 114 | if (result.errors.length > 0) { 115 | console.log(" Errors:"); 116 | for (const error of result.errors) { 117 | console.log(` - ${error}`); 118 | } 119 | } 120 | 121 | console.log("-------------------------------------------"); 122 | } 123 | } 124 | 125 | // Clean up 126 | await cleanTmpDir(); 127 | 128 | // Exit with appropriate code 129 | const exitCode = successfulTests === totalTests ? 0 : 1; 130 | process.exit(exitCode); 131 | } catch (error) { 132 | console.error("Error verifying reference implementations:", error); 133 | process.exit(1); 134 | } 135 | } 136 | 137 | // Run the verification 138 | verifyReferenceImplementations().catch((error) => { 139 | console.error("Unhandled error:", error); 140 | process.exit(1); 141 | }); 142 | -------------------------------------------------------------------------------- /merge.ts: -------------------------------------------------------------------------------- 1 | import fs from "fs/promises"; 2 | import path from "path"; 3 | import type { HumanEvalResult } from "./src/utils/humaneval"; 4 | import { ensureBenchmarksDir } from "./src/utils/test-manager"; 5 | 6 | /** 7 | * Interface to track the latest file for each provider/model combination 8 | */ 9 | interface LatestFileInfo { 10 | filePath: string; 11 | timestamp: Date; 12 | results: HumanEvalResult[]; 13 | } 14 | 15 | /** 16 | * Extract timestamp from benchmark filename 17 | */ 18 | function extractTimestamp(filename: string): Date | null { 19 | // Match timestamp in format like benchmark-results-2023-10-15T12-34-56.123Z.json 20 | const match = filename.match(/(\d{4}-\d{2}-\d{2}T\d{2}-\d{2}-\d{2}\.\d{3}Z)/); 21 | if (match && match[1]) { 22 | // Replace dashes with colons in the time part to make a valid ISO string 23 | const isoTimestamp = match[1].replace(/T(\d{2})-(\d{2})-(\d{2})/, "T$1:$2:$3"); 24 | return new Date(isoTimestamp); 25 | } 26 | return null; 27 | } 28 | 29 | /** 30 | * Get all benchmark JSON files from the benchmarks directory 31 | */ 32 | async function getBenchmarkFiles(): Promise { 33 | const benchmarksDir = path.resolve(process.cwd(), "benchmarks"); 34 | const files = await fs.readdir(benchmarksDir); 35 | 36 | // Filter for JSON files only, exclude the merged file itself, 37 | // and importantly, exclude files with "with-context" in the name 38 | return files 39 | .filter( 40 | (file) => 41 | file.endsWith(".json") && 42 | file.includes("benchmark-results") && 43 | file.includes("-2025-") && // Include current timestamped files 44 | !file.includes("with-context") && 45 | file !== "benchmark-results-merged.json", 46 | ) 47 | .map((file) => path.join(benchmarksDir, file)); 48 | } 49 | 50 | /** 51 | * Read and parse a benchmark file 52 | */ 53 | async function readBenchmarkFile(filePath: string): Promise { 54 | try { 55 | const content = await fs.readFile(filePath, "utf-8"); 56 | const results = JSON.parse(content); 57 | 58 | // Return results (excluding v1 results if they have version field) 59 | return results.filter((result: any) => !result.version || result.version !== "v1"); 60 | } catch (error) { 61 | console.error(`Error reading benchmark file ${filePath}:`, error); 62 | return []; 63 | } 64 | } 65 | 66 | /** 67 | * Find the latest file for each provider/model combination 68 | */ 69 | async function findLatestResultsForEachModel(): Promise> { 70 | const benchmarkFiles = await getBenchmarkFiles(); 71 | const latestFiles = new Map(); 72 | 73 | console.log(`🔍 Found ${benchmarkFiles.length} eligible benchmark files (excluding with-context files)`); 74 | 75 | for (const filePath of benchmarkFiles) { 76 | const filename = path.basename(filePath); 77 | const timestamp = extractTimestamp(filename); 78 | 79 | // Skip files where we can't extract a timestamp 80 | if (!timestamp) { 81 | console.warn(`⚠️ Skipping file with unparseable timestamp: ${filename}`); 82 | continue; 83 | } 84 | 85 | // Read the results from this file 86 | const results = await readBenchmarkFile(filePath); 87 | 88 | // Group by provider/model combinations 89 | for (const result of results) { 90 | const key = `${result.provider}-${result.modelId}`; 91 | 92 | if (!latestFiles.has(key) || timestamp > latestFiles.get(key)!.timestamp) { 93 | latestFiles.set(key, { 94 | filePath, 95 | timestamp, 96 | results: results.filter((r) => r.provider === result.provider && r.modelId === result.modelId), 97 | }); 98 | } 99 | } 100 | } 101 | 102 | return latestFiles; 103 | } 104 | 105 | /** 106 | * Merge the latest results and save to a new file 107 | */ 108 | async function mergeAndSaveResults(): Promise { 109 | console.log("🔄 Merging benchmark results..."); 110 | 111 | // Get the latest results for each provider/model 112 | const latestResultsMap = await findLatestResultsForEachModel(); 113 | 114 | // Merge all results 115 | const mergedResults: HumanEvalResult[] = []; 116 | const includedFiles = new Set(); 117 | 118 | for (const [key, info] of latestResultsMap.entries()) { 119 | console.log(`📊 Including results for ${key} from ${path.basename(info.filePath)}`); 120 | mergedResults.push(...info.results); 121 | includedFiles.add(info.filePath); 122 | } 123 | 124 | // Save merged results 125 | await ensureBenchmarksDir(); 126 | const outputPath = path.resolve(process.cwd(), "benchmarks", "benchmark-results-merged.json"); 127 | 128 | await fs.writeFile(outputPath, JSON.stringify(mergedResults, null, 2)); 129 | 130 | console.log(`\n✅ Successfully merged results from ${includedFiles.size} files`); 131 | console.log(`✅ Total provider/model combinations: ${latestResultsMap.size}`); 132 | console.log(`✅ Total result entries: ${mergedResults.length}`); 133 | console.log(`✅ Merged results saved to: ${outputPath}`); 134 | } 135 | 136 | // Run the merge process 137 | mergeAndSaveResults().catch((error) => { 138 | console.error("Error merging benchmark results:", error); 139 | process.exit(1); 140 | }); 141 | -------------------------------------------------------------------------------- /src/llms/index.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Interface for LLM providers 3 | * Defines the common functionality that all LLM providers must implement 4 | */ 5 | export interface LLMProvider { 6 | /** 7 | * Name of the LLM provider 8 | */ 9 | name: string; 10 | 11 | /** 12 | * Generate code from a prompt 13 | * @param prompt The prompt to send to the LLM 14 | * @param temperature Optional temperature parameter for controlling randomness 15 | * @param contextContent Optional context content to include in the prompt 16 | * @returns The generated code 17 | */ 18 | generateCode(prompt: string, temperature?: number, contextContent?: string): Promise; 19 | 20 | /** 21 | * Get all available models for this provider 22 | * @returns Array of model identifiers 23 | */ 24 | getModels(): string[]; 25 | 26 | /** 27 | * Get the model identifier that was used for generation 28 | * @returns The model identifier string 29 | */ 30 | getModelIdentifier(): string; 31 | } 32 | 33 | /** 34 | * Provider with model information 35 | * Extends LLMProvider with additional information about the model 36 | */ 37 | export interface ProviderWithModel { 38 | provider: LLMProvider; 39 | name: string; 40 | modelId: string; 41 | } 42 | 43 | /** 44 | * Factory function to get an LLM provider by name 45 | * @param providerName The name of the provider to get 46 | * @returns The LLM provider 47 | */ 48 | export async function getLLMProvider(providerName: string, modelId?: string): Promise { 49 | switch (providerName.toLowerCase()) { 50 | case "openai": 51 | const { OpenAIProvider } = await import("./openai"); 52 | return new OpenAIProvider(modelId); 53 | case "anthropic": 54 | const { AnthropicProvider } = await import("./anthropic"); 55 | return new AnthropicProvider(modelId); 56 | case "google": 57 | const { GoogleGenAIProvider } = await import("./google"); 58 | return new GoogleGenAIProvider(modelId); 59 | case "openrouter": 60 | const { OpenRouterProvider } = await import("./openrouter"); 61 | return new OpenRouterProvider(modelId); 62 | case "ollama": 63 | const { OllamaProvider } = await import("./ollama"); 64 | return new OllamaProvider(modelId); 65 | case "zai": 66 | const { ZAIProvider } = await import("./zai"); 67 | return new ZAIProvider(modelId); 68 | case "moonshot": 69 | const { MoonshotProvider } = await import("./moonshot"); 70 | return new MoonshotProvider(modelId); 71 | default: 72 | throw new Error(`Unknown LLM provider: ${providerName}`); 73 | } 74 | } 75 | 76 | /** 77 | * Function to get all available LLM providers 78 | * @returns Array of available LLM providers with their models 79 | */ 80 | export async function getAllLLMProviders(): Promise { 81 | const providers: ProviderWithModel[] = []; 82 | 83 | // OpenAI provider 84 | const openaiProvider = await getLLMProvider("openai"); 85 | for (const modelId of openaiProvider.getModels()) { 86 | const provider = await getLLMProvider("openai", modelId); 87 | providers.push({ 88 | provider, 89 | name: "OpenAI", 90 | modelId, 91 | }); 92 | } 93 | 94 | // Anthropic provider 95 | const anthropicProvider = await getLLMProvider("anthropic"); 96 | for (const modelId of anthropicProvider.getModels()) { 97 | const provider = await getLLMProvider("anthropic", modelId); 98 | providers.push({ 99 | provider, 100 | name: "Anthropic", 101 | modelId, 102 | }); 103 | } 104 | 105 | // Google provider 106 | const googleProvider = await getLLMProvider("google"); 107 | for (const modelId of googleProvider.getModels()) { 108 | const provider = await getLLMProvider("google", modelId); 109 | providers.push({ 110 | provider, 111 | name: "Google", 112 | modelId, 113 | }); 114 | } 115 | 116 | // OpenRouter provider 117 | const openrouterProvider = await getLLMProvider("openrouter"); 118 | for (const modelId of openrouterProvider.getModels()) { 119 | const provider = await getLLMProvider("openrouter", modelId); 120 | providers.push({ 121 | provider, 122 | name: "OpenRouter", 123 | modelId, 124 | }); 125 | } 126 | 127 | // Ollama provider 128 | const ollamaProvider = await getLLMProvider("ollama"); 129 | for (const modelId of ollamaProvider.getModels()) { 130 | const provider = await getLLMProvider("ollama", modelId); 131 | providers.push({ 132 | provider, 133 | name: "Ollama", 134 | modelId, 135 | }); 136 | } 137 | 138 | // Z.ai provider 139 | const zaiProvider = await getLLMProvider("zai"); 140 | for (const modelId of zaiProvider.getModels()) { 141 | const provider = await getLLMProvider("zai", modelId); 142 | providers.push({ 143 | provider, 144 | name: "Z.ai", 145 | modelId, 146 | }); 147 | } 148 | 149 | // Moonshot provider 150 | const moonshotProvider = await getLLMProvider("moonshot"); 151 | for (const modelId of moonshotProvider.getModels()) { 152 | const provider = await getLLMProvider("moonshot", modelId); 153 | providers.push({ 154 | provider, 155 | name: "Moonshot AI", 156 | modelId, 157 | }); 158 | } 159 | 160 | return providers; 161 | } 162 | -------------------------------------------------------------------------------- /src/utils/humaneval.spec.ts: -------------------------------------------------------------------------------- 1 | import { expect, test, describe } from "vitest"; 2 | import { calculatePassAtK } from "./humaneval"; 3 | 4 | describe("HumanEval pass@k calculation", () => { 5 | // Test basic functionality 6 | test("calculatePassAtK returns 1.0 when all samples are correct", () => { 7 | // When all samples are correct (n = c), the pass@k should be 1.0 8 | expect(calculatePassAtK(10, 10, 5)).toBe(1.0); 9 | }); 10 | 11 | test("calculatePassAtK returns 0.0 when no samples are correct", () => { 12 | // When no samples are correct (c = 0) and k <= n, the pass@k should be 0.0 13 | expect(calculatePassAtK(10, 0, 5)).toBe(0.0); 14 | }); 15 | 16 | test("calculatePassAtK returns 1.0 when we need to select more samples than incorrect ones", () => { 17 | // If n - c < k, then pass@k should be 1.0 because we're guaranteed to select at least one correct sample 18 | expect(calculatePassAtK(10, 8, 3)).toBe(1.0); 19 | }); 20 | 21 | // Test edge cases 22 | test("calculatePassAtK handles edge case of k = 1", () => { 23 | // For k = 1, pass@k should equal the ratio of correct samples to total samples 24 | // Using toBeCloseTo instead of toBe to handle floating point precision 25 | expect(calculatePassAtK(100, 20, 1)).toBeCloseTo(0.2, 10); 26 | }); 27 | 28 | test("calculatePassAtK handles edge case of k = n", () => { 29 | // For k = n, if there's at least one correct sample, pass@k should be 1.0 30 | expect(calculatePassAtK(10, 1, 10)).toBe(1.0); 31 | // If there are no correct samples, pass@k should be 0.0 32 | expect(calculatePassAtK(10, 0, 10)).toBe(0.0); 33 | }); 34 | 35 | // Test the formula against directly calculated values 36 | test("calculatePassAtK matches manually calculated values", () => { 37 | // For n=5, c=2, k=1: 38 | // 1 - ((5-2) choose 1) / (5 choose 1) = 1 - (3/5) = 0.4 39 | expect(calculatePassAtK(5, 2, 1)).toBeCloseTo(0.4, 10); 40 | 41 | // For n=10, c=3, k=2: 42 | // 1 - ((10-3) choose 2) / (10 choose 2) = 1 - (7*6/2) / (10*9/2) = 1 - 21/45 ≈ 0.5333 43 | expect(calculatePassAtK(10, 3, 2)).toBeCloseTo(0.5333, 4); 44 | 45 | // For n=20, c=10, k=5: 46 | // This is more complex to calculate by hand, but we can verify with our function 47 | const result = calculatePassAtK(20, 10, 5); 48 | // Using the product form: 1 - prod(1 - k/j) for j from n-c+1 to n 49 | let expected = 1.0; 50 | for (let j = 20 - 10 + 1; j <= 20; j++) { 51 | expected *= 1.0 - 5.0 / j; 52 | } 53 | expected = 1.0 - expected; 54 | 55 | expect(result).toBeCloseTo(expected, 10); 56 | }); 57 | 58 | // Test compatibility with the Python implementation shown in the paper 59 | test("calculatePassAtK is compatible with the Python implementation from the paper", () => { 60 | // The paper provides this Python function: 61 | /* 62 | def pass_at_k(n, c, k): 63 | """ 64 | :param n: total number of samples 65 | :param c: number of correct samples 66 | :param k: k in pass@$k$ 67 | """ 68 | if n - c < k: return 1.0 69 | return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1)) 70 | */ 71 | 72 | // Test some values to ensure our TypeScript implementation matches 73 | // what the Python implementation would produce 74 | 75 | // Example 1: n=50, c=10, k=5 76 | let expected1 = 0.0; 77 | if (50 - 10 < 5) { 78 | expected1 = 1.0; 79 | } else { 80 | let prod = 1.0; 81 | for (let j = 50 - 10 + 1; j <= 50; j++) { 82 | prod *= 1.0 - 5.0 / j; 83 | } 84 | expected1 = 1.0 - prod; 85 | } 86 | expect(calculatePassAtK(50, 10, 5)).toBeCloseTo(expected1, 10); 87 | 88 | // Example 2: n=200, c=50, k=10 89 | let expected2 = 0.0; 90 | if (200 - 50 < 10) { 91 | expected2 = 1.0; 92 | } else { 93 | let prod = 1.0; 94 | for (let j = 200 - 50 + 1; j <= 200; j++) { 95 | prod *= 1.0 - 10.0 / j; 96 | } 97 | expected2 = 1.0 - prod; 98 | } 99 | expect(calculatePassAtK(200, 50, 10)).toBeCloseTo(expected2, 10); 100 | }); 101 | 102 | // Test the bias of the naïve estimator as mentioned in the paper 103 | test("demonstrates the bias in naive estimator versus unbiased estimator", () => { 104 | // The paper mentions that estimating pass@k with 1-(1-p)^k where p is the 105 | // empirical pass@1 can result in a biased estimate 106 | 107 | const n = 100; // Total number of samples 108 | const c = 20; // Correct samples 109 | const k = 10; // k in pass@k 110 | 111 | // Unbiased estimator from the paper 112 | const unbiasedEstimate = calculatePassAtK(n, c, k); 113 | 114 | // Naive estimator: 1-(1-p)^k where p = c/n 115 | const naiveEstimate = 1 - Math.pow(1 - c / n, k); 116 | 117 | // The naive estimate should be lower than the unbiased estimate 118 | // as mentioned in the paper 119 | expect(naiveEstimate).toBeLessThan(unbiasedEstimate); 120 | 121 | // We can also check that as n increases, the bias decreases 122 | const largerN = 1000; 123 | const largerC = 200; // Same proportion as before 124 | 125 | const unbiasedEstimateLargerN = calculatePassAtK(largerN, largerC, k); 126 | const naiveEstimateLargerN = 1 - Math.pow(1 - largerC / largerN, k); 127 | 128 | // The difference between the two estimates should be smaller 129 | const smallNDifference = Math.abs(unbiasedEstimate - naiveEstimate); 130 | const largeNDifference = Math.abs(unbiasedEstimateLargerN - naiveEstimateLargerN); 131 | 132 | expect(largeNDifference).toBeLessThan(smallNDifference); 133 | }); 134 | }); 135 | -------------------------------------------------------------------------------- /src/llms/zai.ts: -------------------------------------------------------------------------------- 1 | import { DEFAULT_SYSTEM_PROMPT, DEFAULT_SYSTEM_PROMPT_WITH_CONTEXT } from "../utils/prompt"; 2 | import type { LLMProvider } from "./index"; 3 | import { withRetry } from "../utils/retry-wrapper"; 4 | 5 | export class ZAIProvider implements LLMProvider { 6 | private apiKey: string; 7 | private modelId: string; 8 | name = "Z.ai"; 9 | private readonly availableModels = [ 10 | "glm-4.5", // x 11 | "glm-4.5-air", // x 12 | "glm-4.5-x", 13 | "glm-4.5-airx", 14 | "glm-4.5-flash", 15 | "glm-4-32b-0414-128k", 16 | "glm-4.6", 17 | ]; 18 | 19 | constructor(modelId?: string) { 20 | const apiKey = process.env.Z_AI_API_KEY; 21 | if (!apiKey) { 22 | throw new Error("Z_AI_API_KEY environment variable is required"); 23 | } 24 | this.apiKey = apiKey; 25 | this.modelId = modelId || this.availableModels[0]; 26 | } 27 | 28 | /** 29 | * Generate code from a prompt using Z.ai 30 | * @param prompt The prompt to send to the LLM 31 | * @param temperature Optional temperature parameter for controlling randomness (default: 0.7) 32 | * @param contextContent Optional context content to include in prompts 33 | * @returns The generated code 34 | */ 35 | async generateCode(prompt: string, temperature?: number, contextContent?: string): Promise { 36 | console.log(`🤖 Generating code with Z.ai using model: ${this.modelId} (temp: ${temperature ?? "default"})...`); 37 | 38 | const systemPrompt = contextContent ? DEFAULT_SYSTEM_PROMPT_WITH_CONTEXT : DEFAULT_SYSTEM_PROMPT; 39 | 40 | const messages: Array<{ 41 | role: "system" | "user" | "assistant"; 42 | content: string; 43 | }> = [ 44 | { 45 | role: "system", 46 | content: systemPrompt, 47 | }, 48 | ]; 49 | 50 | if (contextContent) { 51 | messages.push({ 52 | role: "user", 53 | content: contextContent, 54 | }); 55 | } 56 | 57 | messages.push({ 58 | role: "user", 59 | content: prompt, 60 | }); 61 | 62 | const requestBody: any = { 63 | model: this.modelId, 64 | messages: messages, 65 | }; 66 | 67 | if (temperature !== undefined) { 68 | requestBody.temperature = temperature; 69 | } 70 | 71 | // Wrap the API call in retry logic with custom settings for z.ai 72 | return await withRetry( 73 | async () => { 74 | // Create AbortController for timeout (2 minutes for z.ai models) 75 | const controller = new AbortController(); 76 | const timeoutId = setTimeout( 77 | () => { 78 | controller.abort(); 79 | }, 80 | 2 * 60 * 1000, 81 | ); // 2 minutes 82 | 83 | try { 84 | const response = await fetch("https://open.bigmodel.cn/api/paas/v4/chat/completions", { 85 | method: "POST", 86 | headers: { 87 | "Content-Type": "application/json", 88 | Authorization: `Bearer ${this.apiKey}`, 89 | }, 90 | body: JSON.stringify(requestBody), 91 | signal: controller.signal, 92 | }); 93 | 94 | clearTimeout(timeoutId); 95 | 96 | if (!response.ok) { 97 | // Check for rate limiting or temporary errors 98 | if (response.status === 429 || response.status >= 500) { 99 | throw new Error(`Z.ai API temporary error: ${response.status} ${response.statusText}`); 100 | } 101 | // Non-retryable error 102 | throw new Error(`Z.ai API request failed: ${response.status} ${response.statusText}`); 103 | } 104 | 105 | const data = await response.json(); 106 | const content = data.choices?.[0]?.message?.content; 107 | 108 | if (!content) { 109 | throw new Error("Z.ai returned empty response"); 110 | } 111 | 112 | return content; 113 | } catch (error) { 114 | clearTimeout(timeoutId); 115 | 116 | // Check if it's an abort error (timeout) 117 | if (error instanceof Error && error.name === "AbortError") { 118 | console.error(`Z.ai request timed out after 2 minutes for model: ${this.modelId}`); 119 | throw new Error(`Request timed out after 2 minutes: ${this.modelId}`); 120 | } 121 | 122 | throw error; 123 | } 124 | }, 125 | { 126 | maxAttempts: 10, 127 | initialDelayMs: 2000, // Start with 2 seconds for z.ai 128 | maxDelayMs: 60000, // Max 1 minute between retries 129 | backoffFactor: 2, 130 | onRetry: (error, attempt) => { 131 | console.warn(`⚠️ Z.ai retry attempt ${attempt}/10 for model ${this.modelId} after error: ${error.message}`); 132 | 133 | // On final retry attempt, provide helpful message before failing 134 | if (attempt === 10) { 135 | console.error(`\n❌ Z.ai model ${this.modelId} failed after 10 retry attempts.`); 136 | console.error(`📝 The benchmark will resume from where it left off when you restart.`); 137 | console.error(`⏳ This appears to be a rate limit issue. Please wait before retrying.`); 138 | console.error(`💾 Progress has been saved to the checkpoint file.\n`); 139 | } 140 | }, 141 | }, 142 | ).catch((error) => { 143 | // If all retries failed, exit the process with error 144 | console.error(`\n🛑 Stopping benchmark due to persistent Z.ai API failures.`); 145 | console.error(`ℹ️ To resume, run the same command again. The benchmark will continue from the last checkpoint.`); 146 | process.exit(1); 147 | }); 148 | } 149 | 150 | /** 151 | * Get all available models for this provider 152 | * @returns Array of model identifiers 153 | */ 154 | getModels(): string[] { 155 | return [...this.availableModels]; 156 | } 157 | 158 | /** 159 | * Get the model identifier that was used for generation 160 | * @returns The model identifier string 161 | */ 162 | getModelIdentifier(): string { 163 | return this.modelId; 164 | } 165 | } 166 | -------------------------------------------------------------------------------- /src/utils/test-runner.ts: -------------------------------------------------------------------------------- 1 | import { startVitest } from "vitest/node"; 2 | import path from "path"; 3 | import { getTmpDir } from "./file"; 4 | import fs from "fs/promises"; 5 | 6 | export interface TestResult { 7 | testName: string; 8 | success: boolean; 9 | testFiles: number; 10 | totalTests: number; 11 | failedTests: number; 12 | errors: string[]; // All errors that occurred during test execution 13 | } 14 | 15 | /** 16 | * Run tests for a specific component 17 | * @param testName The name of the test 18 | * @param provider The provider name (optional) 19 | * @param testDir Optional specific directory for test files (for parallel execution) 20 | * @returns Test results 21 | */ 22 | export async function runTest(testName: string, provider?: string, testDir?: string): Promise { 23 | // Create timeout error message 24 | const timeoutMessage = `Test timeout: ${testName} (${ 25 | provider || "unknown" 26 | }) exceeded the maximum execution time of 120 seconds`; 27 | 28 | // Create an AbortController for the timeout 29 | const abortController = new AbortController(); 30 | const signal = abortController.signal; 31 | 32 | // Use a timeout promise to avoid forcing the process to exit 33 | const timeoutPromise = new Promise((_, reject) => { 34 | const timeoutId = setTimeout(() => { 35 | console.error(`⚠️ ${timeoutMessage}`); 36 | // Instead of process.exit, we'll reject with a clear error 37 | abortController.abort(); // Signal abortion to potentially listening handlers 38 | reject(new Error(timeoutMessage)); 39 | }, 120000); // Increase to 120 second timeout for parallel execution 40 | 41 | // Make sure the timeout is cleared if the promise is rejected/resolved elsewhere 42 | signal.addEventListener("abort", () => { 43 | clearTimeout(timeoutId); 44 | }); 45 | }); 46 | 47 | try { 48 | console.log(`🧪 Running tests for ${testName}${provider ? ` (${provider})` : ""}...`); 49 | 50 | const tmpDir = testDir || getTmpDir(provider); 51 | const testFilePath = path.resolve(tmpDir, `${testName}.test.ts`); 52 | 53 | // Verify the test file exists before running the test 54 | try { 55 | await fs.access(testFilePath); 56 | } catch (error) { 57 | abortController.abort(); // Clean up the timeout 58 | throw new Error(`Test file not found: ${testFilePath}`); 59 | } 60 | 61 | // Race between the test execution and the timeout 62 | const testPromise = async (): Promise => { 63 | try { 64 | const vitest = await startVitest("test", [testFilePath], { 65 | watch: false, 66 | reporters: ["verbose"], 67 | }); 68 | 69 | await vitest.close(); 70 | const testModules = vitest.state.getTestModules(); 71 | 72 | // Collect all errors 73 | const allErrors: string[] = []; 74 | 75 | // Get unhandled errors 76 | const unhandledErrors = vitest.state.getUnhandledErrors(); 77 | for (const error of unhandledErrors) { 78 | const errorMessage = error instanceof Error ? error.message : String(error); 79 | allErrors.push(errorMessage); 80 | } 81 | 82 | // Calculate success/failure 83 | let success = true; 84 | let totalTests = 0; 85 | let failedTests = 0; 86 | 87 | if (!testModules || testModules.length === 0) { 88 | return { 89 | testName, 90 | success: false, 91 | testFiles: 0, 92 | totalTests: 0, 93 | failedTests: 0, 94 | errors: allErrors, 95 | }; 96 | } 97 | 98 | for (const module of testModules) { 99 | if (!module.ok()) { 100 | success = false; 101 | } 102 | 103 | // Add module errors 104 | const moduleErrors = module.errors(); 105 | for (const error of moduleErrors) { 106 | if (error.message) { 107 | allErrors.push(error.message); 108 | } 109 | } 110 | 111 | if (!module.children) { 112 | continue; 113 | } 114 | 115 | try { 116 | const tests = Array.from(module.children.allTests()); 117 | totalTests += tests.length; 118 | 119 | const moduleFailedTests = tests.filter((t) => { 120 | const result = t.result(); 121 | 122 | // Collect test errors 123 | if (result.state === "failed" && result.errors) { 124 | for (const testError of result.errors) { 125 | if (testError.message) { 126 | allErrors.push(testError.message); 127 | } 128 | } 129 | } 130 | 131 | return result.state === "failed"; 132 | }); 133 | 134 | failedTests += moduleFailedTests.length; 135 | } catch (err) { 136 | console.error(`Error processing module tests for ${testName}${provider ? ` (${provider})` : ""}:`, err); 137 | const errorMessage = err instanceof Error ? err.message : String(err); 138 | allErrors.push(errorMessage); 139 | success = false; 140 | } 141 | } 142 | 143 | const result: TestResult = { 144 | testName, 145 | success, 146 | testFiles: testModules.length, 147 | totalTests, 148 | failedTests, 149 | errors: allErrors, 150 | }; 151 | 152 | console.log(`📊 Test results for ${testName}${provider ? ` (${provider})` : ""}:`); 153 | console.log(` Success: ${result.success ? "Yes ✅" : "No ❌"}`); 154 | console.log(` Total Tests: ${result.totalTests}`); 155 | console.log(` Failed Tests: ${result.failedTests}`); 156 | console.log(` Errors: ${result.errors.length}`); 157 | 158 | return result; 159 | } finally { 160 | // Always abort the controller to clean up the timeout 161 | abortController.abort(); 162 | } 163 | }; 164 | 165 | // Race the test execution against the timeout 166 | return await Promise.race([testPromise(), timeoutPromise]); 167 | } catch (error) { 168 | // Make sure to abort the controller to clean up the timeout 169 | abortController.abort(); 170 | 171 | const errorMessage = error instanceof Error ? error.message : String(error); 172 | console.error(`Error running tests for ${testName}${provider ? ` (${provider})` : ""}:`, errorMessage); 173 | 174 | return { 175 | testName, 176 | success: false, 177 | testFiles: 0, 178 | totalTests: 0, 179 | failedTests: 0, 180 | errors: [errorMessage], 181 | }; 182 | } 183 | } 184 | -------------------------------------------------------------------------------- /src/llms/moonshot.ts: -------------------------------------------------------------------------------- 1 | import { DEFAULT_SYSTEM_PROMPT, DEFAULT_SYSTEM_PROMPT_WITH_CONTEXT } from "../utils/prompt"; 2 | import type { LLMProvider } from "./index"; 3 | import { withRetry } from "../utils/retry-wrapper"; 4 | 5 | interface MoonshotMessage { 6 | role: "system" | "user" | "assistant"; 7 | content: string; 8 | } 9 | 10 | interface MoonshotRequest { 11 | model: string; 12 | messages: MoonshotMessage[]; 13 | temperature?: number; 14 | max_tokens?: number; 15 | stream?: boolean; 16 | } 17 | 18 | interface MoonshotChoice { 19 | index: number; 20 | message: { 21 | role: string; 22 | content: string; 23 | }; 24 | finish_reason: string; 25 | } 26 | 27 | interface MoonshotResponse { 28 | id: string; 29 | object: string; 30 | created: number; 31 | model: string; 32 | choices: MoonshotChoice[]; 33 | usage: { 34 | prompt_tokens: number; 35 | completion_tokens: number; 36 | total_tokens: number; 37 | }; 38 | } 39 | 40 | export class MoonshotProvider implements LLMProvider { 41 | private apiKey: string; 42 | private baseUrl: string; 43 | private modelId: string; 44 | name = "Moonshot"; 45 | 46 | constructor(modelId?: string) { 47 | const apiKey = process.env.MOONSHOT_API_KEY; 48 | if (!apiKey) { 49 | throw new Error("MOONSHOT_API_KEY environment variable is required"); 50 | } 51 | this.apiKey = apiKey; 52 | this.baseUrl = "https://api.moonshot.ai/v1"; 53 | // Default to moonshot-v1-8k if no model specified 54 | this.modelId = modelId || "moonshot-v1-8k"; 55 | } 56 | 57 | /** 58 | * Generate code from a prompt using Moonshot AI 59 | * @param prompt The prompt to send to the LLM 60 | * @param temperature Optional temperature parameter for controlling randomness (default: 0.7) 61 | * @param contextContent Optional context content to include in prompts 62 | * @returns The generated code 63 | */ 64 | async generateCode(prompt: string, temperature?: number, contextContent?: string): Promise { 65 | // Ensure temperature is within valid range [0, 1] 66 | const validTemperature = temperature !== undefined ? Math.max(0, Math.min(1, temperature)) : 0.7; 67 | 68 | console.log(`🤖 Generating code with Moonshot using model: ${this.modelId} (temp: ${validTemperature})...`); 69 | 70 | const systemPrompt = contextContent ? DEFAULT_SYSTEM_PROMPT_WITH_CONTEXT : DEFAULT_SYSTEM_PROMPT; 71 | 72 | const messages: MoonshotMessage[] = [ 73 | { 74 | role: "system", 75 | content: systemPrompt, 76 | }, 77 | ]; 78 | 79 | // Add context message if available 80 | if (contextContent) { 81 | messages.push({ 82 | role: "user", 83 | content: contextContent, 84 | }); 85 | } 86 | 87 | // Add the main prompt 88 | messages.push({ 89 | role: "user", 90 | content: prompt, 91 | }); 92 | 93 | const requestBody: MoonshotRequest = { 94 | model: this.modelId, 95 | messages, 96 | temperature: validTemperature, 97 | max_tokens: 4000, 98 | stream: false, 99 | }; 100 | 101 | // Wrap the API call in retry logic with custom settings for Moonshot 102 | return await withRetry( 103 | async () => { 104 | // Create AbortController for timeout (2 minutes for Moonshot models) 105 | const controller = new AbortController(); 106 | const timeoutId = setTimeout( 107 | () => { 108 | controller.abort(); 109 | }, 110 | 2 * 60 * 1000, 111 | ); // 2 minutes 112 | 113 | try { 114 | const response = await fetch(`${this.baseUrl}/chat/completions`, { 115 | method: "POST", 116 | headers: { 117 | "Content-Type": "application/json", 118 | Authorization: `Bearer ${this.apiKey}`, 119 | }, 120 | body: JSON.stringify(requestBody), 121 | signal: controller.signal, 122 | }); 123 | 124 | clearTimeout(timeoutId); 125 | 126 | if (!response.ok) { 127 | const errorText = await response.text(); 128 | 129 | // Check for rate limiting or temporary errors 130 | if (response.status === 429 || response.status >= 500) { 131 | throw new Error(`Moonshot API temporary error: ${response.status} ${response.statusText} - ${errorText}`); 132 | } 133 | // Non-retryable error 134 | throw new Error(`Moonshot API request failed: ${response.status} ${response.statusText} - ${errorText}`); 135 | } 136 | 137 | const data: MoonshotResponse = await response.json(); 138 | 139 | if (!data.choices || data.choices.length === 0) { 140 | throw new Error("Moonshot returned empty response"); 141 | } 142 | 143 | return data.choices[0].message.content; 144 | } catch (error) { 145 | clearTimeout(timeoutId); 146 | 147 | // Check if it's an abort error (timeout) 148 | if (error instanceof Error && error.name === "AbortError") { 149 | console.error(`Moonshot request timed out after 2 minutes for model: ${this.modelId}`); 150 | throw new Error(`Request timed out after 2 minutes: ${this.modelId}`); 151 | } 152 | 153 | throw error; 154 | } 155 | }, 156 | { 157 | maxAttempts: 10, 158 | initialDelayMs: 2000, // Start with 2 seconds for Moonshot 159 | maxDelayMs: 60000, // Max 1 minute between retries 160 | backoffFactor: 2, 161 | onRetry: (error, attempt) => { 162 | console.warn( 163 | `⚠️ Moonshot retry attempt ${attempt}/10 for model ${this.modelId} after error: ${error.message}`, 164 | ); 165 | 166 | // On final retry attempt, provide helpful message before failing 167 | if (attempt === 10) { 168 | console.error(`\n❌ Moonshot model ${this.modelId} failed after 10 retry attempts.`); 169 | console.error(`📝 The benchmark will resume from where it left off when you restart.`); 170 | console.error(`⏳ This appears to be a rate limit issue. Please wait before retrying.`); 171 | console.error(`💾 Progress has been saved to the checkpoint file.\n`); 172 | } 173 | }, 174 | }, 175 | ).catch((error) => { 176 | // If all retries failed, exit the process with error 177 | console.error(`\n🛑 Stopping benchmark due to persistent Moonshot API failures.`); 178 | console.error(`ℹ️ To resume, run the same command again. The benchmark will continue from the last checkpoint.`); 179 | process.exit(1); 180 | }); 181 | } 182 | 183 | /** 184 | * Get all available models for this provider 185 | * @returns Array of model identifiers 186 | */ 187 | getModels(): string[] { 188 | return ["moonshot-v1-8k", "moonshot-v1-32k", "moonshot-v1-128k"]; 189 | } 190 | 191 | /** 192 | * Get the model identifier that was used for generation 193 | * @returns The model identifier string 194 | */ 195 | getModelIdentifier(): string { 196 | return this.modelId; 197 | } 198 | } 199 | -------------------------------------------------------------------------------- /src/llms/openrouter.ts: -------------------------------------------------------------------------------- 1 | import { DEFAULT_SYSTEM_PROMPT, DEFAULT_SYSTEM_PROMPT_WITH_CONTEXT } from "../utils/prompt"; 2 | import type { LLMProvider } from "./index"; 3 | import OpenAI from "openai"; 4 | import type { ChatCompletionMessageParam } from "openai/resources/chat/completions"; 5 | 6 | export class OpenRouterProvider implements LLMProvider { 7 | private client: OpenAI; 8 | private modelId: string; 9 | name = "OpenRouter"; 10 | 11 | constructor(modelId?: string) { 12 | const apiKey = process.env.OPENROUTER_API_KEY; 13 | if (!apiKey) { 14 | throw new Error("OPENROUTER_API_KEY environment variable is required"); 15 | } 16 | 17 | this.client = new OpenAI({ 18 | baseURL: "https://openrouter.ai/api/v1", 19 | apiKey: apiKey, 20 | defaultHeaders: { 21 | /* 22 | "HTTP-Referer": 23 | process.env.OPENROUTER_SITE_URL || 24 | "https://github.com/khromov/svelte-bench", 25 | "X-Title": process.env.OPENROUTER_SITE_NAME || "SvelteBench", 26 | */ 27 | }, 28 | }); 29 | 30 | // Default to a commonly available model if no model specified 31 | this.modelId = modelId || "openai/gpt-4o"; 32 | } 33 | 34 | /** 35 | * Generate code from a prompt using OpenRouter 36 | * @param prompt The prompt to send to the LLM 37 | * @param temperature Optional temperature parameter for controlling randomness (default: 0.7) 38 | * @param contextContent Optional context content to include in prompts 39 | * @returns The generated code 40 | */ 41 | async generateCode(prompt: string, temperature?: number, contextContent?: string): Promise { 42 | // Create AbortController with 5-minute timeout 43 | const abortController = new AbortController(); 44 | const timeoutId = setTimeout( 45 | () => { 46 | abortController.abort(); 47 | }, 48 | 5 * 60 * 1000, 49 | ); // 5 minutes 50 | 51 | try { 52 | console.log( 53 | `🤖 Generating code with OpenRouter using model: ${this.modelId} (temp: ${temperature ?? "default"})...`, 54 | ); 55 | 56 | const systemPrompt = contextContent ? DEFAULT_SYSTEM_PROMPT_WITH_CONTEXT : DEFAULT_SYSTEM_PROMPT; 57 | 58 | // Standard chat completions for OpenRouter models 59 | const messages: ChatCompletionMessageParam[] = [ 60 | { 61 | role: "system", 62 | content: systemPrompt, 63 | }, 64 | ]; 65 | 66 | // Add context message if available 67 | if (contextContent) { 68 | messages.push({ 69 | role: "user", 70 | content: contextContent, 71 | }); 72 | } 73 | 74 | // Add the main prompt 75 | messages.push({ 76 | role: "user", 77 | content: prompt, 78 | }); 79 | 80 | const requestOptions: any = { 81 | model: this.modelId, 82 | messages: messages, 83 | }; 84 | 85 | // Add provider routing preferences if configured 86 | const openrouterProvider = process.env.OPENROUTER_PROVIDER; 87 | if (openrouterProvider && openrouterProvider.toLowerCase() !== "auto") { 88 | requestOptions.provider = { only: [openrouterProvider] }; 89 | } else { 90 | // Apply quantization filtering for precision requirements 91 | requestOptions.provider = this.buildProviderConfig(); 92 | } 93 | 94 | // Only add temperature if it's defined 95 | if (temperature !== undefined) { 96 | requestOptions.temperature = temperature; 97 | } 98 | 99 | let completion; 100 | try { 101 | completion = await this.client.chat.completions.create(requestOptions, { 102 | signal: abortController.signal, // Add abort signal 103 | }); 104 | } catch (quantizationError) { 105 | // If no providers match the quantization requirements, fall back to default 106 | if (this.isQuantizationError(quantizationError)) { 107 | console.warn( 108 | "⚠️ WARNING: NO MODELS FOUND WITH REQUIRED PRECISION (bf16+). FALLING BACK TO DEFAULT MODEL WITHOUT QUANTIZATION FILTERING.", 109 | ); 110 | 111 | // Retry without quantization filtering 112 | const fallbackOptions = { ...requestOptions }; 113 | if (openrouterProvider && openrouterProvider.toLowerCase() !== "auto") { 114 | fallbackOptions.provider = { only: [openrouterProvider] }; 115 | } else { 116 | delete fallbackOptions.provider; 117 | } 118 | 119 | completion = await this.client.chat.completions.create(fallbackOptions, { 120 | signal: abortController.signal, 121 | }); 122 | } else { 123 | throw quantizationError; 124 | } 125 | } 126 | 127 | // Clear timeout on successful completion 128 | clearTimeout(timeoutId); 129 | 130 | return completion.choices[0]?.message.content || ""; 131 | } catch (error) { 132 | // Clear timeout on error 133 | clearTimeout(timeoutId); 134 | 135 | // Check if the error is due to abort (timeout) 136 | if (error instanceof Error && error.name === "AbortError") { 137 | console.error(`OpenRouter API call timed out after 5 minutes for model: ${this.modelId}`); 138 | throw new Error(`Request timed out after 5 minutes: ${this.modelId}`); 139 | } 140 | 141 | console.error("Error generating code with OpenRouter:", error); 142 | throw new Error(`Failed to generate code: ${error instanceof Error ? error.message : String(error)}`); 143 | } 144 | } 145 | 146 | /** 147 | * Get all available models for this provider 148 | * @returns Array of model identifiers 149 | */ 150 | getModels(): string[] { 151 | // Return empty array since models are now dynamically validated 152 | return []; 153 | } 154 | 155 | /** 156 | * Get the model identifier that was used for generation 157 | * @returns The model identifier string 158 | */ 159 | getModelIdentifier(): string { 160 | return this.modelId; 161 | } 162 | 163 | /** 164 | * Build provider configuration with quantization filtering 165 | * @returns Provider configuration object 166 | */ 167 | private buildProviderConfig(): any { 168 | // Disallow low precision quantizations, allow unknown for flexibility 169 | const disallowedQuantizations = ["fp4", "fp6", "fp8", "int4", "int8"]; 170 | 171 | return { 172 | disallow_quantizations: disallowedQuantizations, 173 | // Allow unknown quantization to handle cases where precision is not specified 174 | allow_fallbacks: true, 175 | }; 176 | } 177 | 178 | /** 179 | * Check if an error is related to quantization/provider filtering 180 | * @param error The error to check 181 | * @returns True if the error is related to quantization filtering 182 | */ 183 | private isQuantizationError(error: any): boolean { 184 | if (!(error instanceof Error)) return false; 185 | 186 | const errorMessage = error.message.toLowerCase(); 187 | return ( 188 | errorMessage.includes("no providers") || 189 | errorMessage.includes("quantization") || 190 | errorMessage.includes("provider") || 191 | errorMessage.includes("precision") || 192 | errorMessage.includes("not available") || 193 | errorMessage.includes("no models found") 194 | ); 195 | } 196 | } 197 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SvelteBench 2 | 3 | An LLM benchmark for Svelte 5 based on the HumanEval methodology from OpenAI's paper "Evaluating Large Language Models Trained on Code". This benchmark evaluates LLMs' ability to generate functional Svelte 5 components with proper use of runes and modern Svelte features. 4 | 5 | ## Overview 6 | 7 | SvelteBench evaluates LLM-generated Svelte components by testing them against predefined test suites. It works by sending prompts to LLMs, generating Svelte components, and verifying their functionality through automated tests. The benchmark calculates pass@k metrics (typically pass@1 and pass@10) to measure model performance. 8 | 9 | ## Supported Providers 10 | 11 | SvelteBench supports multiple LLM providers: 12 | 13 | - **OpenAI** - GPT-4, GPT-4o, o1, o3, o4 models 14 | - **Anthropic** - Claude 3.5, Claude 4 models 15 | - **Google** - Gemini 2.5 models 16 | - **OpenRouter** - Access to 100+ models through a unified API 17 | - **Ollama** - Run models locally (Llama, Mistral, etc.) 18 | - **Z.ai** - GLM-4 and other models 19 | 20 | ## Setup 21 | 22 | ```bash 23 | nvm use 24 | pnpm install 25 | 26 | # Create .env file from example 27 | cp .env.example .env 28 | ``` 29 | 30 | Then edit the `.env` file and add your API keys: 31 | 32 | ```bash 33 | # OpenAI (optional) 34 | OPENAI_API_KEY=your_openai_api_key_here 35 | 36 | # Anthropic (optional) 37 | ANTHROPIC_API_KEY=your_anthropic_api_key_here 38 | 39 | # Google Gemini (optional) 40 | GEMINI_API_KEY=your_gemini_api_key_here 41 | 42 | # OpenRouter (optional) 43 | OPENROUTER_API_KEY=your_openrouter_api_key_here 44 | OPENROUTER_SITE_URL=https://github.com/khromov/svelte-bench # Optional 45 | OPENROUTER_SITE_NAME=SvelteBench # Optional 46 | OPENROUTER_PROVIDER=deepseek # Optional - preferred provider routing 47 | 48 | # Ollama (optional - defaults to http://127.0.0.1:11434) 49 | OLLAMA_HOST=http://127.0.0.1:11434 50 | 51 | # Z.ai (optional) 52 | Z_AI_API_KEY=your_z_ai_api_key_here 53 | ``` 54 | 55 | You only need to configure the providers you want to test with. 56 | 57 | ## Running the Benchmark 58 | 59 | ### Standard Execution 60 | 61 | ```bash 62 | # Run the full benchmark (sequential execution) 63 | pnpm start 64 | 65 | # Run with parallel sample generation (faster) 66 | PARALLEL_EXECUTION=true pnpm start 67 | 68 | # Run tests only (without building visualization) 69 | pnpm run run-tests 70 | ``` 71 | 72 | **NOTE: This will run all providers and models that are available!** 73 | 74 | ### Execution Modes 75 | 76 | SvelteBench supports two execution modes: 77 | 78 | - **Sequential (default)**: Tests and samples run one at a time. More reliable with detailed progress output. 79 | - **Parallel**: Tests run sequentially, but samples within each test are generated in parallel. Faster execution with `PARALLEL_EXECUTION=true`. 80 | 81 | ### Debug Mode 82 | 83 | For faster development, or to run just one provider/model, you can enable debug mode in your `.env` file: 84 | 85 | ``` 86 | DEBUG_MODE=true 87 | DEBUG_PROVIDER=anthropic 88 | DEBUG_MODEL=claude-3-7-sonnet-20250219 89 | DEBUG_TEST=counter 90 | ``` 91 | 92 | Debug mode runs only one provider/model combination, making it much faster for testing during development. 93 | 94 | #### Running Multiple Models in Debug Mode 95 | 96 | You can now specify multiple models to test in debug mode by providing a comma-separated list: 97 | 98 | ``` 99 | DEBUG_MODE=true 100 | DEBUG_PROVIDER=anthropic 101 | DEBUG_MODEL=claude-3-7-sonnet-20250219,claude-opus-4-20250514,claude-sonnet-4-20250514 102 | ``` 103 | 104 | This will run tests with all three models sequentially while still staying within the same provider. 105 | 106 | ### Running with Context 107 | 108 | You can provide a context file (like Svelte documentation) to help the LLM generate better components: 109 | 110 | ```bash 111 | # Run with a context file 112 | pnpm run run-tests -- --context ./context/svelte.dev/llms-small.txt && pnpm run build 113 | ``` 114 | 115 | The context file will be included in the prompt to the LLM, providing additional information for generating components. 116 | 117 | ## Visualizing Results 118 | 119 | After running the benchmark, you can visualize the results using the built-in visualization tool: 120 | 121 | ```bash 122 | pnpm run build 123 | ``` 124 | 125 | You can now find the visualization in the `dist` directory. 126 | 127 | ## Adding New Tests 128 | 129 | To add a new test: 130 | 131 | 1. Create a new directory in `src/tests/` with the name of your test 132 | 2. Add a `prompt.md` file with instructions for the LLM 133 | 3. Add a `test.ts` file with Vitest tests for the generated component 134 | 4. Add a `Reference.svelte` file with a reference implementation for validation 135 | 136 | Example structure: 137 | 138 | ``` 139 | src/tests/your-test/ 140 | ├── prompt.md # Instructions for the LLM 141 | ├── test.ts # Tests for the generated component 142 | └── Reference.svelte # Reference implementation 143 | ``` 144 | 145 | ## Benchmark Results 146 | 147 | ### Output Files 148 | 149 | After running the benchmark, results are saved in multiple formats: 150 | 151 | - **JSON Results**: `benchmarks/benchmark-results-{timestamp}.json` - Machine-readable results with pass@k metrics 152 | - **HTML Visualization**: `benchmarks/benchmark-results-{timestamp}.html` - Interactive visualization of results 153 | - **Individual Model Results**: `benchmarks/benchmark-results-{provider}-{model}-{timestamp}.json` - Per-model results 154 | 155 | When running with a context file, the results filename will include "with-context" in the name. 156 | 157 | ### Versioning System 158 | 159 | **Current Results**: All new benchmark runs produce current results with: 160 | 161 | - Fixed test prompts and improved error handling 162 | - Corrected Svelte syntax examples 163 | - Standard naming without version suffixes 164 | 165 | **Legacy Results (v1)**: Historical results from the original test suite with known issues in the "inspect" test prompt (stored in `benchmarks/v1/`). 166 | 167 | ### Merging Results 168 | 169 | You can merge multiple benchmark results into a single file: 170 | 171 | ```bash 172 | # Merge current results (recommended) 173 | pnpm run merge 174 | 175 | # Merge legacy results (if needed) 176 | pnpm run merge-v1 177 | 178 | # Build visualization from current results 179 | pnpm run build 180 | 181 | # Build visualization from legacy results 182 | pnpm run build-v1 183 | ``` 184 | 185 | This creates merged JSON and HTML files: 186 | 187 | - `pnpm run merge` → `benchmarks/benchmark-results-merged.{json,html}` (current results) 188 | - `pnpm run merge-v1` → `benchmarks/v1/benchmark-results-merged.{json,html}` (legacy results) 189 | 190 | The standard build process uses current results by default. 191 | 192 | ## Advanced Features 193 | 194 | ### Checkpoint & Resume 195 | 196 | SvelteBench automatically saves checkpoints at the sample level, allowing you to resume interrupted benchmark runs: 197 | 198 | - Checkpoints are saved in `tmp/checkpoint/` after each sample completion 199 | - If a run is interrupted, it will automatically resume from the last checkpoint 200 | - Checkpoints are cleaned up after successful completion 201 | 202 | ### Retry Mechanism 203 | 204 | API calls have configurable retry logic with exponential backoff. Configure in `.env`: 205 | 206 | ```bash 207 | RETRY_MAX_ATTEMPTS=3 # Maximum retry attempts (default: 3) 208 | RETRY_INITIAL_DELAY_MS=1000 # Initial delay before retry (default: 1000ms) 209 | RETRY_MAX_DELAY_MS=30000 # Maximum delay between retries (default: 30s) 210 | RETRY_BACKOFF_FACTOR=2 # Exponential backoff factor (default: 2) 211 | ``` 212 | 213 | ### Model Validation 214 | 215 | Before running benchmarks, models are automatically validated to ensure they're available and properly configured. Invalid models are skipped with appropriate warnings. 216 | 217 | ### HumanEval Metrics 218 | 219 | The benchmark calculates pass@k metrics based on the HumanEval methodology: 220 | 221 | - **pass@1**: Probability that a single sample passes all tests 222 | - **pass@10**: Probability that at least one of 10 samples passes all tests 223 | - Default: 10 samples per test (1 sample for expensive models) 224 | 225 | ### Test Verification 226 | 227 | Verify that all tests have proper structure: 228 | 229 | ```bash 230 | pnpm run verify 231 | ``` 232 | 233 | This checks that each test has required files (prompt.md, test.ts, Reference.svelte). 234 | 235 | ## Current Test Suite 236 | 237 | The benchmark includes tests for core Svelte 5 features: 238 | 239 | - **hello-world**: Basic component rendering 240 | - **counter**: State management with `$state` rune 241 | - **derived**: Computed values with `$derived` rune 242 | - **derived-by**: Advanced derived state with `$derived.by` 243 | - **effect**: Side effects with `$effect` rune 244 | - **props**: Component props with `$props` rune 245 | - **each**: List rendering with `{#each}` blocks 246 | - **snippets**: Reusable template snippets 247 | - **inspect**: Debug utilities with `$inspect` rune 248 | 249 | ## Troubleshooting 250 | 251 | ### Common Issues 252 | 253 | 1. **Models not found**: Ensure API keys are correctly set in `.env` 254 | 2. **Tests failing**: Check that you're using Node.js 20+ and have run `pnpm install` 255 | 3. **Parallel execution errors**: Try sequential mode (remove `PARALLEL_EXECUTION=true`) 256 | 4. **Memory issues**: Reduce the number of samples or run in debug mode with fewer models 257 | 258 | ### Debugging 259 | 260 | Enable detailed logging by examining the generated components in `tmp/samples/` directories and test outputs in the console. 261 | 262 | ## Contributing 263 | 264 | Contributions are welcome! Please ensure: 265 | 266 | 1. New tests include all required files (prompt.md, test.ts, Reference.svelte) 267 | 2. Tests follow the existing structure and naming conventions 268 | 3. Reference implementations are correct and pass all tests 269 | 4. Documentation is updated for new features 270 | 271 | ## License 272 | 273 | MIT 274 | -------------------------------------------------------------------------------- /benchmarks/v1/v1-benchmark-results-2025-05-25T20-01-22.048Z.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "testName": "counter", 4 | "provider": "OpenAI", 5 | "modelId": "o1-pro-2025-03-19", 6 | "numSamples": 1, 7 | "numCorrect": 0, 8 | "pass1": 0, 9 | "pass10": 0, 10 | "context": { 11 | "used": false, 12 | "content": "" 13 | }, 14 | "samples": [ 15 | { 16 | "index": 0, 17 | "code": "\n\n\n\n

\n \n {$state.count}\n \n

\n\n", 18 | "success": false, 19 | "errors": [ 20 | "tmp/openai/Component.svelte:4:2 Cannot use rune without parentheses\nhttps://svelte.dev/e/rune_missing_parentheses" 21 | ], 22 | "temperature": 0.2 23 | } 24 | ] 25 | }, 26 | { 27 | "testName": "derived", 28 | "provider": "OpenAI", 29 | "modelId": "o1-pro-2025-03-19", 30 | "numSamples": 1, 31 | "numCorrect": 0, 32 | "pass1": 0, 33 | "pass10": 0, 34 | "context": { 35 | "used": false, 36 | "content": "" 37 | }, 38 | "samples": [ 39 | { 40 | "index": 0, 41 | "code": "\n\n\n\n

Number: {number()}

Doubled: {doubled()}

\n \n

", 42 | "success": false, 43 | "errors": [ 44 | "tmp/openai/Component.svelte:5:16 `$derived` must be called with exactly one argument\nhttps://svelte.dev/e/rune_invalid_arguments_length" 45 | ], 46 | "temperature": 0.2 47 | } 48 | ] 49 | }, 50 | { 51 | "testName": "derived-by", 52 | "provider": "OpenAI", 53 | "modelId": "o1-pro-2025-03-19", 54 | "numSamples": 1, 55 | "numCorrect": 0, 56 | "pass1": 0, 57 | "pass10": 0, 58 | "context": { 59 | "used": false, 60 | "content": "" 61 | }, 62 | "samples": [ 63 | { 64 | "index": 0, 65 | "code": "\n\n\n\n

\n\t (text = e.target.value)}\n\t/>\n\t\n\t

\n\t\t

Words: {wordCount}

\n\t\t

Characters: {charCount}

\n\t\t

\n\t\t\tStatus: {isLong ? 'Long text' : 'Short text'}\n\t\t

\n\t

", 66 | "success": false, 67 | "errors": [ 68 | "Cannot read properties of undefined (reading 'text')\n\n\tin {expression}\n\tin Component.svelte\n", 69 | "Cannot read properties of undefined (reading 'text')\n\n\tin {expression}\n\tin Component.svelte\n", 70 | "Cannot read properties of undefined (reading 'text')\n\n\tin {expression}\n\tin Component.svelte\n" 71 | ], 72 | "temperature": 0.2 73 | } 74 | ] 75 | }, 76 | { 77 | "testName": "each", 78 | "provider": "OpenAI", 79 | "modelId": "o1-pro-2025-03-19", 80 | "numSamples": 1, 81 | "numCorrect": 0, 82 | "pass1": 0, 83 | "pass10": 0, 84 | "context": { 85 | "used": false, 86 | "content": "" 87 | }, 88 | "samples": [ 89 | { 90 | "index": 0, 91 | "code": "\n\n\n\n

{character}

\n \n

", 92 | "success": false, 93 | "errors": [ 94 | "tmp/openai/Component.svelte:4:9 Unexpected token\nhttps://svelte.dev/e/js_parse_error" 95 | ], 96 | "temperature": 0.2 97 | } 98 | ] 99 | }, 100 | { 101 | "testName": "effect", 102 | "provider": "OpenAI", 103 | "modelId": "o1-pro-2025-03-19", 104 | "numSamples": 1, 105 | "numCorrect": 0, 106 | "pass1": 0, 107 | "pass10": 0, 108 | "context": { 109 | "used": false, 110 | "content": "" 111 | }, 112 | "samples": [ 113 | { 114 | "index": 0, 115 | "code": "\n\n\n\n

Number: {number}

Doubled: {doubled}

\n \n

", 116 | "success": false, 117 | "errors": [ 118 | "tmp/openai/Component.svelte:4:9 Unexpected token\nhttps://svelte.dev/e/js_parse_error" 119 | ], 120 | "temperature": 0.2 121 | } 122 | ] 123 | }, 124 | { 125 | "testName": "hello-world", 126 | "provider": "OpenAI", 127 | "modelId": "o1-pro-2025-03-19", 128 | "numSamples": 1, 129 | "numCorrect": 1, 130 | "pass1": 1, 131 | "pass10": 1, 132 | "context": { 133 | "used": false, 134 | "content": "" 135 | }, 136 | "samples": [ 137 | { 138 | "index": 0, 139 | "code": "\n\n\n

Hello, World!

\n", 140 | "success": true, 141 | "errors": [], 142 | "temperature": 0.2 143 | } 144 | ] 145 | }, 146 | { 147 | "testName": "inspect", 148 | "provider": "OpenAI", 149 | "modelId": "o1-pro-2025-03-19", 150 | "numSamples": 1, 151 | "numCorrect": 0, 152 | "pass1": 0, 153 | "pass10": 0, 154 | "context": { 155 | "used": false, 156 | "content": "" 157 | }, 158 | "samples": [ 159 | { 160 | "index": 0, 161 | "code": "\n\n\n\n

\n \n

Current text: {text}

Character count: {text.length}

", 162 | "success": false, 163 | "errors": [ 164 | "tmp/openai/Component.svelte:11:2 `$effect` must be called with exactly one argument\nhttps://svelte.dev/e/rune_invalid_arguments_length" 165 | ], 166 | "temperature": 0.2 167 | } 168 | ] 169 | }, 170 | { 171 | "testName": "props", 172 | "provider": "OpenAI", 173 | "modelId": "o1-pro-2025-03-19", 174 | "numSamples": 1, 175 | "numCorrect": 0, 176 | "pass1": 0, 177 | "pass10": 0, 178 | "context": { 179 | "used": false, 180 | "content": "" 181 | }, 182 | "samples": [ 183 | { 184 | "index": 0, 185 | "code": "\n\n\n\n

\n\t

Hello, {name}!

\n\t

Count: {updatedCount}

\n\t\n\n\t{#if showDetails}\n\t\t

\n\t\t\t

Name is {name}

\n\t\t\t

Count is {updatedCount}

\n\t\t\t

ShowDetails is {showDetails}

\n\t\t

\n\t{/if}\n

", 186 | "success": false, 187 | "errors": [ 188 | "tmp/openai/Component.svelte:4:36 `$props` cannot be called with arguments\nhttps://svelte.dev/e/rune_invalid_arguments" 189 | ], 190 | "temperature": 0.2 191 | } 192 | ] 193 | }, 194 | { 195 | "testName": "snippets", 196 | "provider": "OpenAI", 197 | "modelId": "o1-pro-2025-03-19", 198 | "numSamples": 1, 199 | "numCorrect": 0, 200 | "pass1": 0, 201 | "pass10": 0, 202 | "context": { 203 | "used": false, 204 | "content": "" 205 | }, 206 | "samples": [ 207 | { 208 | "index": 0, 209 | "code": "\n\n\n\n

\n {@render title(book)}\n

\n\n{@snippet title(title)}\n{title}\n{/snippet}", 210 | "success": false, 211 | "errors": [ 212 | "tmp/openai/Component.svelte:17:1 Unexpected block closing tag\nhttps://svelte.dev/e/block_unexpected_close" 213 | ], 214 | "temperature": 0.2 215 | } 216 | ] 217 | } 218 | ] -------------------------------------------------------------------------------- /src/utils/file.ts: -------------------------------------------------------------------------------- 1 | import fs from "fs/promises"; 2 | import path from "path"; 3 | import { rimraf } from "rimraf"; 4 | 5 | // Maximum retry attempts for file operations 6 | const MAX_RETRIES = 3; 7 | const RETRY_DELAY = 500; // milliseconds 8 | 9 | /** 10 | * Get the directory for temporary sample files for a specific provider 11 | * @param provider The provider name (optional) 12 | * @returns The path to the temporary samples directory 13 | */ 14 | export function getTmpDir(provider?: string): string { 15 | const baseDir = path.resolve(process.cwd(), "tmp"); 16 | if (provider) { 17 | return path.join(baseDir, "samples", provider.toLowerCase()); 18 | } 19 | return baseDir; 20 | } 21 | 22 | /** 23 | * Get the directory for checkpoint files for a specific provider 24 | * @param provider The provider name 25 | * @returns The path to the checkpoint directory 26 | */ 27 | export function getCheckpointDir(provider: string): string { 28 | const baseDir = path.resolve(process.cwd(), "tmp"); 29 | return path.join(baseDir, "checkpoint", provider.toLowerCase()); 30 | } 31 | 32 | /** 33 | * Ensure the temporary directory exists for a specific provider 34 | * @param provider The provider name (optional) 35 | */ 36 | export async function ensureTmpDir(provider?: string): Promise { 37 | try { 38 | const tmpDir = getTmpDir(provider); 39 | await fs.mkdir(tmpDir, { recursive: true }); 40 | } catch (error) { 41 | console.error(`Error creating tmp directory for ${provider || "base"}:`, error); 42 | throw error; 43 | } 44 | } 45 | 46 | /** 47 | * Ensure the checkpoint directory exists for a specific provider 48 | * @param provider The provider name 49 | */ 50 | export async function ensureCheckpointDir(provider: string): Promise { 51 | try { 52 | const checkpointDir = getCheckpointDir(provider); 53 | await fs.mkdir(checkpointDir, { recursive: true }); 54 | } catch (error) { 55 | console.error(`Error creating checkpoint directory for ${provider}:`, error); 56 | throw error; 57 | } 58 | } 59 | 60 | /** 61 | * Helper function to add delay between retries 62 | * @param ms milliseconds to delay 63 | */ 64 | const delay = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms)); 65 | 66 | /** 67 | * Clean the checkpoint directory for a specific provider with retry logic 68 | * This is used when starting a new run to clear previous checkpoints 69 | * @param provider The provider name 70 | */ 71 | export async function cleanCheckpointDir(provider: string): Promise { 72 | let retries = 0; 73 | const checkpointDir = getCheckpointDir(provider); 74 | 75 | while (retries < MAX_RETRIES) { 76 | try { 77 | // Use rimraf to recursively remove directory contents 78 | await rimraf(checkpointDir); 79 | 80 | // Re-create the empty directory 81 | await ensureCheckpointDir(provider); 82 | 83 | console.log(`✨ Cleaned checkpoint directory for ${provider}`); 84 | return; 85 | } catch (error) { 86 | retries++; 87 | console.warn( 88 | `Warning: Failed to clean checkpoint directory for ${provider} (attempt ${retries}/${MAX_RETRIES}):`, 89 | error, 90 | ); 91 | 92 | if (retries < MAX_RETRIES) { 93 | // Wait a bit before retrying to allow any file locks to clear 94 | await delay(RETRY_DELAY * retries); 95 | } else { 96 | console.error(`Failed to clean checkpoint directory for ${provider} after ${MAX_RETRIES} attempts`); 97 | // Don't throw the error, just log it and continue 98 | } 99 | } 100 | } 101 | } 102 | 103 | /** 104 | * Clean the samples directory for a specific provider with retry logic 105 | * This is used during test execution to clear old sample files 106 | * @param provider The provider name (optional) 107 | */ 108 | export async function cleanTmpDir(provider?: string): Promise { 109 | let retries = 0; 110 | const tmpDir = getTmpDir(provider); 111 | 112 | while (retries < MAX_RETRIES) { 113 | try { 114 | // Use rimraf to recursively remove directory contents 115 | // This properly handles subdirectories and permission issues better than fs.unlink 116 | await rimraf(tmpDir); 117 | 118 | // Re-create the empty directory 119 | await ensureTmpDir(provider); 120 | 121 | console.log(`✨ Cleaned samples directory for ${provider || "base"}`); 122 | return; 123 | } catch (error) { 124 | retries++; 125 | console.warn( 126 | `Warning: Failed to clean samples directory for ${provider || "base"} (attempt ${retries}/${MAX_RETRIES}):`, 127 | error, 128 | ); 129 | 130 | if (retries < MAX_RETRIES) { 131 | // Wait a bit before retrying to allow any file locks to clear 132 | await delay(RETRY_DELAY * retries); 133 | } else { 134 | console.error(`Failed to clean samples directory for ${provider || "base"} after ${MAX_RETRIES} attempts`); 135 | // Don't throw the error, just log it and continue 136 | } 137 | } 138 | } 139 | } 140 | 141 | /** 142 | * Write content to a file in the temporary directory for a specific provider with retry logic 143 | * @param filename The name of the file 144 | * @param content The content to write 145 | * @param provider The provider name (optional) 146 | */ 147 | export async function writeToTmpFile(filename: string, content: string, provider?: string): Promise { 148 | let retries = 0; 149 | 150 | while (retries < MAX_RETRIES) { 151 | try { 152 | await ensureTmpDir(provider); 153 | const tmpDir = getTmpDir(provider); 154 | const filePath = path.join(tmpDir, filename); 155 | await fs.writeFile(filePath, content); 156 | console.log(`📝 Wrote to ${filePath}`); 157 | return filePath; 158 | } catch (error) { 159 | retries++; 160 | console.warn( 161 | `Warning: Failed to write to ${filename} for ${provider || "base"} (attempt ${retries}/${MAX_RETRIES}):`, 162 | error, 163 | ); 164 | 165 | if (retries < MAX_RETRIES) { 166 | await delay(RETRY_DELAY * retries); 167 | } else { 168 | console.error(`Error writing to ${filename} for ${provider || "base"} after ${MAX_RETRIES} attempts:`, error); 169 | throw error; 170 | } 171 | } 172 | } 173 | 174 | throw new Error(`Failed to write to ${filename} after ${MAX_RETRIES} attempts`); 175 | } 176 | 177 | /** 178 | * Copy a file to the temporary directory for a specific provider with retry logic 179 | * @param sourcePath The path to the source file 180 | * @param destFilename The name of the destination file 181 | * @param provider The provider name (optional) 182 | */ 183 | export async function copyToTmpDir(sourcePath: string, destFilename: string, provider?: string): Promise { 184 | let retries = 0; 185 | 186 | while (retries < MAX_RETRIES) { 187 | try { 188 | await ensureTmpDir(provider); 189 | const tmpDir = getTmpDir(provider); 190 | const destPath = path.join(tmpDir, destFilename); 191 | await fs.copyFile(sourcePath, destPath); 192 | console.log(`📋 Copied ${sourcePath} to ${destPath}`); 193 | return destPath; 194 | } catch (error) { 195 | retries++; 196 | console.warn( 197 | `Warning: Failed to copy ${sourcePath} for ${provider || "base"} (attempt ${retries}/${MAX_RETRIES}):`, 198 | error, 199 | ); 200 | 201 | if (retries < MAX_RETRIES) { 202 | await delay(RETRY_DELAY * retries); 203 | } else { 204 | console.error(`Error copying ${sourcePath} for ${provider || "base"} after ${MAX_RETRIES} attempts:`, error); 205 | throw error; 206 | } 207 | } 208 | } 209 | 210 | throw new Error(`Failed to copy to ${destFilename} after ${MAX_RETRIES} attempts`); 211 | } 212 | 213 | /** 214 | * Read a file from the specified path with retry logic 215 | * @param filePath The path to the file 216 | * @returns The content of the file 217 | */ 218 | export async function readFile(filePath: string): Promise { 219 | let retries = 0; 220 | 221 | while (retries < MAX_RETRIES) { 222 | try { 223 | return await fs.readFile(filePath, "utf-8"); 224 | } catch (error) { 225 | retries++; 226 | console.warn(`Warning: Failed to read ${filePath} (attempt ${retries}/${MAX_RETRIES}):`, error); 227 | 228 | if (retries < MAX_RETRIES) { 229 | await delay(RETRY_DELAY * retries); 230 | } else { 231 | console.error(`Error reading ${filePath} after ${MAX_RETRIES} attempts:`, error); 232 | throw error; 233 | } 234 | } 235 | } 236 | 237 | throw new Error(`Failed to read ${filePath} after ${MAX_RETRIES} attempts`); 238 | } 239 | 240 | /** 241 | * Load context from a file 242 | * @param filePath The path to the context file 243 | * @returns The context content as a string 244 | */ 245 | export async function loadContextFile(filePath: string): Promise { 246 | try { 247 | if (!filePath) return ""; 248 | 249 | // Check if the file exists 250 | await fs.access(filePath); 251 | 252 | // Read the file 253 | const contextContent = await fs.readFile(filePath, "utf-8"); 254 | console.log(`📄 Loaded context file from ${filePath}`); 255 | return contextContent; 256 | } catch (error) { 257 | console.error(`Error loading context file ${filePath}:`, error); 258 | throw new Error(`Failed to load context file: ${error instanceof Error ? error.message : String(error)}`); 259 | } 260 | } 261 | 262 | /** 263 | * Get the checkpoint file path for a specific provider and model 264 | * @param provider The provider name 265 | * @param modelId The model identifier 266 | * @returns The checkpoint file path 267 | */ 268 | export function getCheckpointPath(provider: string, modelId: string): string { 269 | const checkpointDir = getCheckpointDir(provider); 270 | const safeModelId = modelId.replace(/[^a-zA-Z0-9\-_]/g, "-"); 271 | return path.join(checkpointDir, `checkpoint-${safeModelId}.json`); 272 | } 273 | 274 | /** 275 | * Save checkpoint data to file 276 | * @param provider The provider name 277 | * @param modelId The model identifier 278 | * @param checkpointData The checkpoint data to save 279 | */ 280 | export async function saveCheckpoint(provider: string, modelId: string, checkpointData: any): Promise { 281 | try { 282 | await ensureCheckpointDir(provider); 283 | const checkpointPath = getCheckpointPath(provider, modelId); 284 | await fs.writeFile(checkpointPath, JSON.stringify(checkpointData, null, 2)); 285 | console.log(`💾 Saved checkpoint for ${provider}/${modelId}`); 286 | } catch (error) { 287 | console.error(`Error saving checkpoint for ${provider}/${modelId}:`, error); 288 | // Don't throw - checkpoint saving should not fail the test run 289 | } 290 | } 291 | 292 | /** 293 | * Load checkpoint data from file 294 | * @param provider The provider name 295 | * @param modelId The model identifier 296 | * @returns The checkpoint data or null if not found 297 | */ 298 | export async function loadCheckpoint(provider: string, modelId: string): Promise { 299 | try { 300 | const checkpointPath = getCheckpointPath(provider, modelId); 301 | await fs.access(checkpointPath); 302 | const data = await fs.readFile(checkpointPath, "utf-8"); 303 | const checkpoint = JSON.parse(data); 304 | console.log(`🔄 Loaded checkpoint for ${provider}/${modelId}`); 305 | return checkpoint; 306 | } catch (error) { 307 | // File doesn't exist or can't be read - this is expected for new runs 308 | return null; 309 | } 310 | } 311 | 312 | /** 313 | * Remove checkpoint file for a specific provider and model 314 | * @param provider The provider name 315 | * @param modelId The model identifier 316 | */ 317 | export async function removeCheckpoint(provider: string, modelId: string): Promise { 318 | try { 319 | const checkpointPath = getCheckpointPath(provider, modelId); 320 | await fs.unlink(checkpointPath); 321 | console.log(`🗑️ Removed checkpoint for ${provider}/${modelId}`); 322 | } catch (error) { 323 | // File might not exist - this is fine 324 | } 325 | } 326 | -------------------------------------------------------------------------------- /index.ts: -------------------------------------------------------------------------------- 1 | // Load environment variables from .env file 2 | import "dotenv/config"; 3 | 4 | import { getAllLLMProviders, getLLMProvider } from "./src/llms"; 5 | import { cleanTmpDir, loadContextFile } from "./src/utils/file"; 6 | import { 7 | runAllTestsHumanEval as runAllTestsHumanEvalParallel, 8 | saveBenchmarkResults, 9 | loadTestDefinitions, 10 | } from "./src/utils/parallel-test-manager"; 11 | import { runAllTestsHumanEval as runAllTestsHumanEvalSequential } from "./src/utils/test-manager"; 12 | import type { HumanEvalResult } from "./src/utils/humaneval"; 13 | import { ensureRequiredDirectories } from "./src/utils/ensure-dirs"; 14 | import { validateModels } from "./src/utils/model-validator"; 15 | import path from "path"; 16 | 17 | /** 18 | * Parse command line arguments 19 | * @returns Parsed command line arguments 20 | */ 21 | function parseCommandLineArgs(): { 22 | contextFile?: string; 23 | } { 24 | const args = process.argv.slice(2); 25 | let contextFile: string | undefined; 26 | 27 | // Parse arguments 28 | for (let i = 0; i < args.length; i++) { 29 | if (args[i] === "--context" && i + 1 < args.length) { 30 | contextFile = args[i + 1]; 31 | i++; // Skip the next argument as it's the value for --context 32 | } 33 | } 34 | 35 | return { 36 | contextFile, 37 | }; 38 | } 39 | 40 | /** 41 | * Main function to run the benchmark 42 | */ 43 | async function runBenchmark() { 44 | try { 45 | // Parse command line arguments 46 | const { contextFile } = parseCommandLineArgs(); 47 | 48 | // Check for parallel execution environment variable 49 | const parallel = process.env.PARALLEL_EXECUTION === "true"; 50 | 51 | const executionMode = parallel ? "PARALLEL EXECUTION" : "SEQUENTIAL EXECUTION"; 52 | console.log(`🚀 Starting SvelteBench with HumanEval methodology (${executionMode})...`); 53 | 54 | // Load context file if specified 55 | let contextContent = ""; 56 | if (contextFile) { 57 | try { 58 | // Resolve path relative to the current working directory 59 | const contextFilePath = path.resolve(process.cwd(), contextFile); 60 | contextContent = await loadContextFile(contextFilePath); 61 | console.log(`👉 Using context file: ${contextFilePath}`); 62 | } catch (error) { 63 | console.error(`Error loading context file: ${error}`); 64 | process.exit(1); 65 | } 66 | } 67 | 68 | // Ensure required directories exist 69 | await ensureRequiredDirectories(); 70 | 71 | // Note: We don't clean sample directories at startup anymore - only checkpoints are cleared 72 | 73 | // Check if we're in debug mode 74 | const isDebugMode = process.env.DEBUG_MODE === "true"; 75 | 76 | // Initialize provider models array 77 | let selectedProviderModels: any[] = []; 78 | 79 | if (isDebugMode) { 80 | console.log("🐛 Running in DEBUG_MODE"); 81 | 82 | // Get debug settings 83 | const debugProvider = process.env.DEBUG_PROVIDER; 84 | const debugModel = process.env.DEBUG_MODEL; 85 | 86 | if (!debugProvider) { 87 | throw new Error("DEBUG_PROVIDER must be specified in debug mode"); 88 | } 89 | 90 | if (!debugModel) { 91 | throw new Error(`No model specified for provider "${debugProvider}". Use DEBUG_MODEL to specify models.`); 92 | } 93 | 94 | // Parse comma-separated list of models 95 | const requestedModels = debugModel 96 | .split(",") 97 | .map((m) => m.trim()) 98 | .filter((m) => m.length > 0); 99 | 100 | if (requestedModels.length === 0) { 101 | throw new Error("DEBUG_MODEL must contain at least one model"); 102 | } 103 | 104 | // Validate models 105 | console.log(`👉 Validating models for provider ${debugProvider}...`); 106 | const validModels = await validateModels(debugProvider, requestedModels); 107 | 108 | if (validModels.length === 0) { 109 | throw new Error( 110 | `None of the requested models are valid for provider "${debugProvider}". Models tested: ${requestedModels.join(", ")}`, 111 | ); 112 | } 113 | 114 | // Create provider instances for valid models 115 | for (const modelId of validModels) { 116 | const provider = await getLLMProvider(debugProvider, modelId); 117 | selectedProviderModels.push({ 118 | provider, 119 | name: debugProvider.charAt(0).toUpperCase() + debugProvider.slice(1), 120 | modelId, 121 | }); 122 | } 123 | 124 | console.log( 125 | `👉 Selected provider: ${selectedProviderModels[0].name} (${ 126 | selectedProviderModels.length === 1 127 | ? selectedProviderModels[0].modelId 128 | : `${selectedProviderModels.length} models` 129 | })`, 130 | ); 131 | } else { 132 | // Non-debug mode: Get all available LLM providers and models 133 | console.log("👉 Discovering available LLM providers and models..."); 134 | const providerModels = await getAllLLMProviders(); 135 | 136 | if (providerModels.length === 0) { 137 | console.warn("⚠️ No pre-configured models found. Please use DEBUG_MODE with specific models."); 138 | throw new Error("No LLM provider/model combinations found. Use DEBUG_MODE to specify models."); 139 | } 140 | 141 | console.log(`👉 Found ${providerModels.length} provider/model combinations`); 142 | 143 | selectedProviderModels = providerModels; 144 | } 145 | 146 | const debugTest = process.env.DEBUG_TEST; 147 | 148 | // Load test definitions based on debug mode 149 | let testDefinitions = undefined; 150 | if (isDebugMode) { 151 | const allTests = await loadTestDefinitions(); 152 | 153 | if (allTests.length === 0) { 154 | throw new Error("No tests found"); 155 | } 156 | 157 | if (debugTest) { 158 | const matchingTest = allTests.find((test) => test.name === debugTest); 159 | if (matchingTest) { 160 | testDefinitions = [matchingTest]; 161 | console.log(`👉 Selected test: ${matchingTest.name}`); 162 | } else { 163 | console.warn(`⚠️ Test "${debugTest}" not found, using all tests`); 164 | testDefinitions = undefined; // Use all tests 165 | } 166 | } else { 167 | // No test specified, use all tests 168 | testDefinitions = undefined; 169 | console.log(`👉 Using all available tests`); 170 | } 171 | } 172 | 173 | // Set number of samples (use 10 samples by default unless a specific test was requested) 174 | let numSamples: number; 175 | if (isDebugMode && process.env.DEBUG_SAMPLES) { 176 | // Use DEBUG_SAMPLES value in debug mode if specified 177 | const debugSamples = parseInt(process.env.DEBUG_SAMPLES, 10); 178 | if (isNaN(debugSamples) || debugSamples <= 0) { 179 | throw new Error(`DEBUG_SAMPLES must be a positive integer, got: ${process.env.DEBUG_SAMPLES}`); 180 | } 181 | numSamples = debugSamples; 182 | } else { 183 | // Use default logic: 1 for specific debug tests, 10 for full runs 184 | numSamples = debugTest ? 1 : 10; 185 | } 186 | 187 | console.log(`👉 Running with ${numSamples} samples per test (for pass@k metrics)`); 188 | 189 | const allResults: HumanEvalResult[] = []; 190 | 191 | if (parallel) { 192 | // Run all provider/model combinations in parallel 193 | console.log( 194 | `\n👉 Running tests with ${selectedProviderModels.length} provider/model combinations in parallel...`, 195 | ); 196 | 197 | // Create a promise for each provider/model combination 198 | const providerPromises = selectedProviderModels.map(async (providerWithModel) => { 199 | try { 200 | console.log(`\n👉 Starting tests with ${providerWithModel.name} (${providerWithModel.modelId})...`); 201 | 202 | // Determine number of samples for this model 203 | // Use only 1 sample for expensive o1-pro models 204 | const modelNumSamples = providerWithModel.modelId.startsWith("o1-pro") ? 1 : numSamples; 205 | 206 | if (modelNumSamples !== numSamples) { 207 | console.log(` ⚠️ Using ${modelNumSamples} sample${modelNumSamples > 1 ? "s" : ""} for expensive model`); 208 | } 209 | 210 | // Run tests with this provider model using parallel HumanEval methodology 211 | const results = await runAllTestsHumanEvalParallel( 212 | providerWithModel.provider, 213 | modelNumSamples, 214 | testDefinitions, // Pass specific tests if in debug mode 215 | contextContent, // Pass context content if available 216 | ); 217 | 218 | // Save individual model results immediately to prevent loss if later models fail 219 | if (results.length > 0) { 220 | try { 221 | await saveBenchmarkResults(results, contextFile, contextContent, undefined); 222 | console.log(`💾 Saved individual results for ${providerWithModel.modelId}`); 223 | } catch (saveError) { 224 | console.error(`⚠️ Failed to save individual results for ${providerWithModel.modelId}:`, saveError); 225 | // Don't fail the entire run, just log and continue 226 | } 227 | } 228 | 229 | return results; 230 | } catch (error) { 231 | console.error(`Error running tests with ${providerWithModel.name} (${providerWithModel.modelId}):`, error); 232 | // Return empty results rather than throwing 233 | return []; 234 | } 235 | }); 236 | 237 | // Wait for all provider promises to complete 238 | const resultsArrays = await Promise.all(providerPromises); 239 | 240 | // Combine all results 241 | for (const results of resultsArrays) { 242 | allResults.push(...results); 243 | } 244 | } else { 245 | // Run provider/model combinations sequentially 246 | console.log( 247 | `\n👉 Running tests with ${selectedProviderModels.length} provider/model combinations sequentially...`, 248 | ); 249 | 250 | for (const providerWithModel of selectedProviderModels) { 251 | try { 252 | console.log(`\n👉 Starting tests with ${providerWithModel.name} (${providerWithModel.modelId})...`); 253 | 254 | // Determine number of samples for this model 255 | // Use only 1 sample for expensive o1-pro models 256 | const modelNumSamples = providerWithModel.modelId.startsWith("o1-pro") ? 1 : numSamples; 257 | 258 | if (modelNumSamples !== numSamples) { 259 | console.log(` ⚠️ Using ${modelNumSamples} sample${modelNumSamples > 1 ? "s" : ""} for expensive model`); 260 | } 261 | 262 | // Run tests with this provider model using sequential HumanEval methodology 263 | const results = await runAllTestsHumanEvalSequential( 264 | providerWithModel.provider, 265 | modelNumSamples, 266 | testDefinitions, // Pass specific tests if in debug mode 267 | contextContent, // Pass context content if available 268 | ); 269 | 270 | // Add results to combined array 271 | allResults.push(...results); 272 | 273 | // Save individual model results immediately to prevent loss if later models fail 274 | if (results.length > 0) { 275 | try { 276 | await saveBenchmarkResults(results, contextFile, contextContent, undefined); 277 | console.log(`💾 Saved individual results for ${providerWithModel.modelId}`); 278 | } catch (saveError) { 279 | console.error(`⚠️ Failed to save individual results for ${providerWithModel.modelId}:`, saveError); 280 | // Don't fail the entire run, just log and continue 281 | } 282 | } 283 | } catch (error) { 284 | console.error(`Error running tests with ${providerWithModel.name} (${providerWithModel.modelId}):`, error); 285 | // Continue with next provider instead of failing completely 286 | } 287 | } 288 | } 289 | 290 | // Print summary 291 | console.log(`\n📊 ${isDebugMode ? "Debug" : "Benchmark"} Summary:`); 292 | console.log("==========================================="); 293 | 294 | // Group results by test name 295 | const resultsByTest: Record = {}; 296 | for (const result of allResults) { 297 | if (!resultsByTest[result.testName]) { 298 | resultsByTest[result.testName] = []; 299 | } 300 | resultsByTest[result.testName].push(result); 301 | } 302 | 303 | let totalSuccess = 0; 304 | let totalSamples = 0; 305 | 306 | // Print results by test and provider 307 | for (const [testName, results] of Object.entries(resultsByTest)) { 308 | console.log(`\nTest: ${testName}`); 309 | 310 | for (const result of results) { 311 | console.log(` ${result.provider} (${result.modelId}):`); 312 | console.log( 313 | ` pass@1: ${result.pass1.toFixed(4)}${ 314 | result.numSamples > 1 ? `, pass@10: ${result.pass10.toFixed(4)}` : "" 315 | }`, 316 | ); 317 | console.log(` Samples: ${result.numSamples}, Correct: ${result.numCorrect}`); 318 | 319 | totalSuccess += result.numCorrect; 320 | totalSamples += result.numSamples; 321 | } 322 | } 323 | 324 | console.log("\n==========================================="); 325 | console.log(`Total Samples: ${totalSamples}, Passed: ${totalSuccess}, Failed: ${totalSamples - totalSuccess}`); 326 | 327 | // Note: We no longer clean sample directories at the end - they're preserved for inspection 328 | 329 | // Exit with appropriate code 330 | const exitCode = totalSuccess > 0 ? 0 : 1; 331 | process.exit(exitCode); 332 | } catch (error) { 333 | console.error("Error running benchmark:", error); 334 | process.exit(1); 335 | } 336 | } 337 | 338 | // Run the benchmark 339 | runBenchmark().catch((error) => { 340 | console.error("Unhandled error:", error); 341 | process.exit(1); 342 | }); 343 | -------------------------------------------------------------------------------- /src/utils/test-manager.ts: -------------------------------------------------------------------------------- 1 | import path from "path"; 2 | import fs from "fs/promises"; 3 | import type { LLMProvider } from "../llms"; 4 | import { 5 | cleanTmpDir, 6 | cleanCheckpointDir, 7 | writeToTmpFile, 8 | readFile, 9 | saveCheckpoint, 10 | loadCheckpoint, 11 | removeCheckpoint, 12 | } from "./file"; 13 | import { runTest } from "./test-runner"; 14 | import type { TestResult } from "./test-runner"; 15 | import { calculatePassAtK, type HumanEvalResult } from "./humaneval"; 16 | import { cleanCodeMarkdown } from "./code-cleaner"; 17 | import { withRetry } from "./retry-wrapper"; 18 | 19 | export interface TestDefinition { 20 | name: string; 21 | promptPath: string; 22 | testPath: string; 23 | } 24 | 25 | export interface CheckpointData { 26 | modelId: string; 27 | provider: string; 28 | completedResults: HumanEvalResult[]; 29 | currentTestIndex: number; 30 | currentSampleIndex: number; 31 | currentTestSamples: BenchmarkResult[]; 32 | contextContent?: string; 33 | numSamples: number; 34 | timestamp: string; 35 | } 36 | 37 | // We still need BenchmarkResult for the runSingleTest function 38 | // which is used by runHumanEvalTest 39 | export interface BenchmarkResult { 40 | testName: string; 41 | llmProvider: string; 42 | modelIdentifier: string; 43 | generatedCode: string; 44 | testResult: TestResult; 45 | promptPath: string; 46 | contextContent?: string; 47 | timestamp: string; 48 | sampleIndex?: number; 49 | temperature?: number; 50 | } 51 | 52 | /** 53 | * Load all test definitions from the src/tests directory 54 | */ 55 | export async function loadTestDefinitions(): Promise { 56 | const testsDir = path.resolve(process.cwd(), "src/tests"); 57 | const testDirs = await fs.readdir(testsDir); 58 | 59 | const tests: TestDefinition[] = []; 60 | 61 | for (const dir of testDirs) { 62 | const testDir = path.join(testsDir, dir); 63 | const stats = await fs.stat(testDir); 64 | 65 | if (stats.isDirectory()) { 66 | const promptPath = path.join(testDir, "prompt.md"); 67 | const testPath = path.join(testDir, "test.ts"); 68 | 69 | // Check if both files exist 70 | try { 71 | await Promise.all([fs.access(promptPath), fs.access(testPath)]); 72 | 73 | tests.push({ 74 | name: dir, 75 | promptPath, 76 | testPath, 77 | }); 78 | } catch (error) { 79 | console.warn(`Skipping ${dir}: missing prompt.md or test.ts`); 80 | } 81 | } 82 | } 83 | 84 | return tests; 85 | } 86 | 87 | /** 88 | * Run a single test with the given LLM provider, sample index, and temperature 89 | */ 90 | export async function runSingleTest( 91 | test: TestDefinition, 92 | llmProvider: LLMProvider, 93 | sampleIndex: number = 0, 94 | temperature?: number, 95 | contextContent?: string, 96 | ): Promise { 97 | try { 98 | const providerName = llmProvider.name; 99 | 100 | // Read the prompt 101 | const prompt = await readFile(test.promptPath); 102 | 103 | // Generate code with the LLM 104 | console.log( 105 | `🔄 Generating ${test.name} component with ${providerName} (sample ${ 106 | sampleIndex + 1 107 | }, temp: ${temperature ?? "default"})...`, 108 | ); 109 | let generatedCode = await withRetry( 110 | async () => { 111 | const rawCode = await llmProvider.generateCode(prompt, temperature, contextContent); 112 | 113 | // Apply cleaning to remove markdown code blocks 114 | const cleanedCode = cleanCodeMarkdown(rawCode); 115 | 116 | // Check if the cleaned code is empty or only whitespace 117 | if (!cleanedCode.trim()) { 118 | console.warn( 119 | `⚠️ Generated code is empty after cleaning for ${test.name} with ${providerName}. Raw code was:`, 120 | rawCode, 121 | ); 122 | throw new Error( 123 | "Generated code is empty after cleaning. This indicates an empty response from the LLM provider.", 124 | ); 125 | } 126 | 127 | return cleanedCode; 128 | }, 129 | { 130 | onRetry: (error, attempt) => { 131 | console.warn( 132 | `⚠️ Retry attempt ${attempt} for ${test.name} with ${providerName} after error: ${error.message}`, 133 | ); 134 | }, 135 | }, 136 | ); 137 | 138 | // Check if the generated code already includes 139 | if (!generatedCode.includes("")) { 140 | // Prepend it to the generated code 141 | generatedCode = "\n\n" + generatedCode; 142 | } 143 | 144 | // Use standard Component.svelte name 145 | const componentFilename = "Component.svelte"; 146 | await writeToTmpFile(componentFilename, generatedCode, providerName); 147 | 148 | // Copy the test file 149 | const testContent = await readFile(test.testPath); 150 | const testFilename = `${test.name}.test.ts`; 151 | await writeToTmpFile(testFilename, testContent, providerName); 152 | 153 | // Make sure the files are fully written before proceeding 154 | const tmpDir = path.resolve(process.cwd(), "tmp", "samples", providerName.toLowerCase()); 155 | await fs.access(path.join(tmpDir, componentFilename)); 156 | await fs.access(path.join(tmpDir, testFilename)); 157 | 158 | // Run the test with the standard test name 159 | const testResult = await runTest(test.name, providerName); 160 | 161 | return { 162 | testName: test.name, 163 | llmProvider: providerName, 164 | modelIdentifier: llmProvider.getModelIdentifier(), 165 | generatedCode, 166 | testResult, 167 | promptPath: test.promptPath, 168 | contextContent, 169 | timestamp: new Date().toISOString(), 170 | sampleIndex, 171 | temperature, 172 | }; 173 | } catch (error) { 174 | const errorMessage = error instanceof Error ? error.message : String(error); 175 | console.error(`Error running test ${test.name} with ${llmProvider.name}:`, errorMessage); 176 | 177 | return { 178 | testName: test.name, 179 | llmProvider: llmProvider.name, 180 | modelIdentifier: llmProvider.getModelIdentifier(), 181 | generatedCode: "", 182 | testResult: { 183 | testName: test.name, 184 | success: false, 185 | testFiles: 0, 186 | totalTests: 0, 187 | failedTests: 0, 188 | errors: [errorMessage], 189 | }, 190 | promptPath: test.promptPath, 191 | contextContent, 192 | timestamp: new Date().toISOString(), 193 | sampleIndex, 194 | temperature, 195 | }; 196 | } 197 | } 198 | 199 | /** 200 | * HumanEval implementation: Generate multiple samples for a single test 201 | * with sample-level checkpointing and resumption support 202 | * 203 | * This follows the HumanEval methodology by generating n samples with 204 | * appropriate temperature settings for each sample. 205 | */ 206 | export async function runHumanEvalTest( 207 | test: TestDefinition, 208 | llmProvider: LLMProvider, 209 | numSamples: number = 10, 210 | contextContent?: string, 211 | providerName?: string, 212 | modelId?: string, 213 | testIndex?: number, 214 | completedResults?: HumanEvalResult[], 215 | existingSamples: BenchmarkResult[] = [], 216 | startSampleIndex: number = 0, 217 | ): Promise { 218 | try { 219 | const actualProviderName = providerName || llmProvider.name; 220 | const actualModelId = modelId || llmProvider.getModelIdentifier(); 221 | const samples: BenchmarkResult[] = [...existingSamples]; 222 | 223 | // Run samples starting from startSampleIndex with checkpointing after each API call 224 | for (let i = startSampleIndex; i < numSamples; i++) { 225 | try { 226 | // Clean the tmp directory before each sample 227 | await cleanTmpDir(actualProviderName); 228 | 229 | // Determine temperature: 0 for first sample, undefined for others 230 | const temperature = i === 0 ? 0 : undefined; 231 | 232 | console.log(`🔄 Running sample ${i + 1}/${numSamples} for ${test.name} with ${actualProviderName}...`); 233 | 234 | // Run the test with the current sample index and appropriate temperature 235 | const result = await runSingleTest(test, llmProvider, i, temperature, contextContent); 236 | 237 | // Only add to samples if the API call was successful (has generated code) 238 | if (result.generatedCode.trim() !== "") { 239 | samples.push(result); 240 | console.log(`✅ Completed sample ${i + 1}/${numSamples} for ${test.name}`); 241 | } else { 242 | console.log(`⚠️ API failure for sample ${i + 1}/${numSamples} for ${test.name} - not adding to results`); 243 | } 244 | 245 | // Save checkpoint after each API call (successful or not) 246 | if (testIndex !== undefined && completedResults !== undefined) { 247 | const checkpointData: CheckpointData = { 248 | modelId: actualModelId, 249 | provider: actualProviderName, 250 | completedResults, 251 | currentTestIndex: testIndex, 252 | currentSampleIndex: i, 253 | currentTestSamples: samples, 254 | contextContent, 255 | numSamples, 256 | timestamp: new Date().toISOString(), 257 | }; 258 | await saveCheckpoint(actualProviderName, actualModelId, checkpointData); 259 | console.log(`💾 Saved checkpoint after sample ${i + 1}/${numSamples}`); 260 | } 261 | } catch (error) { 262 | console.error(`Error running sample ${i + 1} for ${test.name} with ${actualProviderName}:`, error); 263 | 264 | // Save checkpoint even for failed samples to track progress 265 | if (testIndex !== undefined && completedResults !== undefined) { 266 | const checkpointData: CheckpointData = { 267 | modelId: actualModelId, 268 | provider: actualProviderName, 269 | completedResults, 270 | currentTestIndex: testIndex, 271 | currentSampleIndex: i, 272 | currentTestSamples: samples, 273 | contextContent, 274 | numSamples, 275 | timestamp: new Date().toISOString(), 276 | }; 277 | await saveCheckpoint(actualProviderName, actualModelId, checkpointData); 278 | console.log(`💾 Saved checkpoint after failed sample ${i + 1}/${numSamples}`); 279 | } 280 | 281 | // If this was due to retry exhaustion, abort the entire run 282 | const errorMessage = error instanceof Error ? error.message : String(error); 283 | if (errorMessage.includes("Failed after")) { 284 | console.error(`❌ Aborting run after exhausting retries for ${test.name}`); 285 | throw error; 286 | } 287 | 288 | // Continue with other samples for other types of errors 289 | } 290 | } 291 | 292 | // Calculate pass@k metrics - only count samples that were successfully run 293 | const validSamples = samples.filter((s) => s !== null && s !== undefined); 294 | const numValidSamples = validSamples.length; 295 | const numCorrect = validSamples.filter((s) => s.testResult.success).length; 296 | 297 | // If we have no valid samples, return default values 298 | if (numValidSamples === 0) { 299 | return { 300 | testName: test.name, 301 | provider: actualProviderName, 302 | modelId: actualModelId, 303 | numSamples: 0, 304 | numCorrect: 0, 305 | pass1: 0, 306 | pass10: 0, 307 | context: { 308 | used: !!contextContent, 309 | content: contextContent, 310 | }, 311 | samples: [], 312 | }; 313 | } 314 | 315 | const pass1 = calculatePassAtK(numValidSamples, numCorrect, 1); 316 | const pass10 = calculatePassAtK(numValidSamples, numCorrect, Math.min(10, numValidSamples)); 317 | 318 | // Format the results 319 | return { 320 | testName: test.name, 321 | provider: actualProviderName, 322 | modelId: actualModelId, 323 | numSamples: numValidSamples, 324 | numCorrect, 325 | pass1, 326 | pass10, 327 | context: { 328 | used: !!contextContent, 329 | content: contextContent, 330 | }, 331 | samples: validSamples.map((s) => ({ 332 | index: s.sampleIndex || 0, 333 | code: s.generatedCode, 334 | success: s.testResult.success, 335 | errors: s.testResult.errors || [], 336 | temperature: s.temperature, 337 | })), 338 | }; 339 | } catch (error) { 340 | const errorMessage = error instanceof Error ? error.message : String(error); 341 | console.error(`Error running HumanEval test ${test.name} with ${llmProvider.name}:`, errorMessage); 342 | 343 | // Return a failed result 344 | return { 345 | testName: test.name, 346 | provider: providerName || llmProvider.name, 347 | modelId: modelId || llmProvider.getModelIdentifier(), 348 | numSamples: 0, 349 | numCorrect: 0, 350 | pass1: 0, 351 | pass10: 0, 352 | context: { 353 | used: !!contextContent, 354 | content: contextContent, 355 | }, 356 | samples: [], 357 | }; 358 | } 359 | } 360 | 361 | /** 362 | * Run all tests with the given LLM provider using HumanEval methodology 363 | * Supports automatic resuming from checkpoints 364 | * @param llmProvider The LLM provider to use 365 | * @param numSamples Number of samples to generate for each test (default: 10) 366 | * @param specificTests Optional array of test definitions to run (default: all tests) 367 | * @param contextContent Optional context content to include in prompts 368 | */ 369 | export async function runAllTestsHumanEval( 370 | llmProvider: LLMProvider, 371 | numSamples: number = 10, 372 | specificTests?: TestDefinition[], 373 | contextContent?: string, 374 | ): Promise { 375 | try { 376 | const providerName = llmProvider.name; 377 | const modelId = llmProvider.getModelIdentifier(); 378 | 379 | // Load test definitions 380 | let tests: TestDefinition[]; 381 | if (specificTests && specificTests.length > 0) { 382 | tests = specificTests; 383 | console.log(`📋 Running ${tests.length} specific tests for ${providerName}`); 384 | } else { 385 | tests = await loadTestDefinitions(); 386 | console.log(`📋 Found ${tests.length} tests to run for ${providerName}`); 387 | } 388 | 389 | // Check for existing checkpoint 390 | const checkpoint = await loadCheckpoint(providerName, modelId); 391 | let results: HumanEvalResult[] = []; 392 | let startTestIndex = 0; 393 | let startSampleIndex = 0; 394 | let currentTestSamples: BenchmarkResult[] = []; 395 | 396 | if (checkpoint) { 397 | console.log(`🔄 Found checkpoint for ${providerName}/${modelId}`); 398 | console.log( 399 | `🔄 Resuming from checkpoint at test ${checkpoint.currentTestIndex + 1}/${tests.length}, sample ${checkpoint.currentSampleIndex + 1}`, 400 | ); 401 | results = checkpoint.completedResults || []; 402 | startTestIndex = checkpoint.currentTestIndex; 403 | startSampleIndex = checkpoint.currentSampleIndex + 1; // Resume from next sample 404 | currentTestSamples = checkpoint.currentTestSamples || []; 405 | 406 | // If we finished all samples for the current test, move to next test 407 | if (startSampleIndex >= numSamples) { 408 | startTestIndex = checkpoint.currentTestIndex + 1; 409 | startSampleIndex = 0; 410 | currentTestSamples = []; 411 | } 412 | 413 | // Verify checkpoint context matches current run 414 | if (checkpoint.contextContent !== contextContent || checkpoint.numSamples !== numSamples) { 415 | console.warn(`⚠️ Checkpoint context/samples mismatch - starting fresh`); 416 | results = []; 417 | startTestIndex = 0; 418 | startSampleIndex = 0; 419 | currentTestSamples = []; 420 | // Clear checkpoints for fresh start 421 | await cleanCheckpointDir(providerName); 422 | } 423 | // No cleaning when resuming from valid checkpoint 424 | } else { 425 | // Clear checkpoints at the beginning for new runs (but leave samples intact) 426 | await cleanCheckpointDir(providerName); 427 | } 428 | 429 | // Run remaining tests from checkpoint or start 430 | for (let i = startTestIndex; i < tests.length; i++) { 431 | const test = tests[i]; 432 | 433 | try { 434 | console.log(`\n🧪 Running test: ${test.name} with ${providerName} (${i + 1}/${tests.length})`); 435 | 436 | // Determine starting sample index (0 for new tests, checkpoint value for resumed tests) 437 | const sampleStartIndex = i === startTestIndex ? startSampleIndex : 0; 438 | const existingSamples = i === startTestIndex ? currentTestSamples : []; 439 | 440 | // Run the test with sample-level checkpointing 441 | const result = await runHumanEvalTest( 442 | test, 443 | llmProvider, 444 | numSamples, 445 | contextContent, 446 | providerName, 447 | modelId, 448 | i, 449 | results, 450 | existingSamples, 451 | sampleStartIndex, 452 | ); 453 | 454 | // Only add result if it has valid samples (not just API failures) 455 | if (result.numSamples > 0) { 456 | results.push(result); 457 | 458 | // Log the pass@k metrics 459 | console.log( 460 | `📊 ${test.name} (${providerName}) - pass@1: ${result.pass1.toFixed( 461 | 4, 462 | )}, pass@10: ${result.pass10.toFixed(4)}`, 463 | ); 464 | console.log(` Samples: ${result.numSamples}, Correct: ${result.numCorrect}`); 465 | } else { 466 | console.log(`⚠️ Skipping ${test.name} - no successful API calls, not adding to final results`); 467 | } 468 | 469 | // Save checkpoint after each test completion (reset sample tracking) 470 | const checkpointData: CheckpointData = { 471 | modelId, 472 | provider: providerName, 473 | completedResults: results, 474 | currentTestIndex: i, 475 | currentSampleIndex: numSamples, // Mark all samples as completed 476 | currentTestSamples: [], 477 | contextContent, 478 | numSamples, 479 | timestamp: new Date().toISOString(), 480 | }; 481 | await saveCheckpoint(providerName, modelId, checkpointData); 482 | } catch (error) { 483 | console.error(`Error running test ${test.name} with ${providerName}:`, error); 484 | 485 | // If this was due to retry exhaustion, abort the entire run 486 | const errorMessage = error instanceof Error ? error.message : String(error); 487 | if (errorMessage.includes("Failed after")) { 488 | console.error(`❌ Aborting entire run due to repeated API failures`); 489 | 490 | // Save final checkpoint before aborting 491 | const checkpointData: CheckpointData = { 492 | modelId, 493 | provider: providerName, 494 | completedResults: results, 495 | currentTestIndex: i, 496 | currentSampleIndex: 0, 497 | currentTestSamples: [], 498 | contextContent, 499 | numSamples, 500 | timestamp: new Date().toISOString(), 501 | }; 502 | await saveCheckpoint(providerName, modelId, checkpointData); 503 | 504 | // Don't continue with other tests, abort 505 | throw error; 506 | } 507 | 508 | // Save checkpoint for non-fatal errors and continue 509 | const checkpointData: CheckpointData = { 510 | modelId, 511 | provider: providerName, 512 | completedResults: results, 513 | currentTestIndex: i, 514 | currentSampleIndex: numSamples, // Mark test as completed (even if failed) 515 | currentTestSamples: [], 516 | contextContent, 517 | numSamples, 518 | timestamp: new Date().toISOString(), 519 | }; 520 | await saveCheckpoint(providerName, modelId, checkpointData); 521 | 522 | // Continue with other tests rather than failing completely 523 | } 524 | } 525 | 526 | // Clean up checkpoint after successful completion 527 | await removeCheckpoint(providerName, modelId); 528 | 529 | return results; 530 | } catch (error) { 531 | console.error(`Error running all tests for ${llmProvider.name}:`, error); 532 | // Return an empty array rather than throwing an error 533 | return []; 534 | } 535 | } 536 | 537 | /** 538 | * Ensure the benchmarks directory exists 539 | */ 540 | export async function ensureBenchmarksDir(): Promise { 541 | const benchmarksDir = path.resolve(process.cwd(), "benchmarks"); 542 | try { 543 | await fs.mkdir(benchmarksDir, { recursive: true }); 544 | } catch (error) { 545 | console.error("Error creating benchmarks directory:", error); 546 | throw error; 547 | } 548 | } 549 | 550 | /** 551 | * Save benchmark results to a file 552 | */ 553 | export async function saveBenchmarkResults( 554 | results: HumanEvalResult[], 555 | contextFile?: string, 556 | contextContent?: string, 557 | customFilenamePrefix?: string, 558 | ): Promise { 559 | try { 560 | // Ensure the benchmarks directory exists 561 | await ensureBenchmarksDir(); 562 | 563 | const timestamp = new Date().toISOString().replace(/:/g, "-"); 564 | let filenamePrefix: string; 565 | 566 | if (customFilenamePrefix) { 567 | // Clean the custom filename prefix to be filesystem-safe 568 | const cleanPrefix = customFilenamePrefix.replace(/[^a-zA-Z0-9\-_]/g, "-"); 569 | filenamePrefix = contextFile 570 | ? `benchmark-results-with-context-${cleanPrefix}-` 571 | : `benchmark-results-${cleanPrefix}-`; 572 | } else { 573 | filenamePrefix = contextFile ? `benchmark-results-with-context-` : `benchmark-results-`; 574 | } 575 | 576 | const filename = `${filenamePrefix}${timestamp}.json`; 577 | const filePath = path.resolve(process.cwd(), "benchmarks", filename); 578 | 579 | // Add context information to the results if it's not already there 580 | const resultsWithContext = results.map((result) => { 581 | if (!result.context) { 582 | result.context = { 583 | used: !!contextContent, 584 | filename: contextFile, 585 | content: contextContent, 586 | }; 587 | } 588 | return { 589 | ...result, 590 | timestamp: new Date().toISOString(), 591 | }; 592 | }); 593 | 594 | await fs.writeFile(filePath, JSON.stringify(resultsWithContext, null, 2)); 595 | console.log(`📊 Saved benchmark results to ${filePath}`); 596 | 597 | return filePath; 598 | } catch (error) { 599 | console.error("Error saving benchmark results:", error); 600 | throw error; 601 | } 602 | } 603 | --------------------------------------------------------------------------------