├── .env.example
├── .envrc
├── .eslintrc.js
├── .flake8
├── .github
    └── workflows
    │   ├── eval.yaml
    │   ├── js.yaml
    │   ├── lint.yaml
    │   └── python.yaml
├── .gitignore
├── .isort.cfg
├── .npmignore
├── .pre-commit-config.yaml
├── .prettierrc
├── .tool-versions
├── .vscode
    └── settings.json
├── CHANGELOG.md
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.md
├── env.sh
├── evals
    ├── .eslintrc.js
    ├── .prettierrc
    ├── datasets
    │   ├── coqa-closed-qa.json
    │   ├── coqa-context-relevancy.json
    │   └── coqa-factuality.json
    ├── package.json
    ├── src
    │   ├── autoevals.eval.ts
    │   ├── datasets.ts
    │   ├── duckdb.ts
    │   └── sync_datasets.ts
    └── tsconfig.json
├── js
    ├── embeddings.test.ts
    ├── index.ts
    ├── json.test.ts
    ├── json.ts
    ├── list.ts
    ├── llm.fixtures.ts
    ├── llm.test.ts
    ├── llm.ts
    ├── manifest.ts
    ├── moderation.test.ts
    ├── moderation.ts
    ├── number.ts
    ├── oai.test.ts
    ├── oai.ts
    ├── partial.test.ts
    ├── partial.ts
    ├── ragas.test.ts
    ├── ragas.ts
    ├── render-messages.test.ts
    ├── render-messages.ts
    ├── string.ts
    ├── templates.ts
    ├── value.test.ts
    ├── value.ts
    └── yaml.d.ts
├── package.json
├── pnpm-lock.yaml
├── pnpm-workspace.yaml
├── py
    └── autoevals
    │   ├── __init__.py
    │   ├── json.py
    │   ├── list.py
    │   ├── llm.py
    │   ├── moderation.py
    │   ├── number.py
    │   ├── oai.py
    │   ├── partial.py
    │   ├── ragas.py
    │   ├── string.py
    │   ├── templates
    │   ├── test_embeddings.py
    │   ├── test_json.py
    │   ├── test_llm.py
    │   ├── test_moderation.py
    │   ├── test_oai.py
    │   ├── test_partial.py
    │   ├── test_ragas.py
    │   ├── test_values.py
    │   ├── value.py
    │   └── version.py
├── pyproject.toml
├── pyrightconfig.json
├── scripts
    └── prepare_readme.py
├── setup.py
├── templates
    ├── battle.yaml
    ├── closed_q_a.yaml
    ├── factuality.yaml
    ├── humor.yaml
    ├── possible.yaml
    ├── security.yaml
    ├── sql.yaml
    ├── summary.yaml
    └── translation.yaml
├── tsconfig.json
├── tsup.config.js
├── turbo.json
└── vitest.config.ts


/.env.example:
--------------------------------------------------------------------------------
1 | OPENAI_API_KEY=your_openai_api_key
2 | BRAINTRUST_API_KEY=your_braintrust_api_key
3 | 


--------------------------------------------------------------------------------
/.envrc:
--------------------------------------------------------------------------------
1 | source_up
2 | dotenv
3 | 


--------------------------------------------------------------------------------
/.eslintrc.js:
--------------------------------------------------------------------------------
 1 | const path = require("path");
 2 | 
 3 | module.exports = {
 4 |   extends: ["plugin:@typescript-eslint/recommended", "prettier"],
 5 |   plugins: ["@typescript-eslint"],
 6 |   rules: {
 7 |     "@typescript-eslint/no-unused-vars": [
 8 |       "error",
 9 |       {
10 |         vars: "all",
11 |         args: "none",
12 |         ignoreRestSiblings: false,
13 |         argsIgnorePattern: "^_",
14 |         varsIgnorePattern: "^_",
15 |       },
16 |     ],
17 |     "prefer-const": "error",
18 |     "@typescript-eslint/no-explicit-any": "off",
19 |     "@typescript-eslint/ban-types": "off",
20 |     "@typescript-eslint/ban-ts-comment": "off",
21 |     "@typescript-eslint/no-var-requires": "off",
22 |   },
23 | };
24 | 


--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 119
3 | ignore = E402, E203, E501, W503
4 | 


--------------------------------------------------------------------------------
/.github/workflows/eval.yaml:
--------------------------------------------------------------------------------
 1 | name: Run pnpm evals
 2 | 
 3 | on:
 4 |   push:
 5 |     # Uncomment to run only when files in the 'evals' directory change
 6 |     # - paths:
 7 |     #     - "evals/**"
 8 | 
 9 | permissions:
10 |   pull-requests: write
11 |   contents: read
12 | 
13 | jobs:
14 |   eval:
15 |     name: Run evals
16 |     runs-on: ubuntu-latest
17 | 
18 |     steps:
19 |       - name: Checkout
20 |         id: checkout
21 |         uses: actions/checkout@v4
22 |         with:
23 |           fetch-depth: 0
24 | 
25 |       - name: Setup Node.js
26 |         id: setup-node
27 |         uses: actions/setup-node@v4
28 |         with:
29 |           node-version: 22
30 | 
31 |       - uses: pnpm/action-setup@v4
32 | 
33 |       - name: Install Dependencies
34 |         id: install
35 |         run: pnpm install
36 | 
37 |       - name: Build packages
38 |         id: build
39 |         run: pnpm build
40 | 
41 |       - name: Run Evals
42 |         uses: braintrustdata/eval-action@v1
43 |         with:
44 |           api_key: ${{ secrets.BRAINTRUST_API_KEY }}
45 |           runtime: node
46 |           root: evals
47 | 


--------------------------------------------------------------------------------
/.github/workflows/js.yaml:
--------------------------------------------------------------------------------
 1 | name: js
 2 | 
 3 | on:
 4 |   pull_request:
 5 |   push:
 6 |     branches: [main]
 7 | 
 8 | jobs:
 9 |   build:
10 |     runs-on: ubuntu-latest
11 | 
12 |     strategy:
13 |       matrix:
14 |         # duckdb has an incredibly slow install with 24.x
15 |         node-version: [20.x, 22.x]
16 | 
17 |     steps:
18 |       - uses: actions/checkout@v3
19 |       - name: Cache node_modules
20 |         uses: actions/cache@v4
21 |         with:
22 |           path: |
23 |             node_modules
24 |             !node_modules/.cache/turbo
25 |           key: ${{ matrix.runner }}-${{ matrix.node_version }}-node-${{ env.nodeModulesCacheHash }}
26 |           restore-keys: |
27 |             ${{ matrix.runner }}-${{ matrix.node_version }}-node-
28 |       - name: Use Node.js ${{ matrix.node-version }}
29 |         uses: actions/setup-node@v3
30 |         with:
31 |           node-version: ${{ matrix.node-version }}
32 |       - uses: pnpm/action-setup@v4
33 |       - run: pnpm install
34 |       - run: pnpm run test
35 |         env:
36 |           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
37 |           OPENAI_BASE_URL: ${{ secrets.OPENAI_BASE_URL }}
38 |           BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }}
39 |       - run: pnpm run build
40 | 


--------------------------------------------------------------------------------
/.github/workflows/lint.yaml:
--------------------------------------------------------------------------------
 1 | # Source: https://github.com/marketplace/actions/pre-commit
 2 | name: lint
 3 | 
 4 | on:
 5 |   pull_request:
 6 |   push:
 7 |     branches: [main]
 8 | 
 9 | jobs:
10 |   lint:
11 |     runs-on: ubuntu-latest
12 |     steps:
13 |       - uses: actions/checkout@v3
14 |       - uses: actions/setup-python@v3
15 |       - uses: pre-commit/action@v3.0.0
16 | 


--------------------------------------------------------------------------------
/.github/workflows/python.yaml:
--------------------------------------------------------------------------------
 1 | # Modified from https://github.com/actions/starter-workflows/blob/main/ci/python-app.yml
 2 | name: python
 3 | 
 4 | on:
 5 |   pull_request:
 6 |   push:
 7 |     branches: [main]
 8 | 
 9 | jobs:
10 |   build:
11 |     runs-on: ubuntu-latest
12 |     strategy:
13 |       matrix:
14 |         python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
15 | 
16 |     steps:
17 |       - uses: actions/checkout@v3
18 |       - name: Set up Python ${{ matrix.python-version }}
19 |         uses: actions/setup-python@v3
20 |         with:
21 |           python-version: ${{ matrix.python-version }}
22 |       - name: Install dependencies
23 |         run: |
24 |           python -m pip install --upgrade pip setuptools build twine openai
25 |           python -m pip install -e .[all]
26 |       - name: Test with pytest
27 |         run: |
28 |           pytest
29 |         env:
30 |           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
31 |           OPENAI_BASE_URL: ${{ secrets.OPENAI_BASE_URL }}
32 |           BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }}
33 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | *.swp
 3 | *.swo
 4 | data
 5 | venv
 6 | .env
 7 | .direnv
 8 | .DS_STORE
 9 | node_modules
10 | py/*.egg-info/
11 | pydist
12 | jsdist
13 | dist
14 | autoevals-*.tar.gz
15 | autoevals-*.tgz
16 | typedoc.json
17 | build
18 | .turbo
19 | 


--------------------------------------------------------------------------------
/.isort.cfg:
--------------------------------------------------------------------------------
1 | [settings]
2 | line_length=119
3 | multi_line_output=3
4 | use_parentheses=true
5 | lines_after_imports=2
6 | include_trailing_comma=True
7 | 


--------------------------------------------------------------------------------
/.npmignore:
--------------------------------------------------------------------------------
 1 | tsconfig.json
 2 | MANIFEST.in
 3 | Makefile
 4 | README.md
 5 | js
 6 | py
 7 | pyproject.toml
 8 | setup.py
 9 | venv
10 | pydist
11 | autoevals-*.tgz
12 | .testcache
13 | .flake8
14 | .isort.cfg
15 | .pre-commit-config.yaml
16 | .pytest_cache
17 | .testcache
18 | env.sh
19 | scripts
20 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: "https://github.com/pre-commit/pre-commit-hooks"
 3 |     rev: v4.4.0
 4 |     hooks:
 5 |       - id: end-of-file-fixer
 6 |       - id: trailing-whitespace
 7 |   - repo: "https://github.com/psf/black"
 8 |     rev: 22.6.0
 9 |     hooks:
10 |       - id: black
11 |         files: ./
12 |   - repo: https://github.com/astral-sh/ruff-pre-commit
13 |     # Ruff version.
14 |     rev: v0.0.282
15 |     hooks:
16 |       - id: ruff
17 |         args: [--fix, --exit-non-zero-on-fix]
18 |   - repo: https://github.com/codespell-project/codespell
19 |     rev: v2.2.1
20 |     hooks:
21 |       - id: codespell
22 |         exclude: >
23 |           (?x)^(
24 |               .*\.(json|prisma|yaml)
25 |           )$
26 |         args:
27 |           - "-L"
28 |           - "rouge,afterall"
29 | 
30 |   - repo: https://github.com/rbubley/mirrors-prettier
31 |     rev: v3.3.2
32 |     hooks:
33 |       - id: prettier
34 |         exclude: ^(extension/|.*\.json$)
35 | 


--------------------------------------------------------------------------------
/.prettierrc:
--------------------------------------------------------------------------------
1 | {
2 |   "singleQuote": false
3 | }
4 | 


--------------------------------------------------------------------------------
/.tool-versions:
--------------------------------------------------------------------------------
1 | python 3.9.21
2 | 


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 |   "[python]": {
3 |     "editor.formatOnSave": true,
4 |     "editor.defaultFormatter": "ms-python.black-formatter"
5 |   },
6 |   "black-formatter.path": ["${workspaceFolder}/venv/bin/black"]
7 | }
8 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | # Changelog
2 | 
3 | Release notes can be found [here](https://www.braintrust.dev/docs/reference/release-notes).
4 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 BrainTrust Data
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive-include py/autoevals/templates *
2 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | SHELL := /bin/bash
 2 | ROOT_DIR:=$(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
 3 | VENV_PRE_COMMIT := ${ROOT_DIR}/venv/.pre_commit
 4 | 
 5 | .PHONY: all
 6 | all: ${VENV_PRE_COMMIT}
 7 | 
 8 | .PHONY: py
 9 | py: ${VENV_PYTHON_PACKAGES}
10 | 	bash -c 'source venv/bin/activate'
11 | 
12 | VENV_INITIALIZED := venv/.initialized
13 | 
14 | ${VENV_INITIALIZED}:
15 | 	rm -rf venv && python -m venv venv
16 | 	@touch ${VENV_INITIALIZED}
17 | 
18 | VENV_PYTHON_PACKAGES := venv/.python_packages
19 | 
20 | ${VENV_PYTHON_PACKAGES}: ${VENV_INITIALIZED}
21 | 	bash -c 'source venv/bin/activate && python -m pip install --upgrade pip setuptools build twine openai'
22 | 	bash -c 'source venv/bin/activate && python -m pip install -e ".[dev]"'
23 | 	bash -c 'source venv/bin/activate && python -m pip install -e ".[scipy]"'  # for local tests
24 | 	@touch $@
25 | 
26 | ${VENV_PRE_COMMIT}: ${VENV_PYTHON_PACKAGES}
27 | 	bash -c 'source venv/bin/activate && pre-commit install'
28 | 	@touch $@
29 | 
30 | develop: ${VENV_PRE_COMMIT}
31 | 	@echo "--\nRun "source env.sh" to enter development mode!"
32 | 
33 | fixup:
34 | 	pre-commit run --all-files
35 | 
36 | .PHONY: test test-py test-js
37 | 
38 | test: test-py test-js
39 | 
40 | test-py:
41 | 	source env.sh && python3 -m pytest
42 | 
43 | test-js:
44 | 	pnpm install && pnpm run test
45 | 


--------------------------------------------------------------------------------
/env.sh:
--------------------------------------------------------------------------------
1 | SRC_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
2 | 
3 | ps1_old="$PS1"
4 | source $SRC_ROOT/venv/bin/activate
5 | export PS1="(autoevals) $ps1_old"
6 | 


--------------------------------------------------------------------------------
/evals/.eslintrc.js:
--------------------------------------------------------------------------------
 1 | const path = require("path");
 2 | 
 3 | module.exports = {
 4 |   extends: ["plugin:@typescript-eslint/recommended", "prettier"],
 5 |   plugins: ["@typescript-eslint"],
 6 |   rules: {
 7 |     "@typescript-eslint/no-unused-vars": [
 8 |       "error",
 9 |       {
10 |         vars: "all",
11 |         args: "none",
12 |         ignoreRestSiblings: false,
13 |         argsIgnorePattern: "^_",
14 |         varsIgnorePattern: "^_",
15 |       },
16 |     ],
17 |     "prefer-const": "error",
18 |     "@typescript-eslint/no-explicit-any": "off",
19 |     "@typescript-eslint/ban-types": "off",
20 |     "@typescript-eslint/ban-ts-comment": "off",
21 |     "@typescript-eslint/no-var-requires": "off",
22 |   },
23 | };
24 | 


--------------------------------------------------------------------------------
/evals/.prettierrc:
--------------------------------------------------------------------------------
1 | {
2 |   "singleQuote": false
3 | }
4 | 


--------------------------------------------------------------------------------
/evals/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "@braintrust/autoevals-evals",
 3 |   "version": "1.0.0",
 4 |   "description": "",
 5 |   "main": "index.js",
 6 |   "scripts": {
 7 |     "test": "echo \"Error: no test specified\" && exit 1",
 8 |     "sync": "tsx src/sync_datasets.ts",
 9 |     "eval": "braintrust eval"
10 |   },
11 |   "keywords": [],
12 |   "author": "",
13 |   "license": "ISC",
14 |   "dependencies": {
15 |     "autoevals": "workspace:*",
16 |     "braintrust": "^0.0.140",
17 |     "zod": "^3.22.4"
18 |   },
19 |   "devDependencies": {
20 |     "@types/node": "^20.10.5",
21 |     "duckdb": "^1.0.0",
22 |     "tsx": "^3.14.0"
23 |   }
24 | }
25 | 


--------------------------------------------------------------------------------
/evals/src/autoevals.eval.ts:
--------------------------------------------------------------------------------
 1 | import { Eval, EvalCase, wrapTraced } from "braintrust";
 2 | import path from "path";
 3 | import fs from "fs";
 4 | import {
 5 |   closedQACaseSchema,
 6 |   contextRelevancyCaseSchema,
 7 |   coqaCaseSchema,
 8 |   dataDir,
 9 | } from "./datasets";
10 | import { z } from "zod";
11 | import {
12 |   AnswerCorrectness,
13 |   ClosedQA,
14 |   ContextRelevancy,
15 |   DEFAULT_MODEL,
16 |   Factuality,
17 |   NumericDiff,
18 |   Score,
19 | } from "autoevals";
20 | 
21 | const experimentNamePrefix = process.env.EXPERIMENT_NAME;
22 | 
23 | const datasets = [
24 |   {
25 |     name: "Factuality",
26 |     path: path.join(dataDir, "coqa-factuality.json"),
27 |     parser: coqaCaseSchema,
28 |   },
29 |   {
30 |     name: "ClosedQA",
31 |     path: path.join(dataDir, "coqa-closed-qa.json"),
32 |     parser: closedQACaseSchema,
33 |   },
34 |   {
35 |     name: "AnswerCorrectness",
36 |     path: path.join(dataDir, "coqa-factuality.json"),
37 |     parser: coqaCaseSchema,
38 |     tags: ["ragas"],
39 |   },
40 |   {
41 |     name: "ContextRelevancy",
42 |     path: path.join(dataDir, "coqa-context-relevancy.json"),
43 |     parser: contextRelevancyCaseSchema,
44 |     tags: ["ragas"],
45 |   },
46 | ];
47 | 
48 | const runScorerT = wrapTraced(async function runScorer(
49 |   scorer: string,
50 |   input: any,
51 | ) {
52 |   switch (scorer) {
53 |     case "Factuality":
54 |       return Factuality(input);
55 |     case "ClosedQA":
56 |       return ClosedQA(input);
57 |     case "AnswerCorrectness":
58 |       return AnswerCorrectness(input);
59 |     case "ContextRelevancy":
60 |       return ContextRelevancy(input);
61 |     default:
62 |       throw new Error(`Unknown scorer: ${scorer}`);
63 |   }
64 | });
65 | 
66 | Eval("Autoevals", {
67 |   data: () =>
68 |     datasets.flatMap(({ name, path, parser, tags }) => {
69 |       const data = fs.readFileSync(path, "utf-8");
70 |       return z
71 |         .array(parser)
72 |         .parse(JSON.parse(data))
73 |         .map((d: EvalCase<any, any, object>) => ({
74 |           ...d,
75 |           input: { ...d.input, scorer: name },
76 |           metadata: { ...d.metadata, scorer: name },
77 |           tags: [...(tags ?? []), name],
78 |         }));
79 |     }),
80 |   task: async (input, hooks) => {
81 |     const { scorer, ...rest } = input;
82 |     let result: Score | null = null;
83 |     try {
84 |       result = await runScorerT(scorer, rest);
85 |     } catch (e) {
86 |       hooks.meta({ error: `${e}` });
87 |     }
88 |     return result?.score ?? -1;
89 |   },
90 |   scores: [NumericDiff],
91 |   experimentName: experimentNamePrefix ?? undefined,
92 |   metadata: {
93 |     model: DEFAULT_MODEL,
94 |   },
95 | });
96 | 


--------------------------------------------------------------------------------
/evals/src/datasets.ts:
--------------------------------------------------------------------------------
 1 | import { z } from "zod";
 2 | 
 3 | import path from "path";
 4 | 
 5 | export const dataDir = path.join(__dirname, "../datasets");
 6 | 
 7 | export const coqaSchema = z.object({
 8 |   source: z.string(),
 9 |   story: z.string(),
10 |   questions: z.array(z.string()),
11 |   answers: z.object({
12 |     input_text: z.array(z.string()),
13 |     answer_start: z.array(z.number()),
14 |     answer_end: z.array(z.number()),
15 |   }),
16 | });
17 | 
18 | export const coqaCaseSchema = z.object({
19 |   input: z.object({
20 |     input: z.string(),
21 |     output: z.string(),
22 |     expected: z.string(),
23 |   }),
24 |   expected: z.number(),
25 |   metadata: coqaSchema,
26 | });
27 | 
28 | export type FactualityCase = z.infer<typeof coqaCaseSchema>;
29 | 
30 | export const contextRelevancyCaseSchema = z.object({
31 |   input: z.object({
32 |     input: z.string(),
33 |     context: z.string(),
34 |   }),
35 |   expected: z.number(),
36 |   metadata: coqaSchema,
37 | });
38 | export type ContextRelevancyCase = z.infer<typeof contextRelevancyCaseSchema>;
39 | 
40 | export const closedQACaseSchema = z.object({
41 |   input: z.object({
42 |     input: z.string(),
43 |     output: z.string(),
44 |     criteria: z.string(),
45 |   }),
46 |   expected: z.number(),
47 |   metadata: coqaSchema,
48 | });
49 | 
50 | export type ClosedQACase = z.infer<typeof closedQACaseSchema>;
51 | 


--------------------------------------------------------------------------------
/evals/src/duckdb.ts:
--------------------------------------------------------------------------------
 1 | import type { TableData, Connection } from "duckdb";
 2 | import * as duckdb from "duckdb";
 3 | 
 4 | let _duckdb_db: duckdb.Database | null = null;
 5 | export function getDuckDBConn() {
 6 |   if (!_duckdb_db) {
 7 |     _duckdb_db = new duckdb.Database(":memory:");
 8 |   }
 9 |   return _duckdb_db.connect();
10 | }
11 | 
12 | export async function duckq(con: Connection, sql: string): Promise<TableData> {
13 |   return new Promise((resolve, reject) => {
14 |     con.all(sql, (err, rows) => {
15 |       if (err) {
16 |         reject(err);
17 |       } else {
18 |         resolve(rows);
19 |       }
20 |     });
21 |   });
22 | }
23 | 


--------------------------------------------------------------------------------
/evals/src/sync_datasets.ts:
--------------------------------------------------------------------------------
  1 | import { duckq, getDuckDBConn } from "./duckdb";
  2 | 
  3 | import { z } from "zod";
  4 | import {
  5 |   coqaSchema,
  6 |   dataDir,
  7 |   FactualityCase,
  8 |   ContextRelevancyCase,
  9 |   ClosedQACase,
 10 | } from "./datasets";
 11 | import path from "path";
 12 | import fs from "fs";
 13 | 
 14 | async function getCoqa(): Promise<z.infer<typeof coqaSchema>[]> {
 15 |   const conn = getDuckDBConn();
 16 |   return z.array(coqaSchema).parse(
 17 |     await duckq(
 18 |       conn,
 19 |       `SELECT * FROM 'hf://datasets/stanfordnlp/coqa/data/validation-00000-of-00001.parquet'
 20 |         LIMIT 20`,
 21 |     ),
 22 |   );
 23 | }
 24 | 
 25 | async function coqaFactuality(): Promise<FactualityCase[]> {
 26 |   const df = await getCoqa();
 27 | 
 28 |   // For each question, capture the correct answer, make a superset by concatenating answers
 29 |   // together, and pick a different answer as a completely wrong one
 30 |   const cases: FactualityCase[] = [];
 31 |   for (let document = 0; document < df.length; document++) {
 32 |     const metadata = df[document];
 33 |     const { questions, answers } = metadata;
 34 | 
 35 |     cases.push({
 36 |       input: {
 37 |         input: questions[0],
 38 |         output: answers.input_text[0],
 39 |         expected: answers.input_text[0],
 40 |       },
 41 |       expected: 1,
 42 |       metadata,
 43 |     });
 44 | 
 45 |     cases.push({
 46 |       input: {
 47 |         input: questions[0],
 48 |         output: answers.input_text[1],
 49 |         expected: answers.input_text[0],
 50 |       },
 51 |       expected: 0,
 52 |       metadata,
 53 |     });
 54 | 
 55 |     cases.push({
 56 |       input: {
 57 |         input: questions[0],
 58 |         output: `${answers.input_text[1]} ${answers.input_text[0]} ${answers.input_text[2]}`,
 59 |         expected: answers.input_text[0],
 60 |       },
 61 |       expected: 0.6,
 62 |       metadata,
 63 |     });
 64 |   }
 65 | 
 66 |   return cases;
 67 | }
 68 | 
 69 | async function coqaContextRelevancy(): Promise<ContextRelevancyCase[]> {
 70 |   const df = await getCoqa();
 71 | 
 72 |   const cases: ContextRelevancyCase[] = [];
 73 |   for (const metadata of df) {
 74 |     const { story, questions, answers } = metadata;
 75 | 
 76 |     const input = questions[0];
 77 |     const contexts = answers.answer_start.map((answer_start, i) =>
 78 |       story.substring(answer_start, answers.answer_end[i]),
 79 |     );
 80 | 
 81 |     cases.push({
 82 |       input: {
 83 |         input,
 84 |         context: contexts[0],
 85 |       },
 86 |       expected: 1,
 87 |       metadata,
 88 |     });
 89 | 
 90 |     cases.push({
 91 |       input: {
 92 |         input,
 93 |         context: contexts[1],
 94 |       },
 95 |       expected: 0,
 96 |       metadata,
 97 |     });
 98 | 
 99 |     const concat = `${contexts[0]} ${contexts[1]}`;
100 |     cases.push({
101 |       input: {
102 |         input,
103 |         context: concat,
104 |       },
105 |       expected: contexts[0].length / concat.length,
106 |       metadata,
107 |     });
108 |   }
109 | 
110 |   return cases;
111 | }
112 | 
113 | async function coqaClosedQA(): Promise<ClosedQACase[]> {
114 |   const df = await getCoqa();
115 | 
116 |   const cases: ClosedQACase[] = [];
117 |   for (const metadata of df) {
118 |     const { questions, answers, story } = metadata;
119 | 
120 |     const input = `Given the following context: ${story}, \n\n Answer the question: ${questions[0]}`;
121 |     const criteria = "Is the answer correct?";
122 |     cases.push({
123 |       input: { input, output: answers.input_text[0], criteria },
124 |       expected: 1,
125 |       metadata,
126 |     });
127 |     cases.push({
128 |       input: { input, output: answers.input_text[1], criteria },
129 |       expected: 0,
130 |       metadata,
131 |     });
132 |   }
133 |   return cases;
134 | }
135 | 
136 | function saveFile(cases: unknown[], fname: string) {
137 |   fs.writeFileSync(path.join(dataDir, fname), JSON.stringify(cases, null, 2));
138 | }
139 | 
140 | async function main() {
141 |   if (!fs.existsSync(dataDir)) {
142 |     fs.mkdirSync(dataDir, { recursive: true });
143 |   }
144 | 
145 |   saveFile(await coqaFactuality(), "coqa-factuality.json");
146 |   saveFile(await coqaContextRelevancy(), "coqa-context-relevancy.json");
147 |   saveFile(await coqaClosedQA(), "coqa-closed-qa.json");
148 | }
149 | 
150 | main();
151 | 


--------------------------------------------------------------------------------
/evals/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "compilerOptions": {
 3 |     "declaration": true,
 4 |     "outDir": "./jsdist",
 5 |     "lib": ["es2015", "dom"],
 6 |     "target": "ES2018",
 7 |     "moduleResolution": "node",
 8 |     "strict": true,
 9 |     "esModuleInterop": true,
10 |     "skipLibCheck": true
11 |   },
12 |   "include": ["js"],
13 |   "exclude": ["node_modules/**"]
14 | }
15 | 


--------------------------------------------------------------------------------
/js/embeddings.test.ts:
--------------------------------------------------------------------------------
 1 | import { EmbeddingSimilarity } from "./string";
 2 | 
 3 | const SYNONYMS = [
 4 |   {
 5 |     word: "water",
 6 |     synonyms: ["water", "H2O", "agua"],
 7 |   },
 8 |   {
 9 |     word: "fire",
10 |     synonyms: ["fire", "flame"],
11 |   },
12 |   {
13 |     word: "earth",
14 |     synonyms: ["earth", "Planet Earth"],
15 |   },
16 | ];
17 | 
18 | const UNRELATED = [
19 |   "water",
20 |   "The quick brown fox jumps over the lazy dog",
21 |   "I like to eat apples",
22 | ];
23 | 
24 | import { test, expect } from "vitest";
25 | 
26 | test("Embeddings Test", async () => {
27 |   const prefix = "resource type: ";
28 |   for (const { word, synonyms } of SYNONYMS) {
29 |     for (const synonym of synonyms) {
30 |       const result = await EmbeddingSimilarity({
31 |         prefix,
32 |         output: word,
33 |         expected: synonym,
34 |       });
35 |       expect(result.score).toBeGreaterThan(0.6);
36 |     }
37 |   }
38 | 
39 |   for (let i = 0; i < UNRELATED.length; i++) {
40 |     for (let j = 0; j < UNRELATED.length; j++) {
41 |       if (i == j) {
42 |         continue;
43 |       }
44 | 
45 |       const word1 = UNRELATED[i];
46 |       const word2 = UNRELATED[j];
47 |       const result = await EmbeddingSimilarity({
48 |         prefix,
49 |         output: word1,
50 |         expected: word2,
51 |       });
52 |       expect(result.score).toBeLessThan(0.5);
53 |     }
54 |   }
55 | }, 600000);
56 | 


--------------------------------------------------------------------------------
/js/index.ts:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * AutoEvals is a tool to quickly and easily evaluate AI model outputs.
 3 |  *
 4 |  * ### Quickstart
 5 |  * ```bash
 6 |  * npm install autoevals
 7 |  * ```
 8 |  *
 9 |  * ### Example
10 |  *
11 |  * Use AutoEvals to model-grade an example LLM completion using the [factuality prompt](templates/factuality.yaml).
12 |  *
13 |  * ```javascript
14 |  * import { Factuality } from "autoevals";
15 |  *
16 |  * (async () => {
17 |  *   const input = "Which country has the highest population?";
18 |  *   const output = "People's Republic of China";
19 |  *   const expected = "China";
20 |  *
21 |  *   const result = await Factuality({ output, expected, input });
22 |  *   console.log(`Factuality score: ${result.score}`);
23 |  *   console.log(`Factuality metadata: ${result.metadata?.rationale}`);
24 |  * })();
25 |  * ```
26 |  *
27 |  * @module autoevals
28 |  */
29 | 
30 | export type { Score, ScorerArgs, Scorer } from "@braintrust/core";
31 | export * from "./llm";
32 | export { init } from "./oai";
33 | export * from "./string";
34 | export * from "./list";
35 | export * from "./moderation";
36 | export * from "./number";
37 | export * from "./json";
38 | export * from "./templates";
39 | export * from "./ragas";
40 | export * from "./value";
41 | export { Evaluators } from "./manifest";
42 | export { makePartial, ScorerWithPartial } from "./partial";
43 | 


--------------------------------------------------------------------------------
/js/json.test.ts:
--------------------------------------------------------------------------------
  1 | import { JSONDiff, ValidJSON } from "./json";
  2 | import { NumericDiff } from "./number";
  3 | import { ExactMatch } from "./value";
  4 | 
  5 | import { test, expect } from "vitest";
  6 | 
  7 | test("JSON String Test", async () => {
  8 |   const cases = [
  9 |     { a: "", b: "", expected: 1 },
 10 |     { a: "", b: "a", expected: 0 },
 11 |     { a: "a", b: "", expected: 0 },
 12 |     { a: "a", b: "a", expected: 1 },
 13 |     { a: "a", b: "b", expected: 0 },
 14 |     { a: "ab", b: "ac", expected: 0.5 },
 15 |     { a: "ac", b: "bc", expected: 0.5 },
 16 |     { a: "abc", b: "axc", expected: 0.66667 },
 17 |     { a: "xabxcdxxefxgx", b: "1ab2cd34ef5g6", expected: 0.53846 },
 18 |   ];
 19 | 
 20 |   for (const { a, b, expected } of cases) {
 21 |     const score = (await JSONDiff({ output: a, expected: b })).score;
 22 |     expect(score).toBeCloseTo(expected);
 23 |   }
 24 | });
 25 | 
 26 | test("JSON Object Test", async () => {
 27 |   const cases = [
 28 |     { a: null, b: null, expected: 1 },
 29 |     { a: undefined, b: null, expected: 1 },
 30 |     { a: "", b: null, expected: 0 },
 31 |     { a: [], b: {}, expected: 0 },
 32 |     { a: [], b: [], expected: 1 },
 33 |     { a: {}, b: {}, expected: 1 },
 34 |     { a: { a: 1 }, b: { a: 1 }, expected: 1 },
 35 |     { a: { a: 1 }, b: { a: 2 }, expected: 0.66667 },
 36 |     { a: { a: 1 }, b: ["a", 1], expected: 0.5714285714285714 },
 37 |     { a: { a: 1 }, b: { b: { a: 1 } }, expected: 0 },
 38 |     { a: { a: 1 }, b: { a: null }, expected: 0 },
 39 |     {
 40 |       a: { mapping: { a: "foo", b: "bar" } },
 41 |       b: { mapping: { a: "Foo", b: "Bar" }, Extra: 5 },
 42 |       expected: 0.33333333333333337,
 43 |     },
 44 |   ];
 45 | 
 46 |   for (const { a, b, expected } of cases) {
 47 |     const score = (await JSONDiff({ output: a, expected: b })).score;
 48 |     expect(score).toBeCloseTo(expected);
 49 |   }
 50 | });
 51 | 
 52 | test("Valid JSON Test", async () => {
 53 |   const cases = [
 54 |     { output: "1", expected: 0 },
 55 |     { output: '{ "a": 1, "b": "hello" }', expected: 1 },
 56 |     { output: '[{ "a": 1 }]', expected: 1 },
 57 |     { output: '[{ "a": 1 }', expected: 0 },
 58 |     {
 59 |       output: '{ "mapping": { "a": "foo", "b": "bar" }, "extra": 4 }',
 60 |       expected: 1,
 61 |     },
 62 |     {
 63 |       output: '{ mapping: { "a": "foo", "b": "bar" }, "extra": 4 }',
 64 |       expected: 0,
 65 |     },
 66 |     {
 67 |       output: '{"a":"1"}',
 68 |       expected: 1,
 69 |       schema: {
 70 |         type: "object",
 71 |         properties: {
 72 |           a: { type: "string" },
 73 |         },
 74 |         required: ["a"],
 75 |       },
 76 |     },
 77 |     {
 78 |       output: '{ "a": "1", "b": "1" }',
 79 |       expected: 0,
 80 |       schema: {
 81 |         type: "object",
 82 |         properties: {
 83 |           a: { type: "string" },
 84 |           b: { type: "number" },
 85 |         },
 86 |         required: ["a", "b"],
 87 |       },
 88 |     },
 89 |     {
 90 |       output: '[{ "a": "1" }, { "a": "1", "b": 22 }]',
 91 |       expected: 1,
 92 |       schema: {
 93 |         type: "array",
 94 |         items: {
 95 |           type: "object",
 96 |           properties: {
 97 |             a: { type: "string" },
 98 |             b: { type: "number" },
 99 |           },
100 |           required: ["a"],
101 |         },
102 |         uniqueItems: true,
103 |       },
104 |     },
105 |     {
106 |       output: { a: "1", b: "1" },
107 |       expected: 1,
108 |     },
109 |     {
110 |       output: [{ a: "1" }, { a: "1", b: 22 }],
111 |       expected: 1,
112 |     },
113 |     {
114 |       output: 100,
115 |       expected: 0,
116 |     },
117 |     {
118 |       // This is technically ambiguous, because it _could_ be the valid parsed JSON value
119 |       // or an unparsed, invalid JSON value. However, since structured outputs _only_ return
120 |       // JSON values, we can safely assume that any strings are unparsed values.
121 |       output: "100",
122 |       expected: 0,
123 |     },
124 |   ];
125 | 
126 |   for (const { output, expected, schema } of cases) {
127 |     const score = (await ValidJSON({ output, schema })).score;
128 |     expect(score).toEqual(expected);
129 |   }
130 | });
131 | 
132 | test("Semantic JSON Test", async () => {
133 |   const cases = [
134 |     { a: '{"x": 1, "y": 2}', b: '{"y": 2, "x": 1}', expected: 1 },
135 |     {
136 |       a: '{"zs": ["a", "b"], "x": 1, "y": 2}',
137 |       b: '{"y": 2, "zs": ["a", "b"], "x": 1}',
138 |       expected: 1,
139 |     },
140 |     {
141 |       a: '{"o1": {"x": 1, "y": 2}}',
142 |       b: '{"o1": {"y": 2, "x": 1}}',
143 |       expected: 1,
144 |     },
145 |     {
146 |       a: '{"xs": [{"o1": {"x": 1, "y": [2]}}]}',
147 |       b: '{"xs": [{"o1": {"y": [2], "x": 1}}]}',
148 |       expected: 1,
149 |     },
150 |     {
151 |       a: '{"o1": {"x": 2, "y": 2}}',
152 |       b: '{"o1": {"y": 2, "x": 1}}',
153 |       expected: 0.83333,
154 |     },
155 |     {
156 |       a: { o1: { x: 2, y: 2 } },
157 |       b: '{"o1": {"y": 2, "x": 1}}',
158 |       expected: 0.83333,
159 |     },
160 |     { a: '{"x": 1, "y": 2}', b: '{"x": 1, "z": 2}', expected: 0.3333 },
161 |     { a: "[1, 2]", b: "[1, 2]", expected: 1 },
162 |     { a: "[1, 2]", b: "[2, 1]", expected: 0.66667 },
163 |   ];
164 | 
165 |   for (const { a, b, expected } of cases) {
166 |     for (const exactNumber of [true, false]) {
167 |       const score = (
168 |         await JSONDiff({
169 |           output: a,
170 |           expected: b,
171 |           numberScorer: exactNumber ? ExactMatch : NumericDiff,
172 |         })
173 |       ).score;
174 |       if (!exactNumber) {
175 |         expect(score).toBeCloseTo(expected);
176 |       } else {
177 |         expect(Math.round((score ?? 0) * 100)).toBeLessThanOrEqual(
178 |           Math.round(expected * 100),
179 |         );
180 |       }
181 |     }
182 |   }
183 | });
184 | 


--------------------------------------------------------------------------------
/js/json.ts:
--------------------------------------------------------------------------------
  1 | import { Scorer } from "@braintrust/core";
  2 | import { NumericDiff } from "./number";
  3 | import { LevenshteinScorer } from "./string";
  4 | import Ajv, { JSONSchemaType, Schema } from "ajv";
  5 | import { makePartial, ScorerWithPartial } from "./partial";
  6 | 
  7 | /**
  8 |  * A simple scorer that compares JSON objects, using a customizable comparison method for strings
  9 |  * (defaults to Levenshtein) and numbers (defaults to NumericDiff).
 10 |  */
 11 | export const JSONDiff: ScorerWithPartial<
 12 |   any,
 13 |   {
 14 |     stringScorer?: Scorer<string, object>;
 15 |     numberScorer?: Scorer<number, object>;
 16 |     preserveStrings?: boolean;
 17 |   }
 18 | > = makePartial(
 19 |   async ({
 20 |     output,
 21 |     expected,
 22 |     stringScorer = LevenshteinScorer,
 23 |     numberScorer = NumericDiff,
 24 |     preserveStrings = false,
 25 |   }) => {
 26 |     return {
 27 |       name: "JSONDiff",
 28 |       score: await jsonDiff(
 29 |         output,
 30 |         expected,
 31 |         stringScorer,
 32 |         numberScorer,
 33 |         preserveStrings,
 34 |       ),
 35 |     };
 36 |   },
 37 |   "JSONDiff",
 38 | );
 39 | 
 40 | /**
 41 |  * A binary scorer that evaluates the validity of JSON output, optionally validating against a
 42 |  * JSON Schema definition (see https://json-schema.org/learn/getting-started-step-by-step#create).
 43 |  */
 44 | export const ValidJSON: ScorerWithPartial<any, { schema?: any }> = makePartial(
 45 |   async ({ output, schema }) => {
 46 |     return {
 47 |       name: "ValidJSON",
 48 |       score: validJSON(output, schema),
 49 |       metadata: { schema },
 50 |     };
 51 |   },
 52 |   "ValidJSON",
 53 | );
 54 | 
 55 | async function jsonDiff(
 56 |   o1: any,
 57 |   o2: any,
 58 |   stringScorer: Scorer<string, object>,
 59 |   numberScorer: Scorer<number, object>,
 60 |   preserveStrings: boolean,
 61 | ): Promise<number | null> {
 62 |   if (!preserveStrings) {
 63 |     if (typeof o1 === "string" && validJSON(o1) === 1) {
 64 |       o1 = JSON.parse(o1);
 65 |     }
 66 |     if (typeof o2 === "string" && validJSON(o2) === 1) {
 67 |       o2 = JSON.parse(o2);
 68 |     }
 69 |   }
 70 | 
 71 |   if (isObject(o1) && isObject(o2)) {
 72 |     if (Object.keys(o1).length == 0 && Object.keys(o2).length == 0) {
 73 |       return 1;
 74 |     }
 75 | 
 76 |     const allKeys = Object.keys(
 77 |       Object.fromEntries(
 78 |         Object.keys(o1)
 79 |           .concat(Object.keys(o2))
 80 |           .map((k) => [k, true]),
 81 |       ),
 82 |     );
 83 | 
 84 |     // eslint-disable-next-line @typescript-eslint/consistent-type-assertions
 85 |     const baseScores = (
 86 |       await Promise.all(
 87 |         allKeys.map((k) =>
 88 |           jsonDiff(o1[k], o2[k], stringScorer, numberScorer, preserveStrings),
 89 |         ),
 90 |       )
 91 |     ).filter((s) => s !== null) as number[];
 92 |     return baseScores.reduce((acc, s) => acc + s, 0) / baseScores.length;
 93 |   } else if (isArray(o1) && isArray(o2)) {
 94 |     if (o1.length === 0 && o2.length === 0) {
 95 |       return 1;
 96 |     }
 97 | 
 98 |     // eslint-disable-next-line @typescript-eslint/consistent-type-assertions
 99 |     const baseScores = (
100 |       await Promise.all(
101 |         Array.from({
102 |           length: Math.min(o1.length, o2.length),
103 |         }).map((_, i) =>
104 |           jsonDiff(o1[i], o2[i], stringScorer, numberScorer, preserveStrings),
105 |         ),
106 |       )
107 |     ).filter((s) => s !== null) as number[];
108 |     return (
109 |       baseScores.reduce((acc, s) => acc + s, 0) / Math.max(o1.length, o2.length)
110 |     );
111 |   } else if (typeof o1 === "string" && typeof o2 === "string") {
112 |     return (await stringScorer({ output: o1, expected: o2 })).score;
113 |   } else if (typeof o1 === "number" && typeof o2 === "number") {
114 |     return (await numberScorer({ output: o1, expected: o2 })).score;
115 |   } else if (
116 |     (o1 === null || o1 === undefined) &&
117 |     (o2 === null || o2 === undefined)
118 |   ) {
119 |     return 1;
120 |   } else if (
121 |     o1 === null ||
122 |     o1 === undefined ||
123 |     o2 === null ||
124 |     o2 === undefined
125 |   ) {
126 |     return 0;
127 |   } else {
128 |     return (
129 |       await stringScorer({
130 |         output: JSON.stringify(o1, replacer),
131 |         expected: JSON.stringify(o2, replacer),
132 |       })
133 |     ).score;
134 |   }
135 | }
136 | 
137 | function isObject(value: any): value is { [key: string]: any } {
138 |   return value instanceof Object && !(value instanceof Array);
139 | }
140 | 
141 | function isArray(value: any): value is Array<unknown> {
142 |   return value instanceof Array;
143 | }
144 | 
145 | // https://gist.github.com/davidfurlong/463a83a33b70a3b6618e97ec9679e490
146 | const replacer = (key: string, value: any) =>
147 |   isObject(value)
148 |     ? Object.keys(value)
149 |         .sort()
150 |         .reduce((sorted: { [key: string]: any }, key) => {
151 |           sorted[key] = value[key];
152 |           return sorted;
153 |         }, {})
154 |     : value;
155 | 
156 | function validJSON<T>(output: any, schema?: Schema | JSONSchemaType<T>) {
157 |   try {
158 |     const parsed = typeof output === "string" ? JSON.parse(output) : output;
159 | 
160 |     if (schema) {
161 |       return validateSchema(parsed, schema);
162 |     }
163 |     if (isObject(parsed) || isArray(parsed)) {
164 |       return 1;
165 |     }
166 |   } catch {
167 |     // Ignore errors
168 |   }
169 | 
170 |   return 0;
171 | }
172 | 
173 | function validateSchema(data: any, schema: any) {
174 |   const ajv = new Ajv();
175 |   const validate = ajv.compile(schema);
176 |   const valid = validate(data);
177 |   return valid ? 1 : 0;
178 | }
179 | 


--------------------------------------------------------------------------------
/js/list.ts:
--------------------------------------------------------------------------------
 1 | import { Scorer } from "@braintrust/core";
 2 | import { Levenshtein } from "./string";
 3 | import { linearSumAssignment } from "linear-sum-assignment";
 4 | import { makePartial, ScorerWithPartial } from "./partial";
 5 | 
 6 | /**
 7 |  * A scorer that semantically evaluates the overlap between two lists of strings. It works by
 8 |  * computing the pairwise similarity between each element of the output and the expected value,
 9 |  * and then using Linear Sum Assignment to find the best matching pairs.
10 |  */
11 | export const ListContains: ScorerWithPartial<
12 |   string[],
13 |   {
14 |     pairwiseScorer?: Scorer<string, {}>;
15 |     allowExtraEntities?: boolean;
16 |   }
17 | > = makePartial(async (args) => {
18 |   const { output, expected, allowExtraEntities } = args;
19 |   if (expected === undefined) {
20 |     throw new Error("ListContains requires an expected value");
21 |   }
22 | 
23 |   if (output.length == 0 && expected.length == 0) {
24 |     return {
25 |       name: "ListContains",
26 |       score: 1,
27 |     };
28 |   } else if (output.length == 0 || expected.length == 0) {
29 |     return {
30 |       name: "ListContains",
31 |       score: 0,
32 |     };
33 |   }
34 | 
35 |   const pairwiseScorer = args.pairwiseScorer || Levenshtein;
36 | 
37 |   const similarities = await Promise.all(
38 |     args.output.map(async (output_item) =>
39 |       Promise.all(
40 |         expected.map(
41 |           async (expected_item) =>
42 |             (
43 |               await pairwiseScorer({
44 |                 output: output_item,
45 |                 expected: expected_item,
46 |               })
47 |             ).score ?? 0,
48 |         ),
49 |       ),
50 |     ),
51 |   );
52 | 
53 |   if (similarities.length === 1 && similarities[0].length === 1) {
54 |     // There appears to be a bug in the linearSumAssignment library when there is only one element
55 |     return {
56 |       name: "ListContains",
57 |       score: similarities[0][0],
58 |     };
59 |   }
60 | 
61 |   const result = linearSumAssignment(similarities, { maximaze: true });
62 | 
63 |   const pairs = Array.from(result.rowAssignments)
64 |     .map((c, r) =>
65 |       c >= 0
66 |         ? {
67 |             output: output[r],
68 |             expected: expected[c],
69 |             score: similarities[r][c],
70 |           }
71 |         : null,
72 |     )
73 |     .filter((pair) => pair !== null) as Array<{
74 |     output: string;
75 |     expected: string;
76 |     score: number;
77 |   }>;
78 | 
79 |   const denominator = allowExtraEntities
80 |     ? expected.length
81 |     : Math.max(output.length, expected.length);
82 | 
83 |   const avgScore =
84 |     pairs.reduce((acc, pair) => acc + pair.score, 0) / denominator;
85 | 
86 |   return {
87 |     name: "ListContains",
88 |     score: Math.min(Math.max(avgScore, 0), 1),
89 |     metadata: {
90 |       pairs,
91 |     },
92 |   };
93 | }, "ListContains");
94 | 


--------------------------------------------------------------------------------
/js/llm.fixtures.ts:
--------------------------------------------------------------------------------
  1 | export const openaiClassifierShouldEvaluateTitles = [
  2 |   {
  3 |     id: "chatcmpl-B7WxpqqPbHYiAOPDl3ViYNalDFbce",
  4 |     object: "chat.completion",
  5 |     created: 1741134709,
  6 |     model: "gpt-3.5-turbo-0125",
  7 |     choices: [
  8 |       {
  9 |         index: 0,
 10 |         message: {
 11 |           role: "assistant",
 12 |           content: null,
 13 |           tool_calls: [
 14 |             {
 15 |               id: "call_OlUJAex0cWI84acfE0XydrHz",
 16 |               type: "function",
 17 |               function: {
 18 |                 name: "select_choice",
 19 |                 arguments:
 20 |                   '{"reasons":"Title 1: Pros - Clearly states the goal of standardizing error responses for better developer experience. Cons - Might be too specific and not catchy. Title 2: Pros - Short and simple. Cons - Lacks information about the issue.","choice":"1"}',
 21 |               },
 22 |             },
 23 |           ],
 24 |           refusal: null,
 25 |         },
 26 |         logprobs: null,
 27 |         finish_reason: "stop",
 28 |       },
 29 |     ],
 30 |     usage: {
 31 |       prompt_tokens: 354,
 32 |       completion_tokens: 58,
 33 |       total_tokens: 412,
 34 |       prompt_tokens_details: {
 35 |         cached_tokens: 0,
 36 |         audio_tokens: 0,
 37 |       },
 38 |       completion_tokens_details: {
 39 |         reasoning_tokens: 0,
 40 |         audio_tokens: 0,
 41 |         accepted_prediction_tokens: 0,
 42 |         rejected_prediction_tokens: 0,
 43 |       },
 44 |     },
 45 |     service_tier: "default",
 46 |     system_fingerprint: null,
 47 |   },
 48 | ];
 49 | 
 50 | export const openaiClassifierShouldEvaluateTitlesWithCoT = [
 51 |   {
 52 |     id: "chatcmpl-B7XFw0OCpCbMVwLizRts3Cl72Obg0",
 53 |     object: "chat.completion",
 54 |     created: 1741135832,
 55 |     model: "gpt-4o-2024-08-06",
 56 |     choices: [
 57 |       {
 58 |         index: 0,
 59 |         message: {
 60 |           role: "assistant",
 61 |           content: null,
 62 |           tool_calls: [
 63 |             {
 64 |               id: "call_jUzxFALMTbpzGX4DfFH57VdI",
 65 |               type: "function",
 66 |               function: {
 67 |                 name: "select_choice",
 68 |                 arguments:
 69 |                   '{"reasons":"1. The issue description talks about the need to standardize error responses from GoTrue, Postgres, and Realtime APIs to improve developer experience (DX).\\n2. Title 1 directly mentions the key components involved (GoTrue, Postgres, and Realtime APIs) and the goal (better DX), which aligns well with the issue description.\\n3. Title 2, \\"Good title,\\" is vague and does not provide any information about the issue or its context.\\n4. Therefore, Title 1 is more descriptive and relevant to the issue at hand.","choice":"1"}',
 70 |               },
 71 |             },
 72 |           ],
 73 |           refusal: null,
 74 |         },
 75 |         logprobs: null,
 76 |         finish_reason: "stop",
 77 |       },
 78 |     ],
 79 |     usage: {
 80 |       prompt_tokens: 370,
 81 |       completion_tokens: 125,
 82 |       total_tokens: 495,
 83 |       prompt_tokens_details: {
 84 |         cached_tokens: 0,
 85 |         audio_tokens: 0,
 86 |       },
 87 |       completion_tokens_details: {
 88 |         reasoning_tokens: 0,
 89 |         audio_tokens: 0,
 90 |         accepted_prediction_tokens: 0,
 91 |         rejected_prediction_tokens: 0,
 92 |       },
 93 |     },
 94 |     service_tier: "default",
 95 |     system_fingerprint: "fp_eb9dce56a8",
 96 |   },
 97 |   {
 98 |     id: "chatcmpl-B7YPU81s7cb2uzlwJ8w9aS5qhfhtJ",
 99 |     object: "chat.completion",
100 |     created: 1741140268,
101 |     model: "gpt-4o-2024-08-06",
102 |     choices: [
103 |       {
104 |         index: 0,
105 |         message: {
106 |           role: "assistant",
107 |           content: null,
108 |           tool_calls: [
109 |             {
110 |               id: "call_3Z63hgrYvLuSZKc2rrHAYLI4",
111 |               type: "function",
112 |               function: {
113 |                 name: "select_choice",
114 |                 arguments:
115 |                   '{"reasons":"1. The issue description talks about the need to standardize error responses from GoTrue, Postgres, and Realtime APIs to improve developer experience (DX).\\n2. Title 1, \\"Good title,\\" is vague and does not convey any specific information about the issue. It does not mention the APIs involved or the purpose of the standardization.\\n3. Title 2, \\"Standardize error responses from GoTrue, Postgres, and Realtime APIs for better DX,\\" directly reflects the main goal of the issue, which is to standardize error responses for better developer experience. It also specifies the APIs involved, making it clear and informative.\\n4. Therefore, Title 2 is a better choice as it accurately and clearly describes the issue at hand.","choice":"2"}',
116 |               },
117 |             },
118 |           ],
119 |           refusal: null,
120 |         },
121 |         logprobs: null,
122 |         finish_reason: "stop",
123 |       },
124 |     ],
125 |     usage: {
126 |       prompt_tokens: 370,
127 |       completion_tokens: 164,
128 |       total_tokens: 534,
129 |       prompt_tokens_details: { cached_tokens: 0, audio_tokens: 0 },
130 |       completion_tokens_details: {
131 |         reasoning_tokens: 0,
132 |         audio_tokens: 0,
133 |         accepted_prediction_tokens: 0,
134 |         rejected_prediction_tokens: 0,
135 |       },
136 |     },
137 |     service_tier: "default",
138 |     system_fingerprint: "fp_eb9dce56a8",
139 |   },
140 |   {
141 |     id: "chatcmpl-B7YQ9ILZ9DJR2AjY2s4qU15Rc6qII",
142 |     object: "chat.completion",
143 |     created: 1741140309,
144 |     model: "gpt-4o-2024-08-06",
145 |     choices: [
146 |       {
147 |         index: 0,
148 |         message: {
149 |           role: "assistant",
150 |           content: null,
151 |           tool_calls: [
152 |             {
153 |               id: "call_CxDdx3i9eaHg81kYjQIICPfd",
154 |               type: "function",
155 |               function: { name: "select_choice", arguments: '{"choice":"1"}' },
156 |             },
157 |           ],
158 |           refusal: null,
159 |         },
160 |         logprobs: null,
161 |         finish_reason: "stop",
162 |       },
163 |     ],
164 |     usage: {
165 |       prompt_tokens: 292,
166 |       completion_tokens: 6,
167 |       total_tokens: 298,
168 |       prompt_tokens_details: { cached_tokens: 0, audio_tokens: 0 },
169 |       completion_tokens_details: {
170 |         reasoning_tokens: 0,
171 |         audio_tokens: 0,
172 |         accepted_prediction_tokens: 0,
173 |         rejected_prediction_tokens: 0,
174 |       },
175 |     },
176 |     service_tier: "default",
177 |     system_fingerprint: "fp_eb9dce56a8",
178 |   },
179 |   {
180 |     id: "chatcmpl-B7YQa80DGu61zUWpdPtXRaJdRQz6l",
181 |     object: "chat.completion",
182 |     created: 1741140336,
183 |     model: "gpt-4o-2024-08-06",
184 |     choices: [
185 |       {
186 |         index: 0,
187 |         message: {
188 |           role: "assistant",
189 |           content: null,
190 |           tool_calls: [
191 |             {
192 |               id: "call_ksuniPMn2w99hFt5Z1mzhWMe",
193 |               type: "function",
194 |               function: { name: "select_choice", arguments: '{"choice":"2"}' },
195 |             },
196 |           ],
197 |           refusal: null,
198 |         },
199 |         logprobs: null,
200 |         finish_reason: "stop",
201 |       },
202 |     ],
203 |     usage: {
204 |       prompt_tokens: 292,
205 |       completion_tokens: 6,
206 |       total_tokens: 298,
207 |       prompt_tokens_details: { cached_tokens: 0, audio_tokens: 0 },
208 |       completion_tokens_details: {
209 |         reasoning_tokens: 0,
210 |         audio_tokens: 0,
211 |         accepted_prediction_tokens: 0,
212 |         rejected_prediction_tokens: 0,
213 |       },
214 |     },
215 |     service_tier: "default",
216 |     system_fingerprint: "fp_eb9dce56a8",
217 |   },
218 | ];
219 | 
220 | export const openaiClassifierShouldEvaluateArithmeticExpressions = [
221 |   {
222 |     id: "chatcmpl-B7YSMVJ7qaQTJ9OtR6zPUEdHxrNbT",
223 |     object: "chat.completion",
224 |     created: 1741140446,
225 |     model: "gpt-4o-2024-08-06",
226 |     choices: [
227 |       {
228 |         index: 0,
229 |         message: {
230 |           role: "assistant",
231 |           content: null,
232 |           tool_calls: [
233 |             {
234 |               id: "call_Iatq5uhNc05I95JHjM7v3N5Y",
235 |               type: "function",
236 |               function: {
237 |                 name: "select_choice",
238 |                 arguments:
239 |                   '{"reasons":"1. The instruction is to add the numbers 1, 2, and 3.\\n2. The correct sum of these numbers is 1 + 2 + 3 = 6.\\n3. Response 1 provides the answer as 600, which is incorrect.\\n4. Response 2 provides the answer as 6, which is correct.\\n5. Since the task is to evaluate which response is better based on the correctness of the addition, Response 2 is better because it provides the correct sum.\\n6. Therefore, Response 1 is not better than Response 2.","choice":"No"}',
240 |               },
241 |             },
242 |           ],
243 |           refusal: null,
244 |         },
245 |         logprobs: null,
246 |         finish_reason: "stop",
247 |       },
248 |     ],
249 |     usage: {
250 |       prompt_tokens: 248,
251 |       completion_tokens: 133,
252 |       total_tokens: 381,
253 |       prompt_tokens_details: { cached_tokens: 0, audio_tokens: 0 },
254 |       completion_tokens_details: {
255 |         reasoning_tokens: 0,
256 |         audio_tokens: 0,
257 |         accepted_prediction_tokens: 0,
258 |         rejected_prediction_tokens: 0,
259 |       },
260 |     },
261 |     service_tier: "default",
262 |     system_fingerprint: "fp_eb9dce56a8",
263 |   },
264 |   {
265 |     id: "chatcmpl-B7YTPWIPOFpRcVOjEnU6s0kZXgPdB",
266 |     object: "chat.completion",
267 |     created: 1741140511,
268 |     model: "gpt-4o-2024-08-06",
269 |     choices: [
270 |       {
271 |         index: 0,
272 |         message: {
273 |           role: "assistant",
274 |           content: null,
275 |           tool_calls: [
276 |             {
277 |               id: "call_eYJIS5zb9S0qS3NW2XZ7HtPu",
278 |               type: "function",
279 |               function: {
280 |                 name: "select_choice",
281 |                 arguments:
282 |                   '{"reasons":"1. The instruction in both cases is to add the numbers 1, 2, and 3.\\n2. The correct sum of these numbers is 1 + 2 + 3 = 6.\\n3. Response 1 provides the answer as 6, which is the correct sum of the numbers.\\n4. Response 2 provides the answer as 600, which is incorrect as it does not represent the sum of the numbers given in the instruction.\\n5. Since Response 1 correctly answers the instruction and Response 2 does not, Response 1 is objectively better than Response 2.\\n6. Therefore, based on the correctness of the responses, the first response is better than the second.","choice":"Yes"}',
283 |               },
284 |             },
285 |           ],
286 |           refusal: null,
287 |         },
288 |         logprobs: null,
289 |         finish_reason: "stop",
290 |       },
291 |     ],
292 |     usage: {
293 |       prompt_tokens: 248,
294 |       completion_tokens: 157,
295 |       total_tokens: 405,
296 |       prompt_tokens_details: { cached_tokens: 0, audio_tokens: 0 },
297 |       completion_tokens_details: {
298 |         reasoning_tokens: 0,
299 |         audio_tokens: 0,
300 |         accepted_prediction_tokens: 0,
301 |         rejected_prediction_tokens: 0,
302 |       },
303 |     },
304 |     service_tier: "default",
305 |     system_fingerprint: "fp_eb9dce56a8",
306 |   },
307 |   {
308 |     id: "chatcmpl-B7YU2qluNL0SenvL1zBiSzrka236n",
309 |     object: "chat.completion",
310 |     created: 1741140550,
311 |     model: "gpt-4o-2024-08-06",
312 |     choices: [
313 |       {
314 |         index: 0,
315 |         message: {
316 |           role: "assistant",
317 |           content: null,
318 |           tool_calls: [
319 |             {
320 |               id: "call_kfVuMD09ytJIQVocHTEBrYLW",
321 |               type: "function",
322 |               function: {
323 |                 name: "select_choice",
324 |                 arguments:
325 |                   '{"reasons":"1. Both instructions are identical, asking to add the numbers 1, 2, and 3.\\n2. Both responses provide the correct sum of these numbers, which is 6.\\n3. There is no additional context, explanation, or formatting in either response that would differentiate them in terms of quality or clarity.\\n4. Since both responses are identical and correct, there is no basis to claim that one is better than the other.\\n5. Therefore, the first response is not better than the second; they are equally good.","choice":"No"}',
326 |               },
327 |             },
328 |           ],
329 |           refusal: null,
330 |         },
331 |         logprobs: null,
332 |         finish_reason: "stop",
333 |       },
334 |     ],
335 |     usage: {
336 |       prompt_tokens: 248,
337 |       completion_tokens: 121,
338 |       total_tokens: 369,
339 |       prompt_tokens_details: { cached_tokens: 0, audio_tokens: 0 },
340 |       completion_tokens_details: {
341 |         reasoning_tokens: 0,
342 |         audio_tokens: 0,
343 |         accepted_prediction_tokens: 0,
344 |         rejected_prediction_tokens: 0,
345 |       },
346 |     },
347 |     service_tier: "default",
348 |     system_fingerprint: "fp_eb9dce56a8",
349 |   },
350 |   {
351 |     id: "chatcmpl-B7YUTk3771FhLlXQNZPaobEC0d8R6",
352 |     object: "chat.completion",
353 |     created: 1741140577,
354 |     model: "gpt-4o-2024-08-06",
355 |     choices: [
356 |       {
357 |         index: 0,
358 |         message: {
359 |           role: "assistant",
360 |           content: null,
361 |           tool_calls: [
362 |             {
363 |               id: "call_lbRjfwrJVP8HgLupWflqoCBM",
364 |               type: "function",
365 |               function: { name: "select_choice", arguments: '{"choice":"No"}' },
366 |             },
367 |           ],
368 |           refusal: null,
369 |         },
370 |         logprobs: null,
371 |         finish_reason: "stop",
372 |       },
373 |     ],
374 |     usage: {
375 |       prompt_tokens: 170,
376 |       completion_tokens: 6,
377 |       total_tokens: 176,
378 |       prompt_tokens_details: { cached_tokens: 0, audio_tokens: 0 },
379 |       completion_tokens_details: {
380 |         reasoning_tokens: 0,
381 |         audio_tokens: 0,
382 |         accepted_prediction_tokens: 0,
383 |         rejected_prediction_tokens: 0,
384 |       },
385 |     },
386 |     service_tier: "default",
387 |     system_fingerprint: "fp_eb9dce56a8",
388 |   },
389 |   {
390 |     id: "chatcmpl-B7YUtrpit4RvQCeqfOcZme9L6pMAP",
391 |     object: "chat.completion",
392 |     created: 1741140603,
393 |     model: "gpt-4o-2024-08-06",
394 |     choices: [
395 |       {
396 |         index: 0,
397 |         message: {
398 |           role: "assistant",
399 |           content: null,
400 |           tool_calls: [
401 |             {
402 |               id: "call_d3YnOawL5qadUmE46hoKds6B",
403 |               type: "function",
404 |               function: {
405 |                 name: "select_choice",
406 |                 arguments: '{"choice":"Yes"}',
407 |               },
408 |             },
409 |           ],
410 |           refusal: null,
411 |         },
412 |         logprobs: null,
413 |         finish_reason: "stop",
414 |       },
415 |     ],
416 |     usage: {
417 |       prompt_tokens: 170,
418 |       completion_tokens: 6,
419 |       total_tokens: 176,
420 |       prompt_tokens_details: { cached_tokens: 0, audio_tokens: 0 },
421 |       completion_tokens_details: {
422 |         reasoning_tokens: 0,
423 |         audio_tokens: 0,
424 |         accepted_prediction_tokens: 0,
425 |         rejected_prediction_tokens: 0,
426 |       },
427 |     },
428 |     service_tier: "default",
429 |     system_fingerprint: "fp_eb9dce56a8",
430 |   },
431 |   {
432 |     id: "chatcmpl-B7YV8HHTm4hZU58Zp9gcjwp3MigEl",
433 |     object: "chat.completion",
434 |     created: 1741140618,
435 |     model: "gpt-4o-2024-08-06",
436 |     choices: [
437 |       {
438 |         index: 0,
439 |         message: {
440 |           role: "assistant",
441 |           content: null,
442 |           tool_calls: [
443 |             {
444 |               id: "call_l3AonPTlmEhJ95fbq4M6J0sd",
445 |               type: "function",
446 |               function: { name: "select_choice", arguments: '{"choice":"No"}' },
447 |             },
448 |           ],
449 |           refusal: null,
450 |         },
451 |         logprobs: null,
452 |         finish_reason: "stop",
453 |       },
454 |     ],
455 |     usage: {
456 |       prompt_tokens: 170,
457 |       completion_tokens: 6,
458 |       total_tokens: 176,
459 |       prompt_tokens_details: { cached_tokens: 0, audio_tokens: 0 },
460 |       completion_tokens_details: {
461 |         reasoning_tokens: 0,
462 |         audio_tokens: 0,
463 |         accepted_prediction_tokens: 0,
464 |         rejected_prediction_tokens: 0,
465 |       },
466 |     },
467 |     service_tier: "default",
468 |     system_fingerprint: "fp_eb9dce56a8",
469 |   },
470 | ];
471 | 


--------------------------------------------------------------------------------
/js/llm.test.ts:
--------------------------------------------------------------------------------
  1 | import { bypass, http, HttpResponse } from "msw";
  2 | import { setupServer } from "msw/node";
  3 | import { OpenAI } from "openai";
  4 | import { ChatCompletionMessageParam } from "openai/resources";
  5 | import { afterAll, afterEach, beforeAll, describe, expect, test } from "vitest";
  6 | import {
  7 |   Battle,
  8 |   buildClassificationTools,
  9 |   LLMClassifierFromTemplate,
 10 |   OpenAIClassifier,
 11 | } from "../js/llm";
 12 | import {
 13 |   openaiClassifierShouldEvaluateArithmeticExpressions,
 14 |   openaiClassifierShouldEvaluateTitles,
 15 |   openaiClassifierShouldEvaluateTitlesWithCoT,
 16 | } from "./llm.fixtures";
 17 | import { init } from "./oai";
 18 | 
 19 | export const server = setupServer();
 20 | 
 21 | beforeAll(() => {
 22 |   server.listen({
 23 |     onUnhandledRequest: (req) => {
 24 |       throw new Error(`Unhandled request ${req.method}, ${req.url}`);
 25 |     },
 26 |   });
 27 | 
 28 |   init({
 29 |     client: new OpenAI({
 30 |       apiKey: "test-api-key",
 31 |       baseURL: "https://api.openai.com/v1",
 32 |     }),
 33 |   });
 34 | });
 35 | 
 36 | afterEach(() => {
 37 |   server.resetHandlers();
 38 | });
 39 | 
 40 | afterAll(() => {
 41 |   server.close();
 42 |   init();
 43 | });
 44 | 
 45 | describe("LLM Tests", () => {
 46 |   test("openai classifier should evaluate titles", async () => {
 47 |     let callCount = -1;
 48 |     server.use(
 49 |       http.post("https://api.openai.com/v1/chat/completions", async () => {
 50 |         const response = openaiClassifierShouldEvaluateTitles[++callCount];
 51 |         return response
 52 |           ? HttpResponse.json(response)
 53 |           : HttpResponse.json({}, { status: 500 });
 54 |       }),
 55 |     );
 56 | 
 57 |     const messages: ChatCompletionMessageParam[] = [
 58 |       {
 59 |         role: "system",
 60 |         content: `You are a technical project manager who helps software engineers generate better titles for their GitHub issues.
 61 | You will look at the issue description, and pick which of two titles better describes it.`,
 62 |       },
 63 |       {
 64 |         role: "user",
 65 |         content: `I'm going to provide you with the issue description, and two possible titles.
 66 | 
 67 | Issue Description: {{page_content}}
 68 | 
 69 | 1: {{output}}
 70 | 2: {{expected}}
 71 | 
 72 | Please discuss each title briefly (one line for pros, one for cons), and then answer the question by calling
 73 | the select_choice function with "1" or "2".`,
 74 |       },
 75 |     ];
 76 | 
 77 |     const page_content = `As suggested by Nicolo, we should standardize the error responses coming from GoTrue, postgres, and realtime (and any other/future APIs) so that it's better DX when writing a client,
 78 | 
 79 | We can make this change on the servers themselves, but since postgrest and gotrue are fully/partially external may be harder to change, it might be an option to transform the errors within the client libraries/supabase-js, could be messy?
 80 | 
 81 | Nicolo also dropped this as a reference: http://spec.openapis.org/oas/v3.0.3#openapi-specification`;
 82 | 
 83 |     const output = `Standardize error responses from GoTrue, Postgres, and Realtime APIs for better DX`;
 84 |     const expected = `Good title`;
 85 | 
 86 |     const score = await OpenAIClassifier({
 87 |       name: "titles",
 88 |       output,
 89 |       expected,
 90 |       messages,
 91 |       model: "gpt-3.5-turbo",
 92 |       parseScoreFn: (grade: string) => grade.match(/Winner: (\d+)/)![1],
 93 |       choiceScores: { "1": 1, "2": 0 },
 94 |       classificationTools: buildClassificationTools(true, ["1", "2"]),
 95 |       page_content,
 96 |       maxTokens: 500,
 97 |       openAiApiKey: "test-api-key",
 98 |     });
 99 | 
100 |     expect(score.error).toBeUndefined();
101 |   });
102 | 
103 |   test("llm classifier should evaluate with and without chain of thought", async () => {
104 |     let callCount = -1;
105 |     server.use(
106 |       http.post(
107 |         "https://api.openai.com/v1/chat/completions",
108 |         async ({ request }) => {
109 |           const response =
110 |             openaiClassifierShouldEvaluateTitlesWithCoT[++callCount];
111 | 
112 |           if (!response) {
113 |             const res = await fetch(bypass(request));
114 |             const body = await res.json();
115 |             return HttpResponse.json(body, {
116 |               status: res.status,
117 |               headers: res.headers,
118 |             });
119 |           }
120 | 
121 |           return response
122 |             ? HttpResponse.json(response)
123 |             : HttpResponse.json({}, { status: 500 });
124 |         },
125 |       ),
126 |     );
127 | 
128 |     const pageContent = `As suggested by Nicolo, we should standardize the error responses coming from GoTrue, postgres, and realtime (and any other/future APIs) so that it's better DX when writing a client,
129 | 
130 | We can make this change on the servers themselves, but since postgrest and gotrue are fully/partially external may be harder to change, it might be an option to transform the errors within the client libraries/supabase-js, could be messy?
131 | 
132 | Nicolo also dropped this as a reference: http://spec.openapis.org/oas/v3.0.3#openapi-specification`;
133 |     const genTitle = `Standardize error responses from GoTrue, Postgres, and Realtime APIs for better DX`;
134 |     const originalTitle = `Good title`;
135 | 
136 |     for (const useCoT of [true, false]) {
137 |       const classifier = LLMClassifierFromTemplate<{ page_content: string }>({
138 |         name: "titles",
139 |         promptTemplate: `You are a technical project manager who helps software engineers generate better titles for their GitHub issues.
140 | You will look at the issue description, and pick which of two titles better describes it.
141 | 
142 | I'm going to provide you with the issue description, and two possible titles.
143 | 
144 | Issue Description: {{page_content}}
145 | 
146 | 1: {{output}}
147 | 2: {{expected}}`,
148 |         choiceScores: { "1": 1, "2": 0 },
149 |         useCoT,
150 |       });
151 | 
152 |       let response = await classifier({
153 |         output: genTitle,
154 |         expected: originalTitle,
155 |         page_content: pageContent,
156 |         openAiApiKey: "test-api-key",
157 |       });
158 | 
159 |       expect(response.error).toBeUndefined();
160 | 
161 |       response = await classifier({
162 |         output: originalTitle,
163 |         expected: genTitle,
164 |         page_content: pageContent,
165 |         openAiApiKey: "test-api-key",
166 |       });
167 | 
168 |       expect(response.error).toBeUndefined();
169 |     }
170 |   });
171 | 
172 |   test("battle should evaluate arithmetic expressions", async () => {
173 |     let callCount = -1;
174 |     server.use(
175 |       http.post("https://api.openai.com/v1/chat/completions", async () => {
176 |         const response =
177 |           openaiClassifierShouldEvaluateArithmeticExpressions[++callCount];
178 | 
179 |         return response
180 |           ? HttpResponse.json(response)
181 |           : HttpResponse.json({}, { status: 500 });
182 |       }),
183 |     );
184 | 
185 |     // reset the client to test direct client usage
186 |     init();
187 | 
188 |     const client = new OpenAI({
189 |       apiKey: "test-api-key",
190 |       baseURL: "https://api.openai.com/v1",
191 |     });
192 | 
193 |     for (const useCoT of [true, false]) {
194 |       let response = await Battle({
195 |         useCoT,
196 |         instructions: "Add the following numbers: 1, 2, 3",
197 |         output: "600",
198 |         expected: "6",
199 |         client,
200 |       });
201 | 
202 |       expect(response.error).toBeUndefined();
203 | 
204 |       response = await Battle({
205 |         useCoT,
206 |         instructions: "Add the following numbers: 1, 2, 3",
207 |         output: "6",
208 |         expected: "600",
209 |         client,
210 |       });
211 | 
212 |       expect(response.error).toBeUndefined();
213 | 
214 |       response = await Battle({
215 |         useCoT,
216 |         instructions: "Add the following numbers: 1, 2, 3",
217 |         output: "6",
218 |         expected: "6",
219 |         client,
220 |       });
221 | 
222 |       expect(response.error).toBeUndefined();
223 |     }
224 |   });
225 | });
226 | 


--------------------------------------------------------------------------------
/js/llm.ts:
--------------------------------------------------------------------------------
  1 | import { Score, Scorer, ScorerArgs } from "@braintrust/core";
  2 | import { ChatCache, OpenAIAuth, cachedChatCompletion } from "./oai";
  3 | import { ModelGradedSpec, templates } from "./templates";
  4 | import {
  5 |   ChatCompletionMessage,
  6 |   ChatCompletionMessageParam,
  7 |   ChatCompletionTool,
  8 | } from "openai/resources";
  9 | import { makePartial, ScorerWithPartial } from "./partial";
 10 | import { renderMessages } from "./render-messages";
 11 | 
 12 | const NO_COT_SUFFIX =
 13 |   "Answer the question by calling `select_choice` with a single choice from {{__choices}}.";
 14 | 
 15 | const COT_SUFFIX =
 16 |   "Answer the question by calling `select_choice` with your reasoning in a step-by-step manner to be sure that your conclusion is correct. Avoid simply stating the correct answer at the outset. Select a single choice by setting the `choice` parameter to a single choice from {{__choices}}.";
 17 | 
 18 | export type LLMArgs = {
 19 |   maxTokens?: number;
 20 |   temperature?: number;
 21 | } & OpenAIAuth;
 22 | 
 23 | export const DEFAULT_MODEL = "gpt-4o";
 24 | 
 25 | const PLAIN_RESPONSE_SCHEMA = {
 26 |   properties: {
 27 |     choice: { description: "The choice", title: "Choice", type: "string" },
 28 |   },
 29 |   required: ["choice"],
 30 |   title: "FunctionResponse",
 31 |   type: "object",
 32 | };
 33 | 
 34 | const COT_RESPONSE_SCHEMA = {
 35 |   properties: {
 36 |     reasons: {
 37 |       description:
 38 |         "Write out in a step by step manner your reasoning to be sure that your conclusion is correct. Avoid simply stating the correct answer at the outset.",
 39 |       title: "Reasoning",
 40 |       type: "string",
 41 |     },
 42 |     choice: { description: "The choice", title: "Choice", type: "string" },
 43 |   },
 44 |   required: ["reasons", "choice"],
 45 |   title: "CoTResponse",
 46 |   type: "object",
 47 | };
 48 | 
 49 | export function buildClassificationTools(
 50 |   useCoT: boolean,
 51 |   choiceStrings: string[],
 52 | ): ChatCompletionTool[] {
 53 |   const params = useCoT ? COT_RESPONSE_SCHEMA : PLAIN_RESPONSE_SCHEMA;
 54 |   const enumParams = {
 55 |     ...params,
 56 |     properties: {
 57 |       ...params.properties,
 58 |       choice: { ...params.properties.choice, enum: choiceStrings },
 59 |     },
 60 |   };
 61 |   return [
 62 |     {
 63 |       type: "function",
 64 |       function: {
 65 |         name: "select_choice",
 66 |         description: "Call this function to select a choice.",
 67 |         parameters: enumParams,
 68 |       },
 69 |     },
 70 |   ];
 71 | }
 72 | 
 73 | export type OpenAIClassifierArgs<RenderArgs> = {
 74 |   name: string;
 75 |   model: string;
 76 |   messages: ChatCompletionMessageParam[];
 77 |   choiceScores: Record<string, number>;
 78 |   classificationTools: ChatCompletionTool[];
 79 |   cache?: ChatCache;
 80 | } & LLMArgs &
 81 |   RenderArgs;
 82 | 
 83 | export async function OpenAIClassifier<RenderArgs, Output>(
 84 |   args: ScorerArgs<Output, OpenAIClassifierArgs<RenderArgs>>,
 85 | ): Promise<Score> {
 86 |   const {
 87 |     name,
 88 |     output,
 89 |     expected,
 90 |     openAiApiKey,
 91 |     openAiOrganizationId,
 92 |     openAiBaseUrl,
 93 |     openAiDefaultHeaders,
 94 |     openAiDangerouslyAllowBrowser,
 95 |     azureOpenAi,
 96 |     client,
 97 |     ...remaining
 98 |   } = args;
 99 | 
100 |   const {
101 |     messages: messagesArg,
102 |     model,
103 |     choiceScores,
104 |     classificationTools: classificationTools,
105 |     maxTokens,
106 |     temperature,
107 |     cache,
108 |     ...remainingRenderArgs
109 |   } = remaining;
110 | 
111 |   const extraArgs = {
112 |     temperature: temperature || 0,
113 |     max_tokens: maxTokens,
114 |   };
115 | 
116 |   const renderArgs = {
117 |     output,
118 |     expected,
119 |     ...remainingRenderArgs,
120 |   };
121 | 
122 |   const messages = renderMessages(messagesArg, renderArgs);
123 | 
124 |   const resp = await cachedChatCompletion(
125 |     {
126 |       model,
127 |       messages,
128 |       tools: classificationTools,
129 |       tool_choice: {
130 |         type: "function",
131 |         function: {
132 |           name: "select_choice",
133 |         },
134 |       },
135 |       ...extraArgs,
136 |     },
137 |     client
138 |       ? { client }
139 |       : {
140 |           cache,
141 |           openAiApiKey,
142 |           openAiOrganizationId,
143 |           openAiBaseUrl,
144 |           openAiDefaultHeaders,
145 |           openAiDangerouslyAllowBrowser,
146 |           azureOpenAi,
147 |         },
148 |   );
149 | 
150 |   if (resp.choices.length > 0) {
151 |     return {
152 |       name,
153 |       ...parseResponse(resp.choices[0].message!, choiceScores),
154 |     };
155 |   } else {
156 |     throw new Error("Empty response from OpenAI");
157 |   }
158 | }
159 | 
160 | function parseResponse(
161 |   resp: ChatCompletionMessage,
162 |   choiceScores: Record<string, number>,
163 | ): Omit<Score, "name"> {
164 |   let score = 0;
165 |   const metadata: Record<string, unknown> = {};
166 | 
167 |   if (!resp.tool_calls || resp.tool_calls.length === 0) {
168 |     throw new Error("No tool calls in response");
169 |   }
170 |   const toolCall = resp.tool_calls[0];
171 |   if (toolCall.function.name !== "select_choice") {
172 |     throw new Error("Unexpected tool call");
173 |   }
174 | 
175 |   const args = JSON.parse(toolCall.function.arguments);
176 |   metadata["rationale"] = args["reasons"];
177 |   const choice = args["choice"]?.trim();
178 |   metadata["choice"] = choice;
179 |   if (choice && choiceScores[choice] !== undefined) {
180 |     score = choiceScores[choice];
181 |   } else {
182 |     throw new Error(`Unknown score choice ${choice}`);
183 |   }
184 |   return {
185 |     score,
186 |     metadata,
187 |   };
188 | }
189 | 
190 | export type LLMClassifierArgs<RenderArgs> = {
191 |   model?: string;
192 |   useCoT?: boolean;
193 | } & LLMArgs &
194 |   RenderArgs;
195 | 
196 | export function LLMClassifierFromTemplate<RenderArgs>({
197 |   name,
198 |   promptTemplate,
199 |   choiceScores,
200 |   model = DEFAULT_MODEL,
201 |   useCoT: useCoTArg,
202 |   temperature,
203 | }: {
204 |   name: string;
205 |   promptTemplate: string;
206 |   choiceScores: Record<string, number>;
207 |   model?: string;
208 |   useCoT?: boolean;
209 |   temperature?: number;
210 | }): Scorer<string, LLMClassifierArgs<RenderArgs>> {
211 |   const choiceStrings = Object.keys(choiceScores);
212 |   const ret = async (
213 |     runtimeArgs: ScorerArgs<string, LLMClassifierArgs<RenderArgs>>,
214 |   ) => {
215 |     const useCoT = runtimeArgs.useCoT ?? useCoTArg ?? true;
216 | 
217 |     const prompt =
218 |       promptTemplate + "\n" + (useCoT ? COT_SUFFIX : NO_COT_SUFFIX);
219 | 
220 |     const maxTokens = 512;
221 |     const messages: ChatCompletionMessageParam[] = [
222 |       {
223 |         role: "user",
224 |         content: prompt,
225 |       },
226 |     ];
227 | 
228 |     return await OpenAIClassifier({
229 |       name,
230 |       messages,
231 |       choiceScores,
232 |       classificationTools: buildClassificationTools(useCoT, choiceStrings),
233 |       model,
234 |       maxTokens,
235 |       temperature,
236 |       __choices: choiceStrings,
237 |       ...runtimeArgs,
238 | 
239 |       // Since the logic is a bit funky for computing this, include
240 |       // it at the end to prevent overrides
241 |       useCoT,
242 |     });
243 |   };
244 |   Object.defineProperty(ret, "name", {
245 |     value: name,
246 |     configurable: true,
247 |   });
248 | 
249 |   return ret;
250 | }
251 | 
252 | export function LLMClassifierFromSpec<RenderArgs>(
253 |   name: string,
254 |   spec: ModelGradedSpec,
255 | ): Scorer<any, LLMClassifierArgs<RenderArgs>> {
256 |   return LLMClassifierFromTemplate({
257 |     name,
258 |     promptTemplate: spec.prompt,
259 |     choiceScores: spec.choice_scores,
260 |     model: spec.model,
261 |     useCoT: spec.use_cot,
262 |     temperature: spec.temperature,
263 |   });
264 | }
265 | 
266 | export function LLMClassifierFromSpecFile<RenderArgs>(
267 |   name: string,
268 |   templateName: keyof typeof templates,
269 | ): Scorer<any, LLMClassifierArgs<RenderArgs>> {
270 |   const doc = templates[templateName];
271 |   return LLMClassifierFromSpec(name, doc);
272 | }
273 | 
274 | function buildLLMClassifier<RenderArgs>(
275 |   name: string,
276 |   templateName: keyof typeof templates,
277 | ): ScorerWithPartial<string, LLMClassifierArgs<RenderArgs>> {
278 |   if (!(templateName in templates)) {
279 |     throw new Error(`Model template ${name} not found`);
280 |   }
281 | 
282 |   return makePartial(
283 |     LLMClassifierFromSpecFile<RenderArgs>(
284 |       name,
285 |       templateName as keyof typeof templates,
286 |     ),
287 |     name,
288 |   );
289 | }
290 | 
291 | /**
292 |  * Test whether an output _better_ performs the `instructions` than the original
293 |  * (expected) value.
294 |  */
295 | export const Battle = buildLLMClassifier<{ instructions: string }>(
296 |   "Battle",
297 |   "battle",
298 | );
299 | 
300 | /**
301 |  * Test whether an output answers the `input` using knowledge built into the model.
302 |  * You can specify `criteria` to further constrain the answer.
303 |  */
304 | export const ClosedQA = buildLLMClassifier<{ input: string; criteria: any }>(
305 |   "ClosedQA",
306 |   "closed_q_a",
307 | );
308 | 
309 | /**
310 |  * Test whether an output is funny.
311 |  */
312 | export const Humor = buildLLMClassifier<{}>("Humor", "humor");
313 | 
314 | /**
315 |  * Test whether an output is factual, compared to an original (`expected`) value.
316 |  */
317 | export const Factuality = buildLLMClassifier<{
318 |   input: string;
319 |   output: string;
320 |   expected?: string;
321 | }>("Factuality", "factuality");
322 | 
323 | /**
324 |  * Test whether an output is a possible solution to the challenge posed in the input.
325 |  */
326 | export const Possible = buildLLMClassifier<{ input: string }>(
327 |   "Possible",
328 |   "possible",
329 | );
330 | 
331 | /**
332 |  * Test whether an output is malicious.
333 |  */
334 | export const Security = buildLLMClassifier<{}>("Security", "security");
335 | 
336 | /**
337 |  * Test whether a SQL query is semantically the same as a reference (output) query.
338 |  */
339 | export const Sql = buildLLMClassifier<{ input: string }>("Sql", "sql");
340 | 
341 | /**
342 |  * Test whether an output is a better summary of the `input` than the original (`expected`) value.
343 |  */
344 | export const Summary = buildLLMClassifier<{ input: string }>(
345 |   "Summary",
346 |   "summary",
347 | );
348 | 
349 | /**
350 |  * Test whether an `output` is as good of a translation of the `input` in the specified `language`
351 |  * as an expert (`expected`) value.
352 |  */
353 | export const Translation = buildLLMClassifier<{
354 |   language: string;
355 |   input: string;
356 | }>("Translation", "translation");
357 | 


--------------------------------------------------------------------------------
/js/manifest.ts:
--------------------------------------------------------------------------------
  1 | import { JSONDiff, ValidJSON } from "./json";
  2 | import {
  3 |   Battle,
  4 |   ClosedQA,
  5 |   Factuality,
  6 |   Humor,
  7 |   Possible,
  8 |   Security,
  9 |   Sql,
 10 |   Summary,
 11 |   Translation,
 12 | } from "./llm";
 13 | import { NumericDiff } from "./number";
 14 | import { EmbeddingSimilarity, Levenshtein } from "./string";
 15 | import {
 16 |   ContextEntityRecall,
 17 |   ContextRelevancy,
 18 |   ContextRecall,
 19 |   ContextPrecision,
 20 |   AnswerRelevancy,
 21 |   AnswerSimilarity,
 22 |   AnswerCorrectness,
 23 | } from "./ragas";
 24 | import { ListContains } from "./list";
 25 | import { ScorerWithPartial } from "./partial";
 26 | import { Moderation } from "./moderation";
 27 | import { ExactMatch } from "./value";
 28 | import { ModelGradedSpec, templates } from "./templates";
 29 | 
 30 | interface AutoevalMethod {
 31 |   method: ScorerWithPartial<any, any>;
 32 |   description: string;
 33 |   template?: ModelGradedSpec;
 34 |   requiresExtraParams?: boolean;
 35 | }
 36 | 
 37 | export const Evaluators: {
 38 |   label: string;
 39 |   methods: AutoevalMethod[];
 40 | }[] = [
 41 |   {
 42 |     label: "LLM-as-a-Judge",
 43 |     methods: [
 44 |       {
 45 |         method: Battle,
 46 |         description:
 47 |           "Test whether an output _better_ performs the `instructions` than the original (expected) value.",
 48 |         template: templates.battle,
 49 |         requiresExtraParams: true,
 50 |       },
 51 |       {
 52 |         method: ClosedQA,
 53 |         description:
 54 |           "Test whether an output answers the `input` using knowledge built into the model. You can specify `criteria` to further constrain the answer.",
 55 |         template: templates.closed_q_a,
 56 |         requiresExtraParams: true,
 57 |       },
 58 |       {
 59 |         method: Humor,
 60 |         description: "Test whether an output is funny.",
 61 |         template: templates.humor,
 62 |       },
 63 |       {
 64 |         method: Factuality,
 65 |         description:
 66 |           "Test whether an output is factual, compared to an original (`expected`) value.",
 67 |         template: templates.factuality,
 68 |       },
 69 |       {
 70 |         method: Moderation,
 71 |         description:
 72 |           "A scorer that uses OpenAI's moderation API to determine if AI response contains ANY flagged content.",
 73 |       },
 74 |       {
 75 |         method: Possible,
 76 |         description:
 77 |           "Test whether an output is a possible solution to the challenge posed in the input.",
 78 |         template: templates.possible,
 79 |       },
 80 |       {
 81 |         method: Security,
 82 |         description: "Test whether an output is malicious.",
 83 |         template: templates.security,
 84 |       },
 85 |       {
 86 |         method: Sql,
 87 |         description:
 88 |           "Test whether a SQL query is semantically the same as a reference (output) query.",
 89 |         template: templates.sql,
 90 |       },
 91 |       {
 92 |         method: Summary,
 93 |         description:
 94 |           "Test whether an output is a better summary of the `input` than the original (`expected`) value.",
 95 |         template: templates.summary,
 96 |       },
 97 |       {
 98 |         method: Translation,
 99 |         description:
100 |           "Test whether an `output` is as good of a translation of the `input` in the specified `language` as an expert (`expected`) value.",
101 |         template: templates.translation,
102 |         requiresExtraParams: true,
103 |       },
104 |     ],
105 |   },
106 |   {
107 |     label: "RAG",
108 |     methods: [
109 |       {
110 |         method: ContextEntityRecall,
111 |         description:
112 |           "Estimates context recall by estimating TP and FN using annotated answer and retrieved context.",
113 |         requiresExtraParams: true,
114 |       },
115 |       {
116 |         method: ContextRelevancy,
117 |         description:
118 |           "Extracts relevant sentences from the provided context that are absolutely required to answer the given question.",
119 |         requiresExtraParams: true,
120 |       },
121 |       {
122 |         method: ContextRecall,
123 |         description:
124 |           "Analyzes each sentence in the answer and classifies if the sentence can be attributed to the given context or not.",
125 |         requiresExtraParams: true,
126 |       },
127 |       {
128 |         method: ContextPrecision,
129 |         description:
130 |           "Verifies if the context was useful in arriving at the given answer.",
131 |         requiresExtraParams: true,
132 |       },
133 |       {
134 |         method: AnswerRelevancy,
135 |         description:
136 |           "Scores the relevancy of the generated answer to the given question.",
137 |         requiresExtraParams: true,
138 |       },
139 |       {
140 |         method: AnswerSimilarity,
141 |         description:
142 |           "Scores the semantic similarity between the generated answer and ground truth.",
143 |         requiresExtraParams: true,
144 |       },
145 |       {
146 |         method: AnswerCorrectness,
147 |         description:
148 |           "Measures answer correctness compared to ground truth using a weighted average of factuality and semantic similarity.",
149 |         requiresExtraParams: true,
150 |       },
151 |     ],
152 |   },
153 |   {
154 |     label: "Composite",
155 |     methods: [
156 |       {
157 |         method: ListContains,
158 |         description:
159 |           "Semantically evaluates the overlap between two lists of strings using pairwise similarity and Linear Sum Assignment.",
160 |       },
161 |       {
162 |         method: ValidJSON,
163 |         description:
164 |           "Evaluates the validity of JSON output, optionally validating against a JSON Schema definition.",
165 |       },
166 |     ],
167 |   },
168 |   {
169 |     label: "Embeddings",
170 |     methods: [
171 |       {
172 |         method: EmbeddingSimilarity,
173 |         description:
174 |           "Evaluates the semantic similarity between two embeddings using cosine distance.",
175 |       },
176 |     ],
177 |   },
178 |   {
179 |     label: "Heuristic",
180 |     methods: [
181 |       {
182 |         method: JSONDiff,
183 |         description:
184 |           "Compares JSON objects using customizable comparison methods for strings and numbers.",
185 |       },
186 |       {
187 |         method: Levenshtein,
188 |         description: "Uses the Levenshtein distance to compare two strings.",
189 |       },
190 |       {
191 |         method: ExactMatch,
192 |         description:
193 |           "Compares two values for exact equality. If the values are objects, they are converted to JSON strings before comparison.",
194 |       },
195 |       {
196 |         method: NumericDiff,
197 |         description: "Compares numbers by normalizing their difference.",
198 |       },
199 |     ],
200 |   },
201 | ];
202 | 


--------------------------------------------------------------------------------
/js/moderation.test.ts:
--------------------------------------------------------------------------------
 1 | import { Moderation } from "./moderation";
 2 | import { describe, expect, test } from "vitest";
 3 | 
 4 | describe("Moderation", () => {
 5 |   const cases = [
 6 |     {
 7 |       name: "Safe content without threshold",
 8 |       output: "hello",
 9 |       threshold: undefined,
10 |       expected: 1,
11 |     },
12 |     {
13 |       name: "Unsafe content without threshold",
14 |       output: "kill kill kill kill kill",
15 |       threshold: undefined,
16 |       expected: 0,
17 |     },
18 |     {
19 |       name: "Unsafe content with lower threshold",
20 |       output: "here are my suicidal thoughts",
21 |       threshold: 0.25,
22 |       expected: 0,
23 |     },
24 |     {
25 |       name: "Unsafe content with higher threshold",
26 |       output: "here are my suicidal thoughts",
27 |       threshold: 0.99,
28 |       expected: 1,
29 |     },
30 |   ];
31 | 
32 |   for (const { name, output, threshold, expected } of cases) {
33 |     test(`passes ${name} check`, async () => {
34 |       const result = await Moderation({
35 |         output,
36 |         threshold,
37 |       });
38 | 
39 |       expect(result.score).toEqual(expected);
40 |     }, 6000);
41 |   }
42 | });
43 | 


--------------------------------------------------------------------------------
/js/moderation.ts:
--------------------------------------------------------------------------------
 1 | import { OpenAIAuth, buildOpenAIClient } from "./oai";
 2 | import { Moderation as ModerationResult } from "openai/resources";
 3 | import { makePartial, ScorerWithPartial } from "./partial";
 4 | 
 5 | const MODERATION_NAME = "Moderation";
 6 | 
 7 | function computeScore(result: ModerationResult, threshold?: number): number {
 8 |   if (threshold === undefined) {
 9 |     return result.flagged ? 0 : 1;
10 |   }
11 | 
12 |   for (const key of Object.keys(result.category_scores)) {
13 |     const score =
14 |       result.category_scores[key as keyof typeof result.category_scores];
15 |     if (score > threshold) {
16 |       return 0;
17 |     }
18 |   }
19 | 
20 |   return 1;
21 | }
22 | 
23 | /**
24 |  * A scorer that uses OpenAI's moderation API to determine if AI response contains ANY flagged content.
25 |  *
26 |  * @param args
27 |  * @param args.threshold Optional. Threshold to use to determine whether content has exceeded threshold. By
28 |  * default, it uses OpenAI's default. (Using `flagged` from the response payload.)
29 |  * @param args.categories Optional. Specific categories to look for. If not set, all categories will
30 |  * be considered.
31 |  * @returns A score between 0 and 1, where 1 means content passed all moderation checks.
32 |  */
33 | export const Moderation: ScorerWithPartial<
34 |   string,
35 |   {
36 |     threshold?: number;
37 |   } & OpenAIAuth
38 | > = makePartial(async (args) => {
39 |   const threshold = args.threshold ?? undefined;
40 |   const output = args.output;
41 | 
42 |   const openai = buildOpenAIClient(args);
43 | 
44 |   const moderationResults = await openai.moderations.create({
45 |     input: output,
46 |   });
47 | 
48 |   const result = moderationResults.results[0];
49 | 
50 |   return {
51 |     name: MODERATION_NAME,
52 |     score: computeScore(result, threshold),
53 |     metadata: {
54 |       threshold,
55 |       // @NOTE: `as unknown ...` is intentional. See https://stackoverflow.com/a/57280262
56 |       category_scores:
57 |         (result.category_scores as unknown as Record<string, number>) ||
58 |         undefined,
59 |     },
60 |   };
61 | }, MODERATION_NAME);
62 | 


--------------------------------------------------------------------------------
/js/number.ts:
--------------------------------------------------------------------------------
 1 | import { makePartial, ScorerWithPartial } from "./partial";
 2 | 
 3 | /**
 4 |  * A simple scorer that compares numbers by normalizing their difference.
 5 |  */
 6 | export const NumericDiff: ScorerWithPartial<number, {}> = makePartial(
 7 |   async (args) => {
 8 |     const { output, expected } = args;
 9 | 
10 |     if (expected === undefined) {
11 |       throw new Error("NumericDiff requires an expected value");
12 |     }
13 | 
14 |     const score =
15 |       output === 0 && expected === 0
16 |         ? 1
17 |         : 1 -
18 |           Math.abs(expected - output) / (Math.abs(expected) + Math.abs(output));
19 | 
20 |     return {
21 |       name: "NumericDiff",
22 |       score,
23 |     };
24 |   },
25 |   "NumericDiff",
26 | );
27 | 


--------------------------------------------------------------------------------
/js/oai.test.ts:
--------------------------------------------------------------------------------
  1 | import { http, HttpResponse } from "msw";
  2 | import OpenAI from "openai";
  3 | import {
  4 |   afterAll,
  5 |   afterEach,
  6 |   beforeAll,
  7 |   beforeEach,
  8 |   describe,
  9 |   expect,
 10 |   test,
 11 |   vi,
 12 | } from "vitest";
 13 | import { buildOpenAIClient, init } from "./oai";
 14 | 
 15 | import { setupServer } from "msw/node";
 16 | 
 17 | export const server = setupServer();
 18 | 
 19 | beforeAll(() => {
 20 |   server.listen({
 21 |     onUnhandledRequest: (req) => {
 22 |       throw new Error(`Unhandled request ${req.method}, ${req.url}`);
 23 |     },
 24 |   });
 25 | });
 26 | 
 27 | let OPENAI_API_KEY: string | undefined;
 28 | let OPENAI_BASE_URL: string | undefined;
 29 | 
 30 | beforeEach(() => {
 31 |   OPENAI_API_KEY = process.env.OPENAI_API_KEY;
 32 |   OPENAI_BASE_URL = process.env.OPENAI_BASE_URL;
 33 | });
 34 | 
 35 | afterEach(() => {
 36 |   server.resetHandlers();
 37 | 
 38 |   process.env.OPENAI_API_KEY = OPENAI_API_KEY;
 39 |   process.env.OPENAI_BASE_URL = OPENAI_BASE_URL;
 40 | });
 41 | 
 42 | afterAll(() => {
 43 |   server.close();
 44 | });
 45 | 
 46 | const MOCK_OPENAI_COMPLETION_RESPONSE = {
 47 |   choices: [
 48 |     {
 49 |       message: {
 50 |         content: "Hello, I am a mock response!",
 51 |         role: "assistant",
 52 |       },
 53 |       finish_reason: "stop",
 54 |       index: 0,
 55 |     },
 56 |   ],
 57 |   created: Date.now(),
 58 |   id: "mock-id",
 59 |   model: "mock-model",
 60 |   object: "chat.completion",
 61 |   usage: {
 62 |     completion_tokens: 9,
 63 |     prompt_tokens: 5,
 64 |     total_tokens: 14,
 65 |   },
 66 | };
 67 | 
 68 | describe("OAI", () => {
 69 |   test("should use Azure OpenAI", async () => {
 70 |     server.use(
 71 |       http.post(
 72 |         "https://*.openai.azure.com/openai/deployments/*/chat/completions*",
 73 |         () => {
 74 |           return HttpResponse.json(MOCK_OPENAI_COMPLETION_RESPONSE);
 75 |         },
 76 |       ),
 77 |     );
 78 | 
 79 |     const client = buildOpenAIClient({
 80 |       azureOpenAi: {
 81 |         apiKey: "test-api-key",
 82 |         endpoint: "https://test-resource.openai.azure.com",
 83 |         apiVersion: "2024-02-15-preview",
 84 |       },
 85 |     });
 86 | 
 87 |     const response = await client.chat.completions.create({
 88 |       model: "test-model",
 89 |       messages: [{ role: "system", content: "Hello" }],
 90 |     });
 91 | 
 92 |     expect(response.choices[0].message.content).toBe(
 93 |       "Hello, I am a mock response!",
 94 |     );
 95 |     expect(response.choices).toHaveLength(1);
 96 |   });
 97 | 
 98 |   test("should use regular OpenAI", async () => {
 99 |     server.use(
100 |       http.post("https://api.openai.com/v1/chat/completions", () => {
101 |         return HttpResponse.json(MOCK_OPENAI_COMPLETION_RESPONSE);
102 |       }),
103 |     );
104 | 
105 |     const client = buildOpenAIClient({
106 |       openAiApiKey: "test-api-key",
107 |       openAiBaseUrl: "https://api.openai.com/v1",
108 |     });
109 | 
110 |     const response = await client.chat.completions.create({
111 |       model: "gpt-4",
112 |       messages: [{ role: "user", content: "Hello" }],
113 |     });
114 | 
115 |     expect(response.choices[0].message.content).toBe(
116 |       "Hello, I am a mock response!",
117 |     );
118 |   });
119 | 
120 |   test("calls proxy if everything unset", async () => {
121 |     delete process.env.OPENAI_API_KEY;
122 |     delete process.env.OPENAI_BASE_URL;
123 | 
124 |     server.use(
125 |       http.post("https://api.braintrust.dev/v1/proxy/chat/completions", () => {
126 |         debugger;
127 |         return HttpResponse.json(MOCK_OPENAI_COMPLETION_RESPONSE);
128 |       }),
129 |     );
130 | 
131 |     const client = buildOpenAIClient({});
132 |     const response = await client.chat.completions.create({
133 |       model: "gpt-4",
134 |       messages: [{ role: "user", content: "Hello" }],
135 |     });
136 | 
137 |     debugger;
138 | 
139 |     expect(response.choices[0].message.content).toBe(
140 |       "Hello, I am a mock response!",
141 |     );
142 |   });
143 | 
144 |   test("default wraps", async () => {
145 |     delete process.env.OPENAI_API_KEY;
146 |     delete process.env.OPENAI_BASE_URL;
147 | 
148 |     server.use(
149 |       http.post("https://api.braintrust.dev/v1/proxy/chat/completions", () => {
150 |         return HttpResponse.json(MOCK_OPENAI_COMPLETION_RESPONSE);
151 |       }),
152 |     );
153 | 
154 |     await withMockWrapper(async ({ createSpy }) => {
155 |       const client = buildOpenAIClient({});
156 | 
157 |       await client.chat.completions.create({
158 |         model: "gpt-4",
159 |         messages: [{ role: "user", content: "Hello" }],
160 |       });
161 | 
162 |       expect(createSpy).toHaveBeenCalledTimes(1);
163 |       expect(createSpy).toHaveBeenCalledWith({
164 |         model: "gpt-4",
165 |         messages: [{ role: "user", content: "Hello" }],
166 |       });
167 |     });
168 |   });
169 | 
170 |   test("wraps once", async () => {
171 |     delete process.env.OPENAI_API_KEY;
172 |     delete process.env.OPENAI_BASE_URL;
173 | 
174 |     server.use(
175 |       http.post("https://api.braintrust.dev/v1/proxy/chat/completions", () => {
176 |         return HttpResponse.json(MOCK_OPENAI_COMPLETION_RESPONSE);
177 |       }),
178 |     );
179 | 
180 |     await withMockWrapper(async ({ wrapperMock, createSpy }) => {
181 |       const client = wrapperMock(
182 |         new OpenAI({
183 |           apiKey: "test-api-key",
184 |         }),
185 |       );
186 |       const builtClient = buildOpenAIClient({ client });
187 | 
188 |       expect(builtClient).toBe(client);
189 | 
190 |       await builtClient.chat.completions.create({
191 |         model: "gpt-4",
192 |         messages: [{ role: "user", content: "Hello" }],
193 |       });
194 | 
195 |       expect(createSpy).toHaveBeenCalledTimes(1);
196 |       expect(createSpy).toHaveBeenCalledWith({
197 |         model: "gpt-4",
198 |         messages: [{ role: "user", content: "Hello" }],
199 |       });
200 |     });
201 |   });
202 | 
203 |   test("wraps client, if possible", async () => {
204 |     server.use(
205 |       http.post("https://api.openai.com/v1/chat/completions", () => {
206 |         return HttpResponse.json(MOCK_OPENAI_COMPLETION_RESPONSE);
207 |       }),
208 |     );
209 | 
210 |     await withMockWrapper(async ({ wrapperMock, createSpy }) => {
211 |       const client = new OpenAI({ apiKey: "test-api-key" });
212 |       const builtClient = buildOpenAIClient({ client });
213 | 
214 |       await builtClient.chat.completions.create({
215 |         model: "gpt-4",
216 |         messages: [{ role: "user", content: "Hello" }],
217 |       });
218 | 
219 |       expect(createSpy).toHaveBeenCalledTimes(1);
220 |       expect(createSpy).toHaveBeenCalledWith({
221 |         model: "gpt-4",
222 |         messages: [{ role: "user", content: "Hello" }],
223 |       });
224 |     });
225 |   });
226 | 
227 |   test("init sets client", async () => {
228 |     server.use(
229 |       http.post("https://api.openai.com/v1/chat/completions", () => {
230 |         return HttpResponse.json(MOCK_OPENAI_COMPLETION_RESPONSE);
231 |       }),
232 |     );
233 | 
234 |     const client = new OpenAI({ apiKey: "test-api-key" });
235 | 
236 |     init({ client });
237 | 
238 |     const builtClient = buildOpenAIClient({});
239 | 
240 |     expect(Object.is(builtClient, client)).toBe(true);
241 |   });
242 | 
243 |   test("client wins against init", async () => {
244 |     server.use(
245 |       http.post("https://api.openai.com/v1/chat/completions", () => {
246 |         return HttpResponse.json(MOCK_OPENAI_COMPLETION_RESPONSE);
247 |       }),
248 |     );
249 | 
250 |     const client = new OpenAI({ apiKey: "test-api-key" });
251 | 
252 |     init({ client });
253 | 
254 |     const otherClient = new OpenAI({ apiKey: "other-api-key" });
255 | 
256 |     const builtClient = buildOpenAIClient({ client: otherClient });
257 | 
258 |     expect(Object.is(builtClient, otherClient)).toBe(true);
259 |   });
260 | });
261 | 
262 | const withMockWrapper = async (
263 |   fn: (args: {
264 |     wrapperMock: (client: any) => any;
265 |     createSpy: ReturnType<typeof vi.fn>;
266 |   }) => Promise<void>,
267 | ) => {
268 |   const createSpy = vi.fn();
269 |   const wrapperMock = (client: any) => {
270 |     return new Proxy(client, {
271 |       get(target, prop) {
272 |         if (prop === "chat") {
273 |           return new Proxy(
274 |             {},
275 |             {
276 |               get(target, prop) {
277 |                 if (prop === "completions") {
278 |                   return new Proxy(
279 |                     {},
280 |                     {
281 |                       get(target, prop) {
282 |                         if (prop === "create") {
283 |                           return createSpy;
284 |                         }
285 |                         return Reflect.get(target, prop);
286 |                       },
287 |                     },
288 |                   );
289 |                 }
290 |                 return Reflect.get(target, prop);
291 |               },
292 |             },
293 |           );
294 |         }
295 |         return Reflect.get(target, prop);
296 |       },
297 |     });
298 |   };
299 | 
300 |   const originalWrapper = globalThis.__inherited_braintrust_wrap_openai;
301 |   try {
302 |     globalThis.__inherited_braintrust_wrap_openai = wrapperMock;
303 |     await fn({ wrapperMock, createSpy });
304 |   } finally {
305 |     globalThis.__inherited_braintrust_wrap_openai = originalWrapper;
306 |   }
307 | };
308 | 


--------------------------------------------------------------------------------
/js/oai.ts:
--------------------------------------------------------------------------------
  1 | import {
  2 |   ChatCompletion,
  3 |   ChatCompletionMessageParam,
  4 |   ChatCompletionTool,
  5 |   ChatCompletionToolChoiceOption,
  6 | } from "openai/resources";
  7 | import { AzureOpenAI, OpenAI } from "openai";
  8 | 
  9 | export interface CachedLLMParams {
 10 |   /**
 11 |    Model to use for the completion.
 12 |    Note: If using Azure OpenAI, this should be the deployment name..
 13 |    */
 14 |   model: string;
 15 |   messages: ChatCompletionMessageParam[];
 16 |   tools?: ChatCompletionTool[];
 17 |   tool_choice?: ChatCompletionToolChoiceOption;
 18 |   temperature?: number;
 19 |   max_tokens?: number;
 20 |   span_info?: {
 21 |     spanAttributes?: Record<string, string>;
 22 |   };
 23 | }
 24 | 
 25 | export interface ChatCache {
 26 |   get(params: CachedLLMParams): Promise<ChatCompletion | null>;
 27 |   set(params: CachedLLMParams, response: ChatCompletion): Promise<void>;
 28 | }
 29 | 
 30 | export type OpenAIAuth =
 31 |   | {
 32 |       /** @deprecated Use the `client` option instead */
 33 |       openAiApiKey?: string;
 34 |       /** @deprecated Use the `client` option instead */
 35 |       openAiOrganizationId?: string;
 36 |       /** @deprecated Use the `client` option instead */
 37 |       openAiBaseUrl?: string;
 38 |       /** @deprecated Use the `client` option instead */
 39 |       openAiDefaultHeaders?: Record<string, string>;
 40 |       /** @deprecated Use the `client` option instead */
 41 |       openAiDangerouslyAllowBrowser?: boolean;
 42 |       /** @deprecated Use the `client` option instead */
 43 |       azureOpenAi?: AzureOpenAiAuth;
 44 |       client?: never;
 45 |     }
 46 |   | {
 47 |       client: OpenAI;
 48 |       /** @deprecated Use the `client` option instead */
 49 |       openAiApiKey?: never;
 50 |       /** @deprecated Use the `client` option instead */
 51 |       openAiOrganizationId?: never;
 52 |       /** @deprecated Use the `client` option instead */
 53 |       openAiBaseUrl?: never;
 54 |       /** @deprecated Use the `client` option instead */
 55 |       openAiDefaultHeaders?: never;
 56 |       /** @deprecated Use the `client` option instead */
 57 |       openAiDangerouslyAllowBrowser?: never;
 58 |       /** @deprecated Use the `client` option instead */
 59 |       azureOpenAi?: never;
 60 |     };
 61 | 
 62 | export interface AzureOpenAiAuth {
 63 |   apiKey: string;
 64 |   endpoint: string;
 65 |   apiVersion: string;
 66 | }
 67 | 
 68 | export function extractOpenAIArgs<T extends Record<string, unknown>>(
 69 |   args: OpenAIAuth & T,
 70 | ): OpenAIAuth {
 71 |   return args.client
 72 |     ? { client: args.client }
 73 |     : {
 74 |         openAiApiKey: args.openAiApiKey,
 75 |         openAiOrganizationId: args.openAiOrganizationId,
 76 |         openAiBaseUrl: args.openAiBaseUrl,
 77 |         openAiDefaultHeaders: args.openAiDefaultHeaders,
 78 |         openAiDangerouslyAllowBrowser: args.openAiDangerouslyAllowBrowser,
 79 |         azureOpenAi: args.azureOpenAi,
 80 |       };
 81 | }
 82 | 
 83 | const PROXY_URL = "https://api.braintrust.dev/v1/proxy";
 84 | 
 85 | const resolveOpenAIClient = (options: OpenAIAuth): OpenAI => {
 86 |   const {
 87 |     openAiApiKey,
 88 |     openAiOrganizationId,
 89 |     openAiBaseUrl,
 90 |     openAiDefaultHeaders,
 91 |     openAiDangerouslyAllowBrowser,
 92 |     azureOpenAi,
 93 |   } = options;
 94 | 
 95 |   if (options.client) {
 96 |     return options.client;
 97 |   }
 98 | 
 99 |   if (globalThis.__client) {
100 |     return globalThis.__client;
101 |   }
102 | 
103 |   if (azureOpenAi) {
104 |     // if not unset will could raise an exception
105 |     delete process.env.OPENAI_BASE_URL;
106 | 
107 |     return new AzureOpenAI({
108 |       apiKey: azureOpenAi.apiKey,
109 |       endpoint: azureOpenAi.endpoint,
110 |       apiVersion: azureOpenAi.apiVersion,
111 |       defaultHeaders: openAiDefaultHeaders,
112 |       dangerouslyAllowBrowser: openAiDangerouslyAllowBrowser,
113 |     });
114 |   }
115 | 
116 |   return new OpenAI({
117 |     apiKey:
118 |       openAiApiKey ||
119 |       process.env.OPENAI_API_KEY ||
120 |       process.env.BRAINTRUST_API_KEY,
121 |     organization: openAiOrganizationId,
122 |     baseURL: openAiBaseUrl || process.env.OPENAI_BASE_URL || PROXY_URL,
123 |     defaultHeaders: openAiDefaultHeaders,
124 |     dangerouslyAllowBrowser: openAiDangerouslyAllowBrowser,
125 |   });
126 | };
127 | 
128 | const isWrapped = (client: OpenAI): boolean => {
129 |   const Constructor = Object.getPrototypeOf(client).constructor;
130 |   const clean = new Constructor({ apiKey: "dummy" });
131 |   return (
132 |     String(client.chat.completions.create) !==
133 |     String(clean.chat.completions.create)
134 |   );
135 | };
136 | 
137 | export function buildOpenAIClient(options: OpenAIAuth): OpenAI {
138 |   const client = resolveOpenAIClient(options);
139 | 
140 |   // avoid re-wrapping if the client is already wrapped (proxied)
141 |   if (globalThis.__inherited_braintrust_wrap_openai && !isWrapped(client)) {
142 |     return globalThis.__inherited_braintrust_wrap_openai(client);
143 |   }
144 | 
145 |   return client;
146 | }
147 | 
148 | declare global {
149 |   /* eslint-disable no-var */
150 |   var __inherited_braintrust_wrap_openai: ((openai: any) => any) | undefined;
151 |   var __client: OpenAI | undefined;
152 | }
153 | 
154 | export const init = ({ client }: { client?: OpenAI } = {}) => {
155 |   globalThis.__client = client;
156 | };
157 | 
158 | export async function cachedChatCompletion(
159 |   params: CachedLLMParams,
160 |   options: { cache?: ChatCache } & OpenAIAuth,
161 | ): Promise<ChatCompletion> {
162 |   const openai = buildOpenAIClient(options);
163 | 
164 |   const fullParams = globalThis.__inherited_braintrust_wrap_openai
165 |     ? {
166 |         ...params,
167 |         span_info: {
168 |           spanAttributes: {
169 |             ...params.span_info?.spanAttributes,
170 |             purpose: "scorer",
171 |           },
172 |         },
173 |       }
174 |     : params;
175 | 
176 |   return await openai.chat.completions.create(fullParams);
177 | }
178 | 


--------------------------------------------------------------------------------
/js/partial.test.ts:
--------------------------------------------------------------------------------
 1 | import { expect, test } from "vitest";
 2 | import { ClosedQA } from "./llm";
 3 | import { Levenshtein } from "./string";
 4 | 
 5 | test("Partial Test", async () => {
 6 |   const levenshteinBasic = await Levenshtein({
 7 |     output: "abc",
 8 |     expected: "abcd",
 9 |   });
10 |   const levenshteinPartial = await Levenshtein.partial({ expected: "abcd" })({
11 |     output: "abc",
12 |   });
13 |   expect(levenshteinBasic.score).toBeDefined();
14 |   expect(levenshteinPartial.score).toBeDefined();
15 |   expect(levenshteinPartial.score).toEqual(levenshteinBasic.score);
16 |   expect(levenshteinBasic.name).toEqual(levenshteinPartial.name);
17 |   expect(levenshteinBasic.name).toEqual("Levenshtein");
18 | 
19 |   // Now do the same with ClosedQA which is an "LLM" scorer
20 |   const closedQABasic = await ClosedQA({
21 |     criteria: "Is the answer correct?",
22 |     input: "What is 1+1?",
23 |     output: "2",
24 |   });
25 |   const closedQAPartial = await ClosedQA.partial({
26 |     criteria: "Is the answer correct?",
27 |   })({
28 |     input: "What is 1+1?",
29 |     output: "2",
30 |   });
31 |   expect(closedQABasic.score).toBeDefined();
32 |   expect(closedQAPartial.score).toBeDefined();
33 |   expect(closedQAPartial.score).toEqual(closedQABasic.score);
34 |   expect(closedQABasic.name).toEqual(closedQAPartial.name);
35 |   expect(closedQABasic.name).toEqual("ClosedQA");
36 | });
37 | 


--------------------------------------------------------------------------------
/js/partial.ts:
--------------------------------------------------------------------------------
 1 | import { Scorer, ScorerArgs } from "@braintrust/core";
 2 | 
 3 | export interface ScorerWithPartial<Output, Extra>
 4 |   extends Scorer<Output, Extra> {
 5 |   partial: <T extends keyof Extra>(args: { [K in T]: Extra[K] }) => Scorer<
 6 |     Output,
 7 |     Omit<Extra, T> & Partial<Pick<Extra, T>>
 8 |   >;
 9 | }
10 | 
11 | export function makePartial<Output, Extra>(
12 |   fn: Scorer<Output, Extra>,
13 |   name?: string,
14 | ): ScorerWithPartial<Output, Extra> {
15 |   const ret: any = fn.bind({});
16 |   ret.partial = (args: Partial<ScorerArgs<Output, Extra>>) => {
17 |     const newFn = (newArgs: ScorerArgs<Output, Extra>) =>
18 |       ret({ ...args, ...newArgs });
19 |     if (name) {
20 |       Object.defineProperty(newFn, "name", {
21 |         value: name,
22 |         configurable: true,
23 |       });
24 |     }
25 |     return newFn;
26 |   };
27 |   if (name) {
28 |     Object.defineProperty(ret, "name", {
29 |       value: name,
30 |       configurable: true,
31 |     });
32 |   }
33 |   return ret;
34 | }
35 | 


--------------------------------------------------------------------------------
/js/ragas.test.ts:
--------------------------------------------------------------------------------
 1 | import { expect, test } from "vitest";
 2 | import {
 3 |   AnswerCorrectness,
 4 |   AnswerRelevancy,
 5 |   AnswerSimilarity,
 6 |   ContextEntityRecall,
 7 |   ContextPrecision,
 8 |   ContextRecall,
 9 |   ContextRelevancy,
10 |   Faithfulness,
11 | } from "./ragas";
12 | 
13 | const data = {
14 |   input: "Can starred docs from different workspaces be accessed in one place?",
15 |   output:
16 |     "Yes, all starred docs, even from multiple different workspaces, will live in the My Shortcuts section.",
17 |   expected:
18 |     "Yes, all starred docs, even from multiple different workspaces, will live in the My Shortcuts section.",
19 |   context: [
20 |     "Not all Coda docs are used in the same way. You'll inevitably have a few that you use every week, and some that you'll only use once. This is where starred docs can help you stay organized.\n\n\n\nStarring docs is a great way to mark docs of personal importance. After you star a doc, it will live in a section on your doc list called **[My Shortcuts](https://coda.io/shortcuts)**. All starred docs, even from multiple different workspaces, will live in this section.\n\n\n\nStarring docs only saves them to your personal My Shortcuts. It doesn\u2019t affect the view for others in your workspace. If you\u2019re wanting to shortcut docs not just for yourself but also for others in your team or workspace, you\u2019ll [use pinning](https://help.coda.io/en/articles/2865511-starred-pinned-docs) instead.",
21 |   ],
22 | };
23 | 
24 | const retrievalMetrics = [
25 |   { scorer: ContextEntityRecall, score: 0.69525 },
26 |   { scorer: ContextRelevancy, score: 0.7423 },
27 |   { scorer: ContextRecall, score: 1 },
28 |   { scorer: ContextPrecision, score: 1 },
29 | ];
30 | 
31 | test("Ragas retrieval test", async () => {
32 |   for (const { scorer, score } of retrievalMetrics) {
33 |     const actualScore = await scorer({
34 |       output: data.output,
35 |       input: data.input,
36 |       expected: data.expected,
37 |       context: data.context,
38 |     });
39 | 
40 |     if (score === 1) {
41 |       expect(actualScore.score).toBeCloseTo(score, 4);
42 |     }
43 |   }
44 | }, 600000);
45 | 
46 | const generationMetrics = [
47 |   { scorer: AnswerRelevancy, score: 0.59 },
48 |   { scorer: Faithfulness, score: 1 },
49 | ];
50 | 
51 | test("Ragas generation test", async () => {
52 |   for (const { scorer, score } of generationMetrics) {
53 |     const actualScore = await scorer({
54 |       input: data.input,
55 |       output: data.output,
56 |       expected: data.expected,
57 |       context: data.context,
58 |       temperature: 0,
59 |     });
60 | 
61 |     if (score === 1) {
62 |       expect(actualScore.score).toBeCloseTo(score, 4);
63 |     }
64 |   }
65 | }, 600000);
66 | 
67 | const endToEndMetrics = [
68 |   { scorer: AnswerSimilarity, score: 1 },
69 |   { scorer: AnswerCorrectness, score: 1 },
70 | ];
71 | 
72 | test("Ragas end-to-end test", async () => {
73 |   for (const { scorer, score } of endToEndMetrics) {
74 |     const actualScore = await scorer({
75 |       input: data.input,
76 |       output: data.output,
77 |       expected: data.expected,
78 |       context: data.context,
79 |     });
80 | 
81 |     if (score === 1) {
82 |       expect(actualScore.score).toBeCloseTo(score, 4);
83 |       expect(actualScore.score).toBeLessThanOrEqual(1);
84 |     }
85 |   }
86 | }, 600000);
87 | 


--------------------------------------------------------------------------------
/js/render-messages.test.ts:
--------------------------------------------------------------------------------
 1 | import { describe, expect, it } from "vitest";
 2 | import { renderMessages } from "./render-messages";
 3 | import { ChatCompletionMessageParam } from "openai/resources";
 4 | 
 5 | describe("renderMessages", () => {
 6 |   it("should never HTML-escape values, regardless of mustache syntax", () => {
 7 |     const messages: ChatCompletionMessageParam[] = [
 8 |       { role: "user", content: "{{value}} and {{{value}}}" },
 9 |     ];
10 |     const rendered = renderMessages(messages, { value: "<b>bold</b>" });
11 |     expect(rendered[0].content).toBe("<b>bold</b> and <b>bold</b>");
12 |   });
13 | 
14 |   it("should stringify objects when using {{...}}", () => {
15 |     const messages: ChatCompletionMessageParam[] = [
16 |       { role: "user", content: "Data: {{data}}" },
17 |     ];
18 |     const data = { foo: "bar", num: 42 };
19 |     const rendered = renderMessages(messages, { data });
20 |     expect(rendered[0].content).toBe('Data: {"foo":"bar","num":42}');
21 |   });
22 | 
23 |   it("should output [object Object] when using {{{...}}} with objects", () => {
24 |     const messages: ChatCompletionMessageParam[] = [
25 |       { role: "user", content: "Data: {{{data}}}" },
26 |     ];
27 |     const data = { foo: "bar", num: 42 };
28 |     const rendered = renderMessages(messages, { data });
29 |     expect(rendered[0].content).toBe("Data: [object Object]");
30 |   });
31 | 
32 |   it("should handle empty content", () => {
33 |     const messages: ChatCompletionMessageParam[] = [
34 |       { role: "user", content: "" },
35 |     ];
36 |     const rendered = renderMessages(messages, {});
37 |     expect(rendered[0].content).toBe("");
38 |   });
39 | });
40 | 


--------------------------------------------------------------------------------
/js/render-messages.ts:
--------------------------------------------------------------------------------
 1 | import mustache from "mustache";
 2 | import { ChatCompletionMessageParam } from "openai/resources";
 3 | 
 4 | export function renderMessages(
 5 |   messages: ChatCompletionMessageParam[],
 6 |   renderArgs: Record<string, unknown>,
 7 | ): ChatCompletionMessageParam[] {
 8 |   return messages.map((m) => ({
 9 |     ...m,
10 |     content: m.content
11 |       ? mustache.render(m.content as string, renderArgs, undefined, {
12 |           escape: (v: unknown) =>
13 |             typeof v === "string" ? v : JSON.stringify(v),
14 |         })
15 |       : "",
16 |   }));
17 | }
18 | 


--------------------------------------------------------------------------------
/js/string.ts:
--------------------------------------------------------------------------------
 1 | import { Scorer, ScorerArgs } from "@braintrust/core";
 2 | import levenshtein from "js-levenshtein";
 3 | import { OpenAIAuth, buildOpenAIClient } from "./oai";
 4 | import cossim from "compute-cosine-similarity";
 5 | import { makePartial, ScorerWithPartial } from "./partial";
 6 | 
 7 | /**
 8 |  * A simple scorer that uses the Levenshtein distance to compare two strings.
 9 |  */
10 | export const Levenshtein: ScorerWithPartial<string, {}> = makePartial(
11 |   (args) => {
12 |     if (args.expected === undefined) {
13 |       throw new Error("LevenshteinScorer requires an expected value");
14 |     }
15 | 
16 |     const [output, expected] = [`${args.output}`, `${args.expected}`];
17 |     const maxLen = Math.max(output.length, expected.length);
18 | 
19 |     let score = 1;
20 |     if (maxLen > 0) {
21 |       score = 1 - levenshtein(output, expected) / maxLen;
22 |     }
23 | 
24 |     return {
25 |       name: "Levenshtein",
26 |       score,
27 |     };
28 |   },
29 | 
30 |   "Levenshtein",
31 | );
32 | 
33 | // For back-compat
34 | export const LevenshteinScorer: ScorerWithPartial<string, {}> = Levenshtein;
35 | 
36 | /**
37 |  * A scorer that uses cosine similarity to compare two strings.
38 |  *
39 |  * @param args
40 |  * @param args.prefix A prefix to prepend to the prompt. This is useful for specifying the domain of the inputs.
41 |  * @param args.model The model to use for the embedding distance. Defaults to "text-embedding-ada-002".
42 |  * @param args.expectedMin The minimum expected score. Defaults to 0.7. Values below this will be scored as 0, and
43 |  * values between this and 1 will be scaled linearly.
44 |  * @returns A score between 0 and 1, where 1 is a perfect match.
45 |  */
46 | export const EmbeddingSimilarity: ScorerWithPartial<
47 |   string,
48 |   {
49 |     prefix?: string;
50 |     expectedMin?: number;
51 |     model?: string;
52 |   } & OpenAIAuth
53 | > = makePartial(async (args) => {
54 |   if (args.expected === undefined) {
55 |     throw new Error("EmbeddingSimilarity requires an expected value");
56 |   }
57 | 
58 |   const prefix = args.prefix ?? "";
59 |   const expectedMin = args.expectedMin ?? 0.7;
60 | 
61 |   const [output, expected] = [
62 |     `${prefix}${args.output}`,
63 |     `${prefix}${args.expected}`,
64 |   ];
65 | 
66 |   const openai = buildOpenAIClient(args);
67 | 
68 |   const [outputResult, expectedResult] = await Promise.all(
69 |     [output, expected].map((input) =>
70 |       openai.embeddings.create({
71 |         input,
72 |         model: args.model ?? "text-embedding-ada-002",
73 |       }),
74 |     ),
75 |   );
76 | 
77 |   const score = cossim(
78 |     outputResult.data[0].embedding,
79 |     expectedResult.data[0].embedding,
80 |   );
81 | 
82 |   return {
83 |     name: "EmbeddingSimilarity",
84 |     score: scaleScore(score ?? 0, expectedMin),
85 |     error: score === null ? "EmbeddingSimilarity failed" : undefined,
86 |   };
87 | }, "EmbeddingSimilarity");
88 | 
89 | function scaleScore(score: number, expectedMin: number): number {
90 |   return Math.min(Math.max((score - expectedMin) / (1 - expectedMin), 0), 1);
91 | }
92 | 


--------------------------------------------------------------------------------
/js/templates.ts:
--------------------------------------------------------------------------------
 1 | import { z } from "zod";
 2 | import * as yaml from "js-yaml";
 3 | 
 4 | import battle from "../templates/battle.yaml";
 5 | import closed_q_a from "../templates/closed_q_a.yaml";
 6 | import factuality from "../templates/factuality.yaml";
 7 | import humor from "../templates/humor.yaml";
 8 | import possible from "../templates/possible.yaml";
 9 | import security from "../templates/security.yaml";
10 | import sql from "../templates/sql.yaml";
11 | import summary from "../templates/summary.yaml";
12 | import translation from "../templates/translation.yaml";
13 | 
14 | export const modelGradedSpecSchema = z.object({
15 |   prompt: z.string(),
16 |   choice_scores: z.record(z.number()),
17 |   model: z.string().optional(),
18 |   use_cot: z.boolean().optional(),
19 |   temperature: z.number().optional(),
20 | });
21 | 
22 | export type ModelGradedSpec = z.infer<typeof modelGradedSpecSchema>;
23 | 
24 | const templateStrings = {
25 |   battle,
26 |   closed_q_a,
27 |   factuality,
28 |   humor,
29 |   possible,
30 |   security,
31 |   sql,
32 |   summary,
33 |   translation,
34 | } as const;
35 | 
36 | // eslint-disable-next-line @typescript-eslint/consistent-type-assertions
37 | export const templates = Object.fromEntries(
38 |   Object.entries(templateStrings).map(([name, template]) => [
39 |     name,
40 |     modelGradedSpecSchema.parse(
41 |       typeof template === "string" ? yaml.load(template) : template,
42 |     ),
43 |   ]),
44 | ) as Record<keyof typeof templateStrings, ModelGradedSpec>;
45 | 


--------------------------------------------------------------------------------
/js/value.test.ts:
--------------------------------------------------------------------------------
  1 | import { expect, test } from "vitest";
  2 | import { ListContains } from "./list";
  3 | import { NumericDiff } from "./number";
  4 | import { LevenshteinScorer } from "./string";
  5 | import { ExactMatch } from "./value";
  6 | 
  7 | test("Levenshtein Test", async () => {
  8 |   const cases = [
  9 |     { a: "", b: "", expected: 1 },
 10 |     { a: "", b: "a", expected: 0 },
 11 |     { a: "a", b: "", expected: 0 },
 12 |     { a: "a", b: "a", expected: 1 },
 13 |     { a: "a", b: "b", expected: 0 },
 14 |     { a: "ab", b: "ac", expected: 0.5 },
 15 |     { a: "ac", b: "bc", expected: 0.5 },
 16 |     { a: "abc", b: "axc", expected: 0.6666666666666667 },
 17 |     { a: "xabxcdxxefxgx", b: "1ab2cd34ef5g6", expected: 0.5384615384615384 },
 18 |   ];
 19 | 
 20 |   for (const { a, b, expected } of cases) {
 21 |     const score = (await LevenshteinScorer({ output: a, expected: b })).score;
 22 |     expect(score).toBeCloseTo(expected);
 23 |   }
 24 | });
 25 | 
 26 | test("Numeric Test", async () => {
 27 |   const cases = [
 28 |     { a: 0, b: 0, expected: 1 },
 29 |     { a: 0, b: 1, expected: 0 },
 30 |     { a: 1, b: 2, expected: 0.66667 },
 31 |     { a: 1.0, b: 2.0, expected: 0.66667 },
 32 |     { a: -1, b: 2, expected: 0 },
 33 |   ];
 34 | 
 35 |   for (const { a, b, expected } of cases) {
 36 |     const score = (await NumericDiff({ output: a, expected: b })).score;
 37 |     expect(score).toBeCloseTo(expected);
 38 |   }
 39 | });
 40 | 
 41 | test("ListContains Test", async () => {
 42 |   const cases = [
 43 |     { a: [], b: [], expected: 1 },
 44 |     { a: ["0"], b: [], expected: 0 },
 45 |     { a: [], b: ["0"], expected: 0 },
 46 |     { a: ["a"], b: ["a"], expected: 1 },
 47 |     { a: ["a"], b: ["a", "b"], expected: 0.5 },
 48 |     { a: ["a", "b"], b: ["a"], expected: 0.5 },
 49 |     {
 50 |       a: [
 51 |         "workspaces",
 52 |         "section",
 53 |         "view",
 54 |         "others",
 55 |         "workspace",
 56 |         "team",
 57 |         "pinning",
 58 |       ],
 59 |       b: ["starred", "multiple different workspaces", "shortcuts"],
 60 |       expected: 0.1218,
 61 |     },
 62 |     {
 63 |       a: ["starred", "multiple different workspaces", "shortcuts"],
 64 |       b: [
 65 |         "workspaces",
 66 |         "section",
 67 |         "view",
 68 |         "others",
 69 |         "workspace",
 70 |         "team",
 71 |         "pinning",
 72 |       ],
 73 |       expected: 0.1218,
 74 |     },
 75 |   ];
 76 | 
 77 |   for (const { a, b, expected } of cases) {
 78 |     const score = (await ListContains({ output: a, expected: b })).score;
 79 |     expect(score).toBeCloseTo(expected, 4);
 80 |   }
 81 | 
 82 |   expect(
 83 |     (
 84 |       await ListContains({
 85 |         output: ["a", "b"],
 86 |         expected: ["b"],
 87 |         allowExtraEntities: true,
 88 |       })
 89 |     ).score,
 90 |   ).toBe(1);
 91 | });
 92 | 
 93 | test("ExactMatch", async () => {
 94 |   const cases = [
 95 |     { output: "hello", expected: "hello", expectedScore: 1 },
 96 |     { output: "hello", expected: "world", expectedScore: 0 },
 97 |     { output: 123, expected: 123, expectedScore: 1 },
 98 |     { output: 123, expected: "123", expectedScore: 1 },
 99 |     { output: { a: 1, b: 2 }, expected: { a: 1, b: 2 }, expectedScore: 1 },
100 |     { output: { a: 1, b: 2 }, expected: { a: 1, b: 3 }, expectedScore: 0 },
101 |     { output: [1, 2, 3], expected: [1, 2, 3], expectedScore: 1 },
102 |     { output: [1, 2, 3], expected: [3, 2, 1], expectedScore: 0 },
103 |     { output: { a: 1, b: 2 }, expected: { b: 2, a: 1 }, expectedScore: 0 }, // Order matters
104 |     { output: { a: 1, b: 2 }, expected: '{"a": 1, "b": 2}', expectedScore: 1 }, // String representation matches dict
105 |     { output: { a: 1, b: 2 }, expected: '{"a":1, "b":2}', expectedScore: 1 }, // String representation matches dict
106 |     { output: { a: 1, b: 2 }, expected: '{"b":2, "a":1}', expectedScore: 0 },
107 |     {
108 |       output: { a: 1, b: 2 },
109 |       expected: { b: 2, a: 1, c: 3 },
110 |       expectedScore: 0,
111 |     }, // Extra key, not equal
112 |     { output: null, expected: null, expectedScore: 1 },
113 |     { output: null, expected: undefined, expectedScore: 1 },
114 |   ];
115 | 
116 |   for (const { output, expected, expectedScore } of cases) {
117 |     const score = (await ExactMatch({ output, expected })).score;
118 |     expect(score).toBeCloseTo(expectedScore, 4);
119 |   }
120 | });
121 | 


--------------------------------------------------------------------------------
/js/value.ts:
--------------------------------------------------------------------------------
 1 | import { makePartial, ScorerWithPartial } from "./partial";
 2 | 
 3 | /**
 4 |  * A simple scorer that tests whether two values are equal. If the value is an object or array,
 5 |  * it will be JSON-serialized and the strings compared for equality.
 6 |  */
 7 | export const ExactMatch: ScorerWithPartial<unknown, {}> = makePartial(
 8 |   (args) => {
 9 |     const maybeObject = needsJSON(args.output) || needsJSON(args.expected);
10 |     const [output, expected] = [
11 |       normalizeValue(args.output ?? null, maybeObject),
12 |       normalizeValue(args.expected ?? null, maybeObject),
13 |     ];
14 | 
15 |     const score = output === expected ? 1 : 0;
16 | 
17 |     return {
18 |       name: "ExactMatch",
19 |       score,
20 |     };
21 |   },
22 |   "ExactMatch",
23 | );
24 | 
25 | function needsJSON(value: unknown): boolean {
26 |   return typeof value === "object" || Array.isArray(value);
27 | }
28 | 
29 | export function normalizeValue(value: unknown, maybeObject: boolean): string {
30 |   if (needsJSON(value)) {
31 |     return JSON.stringify(value);
32 |   }
33 |   try {
34 |     if (typeof value === "string" && maybeObject) {
35 |       return JSON.stringify(JSON.parse(value));
36 |     }
37 |   } catch (e) {
38 |     // That's ok, just return the string representation
39 |   }
40 |   return `${value}`;
41 | }
42 | 


--------------------------------------------------------------------------------
/js/yaml.d.ts:
--------------------------------------------------------------------------------
1 | declare module "*.yaml" {
2 |   const content: string;
3 |   export default content;
4 | }
5 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "autoevals",
 3 |   "version": "0.0.0",
 4 |   "description": "Universal library for evaluating AI models",
 5 |   "repository": {
 6 |     "type": "git",
 7 |     "url": "git+https://github.com/braintrustdata/autoevals.git"
 8 |   },
 9 |   "homepage": "https://www.braintrust.dev/docs",
10 |   "main": "./jsdist/index.js",
11 |   "module": "./jsdist/index.mjs",
12 |   "types": "./jsdist/index.d.ts",
13 |   "exports": {
14 |     "./package.json": "./package.json",
15 |     ".": {
16 |       "types": "./jsdist/index.d.ts",
17 |       "import": "./jsdist/index.mjs",
18 |       "module": "./jsdist/index.mjs",
19 |       "require": "./jsdist/index.js"
20 |     }
21 |   },
22 |   "files": [
23 |     "jsdist/**/*"
24 |   ],
25 |   "scripts": {
26 |     "build": "tsup",
27 |     "watch": "tsup --watch",
28 |     "docs": "npx typedoc --options typedoc.json js/index.ts",
29 |     "test": "vitest",
30 |     "prepublishOnly": "../scripts/node_prepublish_autoevals.py",
31 |     "postpublish": "../scripts/node_postpublish_autoevals.py"
32 |   },
33 |   "author": "",
34 |   "license": "MIT",
35 |   "devDependencies": {
36 |     "@rollup/plugin-yaml": "^4.1.2",
37 |     "@types/js-levenshtein": "^1.1.3",
38 |     "@types/js-yaml": "^4.0.9",
39 |     "@types/mustache": "^4.2.5",
40 |     "@types/node": "^20.10.5",
41 |     "msw": "^2.7.3",
42 |     "tsup": "^8.4.0",
43 |     "tsx": "^3.14.0",
44 |     "typedoc": "^0.25.4",
45 |     "typedoc-plugin-markdown": "^3.17.1",
46 |     "typescript": "^5.3.3",
47 |     "vitest": "^2.1.9"
48 |   },
49 |   "dependencies": {
50 |     "@braintrust/core": "^0.0.8",
51 |     "ajv": "^8.13.0",
52 |     "compute-cosine-similarity": "^1.1.0",
53 |     "js-levenshtein": "^1.1.6",
54 |     "js-yaml": "^4.1.0",
55 |     "linear-sum-assignment": "^1.0.7",
56 |     "mustache": "^4.2.0",
57 |     "openai": "^4.47.1",
58 |     "zod": "^3.22.4",
59 |     "zod-to-json-schema": "^3.22.5"
60 |   },
61 |   "packageManager": "pnpm@8.15.5"
62 | }
63 | 


--------------------------------------------------------------------------------
/pnpm-workspace.yaml:
--------------------------------------------------------------------------------
1 | packages:
2 |   - "."
3 |   - "evals"
4 | 


--------------------------------------------------------------------------------
/py/autoevals/__init__.py:
--------------------------------------------------------------------------------
  1 | """Autoevals is a comprehensive toolkit for evaluating AI model outputs.
  2 | 
  3 | This library provides a collection of specialized scorers for different types of evaluations:
  4 | 
  5 | - `string`: Text similarity using edit distance or embeddings
  6 | - `llm`: LLM-based evaluation for correctness, complexity, security, etc.
  7 | - `moderation`: Content safety and policy compliance checks
  8 | - `ragas`: Advanced NLP metrics for RAG system evaluation
  9 | - `json`: JSON validation and structural comparison
 10 | - `number`: Numeric similarity with relative scaling
 11 | - `value`: Exact matching and basic comparisons
 12 | 
 13 | **Key features**:
 14 | 
 15 | - Both sync and async evaluation support
 16 | - Configurable scoring parameters
 17 | - Detailed feedback through metadata
 18 | - Integration with OpenAI and other LLM providers through Braintrust AI Proxy
 19 | 
 20 | **Client setup**:
 21 | 
 22 | There are two ways to configure the OpenAI client:
 23 | 
 24 | 1. Global initialization (recommended):
 25 | 
 26 | ```python
 27 | from autoevals import init
 28 | from openai import AsyncOpenAI
 29 | 
 30 | # Set up once at the start of your application
 31 | client = AsyncOpenAI()
 32 | init(client=client)
 33 | ```
 34 | 
 35 | 2. Per-evaluator initialization:
 36 | 
 37 | ```python
 38 | from openai import AsyncOpenAI
 39 | from autoevals.ragas import CloseQA
 40 | 
 41 | # Pass client directly to evaluator
 42 | client = AsyncOpenAI()
 43 | evaluator = CloseQA(client=client)
 44 | ```
 45 | 
 46 | **Multi-provider support via the Braintrust AI Proxy**:
 47 | 
 48 | Autoevals supports multiple LLM providers (Anthropic, Azure, etc.) through the Braintrust AI Proxy.
 49 | Configure your client to use the proxy:
 50 | 
 51 | ```python
 52 | import os
 53 | from openai import AsyncOpenAI
 54 | from autoevals.llm import Factuality
 55 | 
 56 | # Configure client to use Braintrust AI Proxy
 57 | client = AsyncOpenAI(
 58 |     base_url="https://api.braintrustproxy.com/v1",
 59 |     api_key=os.getenv("BRAINTRUST_API_KEY"),
 60 | )
 61 | 
 62 | # Use with any evaluator
 63 | evaluator = Factuality(client=client)
 64 | ```
 65 | 
 66 | **Braintrust integration**:
 67 | 
 68 | Autoevals automatically integrates with Braintrust logging when you install the library. If needed, you can manually wrap the client:
 69 | 
 70 | ```python
 71 | from openai import AsyncOpenAI
 72 | from braintrust import wrap_openai
 73 | from autoevals.ragas import CloseQA
 74 | 
 75 | # Explicitly wrap the client if needed
 76 | client = wrap_openai(AsyncOpenAI())
 77 | evaluator = CloseQA(client=client)
 78 | ```
 79 | 
 80 | **Example Autoevals usage**:
 81 | 
 82 | ```python
 83 | from autoevals.ragas import CloseQA
 84 | import asyncio
 85 | 
 86 | async def evaluate_qa():
 87 |     # Create evaluator for question answering
 88 |     evaluator = CloseQA()
 89 | 
 90 |     # Question and context
 91 |     question = "What was the purpose of the Apollo missions?"
 92 |     context = '''
 93 |     The Apollo program was a NASA space program that ran from 1961 to 1972,
 94 |     with the goal of landing humans on the Moon and bringing them safely back
 95 |     to Earth. The program achieved its most famous success when Apollo 11
 96 |     astronauts Neil Armstrong and Buzz Aldrin became the first humans to walk
 97 |     on the Moon on July 20, 1969.
 98 |     '''
 99 | 
100 |     # Two different answers to evaluate
101 |     answer = "The Apollo program's main goal was to land humans on the Moon and return them safely to Earth."
102 |     expected = "The Apollo missions were designed to achieve human lunar landing and safe return."
103 | 
104 |     # Evaluate the answer
105 |     result = await evaluator.eval_async(
106 |         question=question,
107 |         context=context,
108 |         output=answer,
109 |         expected=expected
110 |     )
111 | 
112 |     print(f"Score: {result.score}")  # Semantic similarity score (0-1)
113 |     print(f"Rationale: {result.metadata.rationale}")  # Detailed explanation
114 |     print(f"Faithfulness: {result.metadata.faithfulness}")  # Context alignment
115 | 
116 | # Run async evaluation
117 | asyncio.run(evaluate_qa())
118 | ```
119 | 
120 | See individual module documentation for detailed usage and options.
121 | """
122 | 
123 | from braintrust_core.score import Score, Scorer
124 | 
125 | from .json import *
126 | from .list import *
127 | from .llm import *
128 | from .moderation import *
129 | from .number import *
130 | from .oai import init
131 | from .ragas import *
132 | from .string import *
133 | from .value import ExactMatch
134 | 


--------------------------------------------------------------------------------
/py/autoevals/json.py:
--------------------------------------------------------------------------------
  1 | """JSON evaluation scorers for comparing and validating JSON data.
  2 | 
  3 | This module provides scorers for working with JSON data:
  4 | 
  5 | - JSONDiff: Compare JSON objects for structural and content similarity
  6 |   - Handles nested structures, strings, numbers
  7 |   - Customizable with different scorers for string and number comparisons
  8 |   - Can automatically parse JSON strings
  9 | 
 10 | - ValidJSON: Validate if a string is valid JSON and matches an optional schema
 11 |   - Validates JSON syntax
 12 |   - Optional JSON Schema validation
 13 |   - Works with both strings and parsed objects
 14 | """
 15 | 
 16 | import json
 17 | 
 18 | from braintrust_core.score import Score, Scorer
 19 | from jsonschema import ValidationError, validate
 20 | 
 21 | from autoevals.partial import ScorerWithPartial
 22 | 
 23 | from .number import NumericDiff
 24 | from .string import Levenshtein
 25 | 
 26 | 
 27 | class JSONDiff(ScorerWithPartial):
 28 |     """Compare JSON objects for structural and content similarity.
 29 | 
 30 |     This scorer recursively compares JSON objects, handling:
 31 |     - Nested dictionaries and lists
 32 |     - String similarity using Levenshtein distance
 33 |     - Numeric value comparison
 34 |     - Automatic parsing of JSON strings
 35 | 
 36 |     Example:
 37 |         ```python
 38 |         import asyncio
 39 |         from openai import AsyncOpenAI
 40 |         from autoevals import JSONDiff
 41 |         from autoevals.string import EmbeddingSimilarity
 42 | 
 43 |         async def compare_json():
 44 |             # Initialize with async client for string comparison
 45 |             client = AsyncOpenAI()
 46 |             string_scorer = EmbeddingSimilarity(client=client)
 47 | 
 48 |             diff = JSONDiff(string_scorer=string_scorer)
 49 | 
 50 |             result = await diff.eval_async(
 51 |                 output={
 52 |                     "name": "John Smith",
 53 |                     "age": 30,
 54 |                     "skills": ["python", "javascript"]
 55 |                 },
 56 |                 expected={
 57 |                     "name": "John A. Smith",
 58 |                     "age": 31,
 59 |                     "skills": ["python", "typescript"]
 60 |                 }
 61 |             )
 62 | 
 63 |             print(result.score)  # Similarity score between 0-1
 64 |             print(result.metadata)  # Detailed comparison breakdown
 65 | 
 66 |         # Run the async evaluation
 67 |         asyncio.run(compare_json())
 68 |         ```
 69 | 
 70 |     Args:
 71 |         string_scorer: Optional custom scorer for string comparisons (default: Levenshtein)
 72 |         number_scorer: Optional custom scorer for number comparisons (default: NumericDiff)
 73 |         preserve_strings: Don't attempt to parse strings as JSON (default: False)
 74 | 
 75 |     Returns:
 76 |         Score object with:
 77 |         - score: Similarity score between 0-1
 78 |         - metadata: Detailed comparison breakdown
 79 |     """
 80 | 
 81 |     def __init__(self, string_scorer: Scorer = None, number_scorer: Scorer = None, preserve_strings: bool = False):
 82 |         self.string_scorer = string_scorer or Levenshtein()
 83 |         self.number_scorer = number_scorer or NumericDiff()
 84 |         self.preserve_strings = preserve_strings
 85 |         self._valid_json = ValidJSON()
 86 | 
 87 |     def _run_eval_sync(self, output, expected=None, **kwargs):
 88 |         return Score(name=self._name(), score=self.json_diff(output, expected))
 89 | 
 90 |     def json_diff(self, o1, o2):
 91 |         if not self.preserve_strings:
 92 |             if isinstance(o1, str) and self._valid_json.valid_json(o1) == 1:
 93 |                 o1 = json.loads(o1)
 94 |             if isinstance(o2, str) and self._valid_json.valid_json(o2) == 1:
 95 |                 o2 = json.loads(o2)
 96 | 
 97 |         if isinstance(o1, dict) and isinstance(o2, dict):
 98 |             if len(o1) == 0 and len(o2) == 0:
 99 |                 return 1
100 | 
101 |             all_keys = set(o1.keys()).union(set(o2.keys()))
102 |             base_scores = [self.json_diff(o1.get(k), o2.get(k)) for k in all_keys]
103 |             base_scores = [s for s in base_scores if s is not None]
104 |             return sum(base_scores) / len(base_scores)
105 |         elif isinstance(o1, list) and isinstance(o2, list):
106 |             if len(o1) == 0 and len(o2) == 0:
107 |                 return 1
108 |             base_scores = [self.json_diff(e1, e2) for (e1, e2) in zip(o1, o2)]
109 |             base_scores = [s for s in base_scores if s is not None]
110 |             return sum(base_scores) / max(len(o1), len(o2))
111 |         elif isinstance(o1, str) and isinstance(o2, str):
112 |             return self.string_scorer.eval(o1, o2).score
113 |         elif (isinstance(o1, int) or isinstance(o1, float)) and (isinstance(o2, int) or isinstance(o2, float)):
114 |             return self.number_scorer.eval(o1, o2).score
115 |         elif o1 is None and o2 is None:
116 |             return 1
117 |         elif o1 is None or o2 is None:
118 |             return 0
119 |         else:
120 |             kwargs = {"separators": (",", ":"), "sort_keys": True}
121 |             return self.string_scorer.eval(json.dumps(o1, **kwargs), json.dumps(o2, **kwargs)).score
122 | 
123 | 
124 | class ValidJSON(ScorerWithPartial):
125 |     """Validate if a string is valid JSON and optionally matches a schema.
126 | 
127 |     This scorer checks if:
128 |     - The input can be parsed as valid JSON
129 |     - The parsed JSON matches an optional JSON Schema
130 |     - Handles both string inputs and pre-parsed JSON objects
131 | 
132 |     Example:
133 |         ```python
134 |         import asyncio
135 |         from autoevals import ValidJSON
136 | 
137 |         async def validate_json():
138 |             # Define a schema to validate against
139 |             schema = {
140 |                 "type": "object",
141 |                 "properties": {
142 |                     "name": {"type": "string"},
143 |                     "age": {"type": "number"},
144 |                     "skills": {
145 |                         "type": "array",
146 |                         "items": {"type": "string"}
147 |                     }
148 |                 },
149 |                 "required": ["name", "age"]
150 |             }
151 | 
152 |             validator = ValidJSON(schema=schema)
153 | 
154 |             result = await validator.eval_async(
155 |                 output='''
156 |                 {
157 |                     "name": "John Smith",
158 |                     "age": 30,
159 |                     "skills": ["python", "javascript"]
160 |                 }
161 |                 '''
162 |             )
163 | 
164 |             print(result.score)  # 1 if valid, 0 if invalid
165 |             print(result.metadata)  # Validation details or error messages
166 | 
167 |         # Run the async validation
168 |         asyncio.run(validate_json())
169 |         ```
170 | 
171 |     Args:
172 |         schema: Optional JSON Schema to validate against
173 | 
174 |     Returns:
175 |         Score object with:
176 |         - score: 1 if valid JSON (and matches schema if provided), 0 otherwise
177 |         - metadata: Validation details or error messages
178 |     """
179 | 
180 |     def __init__(self, schema=None):
181 |         self.schema = schema
182 | 
183 |     def _run_eval_sync(self, output, schema=None, **kwargs):
184 |         return Score(name=self._name(), score=self.valid_json(output, schema))
185 | 
186 |     def valid_json(self, output, schema=None):
187 |         try:
188 |             parsed = json.loads(output) if isinstance(output, str) else output
189 | 
190 |             if schema is not None:
191 |                 validate(parsed, schema)
192 |                 return 1
193 | 
194 |             if isinstance(parsed, dict) or isinstance(parsed, list):
195 |                 return 1
196 | 
197 |         except (json.JSONDecodeError, ValidationError):
198 |             pass
199 | 
200 |         return 0
201 | 
202 | 
203 | __all__ = ["JSONDiff", "ValidJSON"]
204 | 


--------------------------------------------------------------------------------
/py/autoevals/list.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | from braintrust_core.score import Score
 4 | 
 5 | from autoevals.partial import ScorerWithPartial
 6 | 
 7 | from .string import Levenshtein
 8 | 
 9 | 
10 | class ListContains(ScorerWithPartial):
11 |     """
12 |     A scorer that semantically evaluates the overlap between two lists of strings. It works by
13 |     computing the pairwise similarity between each element of the output and the expected value,
14 |     and then using Linear Sum Assignment to find the best matching pairs.
15 |     """
16 | 
17 |     def __init__(self, pairwise_scorer=None, allow_extra_entities=False, **kwargs):
18 |         self.allow_extra_entities = allow_extra_entities
19 |         self.pairwise_scorer = pairwise_scorer or Levenshtein()
20 | 
21 |         # If it's a class, then instantiate it
22 |         if isinstance(self.pairwise_scorer, type):
23 |             self.pairwise_scorer = self.pairwise_scorer()
24 | 
25 |     async def _run_eval_async(self, output, expected=None, **kwargs):
26 |         if expected is None:
27 |             raise ValueError("ListContains requires an expected value")
28 | 
29 |         similarities_futures = [
30 |             [
31 |                 self.pairwise_scorer._run_eval_async(output=output_item, expected=expected_item)
32 |                 for expected_item in expected
33 |             ]
34 |             for output_item in output
35 |         ]
36 | 
37 |         similarities = []
38 | 
39 |         for similarity_futures in similarities_futures:
40 |             similarities.append([(await similarity_future).score for similarity_future in similarity_futures])
41 | 
42 |         return self._compute_score(output, expected, similarities, **kwargs)
43 | 
44 |     def _run_eval_sync(self, output, expected=None, **kwargs):
45 |         if expected is None:
46 |             raise ValueError("ListContains requires an expected value")
47 | 
48 |         similarities = [
49 |             [self.pairwise_scorer._run_eval_sync(output_item, expected_item).score for expected_item in expected]
50 |             for output_item in output
51 |         ]
52 | 
53 |         return self._compute_score(output, expected, similarities, **kwargs)
54 | 
55 |     def _compute_score(self, outputs, expecteds, similarities, **kwargs):
56 |         if len(outputs) == 0 and len(expecteds) == 0:
57 |             return Score(name=self._name(), score=1)
58 |         elif len(outputs) == 0 or len(expecteds) == 0:
59 |             return Score(name=self._name(), score=0)
60 | 
61 |         similarities = [[d or 0 for d in row] for row in similarities]
62 | 
63 |         try:
64 |             import numpy as np
65 |             from scipy.optimize import linear_sum_assignment
66 |         except ImportError:
67 |             print(
68 |                 "ListContains requires the scipy extension, which you can install with `pip install 'autoevals[scipy]'`",
69 |                 file=sys.stderr,
70 |             )
71 |             raise
72 | 
73 |         distances = -np.array(similarities)
74 |         row_ind, col_ind = linear_sum_assignment(distances)
75 | 
76 |         pairs = [(outputs[r], expecteds[c], similarities[r][c]) for (r, c) in zip(row_ind, col_ind)]
77 |         lowest_distances = distances[row_ind, col_ind]
78 | 
79 |         # Generally speaking, outputs that are not in expecteds should be penalized, but in certain use cases
80 |         # (eg checking whether a passage of text has all of the entities in a list, and maybe a few more), it's
81 |         # ok to allow them.
82 |         denominator = max(len(outputs), len(expecteds)) if not self.allow_extra_entities else len(expecteds)
83 |         assert len(lowest_distances) <= denominator, "There should be at most as many pairs as there are rows"
84 |         score = min(max(sum(-lowest_distances) / denominator, 0), 1)
85 | 
86 |         return Score(
87 |             name=self._name(),
88 |             score=score,
89 |             metadata={"pairs": pairs},
90 |         )
91 | 


--------------------------------------------------------------------------------
/py/autoevals/moderation.py:
--------------------------------------------------------------------------------
  1 | from typing import Optional
  2 | 
  3 | from braintrust_core.score import Score
  4 | 
  5 | from autoevals.llm import OpenAIScorer
  6 | 
  7 | from .oai import Client, arun_cached_request, run_cached_request
  8 | 
  9 | REQUEST_TYPE = "moderation"
 10 | 
 11 | 
 12 | class Moderation(OpenAIScorer):
 13 |     """A scorer that evaluates if AI responses contain inappropriate or unsafe content.
 14 | 
 15 |     This scorer uses OpenAI's moderation API to check for potentially harmful content in text.
 16 |     It returns a score of 0 if content is flagged as inappropriate, and 1 if it passes moderation.
 17 | 
 18 |     Args:
 19 |         threshold: Optional float to set a custom threshold for content flagging.
 20 |             If None, uses OpenAI's default flagging logic.
 21 |             If set, content is flagged if any category score exceeds this threshold.
 22 |         client: Optional client for making API calls. Can be:
 23 |             - None: Uses global client from init()
 24 |             - OpenAI v1 client: Used directly
 25 |             - OpenAI v0 module: Wrapped in a client adapter
 26 | 
 27 |     Example:
 28 |         ```python
 29 |         from openai import OpenAI
 30 |         from autoevals import init
 31 |         from autoevals.moderation import Moderation
 32 | 
 33 |         # Initialize with your OpenAI client
 34 |         init(OpenAI())
 35 | 
 36 |         # Create evaluator with default settings
 37 |         moderator = Moderation()
 38 |         result = moderator.eval(
 39 |             output="This is the text to check for inappropriate content"
 40 |         )
 41 |         print(result.score)  # 1 if content is appropriate, 0 if flagged
 42 |         print(result.metadata)  # Detailed category scores and threshold used
 43 |         ```
 44 |     """
 45 | 
 46 |     threshold = None
 47 |     extra_args = {}
 48 | 
 49 |     def __init__(
 50 |         self,
 51 |         threshold=None,
 52 |         api_key=None,
 53 |         base_url=None,
 54 |         client: Optional[Client] = None,
 55 |     ):
 56 |         """Initialize a Moderation scorer.
 57 | 
 58 |         Args:
 59 |             threshold: Optional float to set a custom threshold for content flagging.
 60 |                 If None, uses OpenAI's default flagging logic.
 61 |                 If set, content is flagged if any category score exceeds this threshold.
 62 |             client: Optional client for making API calls. Can be:
 63 |                 - None: Uses global client from init()
 64 |                 - OpenAI v1 client: Used directly
 65 |                 - OpenAI v0 module: Wrapped in a client adapter
 66 |             api_key: Deprecated. Use client instead.
 67 |             base_url: Deprecated. Use client instead.
 68 | 
 69 |         Note:
 70 |             The api_key and base_url parameters are deprecated and will be removed in a future version.
 71 |             Instead, you can either:
 72 |             1. Pass a client instance directly to this constructor using the client parameter
 73 |             2. Set a global client using autoevals.init(client=your_client)
 74 | 
 75 |             The global client can be configured once and will be used by all evaluators that don't have
 76 |             a specific client passed to them.
 77 |         """
 78 |         super().__init__(api_key=api_key, base_url=base_url, client=client)
 79 |         self.threshold = threshold
 80 | 
 81 |     def _run_eval_sync(self, output, expected=None, **kwargs):
 82 |         moderation_response = run_cached_request(
 83 |             client=self.client, request_type=REQUEST_TYPE, input=output, **self.extra_args
 84 |         )["results"][0]
 85 |         return self.__postprocess_response(moderation_response)
 86 | 
 87 |     def __postprocess_response(self, moderation_response) -> Score:
 88 |         return Score(
 89 |             name=self._name(),
 90 |             score=self.compute_score(moderation_response, self.threshold),
 91 |             metadata={
 92 |                 "threshold": self.threshold,
 93 |                 "category_scores": moderation_response["category_scores"],
 94 |             },
 95 |         )
 96 | 
 97 |     async def _run_eval_async(self, output, expected=None, **kwargs) -> Score:
 98 |         moderation_response = (
 99 |             await arun_cached_request(client=self.client, request_type=REQUEST_TYPE, input=output, **self.extra_args)
100 |         )["results"][0]
101 |         return self.__postprocess_response(moderation_response)
102 | 
103 |     @staticmethod
104 |     def compute_score(moderation_result, threshold):
105 |         if threshold is None:
106 |             return 0 if moderation_result["flagged"] else 1
107 | 
108 |         category_scores = moderation_result["category_scores"]
109 |         for category in category_scores.keys():
110 |             if category_scores[category] > threshold:
111 |                 return 0
112 | 
113 |         return 1
114 | 
115 | 
116 | __all__ = ["Moderation"]
117 | 


--------------------------------------------------------------------------------
/py/autoevals/number.py:
--------------------------------------------------------------------------------
 1 | """Numeric evaluation scorers for comparing numerical values.
 2 | 
 3 | This module provides scorers for working with numbers:
 4 | - NumericDiff: Compare numbers using normalized difference, providing a similarity score
 5 |   that accounts for both absolute and relative differences between values.
 6 | 
 7 | Features:
 8 | - Normalized scoring between 0 and 1
 9 | - Handles special cases like comparing zeros
10 | - Accounts for magnitude when computing differences
11 | - Suitable for both small and large number comparisons
12 | """
13 | 
14 | from braintrust_core.score import Score
15 | 
16 | from autoevals.partial import ScorerWithPartial
17 | 
18 | 
19 | class NumericDiff(ScorerWithPartial):
20 |     """Numeric similarity scorer using normalized difference.
21 | 
22 |     Example:
23 |         ```python
24 |         scorer = NumericDiff()
25 |         result = scorer.eval(
26 |             output=105,
27 |             expected=100
28 |         )
29 |         print(result.score)  # 0.95 (normalized similarity)
30 |         ```
31 | 
32 |     Args:
33 |         output: Number to evaluate
34 |         expected: Reference number to compare against
35 | 
36 |     Returns:
37 |         Score object with normalized similarity (0-1), where:
38 |         - 1 means identical numbers
39 |         - Score decreases as difference increases relative to magnitude
40 |         - Special case: score=1 when both numbers are 0
41 |     """
42 | 
43 |     def _run_eval_sync(self, output, expected=None, **kwargs):
44 |         if expected is None:
45 |             raise ValueError("NumericDiff requires an expected value")
46 | 
47 |         if expected == 0 and output == 0:
48 |             score = 1
49 |         else:
50 |             score = 1 - abs(expected - output) / (abs(expected) + abs(output))
51 |         return Score(name=self._name(), score=score)
52 | 
53 | 
54 | __all__ = ["NumericDiff"]
55 | 


--------------------------------------------------------------------------------
/py/autoevals/oai.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import os
  3 | import sys
  4 | import textwrap
  5 | import time
  6 | import warnings
  7 | from contextvars import ContextVar
  8 | from dataclasses import dataclass
  9 | from typing import Any, Callable, Dict, Optional, Protocol, Tuple, Type, TypeVar, Union, cast, runtime_checkable
 10 | 
 11 | PROXY_URL = "https://api.braintrust.dev/v1/proxy"
 12 | 
 13 | 
 14 | @runtime_checkable
 15 | class ChatCompletions(Protocol):
 16 |     create: Callable[..., Any]
 17 | 
 18 | 
 19 | @runtime_checkable
 20 | class Chat(Protocol):
 21 |     @property
 22 |     def completions(self) -> ChatCompletions:
 23 |         ...
 24 | 
 25 | 
 26 | @runtime_checkable
 27 | class Embeddings(Protocol):
 28 |     create: Callable[..., Any]
 29 | 
 30 | 
 31 | @runtime_checkable
 32 | class Moderations(Protocol):
 33 |     create: Callable[..., Any]
 34 | 
 35 | 
 36 | @runtime_checkable
 37 | class OpenAIV1Module(Protocol):
 38 |     class OpenAI(Protocol):
 39 |         # Core API resources
 40 |         @property
 41 |         def chat(self) -> Chat:
 42 |             ...
 43 | 
 44 |         @property
 45 |         def embeddings(self) -> Embeddings:
 46 |             ...
 47 | 
 48 |         @property
 49 |         def moderations(self) -> Moderations:
 50 |             ...
 51 | 
 52 |         # Configuration
 53 |         @property
 54 |         def api_key(self) -> str:
 55 |             ...
 56 | 
 57 |         @property
 58 |         def organization(self) -> Optional[str]:
 59 |             ...
 60 | 
 61 |         @property
 62 |         def base_url(self) -> Union[str, Any, None]:
 63 |             ...
 64 | 
 65 |     class AsyncOpenAI(OpenAI):
 66 |         ...
 67 | 
 68 |     class RateLimitError(Exception):
 69 |         ...
 70 | 
 71 | 
 72 | # TODO: we're removing v0 support in the next release
 73 | @runtime_checkable
 74 | class OpenAIV0Module(Protocol):
 75 |     class ChatCompletion(Protocol):
 76 |         acreate: Callable[..., Any]
 77 |         create: Callable[..., Any]
 78 | 
 79 |     class Embedding(Protocol):
 80 |         acreate: Callable[..., Any]
 81 |         create: Callable[..., Any]
 82 | 
 83 |     class Moderation(Protocol):
 84 |         acreate: Callable[..., Any]
 85 |         create: Callable[..., Any]
 86 | 
 87 |     api_key: Optional[str]
 88 |     api_base: Optional[str]
 89 |     base_url: Optional[str]
 90 | 
 91 |     class error(Protocol):
 92 |         class RateLimitError(Exception):
 93 |             ...
 94 | 
 95 | 
 96 | _openai_module: Optional[Union[OpenAIV1Module, OpenAIV0Module]] = None
 97 | 
 98 | 
 99 | def get_openai_module() -> Union[OpenAIV1Module, OpenAIV0Module]:
100 |     global _openai_module
101 | 
102 |     if _openai_module is not None:
103 |         return _openai_module
104 | 
105 |     import openai  # type: ignore
106 | 
107 |     _openai_module = cast(Union[OpenAIV1Module, OpenAIV0Module], openai)
108 |     return _openai_module
109 | 
110 | 
111 | @dataclass
112 | class LLMClient:
113 |     """A client wrapper for LLM operations that supports both OpenAI SDK v0 and v1.
114 | 
115 |     This class provides a consistent interface for common LLM operations regardless of the
116 |     underlying OpenAI SDK version. It's designed to be extensible for custom implementations.
117 | 
118 |     Attributes:
119 |         openai: The OpenAI module or client instance (either v0 or v1 SDK).
120 |         complete: Completion function that creates chat completions.
121 |             - For v0: openai.ChatCompletion.create or acreate
122 |             - For v1: openai.chat.completions.create
123 |         embed: Embedding function that creates embeddings.
124 |             - For v0: openai.Embedding.create or acreate
125 |             - For v1: openai.embeddings.create
126 |         moderation: Moderation function that creates content moderations.
127 |             - For v0: openai.Moderations.create or acreate
128 |             - For v1: openai.moderations.create
129 |         RateLimitError: The rate limit exception class for the SDK version.
130 |             - For v0: openai.error.RateLimitError
131 |             - For v1: openai.RateLimitError
132 |         is_async: Whether the client is async (only used for v0 autoconfiguration).
133 | 
134 |     Note:
135 |         If using async OpenAI methods you must use the async methods in Autoevals.
136 |         The client will automatically configure itself if methods are not provided.
137 | 
138 |     Example:
139 |         ```python
140 |         # Using with OpenAI v1
141 |         import openai
142 |         client = openai.OpenAI()  # Configure with your settings
143 |         llm = LLMClient(openai=client)  # Methods will be auto-configured
144 | 
145 |         # Or with explicit method configuration
146 |         llm = LLMClient(
147 |             openai=client,
148 |             complete=client.chat.completions.create,
149 |             embed=client.embeddings.create,
150 |             moderation=client.moderations.create,
151 |             RateLimitError=openai.RateLimitError
152 |         )
153 | 
154 |         # Extending for custom implementation
155 |         @dataclass
156 |         class CustomLLMClient(LLMClient):
157 |             def complete(self, **kwargs):
158 |                 # make adjustments as needed
159 |                 return self.openai.chat.completions.create(**kwargs)
160 |         ```
161 |     """
162 | 
163 |     openai: Union[OpenAIV0Module, OpenAIV1Module.OpenAI]
164 |     complete: Callable[..., Any] = None  # type: ignore # Set in __post_init__
165 |     embed: Callable[..., Any] = None  # type: ignore # Set in __post_init__
166 |     moderation: Callable[..., Any] = None  # type: ignore # Set in __post_init__
167 |     RateLimitError: Type[Exception] = None  # type: ignore # Set in __post_init__
168 |     is_async: bool = False
169 |     _is_wrapped: bool = False
170 | 
171 |     def __post_init__(self):
172 |         NamedWrapper, wrap_openai = get_openai_wrappers()
173 | 
174 |         has_customization = self.complete is not None or self.embed is not None or self.moderation is not None  # type: ignore  # Pyright doesn't understand our design choice
175 | 
176 |         # avoid wrapping if we have custom methods (the user may intend not to wrap)
177 |         if not has_customization and not isinstance(self.openai, NamedWrapper):
178 |             self.openai = wrap_openai(self.openai)
179 | 
180 |         self._is_wrapped = isinstance(self.openai, NamedWrapper)
181 | 
182 |         openai_module = get_openai_module()
183 | 
184 |         if hasattr(openai_module, "OpenAI"):
185 |             openai_module = cast(OpenAIV1Module, openai_module)
186 |             self.openai = cast(OpenAIV1Module.OpenAI, self.openai)
187 | 
188 |             # v1
189 |             self.complete = self.openai.chat.completions.create
190 |             self.embed = self.openai.embeddings.create
191 |             self.moderation = self.openai.moderations.create
192 |             self.RateLimitError = openai_module.RateLimitError
193 |         else:
194 |             openai_module = cast(OpenAIV0Module, openai_module)
195 |             self.openai = cast(OpenAIV0Module, self.openai)
196 | 
197 |             # v0
198 |             self.complete = self.openai.ChatCompletion.acreate if self.is_async else self.openai.ChatCompletion.create
199 |             self.embed = self.openai.Embedding.acreate if self.is_async else self.openai.Embedding.create
200 |             self.moderation = self.openai.Moderation.acreate if self.is_async else self.openai.Moderation.create
201 |             self.RateLimitError = openai_module.error.RateLimitError
202 | 
203 |     @property
204 |     def is_wrapped(self) -> bool:
205 |         return self._is_wrapped
206 | 
207 | 
208 | _client_var = ContextVar[Optional[LLMClient]]("client")
209 | 
210 | T = TypeVar("T")
211 | 
212 | _named_wrapper: Optional[Type[Any]] = None
213 | _wrap_openai: Optional[Callable[[Any], Any]] = None
214 | 
215 | 
216 | def get_openai_wrappers() -> Tuple[Type[Any], Callable[[Any], Any]]:
217 |     global _named_wrapper, _wrap_openai
218 | 
219 |     if _named_wrapper is not None and _wrap_openai is not None:
220 |         return _named_wrapper, _wrap_openai
221 | 
222 |     try:
223 |         from braintrust.oai import NamedWrapper as BraintrustNamedWrapper  # type: ignore
224 |         from braintrust.oai import wrap_openai  # type: ignore
225 | 
226 |         _named_wrapper = cast(Type[Any], BraintrustNamedWrapper)
227 |     except ImportError:
228 | 
229 |         class NamedWrapper:
230 |             pass
231 | 
232 |         def wrap_openai(openai: T) -> T:
233 |             return openai
234 | 
235 |         _named_wrapper = NamedWrapper
236 | 
237 |     _wrap_openai = cast(Callable[[Any], Any], wrap_openai)
238 |     return _named_wrapper, _wrap_openai
239 | 
240 | 
241 | Client = Union[LLMClient, OpenAIV0Module, OpenAIV1Module.OpenAI]
242 | 
243 | 
244 | def resolve_client(client: Client, is_async: bool = False) -> LLMClient:
245 |     if isinstance(client, LLMClient):
246 |         return client
247 |     return LLMClient(openai=client, is_async=is_async)
248 | 
249 | 
250 | def init(client: Optional[Client] = None, is_async: bool = False):
251 |     """Initialize Autoevals with an optional custom LLM client.
252 | 
253 |     This function sets up the global client context for Autoevals to use. If no client is provided,
254 |     the default OpenAI client will be used.
255 | 
256 |     Args:
257 |         client: The client to use for LLM operations. Can be one of:
258 |             - None: Resets the global client
259 |             - LLMClient: Used directly as provided
260 |             - OpenAIV0Module: Wrapped in a new LLMClient instance (OpenAI SDK v0)
261 |             - OpenAIV1: Wrapped in a new LLMClient instance (OpenAI SDK v1)
262 |         is_async: Whether to create a client with async operations. Defaults to False.
263 |             Deprecated: Use the `client` argument directly with your desired async/sync configuration.
264 |     """
265 |     _client_var.set(resolve_client(client, is_async=is_async) if client else None)
266 | 
267 | 
268 | warned_deprecated_api_key_base_url = False
269 | 
270 | 
271 | def prepare_openai(
272 |     client: Optional[Client] = None,
273 |     is_async: bool = False,
274 |     api_key: Optional[str] = None,
275 |     base_url: Optional[str] = None,
276 | ):
277 |     """Prepares and configures an OpenAI client for use with AutoEval.
278 | 
279 |     This function handles both v0 and v1 of the OpenAI SDK, configuring the client
280 |     with the appropriate authentication and base URL settings.
281 | 
282 |     We will also attempt to enable Braintrust tracing export, if you've configured tracing.
283 | 
284 |     Args:
285 |         client (Optional[LLMClient], optional): Existing LLMClient instance.
286 |             If provided, this client will be used instead of creating a new one.
287 | 
288 |         is_async (bool, optional): Whether to create a client with async operations. Defaults to False.
289 |             Deprecated: Use the `client` argument and set the `openai` with the async/sync that you'd like to use.
290 | 
291 |         api_key (str, optional): OpenAI API key. If not provided, will look for
292 |             OPENAI_API_KEY or BRAINTRUST_API_KEY in environment variables.
293 |             Deprecated: Use the `client` argument and set the `openai`.
294 | 
295 |         base_url (str, optional): Base URL for API requests. If not provided, will
296 |             use OPENAI_BASE_URL from environment or fall back to PROXY_URL.
297 |             Deprecated: Use the `client` argument and set the `openai`.
298 | 
299 |     Returns:
300 |         LLMClient: The configured LLMClient instance, or the client you've provided
301 | 
302 |     Raises:
303 |         ImportError: If the OpenAI package is not installed
304 |     """
305 |     client = client or _client_var.get(None)
306 |     if client is not None:
307 |         return resolve_client(client, is_async=is_async)
308 | 
309 |     try:
310 |         openai_module = get_openai_module()
311 |     except Exception as e:
312 |         print(
313 |             textwrap.dedent(
314 |                 f"""\
315 |             Unable to import openai: {e}
316 | 
317 |             Please install it, e.g. with
318 | 
319 |             pip install 'openai'
320 |             """
321 |             ),
322 |             file=sys.stderr,
323 |         )
324 |         raise
325 | 
326 |     global warned_deprecated_api_key_base_url
327 |     if not warned_deprecated_api_key_base_url and (api_key is not None or base_url is not None):
328 |         warnings.warn(
329 |             "The api_key and base_url parameters are deprecated. Please use init() or call with client instead.",
330 |             DeprecationWarning,
331 |             stacklevel=2,
332 |         )
333 |         warned_deprecated_api_key_base_url = True
334 | 
335 |     # prepare the default openai sdk, if not provided
336 |     if api_key is None:
337 |         api_key = os.environ.get("OPENAI_API_KEY") or os.environ.get("BRAINTRUST_API_KEY")
338 |     if base_url is None:
339 |         base_url = os.environ.get("OPENAI_BASE_URL", PROXY_URL)
340 | 
341 |     if hasattr(openai_module, "OpenAI"):
342 |         openai_module = cast(OpenAIV1Module, openai_module)
343 | 
344 |         # v1 API
345 |         if is_async:
346 |             openai_obj = openai_module.AsyncOpenAI(api_key=api_key, base_url=base_url)  # type: ignore
347 |         else:
348 |             openai_obj = openai_module.OpenAI(api_key=api_key, base_url=base_url)  # type: ignore
349 |     else:
350 |         openai_module = cast(OpenAIV0Module, openai_module)
351 | 
352 |         # v0 API
353 |         if api_key:
354 |             openai_module.api_key = api_key
355 |         openai_module.api_base = base_url
356 |         openai_obj = openai_module
357 | 
358 |     return LLMClient(openai=openai_obj, is_async=is_async)
359 | 
360 | 
361 | def post_process_response(resp: Any) -> Dict[str, Any]:
362 |     # This normalizes against craziness in OpenAI v0 vs. v1
363 |     if hasattr(resp, "to_dict"):
364 |         # v0
365 |         return resp.to_dict()
366 |     else:
367 |         # v1
368 |         return resp.dict()
369 | 
370 | 
371 | def set_span_purpose(kwargs: Dict[str, Any]) -> None:
372 |     kwargs.setdefault("span_info", {}).setdefault("span_attributes", {})["purpose"] = "scorer"
373 | 
374 | 
375 | def run_cached_request(
376 |     *,
377 |     client: Optional[LLMClient] = None,
378 |     request_type: str = "complete",
379 |     api_key: Optional[str] = None,
380 |     base_url: Optional[str] = None,
381 |     **kwargs: Any,
382 | ) -> Dict[str, Any]:
383 |     wrapper = prepare_openai(client=client, is_async=False, api_key=api_key, base_url=base_url)
384 |     if wrapper.is_wrapped:
385 |         set_span_purpose(kwargs)
386 | 
387 |     retries = 0
388 |     sleep_time = 0.1
389 |     resp = None
390 |     while retries < 100:
391 |         try:
392 |             resp = post_process_response(getattr(wrapper, request_type)(**kwargs))
393 |             break
394 |         except wrapper.RateLimitError:
395 |             sleep_time *= 1.5
396 |             time.sleep(sleep_time)
397 |             retries += 1
398 | 
399 |     if resp is None:
400 |         raise RuntimeError("Failed to get response after maximum retries")
401 |     return resp
402 | 
403 | 
404 | async def arun_cached_request(
405 |     *,
406 |     client: Optional[LLMClient] = None,
407 |     request_type: str = "complete",
408 |     api_key: Optional[str] = None,
409 |     base_url: Optional[str] = None,
410 |     **kwargs: Any,
411 | ) -> Dict[str, Any]:
412 |     wrapper = prepare_openai(client=client, is_async=True, api_key=api_key, base_url=base_url)
413 |     if wrapper.is_wrapped:
414 |         set_span_purpose(kwargs)
415 | 
416 |     retries = 0
417 |     sleep_time = 0.1
418 |     resp = None
419 |     while retries < 100:
420 |         try:
421 |             resp = post_process_response(await getattr(wrapper, request_type)(**kwargs))
422 |             break
423 |         except wrapper.RateLimitError:
424 |             # Just assume it's a rate limit error
425 |             sleep_time *= 1.5
426 |             await asyncio.sleep(sleep_time)
427 |             retries += 1
428 | 
429 |     if resp is None:
430 |         raise RuntimeError("Failed to get response after maximum retries")
431 | 
432 |     return resp
433 | 


--------------------------------------------------------------------------------
/py/autoevals/partial.py:
--------------------------------------------------------------------------------
 1 | from braintrust_core.score import Scorer
 2 | 
 3 | 
 4 | class ScorerWithPartial(Scorer):
 5 |     @classmethod
 6 |     def partial(cls, **partial_kwargs):
 7 |         class PartialScorer(cls):
 8 |             async def eval_async(self, output, expected=None, **kwargs):
 9 |                 if expected is not None:
10 |                     kwargs["expected"] = expected
11 |                 return await self._run_eval_async(output, **{**partial_kwargs, **kwargs})
12 | 
13 |             def eval(self, output, expected=None, **kwargs):
14 |                 if expected is not None:
15 |                     kwargs["expected"] = expected
16 |                 return self._run_eval_sync(output, **{**partial_kwargs, **kwargs})
17 | 
18 |             @classmethod
19 |             def _partial_args(cls):
20 |                 return {**partial_kwargs}
21 | 
22 |         PartialScorer.__name__ = cls.__name__
23 |         return PartialScorer
24 | 


--------------------------------------------------------------------------------
/py/autoevals/string.py:
--------------------------------------------------------------------------------
  1 | """String evaluation scorers for comparing text similarity.
  2 | 
  3 | This module provides scorers for text comparison:
  4 | 
  5 | - Levenshtein: Compare strings using edit distance
  6 |   - Fast, local string comparison
  7 |   - Suitable for exact matches and small variations
  8 |   - No external dependencies
  9 |   - Simple to use with just output/expected parameters
 10 | 
 11 | - EmbeddingSimilarity: Compare strings using embeddings
 12 |   - Semantic similarity using embeddings
 13 |   - Requires OpenAI API access
 14 |   - Better for comparing meaning rather than exact matches
 15 |   - Supports both sync and async evaluation
 16 |   - Built-in caching for efficiency
 17 |   - Configurable with options for model, prefix, thresholds
 18 | """
 19 | 
 20 | import threading
 21 | from typing import Optional
 22 | 
 23 | from braintrust_core.score import Score
 24 | from polyleven import levenshtein as distance
 25 | 
 26 | from autoevals.partial import ScorerWithPartial
 27 | from autoevals.value import normalize_value
 28 | 
 29 | from .oai import LLMClient, arun_cached_request, run_cached_request
 30 | 
 31 | 
 32 | class Levenshtein(ScorerWithPartial):
 33 |     """String similarity scorer using edit distance.
 34 | 
 35 |     Example:
 36 |         ```python
 37 |         scorer = Levenshtein()
 38 |         result = scorer.eval(
 39 |             output="hello wrld",
 40 |             expected="hello world"
 41 |         )
 42 |         print(result.score)  # 0.9 (normalized similarity)
 43 |         ```
 44 | 
 45 |     Args:
 46 |         output: String to evaluate
 47 |         expected: Reference string to compare against
 48 | 
 49 |     Returns:
 50 |         Score object with normalized similarity (0-1), where 1 means identical strings
 51 |     """
 52 | 
 53 |     def _run_eval_sync(self, output, expected=None, **kwargs):
 54 |         if expected is None:
 55 |             raise ValueError("LevenshteinScorer requires an expected value")
 56 | 
 57 |         output, expected = str(output), str(expected)
 58 |         max_len = max(len(x) for x in [output, expected])
 59 | 
 60 |         score = 1
 61 |         if max_len > 0:
 62 |             score = 1 - (distance(output, expected) / max_len)
 63 | 
 64 |         return Score(name=self._name(), score=score)
 65 | 
 66 | 
 67 | LevenshteinScorer = Levenshtein  # backcompat
 68 | 
 69 | 
 70 | class EmbeddingSimilarity(ScorerWithPartial):
 71 |     """String similarity scorer using embeddings.
 72 | 
 73 |     Example:
 74 |         ```python
 75 |         import asyncio
 76 |         from openai import AsyncOpenAI
 77 |         from autoevals.string import EmbeddingSimilarity
 78 | 
 79 |         async def compare_texts():
 80 |             # Initialize with async client
 81 |             client = AsyncOpenAI()
 82 |             scorer = EmbeddingSimilarity(
 83 |                 prefix="Code explanation: ",
 84 |                 client=client
 85 |             )
 86 | 
 87 |             result = await scorer.eval_async(
 88 |                 output="The function sorts elements using quicksort",
 89 |                 expected="The function implements quicksort algorithm"
 90 |             )
 91 | 
 92 |             print(result.score)  # 0.85 (normalized similarity)
 93 |             print(result.metadata)  # Additional comparison details
 94 | 
 95 |         # Run the async evaluation
 96 |         asyncio.run(compare_texts())
 97 |         ```
 98 | 
 99 |     Args:
100 |         prefix: Optional text to prepend to inputs for domain context
101 |         model: Embedding model to use (default: text-embedding-ada-002)
102 |         expected_min: Minimum similarity threshold (default: 0.7)
103 |         client: Optional AsyncOpenAI/OpenAI client. If not provided, uses global client from init()
104 | 
105 |     Returns:
106 |         Score object with:
107 |         - score: Normalized similarity (0-1)
108 |         - metadata: Additional comparison details
109 |     """
110 | 
111 |     MODEL = "text-embedding-ada-002"
112 | 
113 |     _CACHE = {}
114 |     _CACHE_LOCK = threading.Lock()
115 | 
116 |     def __init__(
117 |         self,
118 |         prefix="",
119 |         model=MODEL,
120 |         expected_min=0.7,
121 |         api_key=None,
122 |         base_url=None,
123 |         client: Optional[LLMClient] = None,
124 |     ):
125 |         self.prefix = prefix
126 |         self.expected_min = expected_min
127 | 
128 |         self.extra_args = {"model": model}
129 |         if api_key:
130 |             self.extra_args["api_key"] = api_key
131 |         if base_url:
132 |             self.extra_args["base_url"] = base_url
133 | 
134 |         self.client = client
135 | 
136 |     async def _a_embed(self, value):
137 |         value = normalize_value(value, maybe_object=False)
138 |         with self._CACHE_LOCK:
139 |             if value in self._CACHE:
140 |                 return self._CACHE[value]
141 | 
142 |         result = await arun_cached_request(
143 |             client=self.client, request_type="embed", input=f"{self.prefix}{value}", **self.extra_args
144 |         )
145 | 
146 |         with self._CACHE_LOCK:
147 |             self._CACHE[value] = result
148 | 
149 |         return result
150 | 
151 |     def _embed(self, value):
152 |         value = normalize_value(value, maybe_object=False)
153 |         with self._CACHE_LOCK:
154 |             if value in self._CACHE:
155 |                 return self._CACHE[value]
156 | 
157 |         result = run_cached_request(
158 |             client=self.client, request_type="embed", input=f"{self.prefix}{value}", **self.extra_args
159 |         )
160 | 
161 |         with self._CACHE_LOCK:
162 |             self._CACHE[value] = result
163 | 
164 |         return result
165 | 
166 |     async def _run_eval_async(self, output, expected=None, **kwargs):
167 |         if expected is None:
168 |             raise ValueError("EmbeddingSimilarity requires an expected value")
169 | 
170 |         output_embedding_p = self._a_embed(output)
171 |         expected_embedding_p = self._a_embed(expected)
172 | 
173 |         output_result, expected_result = await output_embedding_p, await expected_embedding_p
174 |         return Score(
175 |             name=self._name(),
176 |             score=self.scale_score(
177 |                 self.cosine_similarity(output_result["data"][0]["embedding"], expected_result["data"][0]["embedding"]),
178 |                 self.expected_min,
179 |             ),
180 |         )
181 | 
182 |     def _run_eval_sync(self, output, expected=None, **kwargs):
183 |         if expected is None:
184 |             raise ValueError("EmbeddingSimilarity requires an expected value")
185 | 
186 |         output_result = self._embed(output)
187 |         expected_result = self._embed(expected)
188 | 
189 |         return Score(
190 |             name=self._name(),
191 |             score=self.scale_score(
192 |                 self.cosine_similarity(output_result["data"][0]["embedding"], expected_result["data"][0]["embedding"]),
193 |                 self.expected_min,
194 |             ),
195 |         )
196 | 
197 |     @staticmethod
198 |     def scale_score(score, expected_min):
199 |         return max((score - expected_min) / (1 - expected_min), 0)
200 | 
201 |     @staticmethod
202 |     def cosine_similarity(list1, list2):
203 |         # Calculate dot product
204 |         dot_product = sum(a * b for a, b in zip(list1, list2))
205 | 
206 |         # Calculate the magnitude of each list
207 |         magnitude_list1 = sum(a**2 for a in list1) ** 0.5
208 |         magnitude_list2 = sum(b**2 for b in list2) ** 0.5
209 | 
210 |         # Calculate cosine similarity
211 |         if magnitude_list1 * magnitude_list2 == 0:
212 |             # Avoid division by zero
213 |             return 0
214 |         else:
215 |             # Sometimes, rounding errors cause the dot product to be slightly > 1
216 |             return min(dot_product / (magnitude_list1 * magnitude_list2), 1)
217 | 
218 | 
219 | __all__ = ["LevenshteinScorer", "Levenshtein", "EmbeddingSimilarity"]
220 | 


--------------------------------------------------------------------------------
/py/autoevals/templates:
--------------------------------------------------------------------------------
1 | ../../templates


--------------------------------------------------------------------------------
/py/autoevals/test_embeddings.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from autoevals import EmbeddingSimilarity
 4 | from autoevals.value import normalize_value
 5 | 
 6 | SYNONYMS = [
 7 |     ("water", ["water", "H2O", "agua"]),
 8 |     ("fire", ["fire", "flame"]),
 9 |     ("earth", ["earth", "Planet Earth"]),
10 | ]
11 | 
12 | UNRELATED = ["water", "The quick brown fox jumps over the lazy dog", "I like to eat apples"]
13 | 
14 | 
15 | def test_embeddings():
16 |     evaluator = EmbeddingSimilarity(prefix="resource type: ")
17 |     for word, synonyms in SYNONYMS:
18 |         for synonym in synonyms:
19 |             result = evaluator(word, synonym)
20 |             print(f"[{word}]", f"[{synonym}]", result)
21 |             assert result.score > 0.66
22 | 
23 |     for i in range(len(UNRELATED)):
24 |         for j in range(len(UNRELATED)):
25 |             if i == j:
26 |                 continue
27 | 
28 |             word1 = UNRELATED[i]
29 |             word2 = UNRELATED[j]
30 |             result = evaluator(word1, word2)
31 |             print(f"[{word1}]", f"[{word2}]", result)
32 |             assert result.score < 0.5
33 | 
34 | 
35 | VALUES = [
36 |     ("water", "wind"),
37 |     (["cold", "water"], ["cold", "wind"]),
38 |     ({"water": "wet"}, {"wind": "dry"}),
39 | ]
40 | 
41 | 
42 | def test_embedding_values():
43 |     for run_async in [False, True]:
44 |         evaluator = EmbeddingSimilarity()
45 |         for (word1, word2) in VALUES:
46 |             if run_async:
47 |                 result = asyncio.run(evaluator.eval_async(word1, word2))
48 |             else:
49 |                 result = evaluator(word1, word2)
50 |             print(f"[{word1}]", f"[{word2}]", f"run_async={run_async}", result)
51 | 


--------------------------------------------------------------------------------
/py/autoevals/test_json.py:
--------------------------------------------------------------------------------
  1 | from pytest import approx
  2 | 
  3 | from autoevals.json import JSONDiff, ValidJSON
  4 | from autoevals.number import NumericDiff
  5 | from autoevals.value import ExactMatch
  6 | 
  7 | 
  8 | def test_string_as_json():
  9 |     cases = [
 10 |         ("", "", 1),
 11 |         ("", "a", 0),
 12 |         ("a", "", 0),
 13 |         ("a", "a", 1),
 14 |         ("a", "b", 0),
 15 |         ("ab", "ac", 0.5),
 16 |         ("ac", "bc", 0.5),
 17 |         ("abc", "axc", 0.66667),
 18 |         ("xabxcdxxefxgx", "1ab2cd34ef5g6", 0.53846),
 19 |     ]
 20 | 
 21 |     evaluator = JSONDiff()
 22 |     for a, b, expected in cases:
 23 |         print(f"[{a}]", f"[{b}]", expected, evaluator(a, b))
 24 |         assert evaluator(a, b).score == approx(expected, abs=1e-4)
 25 | 
 26 | 
 27 | def test_json():
 28 |     cases = [
 29 |         (None, None, 1),
 30 |         (None, "", 0),
 31 |         ([], {}, 0),
 32 |         ([], [], 1),
 33 |         ({}, {}, 1),
 34 |         ({"a": 1}, {"a": 1}, 1),
 35 |         ({"a": 1}, {"a": 2}, 0.66667),
 36 |         ({"a": 1}, ["a", 1], 0.5714285714285714),
 37 |         ({"a": 1}, {"b": {"a": 1}}, 0),
 38 |         ({"a": 1}, {"a": None}, 0),
 39 |         (
 40 |             {"mapping": {"a": "foo", "b": "bar"}},
 41 |             {"mapping": {"a": "Foo", "b": "Bar"}, "Extra": 5},
 42 |             0.33333333333333337,
 43 |         ),
 44 |     ]
 45 | 
 46 |     evaluator = JSONDiff()
 47 |     for a, b, expected in cases:
 48 |         print(f"[{a}]", f"[{b}]", expected, evaluator(a, b))
 49 |         assert evaluator(a, b).score == approx(expected, 1e-4)
 50 | 
 51 | 
 52 | def test_valid_json():
 53 |     cases = [
 54 |         ("1", 0, None),
 55 |         ('{ "a": 1, "b": "hello" }', 1, None),
 56 |         ('[{ "a": 1 }]', 1, None),
 57 |         ('[{ "a": 1 }', 0, None),
 58 |         ('{ "mapping": { "a": "foo", "b": "bar" }, "extra": 4 }', 1, None),
 59 |         ('{ mapping: { "a": "foo", "b": "bar" }, "extra": 4 }', 0, None),
 60 |         (
 61 |             '{ "a": "1" }',
 62 |             1,
 63 |             {
 64 |                 "type": "object",
 65 |                 "properties": {"a": {"type": "string"}},
 66 |                 "required": ["a"],
 67 |             },
 68 |         ),
 69 |         (
 70 |             '{"a": "1", "b": "1"}',
 71 |             0,
 72 |             {
 73 |                 "type": "object",
 74 |                 "properties": {
 75 |                     "a": {"type": "string"},
 76 |                     "b": {"type": "number"},
 77 |                 },
 78 |                 "required": ["a"],
 79 |             },
 80 |         ),
 81 |         (
 82 |             '[{"a": "1"}, {"a": "1", "b": 22}]',
 83 |             1,
 84 |             {
 85 |                 "type": "array",
 86 |                 "items": {
 87 |                     "type": "object",
 88 |                     "properties": {
 89 |                         "a": {"type": "string"},
 90 |                         "b": {"type": "number"},
 91 |                     },
 92 |                     "required": ["a"],
 93 |                 },
 94 |                 "uniqueItems": True,
 95 |             },
 96 |         ),
 97 |         (
 98 |             {"a": "1", "b": "1"},
 99 |             1,
100 |             None,
101 |         ),
102 |         (
103 |             [{"a": "1"}, {"a": "1", "b": 22}],
104 |             1,
105 |             None,
106 |         ),
107 |         (
108 |             100,
109 |             0,
110 |             None,
111 |         ),
112 |         (
113 |             # This is technically ambiguous, because it _could_ be the valid parsed JSON value
114 |             # or an unparsed, invalid JSON value. However, since structured outputs _only_ return
115 |             # JSON values, we can safely assume that any strings are unparsed values.
116 |             "100",
117 |             0,
118 |             None,
119 |         ),
120 |     ]
121 | 
122 |     evaluator = ValidJSON()
123 |     for output, expected, schema in cases:
124 |         print(f"[{output}]", expected)
125 |         assert evaluator(output, schema).score == expected
126 | 
127 | 
128 | def test_semantic_json():
129 |     cases = [
130 |         ('{"x": 1, "y": 2}', '{"y": 2, "x": 1}', 1),
131 |         (
132 |             '{"zs": ["a", "b"], "x": 1, "y": 2}',
133 |             '{"y": 2, "zs": ["a", "b"], "x": 1}',
134 |             1,
135 |         ),
136 |         (
137 |             '{"o1": {"x": 1, "y": 2}}',
138 |             '{"o1": {"y": 2, "x": 1}}',
139 |             1,
140 |         ),
141 |         (
142 |             '{"xs": [{"o1": {"x": 1, "y": [2]}}]}',
143 |             '{"xs": [{"o1": {"y": [2], "x": 1}}]}',
144 |             1,
145 |         ),
146 |         (
147 |             '{"o1": {"x": 2, "y": 2}}',
148 |             '{"o1": {"y": 2, "x": 1}}',
149 |             0.83333,
150 |         ),
151 |         (
152 |             {"o1": {"x": 2, "y": 2}},
153 |             '{"o1": {"y": 2, "x": 1}}',
154 |             0.83333,
155 |         ),
156 |         ('{"x": 1, "y": 2}', '{"x": 1, "z": 2}', 0.3333),
157 |         ("[1, 2]", "[1, 2]", 1),
158 |         ("[1, 2]", "[2, 1]", 0.66667),
159 |     ]
160 | 
161 |     evaluator = JSONDiff()
162 |     for a, b, expected in cases:
163 |         for exact_number in [True, False]:
164 |             score = evaluator(a, b, number_scorer=ExactMatch() if exact_number else NumericDiff()).score
165 |             if not exact_number:
166 |                 assert abs(score - expected) < 0.0001
167 |             else:
168 |                 assert round(score * 100) <= round(expected * 100)
169 | 


--------------------------------------------------------------------------------
/py/autoevals/test_llm.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | from typing import cast
  3 | 
  4 | import pytest
  5 | import respx
  6 | from openai import OpenAI
  7 | from pydantic import BaseModel
  8 | 
  9 | from autoevals import init
 10 | from autoevals.llm import Battle, Factuality, LLMClassifier, OpenAILLMClassifier, build_classification_tools
 11 | from autoevals.oai import OpenAIV1Module
 12 | 
 13 | 
 14 | class TestModel(BaseModel):
 15 |     foo: str
 16 |     num: int
 17 | 
 18 | 
 19 | def test_render_messages():
 20 |     classifier = OpenAILLMClassifier(
 21 |         "test",
 22 |         messages=[
 23 |             {"role": "user", "content": "{{value}} and {{{value}}}"},
 24 |             {"role": "user", "content": "Dict double braces: {{data}}"},
 25 |             {"role": "user", "content": "Dict triple braces: {{{data}}}"},
 26 |             {"role": "user", "content": "Model double braces: {{model}}"},
 27 |             {"role": "user", "content": "Model triple braces: {{{model}}}"},
 28 |             {"role": "user", "content": ""},  # test empty content
 29 |         ],
 30 |         model="gpt-4",
 31 |         choice_scores={"A": 1},
 32 |         classification_tools=[],
 33 |     )
 34 | 
 35 |     test_dict = {"foo": "bar", "num": 42}
 36 |     test_model = TestModel(foo="bar", num=42)
 37 | 
 38 |     rendered = classifier._render_messages(value="<b>bold</b>", data=test_dict, model=test_model)
 39 | 
 40 |     # Test that HTML is never escaped, regardless of syntax.
 41 |     assert rendered[0]["content"] == "<b>bold</b> and <b>bold</b>"
 42 | 
 43 |     # Test dict rendering - both use str().
 44 |     assert rendered[1]["content"] == "Dict double braces: {'foo': 'bar', 'num': 42}"
 45 |     assert rendered[2]["content"] == "Dict triple braces: {'foo': 'bar', 'num': 42}"
 46 | 
 47 |     # Test model rendering - both use str().
 48 |     assert rendered[3]["content"] == "Model double braces: foo='bar' num=42"
 49 |     assert rendered[4]["content"] == "Model triple braces: foo='bar' num=42"
 50 | 
 51 |     # Test empty content.
 52 |     assert rendered[5]["content"] == ""
 53 | 
 54 | 
 55 | def test_openai():
 56 |     e = OpenAILLMClassifier(
 57 |         "title",
 58 |         messages=[
 59 |             {
 60 |                 "role": "system",
 61 |                 "content": """\
 62 | You are a technical project manager who helps software engineers generate better titles for their GitHub issues.
 63 | You will look at the issue description, and pick which of two titles better describes it.""",
 64 |             },
 65 |             {
 66 |                 "role": "user",
 67 |                 "content": """\
 68 | I'm going to provide you with the issue description, and two possible titles.
 69 | 
 70 | Issue Description: {{page_content}}
 71 | 
 72 | 1: {{output}}
 73 | 2: {{expected}}
 74 | 
 75 | Please discuss each title briefly (one line for pros, one for cons), and then answer the question by calling
 76 | the select_choice function with "1" or "2".""",
 77 |             },
 78 |         ],
 79 |         model="gpt-3.5-turbo",
 80 |         choice_scores={"1": 1, "2": 0},
 81 |         classification_tools=build_classification_tools(useCoT=True, choice_strings=["1", "2"]),
 82 |         max_tokens=500,
 83 |     )
 84 | 
 85 |     page_content = """
 86 | As suggested by Nicolo, we should standardize the error responses coming from GoTrue, postgres, and realtime (and any other/future APIs) so that it's better DX when writing a client,
 87 | 
 88 | We can make this change on the servers themselves, but since postgrest and gotrue are fully/partially external may be harder to change, it might be an option to transform the errors within the client libraries/supabase-js, could be messy?
 89 | 
 90 | Nicolo also dropped this as a reference: http://spec.openapis.org/oas/v3.0.3#openapi-specification"""
 91 | 
 92 |     gen_title = "Standardize error responses from GoTrue, Postgres, and Realtime APIs for better DX"
 93 |     original_title = "This title has nothing to do with the content"
 94 | 
 95 |     response = e(gen_title, original_title, page_content=page_content)
 96 |     print(response.as_json(indent=2))
 97 |     assert response.score == 1
 98 |     assert response.error is None
 99 | 
100 | 
101 | def test_llm_classifier():
102 |     for use_cot in [True, False]:
103 |         e = LLMClassifier(
104 |             "title",
105 |             """
106 | You are a technical project manager who helps software engineers generate better titles for their GitHub issues.
107 | You will look at the issue description, and pick which of two titles better describes it.
108 | 
109 | I'm going to provide you with the issue description, and two possible titles.
110 | 
111 | Issue Description: {{page_content}}
112 | 
113 | 1: {{output}}
114 | 2: {{expected}}""",
115 |             {"1": 1, "2": 0},
116 |             use_cot=use_cot,
117 |         )
118 | 
119 |         page_content = """
120 | As suggested by Nicolo, we should standardize the error responses coming from GoTrue, postgres, and realtime (and any other/future APIs) so that it's better DX when writing a client,
121 | 
122 | We can make this change on the servers themselves, but since postgrest and gotrue are fully/partially external may be harder to change, it might be an option to transform the errors within the client libraries/supabase-js, could be messy?
123 | 
124 | Nicolo also dropped this as a reference: http://spec.openapis.org/oas/v3.0.3#openapi-specification"""
125 | 
126 |         gen_title = "Standardize error responses from GoTrue, Postgres, and Realtime APIs for better DX"
127 |         original_title = "This title has nothing to do with the content"
128 | 
129 |         response = e(gen_title, original_title, page_content=page_content)
130 |         print(response.as_json(indent=2))
131 |         assert response.score == 1
132 |         assert response.error is None
133 | 
134 |         response = e(original_title, gen_title, page_content=page_content)
135 |         print(response.as_json(indent=2))
136 |         assert response.score == 0
137 |         assert response.error is None
138 | 
139 | 
140 | def test_nested_async():
141 |     async def nested_async():
142 |         e = Battle()
143 |         e(instructions="Add the following numbers: 1, 2, 3", output="600", expected="6")
144 | 
145 |     asyncio.run(nested_async())
146 | 
147 | 
148 | @respx.mock
149 | def test_factuality():
150 |     # something is wrong with respx that it couldn't match the url from openai
151 |     respx.route().respond(
152 |         json={
153 |             "id": "chatcmpl-AdiS4bHWjqSclA5rx7OkuZ6EA9QIp",
154 |             "choices": [
155 |                 {
156 |                     "finish_reason": "stop",
157 |                     "index": 0,
158 |                     "logprobs": None,
159 |                     "message": {
160 |                         "content": None,
161 |                         "refusal": None,
162 |                         "role": "assistant",
163 |                         "tool_calls": [
164 |                             {
165 |                                 "id": "call_JKoeGAX2zGPJAmF2muDgjpHp",
166 |                                 "function": {
167 |                                     "arguments": '{"reasons":"1. The question asks to add the numbers 1, 2, and 3.\\n2. The expert answer provides the sum of these numbers as 6.\\n3. The submitted answer also provides the sum as 6.\\n4. Both the expert and submitted answers provide the same numerical result, which is 6.\\n5. Since both answers provide the same factual content, the submitted answer contains all the same details as the expert answer.\\n6. There is no additional information or discrepancy between the two answers.\\n7. Therefore, the submitted answer is neither a subset nor a superset; it is exactly the same as the expert answer in terms of factual content.","choice":"C"}',
168 |                                     "name": "select_choice",
169 |                                 },
170 |                                 "type": "function",
171 |                             }
172 |                         ],
173 |                     },
174 |                 }
175 |             ],
176 |             "created": 1734029028,
177 |             "model": "gpt-4o-2024-08-06",
178 |             "object": "chat.completion",
179 |             "system_fingerprint": "fp_cc5cf1c6e3",
180 |             "usage": {
181 |                 "completion_tokens": 149,
182 |                 "prompt_tokens": 404,
183 |                 "total_tokens": 553,
184 |                 "completion_tokens_details": {
185 |                     "accepted_prediction_tokens": 0,
186 |                     "audio_tokens": 0,
187 |                     "reasoning_tokens": 0,
188 |                     "rejected_prediction_tokens": 0,
189 |                 },
190 |                 "prompt_tokens_details": {"audio_tokens": 0, "cached_tokens": 0},
191 |             },
192 |         }
193 |     )
194 | 
195 |     llm = Factuality(base_url="https://api.openai.com/v1/")
196 |     result = llm.eval(
197 |         output="6",
198 |         expected="6",
199 |         input="Add the following numbers: 1, 2, 3",
200 |     )
201 | 
202 |     assert result.score == 1
203 | 
204 | 
205 | @respx.mock
206 | def test_factuality_client():
207 |     respx.route().respond(
208 |         json={
209 |             "id": "chatcmpl-AdiS4bHWjqSclA5rx7OkuZ6EA9QIp",
210 |             "choices": [
211 |                 {
212 |                     "finish_reason": "stop",
213 |                     "index": 0,
214 |                     "logprobs": None,
215 |                     "message": {
216 |                         "content": None,
217 |                         "refusal": None,
218 |                         "role": "assistant",
219 |                         "tool_calls": [
220 |                             {
221 |                                 "id": "call_JKoeGAX2zGPJAmF2muDgjpHp",
222 |                                 "function": {
223 |                                     "arguments": '{"reasons":"1. The question asks to add the numbers 1, 2, and 3.\\n2. The expert answer provides the sum of these numbers as 6.\\n3. The submitted answer also provides the sum as 6.\\n4. Both the expert and submitted answers provide the same numerical result, which is 6.\\n5. Since both answers provide the same factual content, the submitted answer contains all the same details as the expert answer.\\n6. There is no additional information or discrepancy between the two answers.\\n7. Therefore, the submitted answer is neither a subset nor a superset; it is exactly the same as the expert answer in terms of factual content.","choice":"C"}',
224 |                                     "name": "select_choice",
225 |                                 },
226 |                                 "type": "function",
227 |                             }
228 |                         ],
229 |                     },
230 |                 }
231 |             ],
232 |             "created": 1734029028,
233 |             "model": "gpt-4o-2024-08-06",
234 |             "object": "chat.completion",
235 |             "system_fingerprint": "fp_cc5cf1c6e3",
236 |             "usage": {
237 |                 "completion_tokens": 149,
238 |                 "prompt_tokens": 404,
239 |                 "total_tokens": 553,
240 |                 "completion_tokens_details": {
241 |                     "accepted_prediction_tokens": 0,
242 |                     "audio_tokens": 0,
243 |                     "reasoning_tokens": 0,
244 |                     "rejected_prediction_tokens": 0,
245 |                 },
246 |                 "prompt_tokens_details": {"audio_tokens": 0, "cached_tokens": 0},
247 |             },
248 |         }
249 |     )
250 | 
251 |     llm = Factuality(client=OpenAI(api_key="test"))
252 |     result = llm.eval(
253 |         output="6",
254 |         expected="6",
255 |         input="Add the following numbers: 1, 2, 3",
256 |     )
257 | 
258 |     assert result.score == 1
259 | 
260 | 
261 | @pytest.fixture(autouse=True)
262 | def reset_client():
263 |     yield
264 |     init(client=None)
265 | 
266 | 
267 | # make sure we deny any leaked calls to OpenAI
268 | @respx.mock
269 | def test_init_client():
270 |     client = cast(OpenAIV1Module.OpenAI, OpenAI(api_key="test"))
271 | 
272 |     respx.route().respond(
273 |         json={
274 |             "id": "chatcmpl-AdiS4bHWjqSclA5rx7OkuZ6EA9QIp",
275 |             "choices": [
276 |                 {
277 |                     "finish_reason": "stop",
278 |                     "index": 0,
279 |                     "logprobs": None,
280 |                     "message": {
281 |                         "content": None,
282 |                         "refusal": None,
283 |                         "role": "assistant",
284 |                         "tool_calls": [
285 |                             {
286 |                                 "id": "call_JKoeGAX2zGPJAmF2muDgjpHp",
287 |                                 "function": {
288 |                                     "arguments": '{"reasons":"1. The question asks to add the numbers 1, 2, and 3.\\n2. The expert answer provides the sum of these numbers as 6.\\n3. The submitted answer also provides the sum as 6.\\n4. Both the expert and submitted answers provide the same numerical result, which is 6.\\n5. Since both answers provide the same factual content, the submitted answer contains all the same details as the expert answer.\\n6. There is no additional information or discrepancy between the two answers.\\n7. Therefore, the submitted answer is neither a subset nor a superset; it is exactly the same as the expert answer in terms of factual content.","choice":"C"}',
289 |                                     "name": "select_choice",
290 |                                 },
291 |                                 "type": "function",
292 |                             }
293 |                         ],
294 |                     },
295 |                 }
296 |             ],
297 |             "created": 1734029028,
298 |             "model": "gpt-4o-2024-08-06",
299 |             "object": "chat.completion",
300 |             "system_fingerprint": "fp_cc5cf1c6e3",
301 |             "usage": {
302 |                 "completion_tokens": 149,
303 |                 "prompt_tokens": 404,
304 |                 "total_tokens": 553,
305 |                 "completion_tokens_details": {
306 |                     "accepted_prediction_tokens": 0,
307 |                     "audio_tokens": 0,
308 |                     "reasoning_tokens": 0,
309 |                     "rejected_prediction_tokens": 0,
310 |                 },
311 |                 "prompt_tokens_details": {"audio_tokens": 0, "cached_tokens": 0},
312 |             },
313 |         }
314 |     )
315 | 
316 |     init(client=client)
317 | 
318 |     llm = Factuality(base_url="https://api.openai.com/v1/")
319 |     result = llm.eval(
320 |         output="6",
321 |         expected="6",
322 |         input="Add the following numbers: 1, 2, 3",
323 |     )
324 | 
325 |     assert result.score == 1
326 | 
327 | 
328 | def test_battle():
329 |     for use_cot in [True, False]:
330 |         print("use_cot", use_cot)
331 |         e = Battle(use_cot=use_cot)
332 |         response = e(
333 |             instructions="Add the following numbers: 1, 2, 3",
334 |             output="600",
335 |             expected="6",
336 |         )
337 | 
338 |         print(response.as_json(indent=2))
339 |         assert response.score == 0
340 |         assert response.error is None
341 | 
342 |         response = e(
343 |             instructions="Add the following numbers: 1, 2, 3",
344 |             output="6",
345 |             expected="600",
346 |         )
347 | 
348 |         print(response.as_json(indent=2))
349 |         assert response.score == 1
350 |         assert response.error is None
351 | 
352 |         response = e(instructions="Add the following numbers: 1, 2, 3", output="6", expected="6")
353 | 
354 |         print(response.as_json(indent=2))
355 |         assert response.score == 0
356 |         assert response.error is None
357 | 


--------------------------------------------------------------------------------
/py/autoevals/test_moderation.py:
--------------------------------------------------------------------------------
 1 | from autoevals.moderation import Moderation
 2 | 
 3 | CASES = [
 4 |     {
 5 |         "name": "Safe content without threshold",
 6 |         "output": "hello",
 7 |         "threshold": None,
 8 |         "expected": 1,
 9 |     },
10 |     {
11 |         "name": "Unsafe content without threshold",
12 |         "output": "kill kill kill kill kill",
13 |         "threshold": None,
14 |         "expected": 0,
15 |     },
16 |     {
17 |         "name": "Unsafe content with lower threshold",
18 |         "output": "here are my suicidal thoughts",
19 |         "threshold": 0.25,
20 |         "expected": 0,
21 |     },
22 |     {
23 |         "name": "Unsafe content with higher threshold",
24 |         "output": "here are my suicidal thoughts",
25 |         "threshold": 0.99,
26 |         "expected": 1,
27 |     },
28 | ]
29 | 
30 | 
31 | def test_moderation():
32 |     for test_case in CASES:
33 |         expected = test_case["expected"]
34 | 
35 |         evaluator = Moderation(threshold=test_case["threshold"])
36 |         actual = evaluator.eval(test_case["output"])
37 | 
38 |         assert actual.score == expected, f"Expects {test_case['name']} to be {expected} but got {actual.score}"
39 | 


--------------------------------------------------------------------------------
/py/autoevals/test_oai.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | from typing import Any, Union, cast
  3 | 
  4 | import openai
  5 | import pytest
  6 | from braintrust.oai import (
  7 |     ChatCompletionV0Wrapper,
  8 |     CompletionsV1Wrapper,
  9 |     NamedWrapper,
 10 |     OpenAIV0Wrapper,
 11 |     OpenAIV1Wrapper,
 12 |     wrap_openai,
 13 | )
 14 | from openai.resources.chat.completions import AsyncCompletions
 15 | 
 16 | from autoevals import init  # type: ignore[import]
 17 | from autoevals.oai import (  # type: ignore[import]
 18 |     LLMClient,
 19 |     OpenAIV0Module,
 20 |     OpenAIV1Module,
 21 |     _named_wrapper,  # type: ignore[import]  # Accessing private members for testing
 22 |     _wrap_openai,  # type: ignore[import]  # Accessing private members for testing
 23 |     get_openai_wrappers,
 24 |     prepare_openai,
 25 | )
 26 | 
 27 | 
 28 | def unwrap_named_wrapper(obj: Union[NamedWrapper, OpenAIV1Module.OpenAI, OpenAIV0Module]) -> Any:
 29 |     return getattr(obj, "_NamedWrapper__wrapped")
 30 | 
 31 | 
 32 | @pytest.fixture(autouse=True)
 33 | def reset_env_and_client(monkeypatch: pytest.MonkeyPatch):
 34 |     """Reset environment variables and client before each test."""
 35 |     monkeypatch.delenv("OPENAI_API_KEY", raising=False)
 36 |     monkeypatch.setenv("OPENAI_API_KEY", "test-key")
 37 |     monkeypatch.setenv("OPENAI_BASE_URL", "http://test-url")
 38 |     monkeypatch.setattr("autoevals.oai._named_wrapper", None)
 39 |     monkeypatch.setattr("autoevals.oai._wrap_openai", None)
 40 |     monkeypatch.setattr("autoevals.oai._openai_module", None)
 41 | 
 42 |     init(None)
 43 | 
 44 |     yield
 45 | 
 46 | 
 47 | def test_prepare_openai_uses_unwrapped_global_client():
 48 |     openai_obj = openai.OpenAI(api_key="api-key", base_url="http://test")
 49 |     client = LLMClient(
 50 |         openai=openai_obj,
 51 |         complete=openai_obj.chat.completions.create,
 52 |         embed=openai_obj.embeddings.create,
 53 |         moderation=openai_obj.moderations.create,
 54 |         RateLimitError=openai.RateLimitError,
 55 |     )
 56 | 
 57 |     init(client)
 58 | 
 59 |     prepared_client = prepare_openai()
 60 | 
 61 |     assert prepared_client == client
 62 |     assert not prepared_client.is_wrapped
 63 |     assert prepared_client.openai == openai_obj
 64 |     assert prepared_client.complete is client.complete
 65 |     assert prepared_client.openai.api_key == "api-key"
 66 | 
 67 | 
 68 | def test_init_creates_llmclient_if_needed():
 69 |     openai_obj = openai.OpenAI()
 70 |     init(openai_obj)
 71 | 
 72 |     prepared_client = prepare_openai()
 73 | 
 74 |     assert isinstance(prepared_client, LLMClient)
 75 |     assert prepared_client.is_wrapped
 76 |     assert unwrap_named_wrapper(prepared_client.openai) == openai_obj
 77 | 
 78 | 
 79 | def test_init_creates_async_llmclient_if_needed(mock_openai_v0: OpenAIV0Module):
 80 |     init(mock_openai_v0, is_async=True)
 81 | 
 82 |     prepared_client = prepare_openai()
 83 | 
 84 |     assert isinstance(prepared_client, LLMClient)
 85 |     assert prepared_client.is_wrapped
 86 |     assert isinstance(prepared_client.openai, OpenAIV0Wrapper)
 87 |     assert prepared_client.complete.__name__ == "acreate"
 88 | 
 89 | 
 90 | def test_prepare_openai_defaults():
 91 |     prepared_client = prepare_openai()
 92 | 
 93 |     assert isinstance(prepared_client, LLMClient)
 94 |     assert prepared_client.is_wrapped
 95 |     openai_obj = unwrap_named_wrapper(prepared_client.openai)
 96 |     assert isinstance(openai_obj, openai.OpenAI)
 97 |     assert isinstance(getattr(prepared_client.complete, "__self__", None), CompletionsV1Wrapper)
 98 |     assert openai_obj.api_key == "test-key"
 99 |     assert openai_obj.base_url == "http://test-url"
100 | 
101 | 
102 | def test_prepare_openai_with_plain_openai():
103 |     client = openai.OpenAI(api_key="api-key", base_url="http://test")
104 |     prepared_client = prepare_openai(client=client)
105 | 
106 |     assert prepared_client.is_wrapped
107 |     assert isinstance(prepared_client.openai, OpenAIV1Wrapper)
108 | 
109 | 
110 | def test_prepare_openai_async():
111 |     prepared_client = prepare_openai(is_async=True)
112 | 
113 |     assert isinstance(prepared_client, LLMClient)
114 |     assert prepared_client.is_wrapped
115 |     assert isinstance(prepared_client.openai, OpenAIV1Wrapper)
116 | 
117 |     openai_obj = getattr(prepared_client.complete, "__self__", None)
118 |     assert isinstance(openai_obj, NamedWrapper)
119 |     assert isinstance(unwrap_named_wrapper(openai_obj), AsyncCompletions)
120 | 
121 | 
122 | def test_prepare_openai_wraps_once():
123 |     openai_obj = cast(OpenAIV1Module.OpenAI, wrap_openai(openai.OpenAI(api_key="api-key", base_url="http://test")))
124 | 
125 |     client = LLMClient(openai_obj)
126 | 
127 |     init(client)
128 | 
129 |     prepared_client = prepare_openai()
130 | 
131 |     assert prepared_client is client
132 |     assert prepared_client.is_wrapped
133 |     assert prepared_client.openai is openai_obj
134 | 
135 | 
136 | def test_prepare_openai_handles_missing_braintrust(monkeypatch: pytest.MonkeyPatch):
137 |     monkeypatch.setitem(sys.modules, "braintrust.oai", None)
138 | 
139 |     prepared_client = prepare_openai()
140 | 
141 |     assert isinstance(prepared_client, LLMClient)
142 |     assert not prepared_client.is_wrapped
143 |     assert isinstance(prepared_client.openai, openai.OpenAI)
144 | 
145 | 
146 | def test_get_openai_wrappers_caches_imports():
147 |     original_wrapper = _named_wrapper
148 |     original_wrap_fn = _wrap_openai
149 | 
150 |     # First call should set the cache
151 |     wrapper1, wrap_fn1 = get_openai_wrappers()
152 | 
153 |     # Second call should use cache
154 |     wrapper2, wrap_fn2 = get_openai_wrappers()
155 | 
156 |     # Verify we got same objects back
157 |     assert wrapper2 is wrapper1
158 |     assert wrap_fn2 is wrap_fn1
159 | 
160 |     # Verify they're different from the original None values
161 |     assert wrapper2 is not original_wrapper
162 |     assert wrap_fn2 is not original_wrap_fn
163 | 
164 | 
165 | def test_prepare_openai_raises_on_missing_openai(monkeypatch: pytest.MonkeyPatch):
166 |     monkeypatch.setitem(sys.modules, "openai", None)
167 | 
168 |     with pytest.raises(ImportError):
169 |         prepare_openai()
170 | 
171 | 
172 | @pytest.fixture
173 | def mock_openai_v0(monkeypatch: pytest.MonkeyPatch):
174 |     """Mock the OpenAI v0 SDK for testing."""
175 | 
176 |     class MockOpenAIV0:
177 |         __module__ = "openai"
178 |         api_key = None
179 |         api_base = None
180 | 
181 |         class ChatCompletion:
182 |             __module__ = "openai"
183 | 
184 |             @staticmethod
185 |             def create(*args: Any, **kwargs: Any):
186 |                 pass
187 | 
188 |             @staticmethod
189 |             def acreate(*args: Any, **kwargs: Any):
190 |                 pass
191 | 
192 |         class Embedding:
193 |             __module__ = "openai"
194 | 
195 |             @staticmethod
196 |             def create(*args: Any, **kwargs: Any):
197 |                 pass
198 | 
199 |             @staticmethod
200 |             def acreate(*args: Any, **kwargs: Any):
201 |                 pass
202 | 
203 |         class Moderation:
204 |             __module__ = "openai"
205 | 
206 |             @staticmethod
207 |             def create(*args: Any, **kwargs: Any):
208 |                 pass
209 | 
210 |             @staticmethod
211 |             def acreate(*args: Any, **kwargs: Any):
212 |                 pass
213 | 
214 |         class error:
215 |             __module__ = "openai"
216 | 
217 |             class RateLimitError(Exception):
218 |                 __module__ = "openai"
219 |                 pass
220 | 
221 |     mock_openai = MockOpenAIV0()
222 |     monkeypatch.setitem(sys.modules, "openai", mock_openai)
223 |     return cast(OpenAIV0Module, mock_openai)
224 | 
225 | 
226 | def test_prepare_openai_v0_sdk(mock_openai_v0: OpenAIV0Module):
227 |     prepared_client = prepare_openai()
228 | 
229 |     assert prepared_client.is_wrapped
230 |     assert prepared_client.openai.api_key == "test-key"
231 | 
232 |     assert isinstance(getattr(prepared_client.complete, "__self__", None), ChatCompletionV0Wrapper)
233 | 
234 | 
235 | def test_prepare_openai_v0_async(mock_openai_v0: OpenAIV0Module):
236 |     prepared_client = prepare_openai(is_async=True)
237 | 
238 |     assert prepared_client.is_wrapped
239 |     assert prepared_client.openai.api_key == "test-key"
240 | 
241 |     assert prepared_client.complete.__name__ == "acreate"
242 | 
243 | 
244 | def test_prepare_openai_v0_with_client(mock_openai_v0: OpenAIV0Module):
245 |     client = LLMClient(openai=mock_openai_v0, is_async=True)
246 | 
247 |     prepared_client = prepare_openai(client=client)
248 | 
249 |     assert prepared_client.is_wrapped
250 |     assert prepared_client.openai.api_key is mock_openai_v0.api_key  # must be set by the user
251 |     assert prepared_client.complete.__name__ == "acreate"
252 | 


--------------------------------------------------------------------------------
/py/autoevals/test_partial.py:
--------------------------------------------------------------------------------
 1 | from autoevals.llm import ClosedQA
 2 | from autoevals.string import Levenshtein
 3 | 
 4 | 
 5 | def test_partial():
 6 |     levenshtein_basic = Levenshtein()(output="abc", expected="abcd")
 7 |     levenshtein_partial = Levenshtein.partial(expected="abcd")()(output="abc")
 8 |     assert levenshtein_partial.score == levenshtein_basic.score
 9 |     assert levenshtein_partial.name == levenshtein_basic.name
10 |     assert levenshtein_partial.name == "Levenshtein"
11 | 
12 |     closedqa_basic = ClosedQA()(criteria="Is the answer correct?", input="What is 1+1?", output="2")
13 |     closedqa_partial = ClosedQA.partial(criteria="Is the answer correct?")()(input="What is 1+1?", output="2")
14 |     assert closedqa_partial.score == closedqa_basic.score
15 |     assert closedqa_partial.name == closedqa_basic.name
16 |     assert closedqa_partial.name == "ClosedQA"
17 | 


--------------------------------------------------------------------------------
/py/autoevals/test_ragas.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from typing import cast
 3 | 
 4 | import pytest
 5 | from pytest import approx
 6 | 
 7 | from autoevals.ragas import *
 8 | 
 9 | data = {
10 |     "input": "Can starred docs from different workspaces be accessed in one place?",
11 |     "output": "Yes, all starred docs, even from multiple different workspaces, will live in the My Shortcuts section.",
12 |     "expected": "Yes, all starred docs, even from multiple different workspaces, will live in the My Shortcuts section.",
13 |     "context": [
14 |         "Not all Coda docs are used in the same way. You'll inevitably have a few that you use every week, and some that you'll only use once. This is where starred docs can help you stay organized.\n\n\n\nStarring docs is a great way to mark docs of personal importance. After you star a doc, it will live in a section on your doc list called **[My Shortcuts](https://coda.io/shortcuts)**. All starred docs, even from multiple different workspaces, will live in this section.\n\n\n\nStarring docs only saves them to your personal My Shortcuts. It doesn\u2019t affect the view for others in your workspace. If you\u2019re wanting to shortcut docs not just for yourself but also for others in your team or workspace, you\u2019ll [use pinning](https://help.coda.io/en/articles/2865511-starred-pinned-docs) instead."
15 |     ],
16 | }
17 | 
18 | 
19 | @pytest.mark.parametrize(
20 |     ["metric", "expected_score", "can_fail"],
21 |     [
22 |         (ContextEntityRecall(), 0.5, False),
23 |         (ContextRelevancy(), 0.7, True),
24 |         (ContextRecall(), 1, False),
25 |         (ContextPrecision(), 1, False),
26 |     ],
27 | )
28 | @pytest.mark.parametrize("is_async", [False, True])
29 | def test_ragas_retrieval(metric: OpenAILLMScorer, expected_score: float, is_async: bool, can_fail: bool):
30 |     if is_async:
31 |         score = asyncio.run(metric.eval_async(**data)).score
32 |     else:
33 |         score = metric.eval(**data).score
34 | 
35 |     if score is None:
36 |         raise ValueError("Score is None")
37 | 
38 |     try:
39 |         if expected_score == 1:
40 |             assert score == expected_score
41 |         else:
42 |             assert score >= expected_score
43 |     except AssertionError as e:
44 |         # TODO: just to unblock the CI
45 |         if can_fail:
46 |             pytest.xfail(f"Expected score {expected_score} but got {score}")
47 |         else:
48 |             raise e
49 | 


--------------------------------------------------------------------------------
/py/autoevals/test_values.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | from pytest import approx
  3 | 
  4 | from autoevals.list import ListContains
  5 | from autoevals.number import NumericDiff
  6 | from autoevals.string import LevenshteinScorer
  7 | from autoevals.value import ExactMatch
  8 | 
  9 | 
 10 | def test_levenshtein():
 11 |     cases = [
 12 |         ("", "", 1),
 13 |         ("", "a", 0),
 14 |         ("a", "", 0),
 15 |         ("a", "a", 1),
 16 |         ("a", "b", 0),
 17 |         ("ab", "ac", 0.5),
 18 |         ("ac", "bc", 0.5),
 19 |         ("abc", "axc", 0.66667),
 20 |         ("xabxcdxxefxgx", "1ab2cd34ef5g6", 0.53846),
 21 |     ]
 22 | 
 23 |     evaluator = LevenshteinScorer()
 24 |     for a, b, expected in cases:
 25 |         print(f"[{a}]", f"[{b}]", expected, evaluator(a, b))
 26 |         assert evaluator(a, b).score == approx(expected, abs=1e-4)
 27 | 
 28 | 
 29 | def test_numeric():
 30 |     cases = [(0, 0, 1), (0, 1, 0), (1, 2, 0.66667), (1.0, 2.0, 0.66667), (-1, 2, 0)]
 31 | 
 32 |     evaluator = NumericDiff()
 33 |     for a, b, expected in cases:
 34 |         print(f"[{a}]", f"[{b}]", expected, evaluator(a, b))
 35 |         assert evaluator(a, b).score == approx(expected, abs=1e-4)
 36 | 
 37 | 
 38 | def test_list_contains():
 39 |     cases = [
 40 |         [[], [], 1],
 41 |         [[0], [], 0],
 42 |         [[], [0], 0],
 43 |         [["a"], ["a"], 1],
 44 |         [["a"], ["a", "b"], 0.5],
 45 |         [["a", "b"], ["a"], 0.5],
 46 |         [
 47 |             [
 48 |                 "workspaces",
 49 |                 "section",
 50 |                 "view",
 51 |                 "others",
 52 |                 "workspace",
 53 |                 "team",
 54 |                 "pinning",
 55 |             ],
 56 |             ["starred", "multiple different workspaces", "shortcuts"],
 57 |             0.1218,
 58 |         ],
 59 |         [
 60 |             ["starred", "multiple different workspaces", "shortcuts"],
 61 |             [
 62 |                 "workspaces",
 63 |                 "section",
 64 |                 "view",
 65 |                 "others",
 66 |                 "workspace",
 67 |                 "team",
 68 |                 "pinning",
 69 |             ],
 70 |             0.1218,
 71 |         ],
 72 |     ]
 73 | 
 74 |     for output, expected, expected_score in cases:
 75 |         assert ListContains(pairwise_evaluator=LevenshteinScorer())(output, expected).score == approx(
 76 |             expected_score, abs=1e-4
 77 |         ), (output, expected, expected_score)
 78 | 
 79 |     assert (
 80 |         ListContains(pairwise_evaluator=LevenshteinScorer(), allow_extra_entities=True)(["a", "b"], ["a"]).score == 1
 81 |     )
 82 | 
 83 | 
 84 | def test_exact_match():
 85 |     cases = [
 86 |         ["hello", "hello", 1],
 87 |         ["hello", "world", 0],
 88 |         [123, 123, 1],
 89 |         [123, "123", 1],
 90 |         [{"a": 1, "b": 2}, {"a": 1, "b": 2}, 1],
 91 |         [{"a": 1, "b": 2}, {"a": 1, "b": 3}, 0],
 92 |         [[1, 2, 3], [1, 2, 3], 1],
 93 |         [[1, 2, 3], [3, 2, 1], 0],
 94 |         [{"a": 1, "b": 2}, {"b": 2, "a": 1}, 0],  # Order matters
 95 |         [{"a": 1, "b": 2}, '{"a": 1, "b": 2}', 1],  # String representation matches dict
 96 |         [{"a": 1, "b": 2}, '{"a":1, "b":2}', 1],  # String representation matches dict
 97 |         [{"a": 1, "b": 2}, '{"b": 2, "a": 1}', 0],
 98 |         [{"a": 1, "b": 2}, {"b": 2, "a": 1, "c": 3}, 0],  # Extra key, not equal
 99 |         [None, None, 1],
100 |         [None, "None", 1],
101 |     ]
102 | 
103 |     for output, expected, expected_score in cases:
104 |         assert ExactMatch()(output, expected).score == approx(expected_score, abs=1e-4), (
105 |             output,
106 |             expected,
107 |             expected_score,
108 |         )
109 | 


--------------------------------------------------------------------------------
/py/autoevals/value.py:
--------------------------------------------------------------------------------
 1 | """Value comparison utilities for exact matching and normalization.
 2 | 
 3 | This module provides tools for exact value comparison with smart handling of different data types:
 4 | 
 5 | - ExactMatch: A scorer for exact value comparison
 6 |   - Handles primitive types (strings, numbers, etc.)
 7 |   - Smart `JSON` serialization for objects and arrays
 8 |   - Normalizes `JSON` strings for consistent comparison
 9 | 
10 | Example:
11 | ```python
12 | from autoevals import ExactMatch
13 | 
14 | # Simple value comparison
15 | scorer = ExactMatch()
16 | result = scorer.eval(
17 |     output="hello",
18 |     expected="hello"
19 | )
20 | print(result.score)  # 1.0 for exact match
21 | 
22 | # Object comparison (automatically normalized)
23 | result = scorer.eval(
24 |     output={"name": "John", "age": 30},
25 |     expected='{"age": 30, "name": "John"}'  # Different order but same content
26 | )
27 | print(result.score)  # 1.0 for equivalent JSON
28 | 
29 | # Array comparison
30 | result = scorer.eval(
31 |     output=[1, 2, 3],
32 |     expected="[1, 2, 3]"  # String or native types work
33 | )
34 | print(result.score)  # 1.0 for equivalent arrays
35 | ```
36 | """
37 | 
38 | import json
39 | from typing import Any
40 | 
41 | from braintrust_core.score import Score
42 | 
43 | from autoevals.partial import ScorerWithPartial
44 | 
45 | 
46 | class ExactMatch(ScorerWithPartial):
47 |     """A scorer that tests for exact equality between values.
48 | 
49 |     This scorer handles various input types:
50 |     - Primitive values (strings, numbers, etc.)
51 |     - JSON objects (dicts) and arrays (lists)
52 |     - JSON strings that can be parsed into objects/arrays
53 | 
54 |     The comparison process:
55 |     1. Detects if either value is/might be a JSON object/array
56 |     2. Normalizes both values (serialization if needed)
57 |     3. Performs exact string comparison
58 | 
59 |     Args:
60 |         output: Value to evaluate
61 |         expected: Reference value to compare against
62 | 
63 |     Returns:
64 |         Score object with:
65 |         - score: 1.0 for exact match, 0.0 otherwise
66 |     """
67 | 
68 |     def _run_eval_sync(self, output, expected=None, **kwargs):
69 |         maybe_object = needs_json(output) or needs_json(expected)
70 |         output, expected = normalize_value(output, maybe_object), normalize_value(expected, maybe_object)
71 |         score = 1 if output == expected else 0
72 | 
73 |         return Score(name=self._name(), score=score)
74 | 
75 | 
76 | def needs_json(value: Any) -> bool:
77 |     return isinstance(value, (dict, list))
78 | 
79 | 
80 | def normalize_value(value: Any, maybe_object: bool) -> str:
81 |     if needs_json(value):
82 |         return json.dumps(value)
83 | 
84 |     try:
85 |         if maybe_object:
86 |             return json.dumps(json.loads(value))
87 |     except json.JSONDecodeError:
88 |         pass
89 | 
90 |     return str(value)
91 | 


--------------------------------------------------------------------------------
/py/autoevals/version.py:
--------------------------------------------------------------------------------
1 | VERSION = "0.0.129"
2 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.black]
2 | line-length = 119
3 | 
4 | [tool.ruff]
5 | line-length = 119
6 | select = ["I001"]
7 | 


--------------------------------------------------------------------------------
/pyrightconfig.json:
--------------------------------------------------------------------------------
1 | {
2 |   "typeCheckingMode": "strict",
3 |   "reportMissingTypeStubs": false
4 | }
5 | 


--------------------------------------------------------------------------------
/scripts/prepare_readme.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import os
 4 | import re
 5 | import sys
 6 | 
 7 | SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
 8 | README_FILE = os.path.join(SCRIPT_DIR, "..", "README.md")
 9 | 
10 | if __name__ == "__main__":
11 |     mode = sys.argv[1]
12 |     assert mode in ["py", "js"], mode
13 | 
14 |     with open(README_FILE, "r") as f:
15 |         readme = f.read()
16 | 
17 |     remove_section = "Python" if mode == "js" else "Node.js"
18 | 
19 |     # Remove the whole section
20 |     readme = re.sub(
21 |         r"\#+\s*" + remove_section + r"\s*\n.*?((^\#\#+)|\Z)",
22 |         r"\1",
23 |         readme,
24 |         flags=re.MULTILINE | re.DOTALL,
25 |     )
26 | 
27 |     # Remove the "Python" or "Node.js" header
28 |     remove_header = "Python" if mode == "py" else "Node.js"
29 |     readme = re.sub(r"\#+\s*" + remove_header + r"\s*\n", "", readme)
30 | 
31 |     readme = readme.strip()
32 | 
33 |     with open(README_FILE, "w") as f:
34 |         f.write(readme)
35 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import setuptools
 4 | 
 5 | dir_name = os.path.abspath(os.path.dirname(__file__))
 6 | 
 7 | version_contents = {}
 8 | with open(os.path.join(dir_name, "py", "autoevals", "version.py"), encoding="utf-8") as f:
 9 |     exec(f.read(), version_contents)
10 | 
11 | with open(os.path.join(dir_name, "README.md"), "r", encoding="utf-8") as f:
12 |     long_description = f.read()
13 | 
14 | install_requires = ["chevron", "polyleven", "pyyaml", "braintrust_core", "jsonschema"]
15 | 
16 | extras_require = {
17 |     "dev": [
18 |         "black==22.6.0",
19 |         "braintrust",  # used for testing
20 |         "build",
21 |         "flake8",
22 |         "flake8-isort",
23 |         "IPython",
24 |         "isort==5.12.0",
25 |         "openai",  # used for testing
26 |         "pre-commit",
27 |         "pytest",
28 |         "respx",
29 |         "twine",
30 |     ],
31 |     "doc": ["pydoc-markdown"],
32 |     "scipy": ["numpy", "scipy"],
33 | }
34 | 
35 | extras_require["all"] = sorted({package for packages in extras_require.values() for package in packages})
36 | 
37 | setuptools.setup(
38 |     name="autoevals",
39 |     version=version_contents["VERSION"],
40 |     author="BrainTrust",
41 |     author_email="info@braintrustdata.com",
42 |     description="Universal library for evaluating AI models",
43 |     long_description=long_description,
44 |     long_description_content_type="text/markdown",
45 |     url="https://www.braintrustdata.com",
46 |     project_urls={
47 |         "Bug Tracker": "https://github.com/braintrustdata/autoevals",
48 |     },
49 |     classifiers=[
50 |         "Programming Language :: Python :: 3",
51 |         "Operating System :: OS Independent",
52 |     ],
53 |     package_dir={"": "py"},
54 |     include_package_data=True,
55 |     packages=setuptools.find_packages(where="py"),
56 |     python_requires=">=3.8.0",
57 |     entry_points={},
58 |     install_requires=install_requires,
59 |     extras_require=extras_require,
60 | )
61 | 


--------------------------------------------------------------------------------
/templates/battle.yaml:
--------------------------------------------------------------------------------
 1 | prompt: |-
 2 |   You are comparing responses to the following instructions.
 3 | 
 4 |   [Instruction 1]
 5 |   {{instructions}}
 6 |   [Response 1]
 7 |   {{output}}
 8 | 
 9 |   [Instruction 2]
10 |   {{instructions}}
11 |   [Response 2]
12 |   {{expected}}
13 | 
14 | 
15 |   Is the first response better than the second? You must provide one answer based on your subjective view.
16 | choice_scores:
17 |   "Yes": 1.0
18 |   "No": 0.0
19 | 


--------------------------------------------------------------------------------
/templates/closed_q_a.yaml:
--------------------------------------------------------------------------------
 1 | prompt: |-
 2 |   You are assessing a submitted answer on a given task based on a criterion. Here is the data:
 3 |   [BEGIN DATA]
 4 |   ***
 5 |   [Task]: {{input}}
 6 |   ***
 7 |   [Submission]: {{output}}
 8 |   ***
 9 |   [Criterion]: {{criteria}}
10 |   ***
11 |   [END DATA]
12 |   Does the submission meet the criterion?
13 | choice_scores:
14 |   "Y": 1.0
15 |   "N": 0.0
16 | 


--------------------------------------------------------------------------------
/templates/factuality.yaml:
--------------------------------------------------------------------------------
 1 | prompt: |-
 2 |   You are comparing a submitted answer to an expert answer on a given question. Here is the data:
 3 |   [BEGIN DATA]
 4 |   ************
 5 |   [Question]: {{input}}
 6 |   ************
 7 |   [Expert]: {{expected}}
 8 |   ************
 9 |   [Submission]: {{output}}
10 |   ************
11 |   [END DATA]
12 | 
13 |   Compare the factual content of the submitted answer with the expert answer. Ignore any differences in style, grammar, or punctuation.
14 |   The submitted answer may either be a subset or superset of the expert answer, or it may conflict with it. Determine which case applies. Answer the question by selecting one of the following options:
15 |   (A) The submitted answer is a subset of the expert answer and is fully consistent with it.
16 |   (B) The submitted answer is a superset of the expert answer and is fully consistent with it.
17 |   (C) The submitted answer contains all the same details as the expert answer.
18 |   (D) There is a disagreement between the submitted answer and the expert answer.
19 |   (E) The answers differ, but these differences don't matter from the perspective of factuality.
20 | choice_scores:
21 |   "A": 0.4
22 |   "B": 0.6
23 |   "C": 1
24 |   "D": 0
25 |   "E": 1
26 | 


--------------------------------------------------------------------------------
/templates/humor.yaml:
--------------------------------------------------------------------------------
1 | prompt: |-
2 |   Is the following funny?
3 | 
4 |   {{output}}
5 | choice_scores:
6 |   "Yes": 1.0
7 |   "No": 0.0
8 |   "Unsure": 0.5
9 | 


--------------------------------------------------------------------------------
/templates/possible.yaml:
--------------------------------------------------------------------------------
 1 | prompt: |-
 2 |   You are analyzing a statement for a task.
 3 |   You want to figure out if the statement declares the task as impossible or provides a solution.
 4 |   A solution can involve instructions, a list, a sequence, or any other way to solve the task.
 5 |   If the statement doesn't say the task is impossible, it's likely a solution.
 6 | 
 7 |   [BEGIN DATA]
 8 |   ************
 9 |   [Task]: {{input}}
10 |   ************
11 |   [Submission]: {{output}}
12 |   ************
13 |   [END DATA]
14 | 
15 |   (A) The statement declares the task to be impossible
16 |   (B) The statement provides instructions on how to solve a given task, or provides a solution
17 | choice_scores:
18 |   "A": 0.0
19 |   "B": 1.0
20 | 


--------------------------------------------------------------------------------
/templates/security.yaml:
--------------------------------------------------------------------------------
1 | prompt: |-
2 |   Is this string malicious?
3 | 
4 |   {{output}}
5 | choice_scores:
6 |   "Yes": 0.0
7 |   "No": 1.0
8 |   "Unsure": 0.5
9 | 


--------------------------------------------------------------------------------
/templates/sql.yaml:
--------------------------------------------------------------------------------
 1 | prompt: |-
 2 |   You are comparing a submitted answer to an expert answer on a given SQL coding question. Here is the data:
 3 |   [BEGIN DATA]
 4 |   ************
 5 |   [Question]: {{input}}
 6 |   ************
 7 |   [Expert]: {{expected}}
 8 |   ************
 9 |   [Submission]: {{output}}
10 |   ************
11 |   [END DATA]
12 | 
13 |   Compare the content and correctness of the submitted SQL with the expert answer. Ignore any differences in whitespace, style, or output column names.
14 |   The submitted answer may either be correct or incorrect. Determine which case applies. Answer the question by responding with one of the following:
15 |     "Correct": The submitted SQL and the expert answer are semantically the same, i.e. they yield the same result when run on the database, ignoring differences in output column naming or ordering.
16 |     "Incorrect": The submitted SQL and the expert answer are semantically different, i.e. they do not yield the same result when run, even after accounting for superficial differences, or the submitted SQL will result in an error when run.
17 | choice_scores:
18 |   "Correct": 1.0
19 |   "Incorrect": 0.0
20 | 


--------------------------------------------------------------------------------
/templates/summary.yaml:
--------------------------------------------------------------------------------
 1 | prompt: |-
 2 |   You are comparing a submitted summary of a given text to an expert summary. Here is the data:
 3 |   [BEGIN DATA]
 4 |   ************
 5 |   [Text]: {{input}}
 6 |   ************
 7 |   A: {{expected}}
 8 |   ************
 9 |   B: {{output}}
10 |   ************
11 |   [END DATA]
12 | 
13 |   Compare summary A with summary B. Ignore any differences in style, grammar, or punctuation.
14 |   Determine which summary better describes the original text.
15 | choice_scores:
16 |   "A": 0
17 |   "B": 1
18 | 


--------------------------------------------------------------------------------
/templates/translation.yaml:
--------------------------------------------------------------------------------
 1 | prompt: |-
 2 |   You are comparing the submitted translation to an expert translation of a sentence from {{{language}}} to English. Here is the data:
 3 |   [BEGIN DATA]
 4 |   ************
 5 |   [Sentence]: {{input}}
 6 |   ************
 7 |   [Expert]: {{expected}}
 8 |   ************
 9 |   [Submission]: {{output}}
10 |   ************
11 |   [END DATA]
12 |   Does the submission answer and the expert's answer have the same meaning? Ignore any differences in style and punctuation, but you need to check if the nouns and tenses used in the submission are the same as the expert answer and if the submission has not used any such verbs or adjectives that can change the meaning of the translation.
13 | choice_scores:
14 |   "Y": 1.0
15 |   "N": 0.0
16 | 


--------------------------------------------------------------------------------
/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "compilerOptions": {
 3 |     "declaration": true,
 4 |     "outDir": "./jsdist",
 5 |     "lib": ["es2015", "dom"],
 6 |     "target": "ES2018",
 7 |     "moduleResolution": "node",
 8 |     "strict": true,
 9 |     "esModuleInterop": true,
10 |     "skipLibCheck": true
11 |   },
12 |   "include": ["js"],
13 |   "exclude": ["node_modules/**"]
14 | }
15 | 


--------------------------------------------------------------------------------
/tsup.config.js:
--------------------------------------------------------------------------------
 1 | import { defineConfig } from "tsup";
 2 | 
 3 | export default defineConfig([
 4 |   {
 5 |     entry: ["js/index.ts"],
 6 |     format: ["cjs", "esm"],
 7 |     outDir: "jsdist",
 8 |     dts: true,
 9 |     loader: {
10 |       ".yaml": "text",
11 |     },
12 |   },
13 | ]);
14 | 


--------------------------------------------------------------------------------
/turbo.json:
--------------------------------------------------------------------------------
1 | {
2 |   "extends": ["//"],
3 |   "tasks": {
4 |     "build": {
5 |       "outputs": ["**/jsdist/**"]
6 |     }
7 |   }
8 | }
9 | 


--------------------------------------------------------------------------------
/vitest.config.ts:
--------------------------------------------------------------------------------
 1 | import { defineConfig } from "vitest/config";
 2 | import yaml from "@rollup/plugin-yaml";
 3 | 
 4 | export default defineConfig({
 5 |   plugins: [yaml()],
 6 |   test: {
 7 |     environment: "node",
 8 |     testTimeout: 15_000,
 9 |   },
10 | });
11 | 


--------------------------------------------------------------------------------