├── .env.example ├── .envrc ├── .eslintrc.js ├── .flake8 ├── .github └── workflows │ ├── eval.yaml │ ├── js.yaml │ ├── lint.yaml │ └── python.yaml ├── .gitignore ├── .isort.cfg ├── .npmignore ├── .pre-commit-config.yaml ├── .prettierrc ├── .tool-versions ├── .vscode └── settings.json ├── CHANGELOG.md ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.md ├── env.sh ├── evals ├── .eslintrc.js ├── .prettierrc ├── datasets │ ├── coqa-closed-qa.json │ ├── coqa-context-relevancy.json │ └── coqa-factuality.json ├── package.json ├── src │ ├── autoevals.eval.ts │ ├── datasets.ts │ ├── duckdb.ts │ └── sync_datasets.ts └── tsconfig.json ├── js ├── embeddings.test.ts ├── index.ts ├── json.test.ts ├── json.ts ├── list.ts ├── llm.fixtures.ts ├── llm.test.ts ├── llm.ts ├── manifest.ts ├── moderation.test.ts ├── moderation.ts ├── number.ts ├── oai.test.ts ├── oai.ts ├── partial.test.ts ├── partial.ts ├── ragas.test.ts ├── ragas.ts ├── render-messages.test.ts ├── render-messages.ts ├── string.ts ├── templates.ts ├── value.test.ts ├── value.ts └── yaml.d.ts ├── package.json ├── pnpm-lock.yaml ├── pnpm-workspace.yaml ├── py └── autoevals │ ├── __init__.py │ ├── json.py │ ├── list.py │ ├── llm.py │ ├── moderation.py │ ├── number.py │ ├── oai.py │ ├── partial.py │ ├── ragas.py │ ├── string.py │ ├── templates │ ├── test_embeddings.py │ ├── test_json.py │ ├── test_llm.py │ ├── test_moderation.py │ ├── test_oai.py │ ├── test_partial.py │ ├── test_ragas.py │ ├── test_values.py │ ├── value.py │ └── version.py ├── pyproject.toml ├── pyrightconfig.json ├── scripts └── prepare_readme.py ├── setup.py ├── templates ├── battle.yaml ├── closed_q_a.yaml ├── factuality.yaml ├── humor.yaml ├── possible.yaml ├── security.yaml ├── sql.yaml ├── summary.yaml └── translation.yaml ├── tsconfig.json ├── tsup.config.js ├── turbo.json └── vitest.config.ts /.env.example: -------------------------------------------------------------------------------- 1 | OPENAI_API_KEY=your_openai_api_key 2 | BRAINTRUST_API_KEY=your_braintrust_api_key 3 | -------------------------------------------------------------------------------- /.envrc: -------------------------------------------------------------------------------- 1 | source_up 2 | dotenv 3 | -------------------------------------------------------------------------------- /.eslintrc.js: -------------------------------------------------------------------------------- 1 | const path = require("path"); 2 | 3 | module.exports = { 4 | extends: ["plugin:@typescript-eslint/recommended", "prettier"], 5 | plugins: ["@typescript-eslint"], 6 | rules: { 7 | "@typescript-eslint/no-unused-vars": [ 8 | "error", 9 | { 10 | vars: "all", 11 | args: "none", 12 | ignoreRestSiblings: false, 13 | argsIgnorePattern: "^_", 14 | varsIgnorePattern: "^_", 15 | }, 16 | ], 17 | "prefer-const": "error", 18 | "@typescript-eslint/no-explicit-any": "off", 19 | "@typescript-eslint/ban-types": "off", 20 | "@typescript-eslint/ban-ts-comment": "off", 21 | "@typescript-eslint/no-var-requires": "off", 22 | }, 23 | }; 24 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 119 3 | ignore = E402, E203, E501, W503 4 | -------------------------------------------------------------------------------- /.github/workflows/eval.yaml: -------------------------------------------------------------------------------- 1 | name: Run pnpm evals 2 | 3 | on: 4 | push: 5 | # Uncomment to run only when files in the 'evals' directory change 6 | # - paths: 7 | # - "evals/**" 8 | 9 | permissions: 10 | pull-requests: write 11 | contents: read 12 | 13 | jobs: 14 | eval: 15 | name: Run evals 16 | runs-on: ubuntu-latest 17 | 18 | steps: 19 | - name: Checkout 20 | id: checkout 21 | uses: actions/checkout@v4 22 | with: 23 | fetch-depth: 0 24 | 25 | - name: Setup Node.js 26 | id: setup-node 27 | uses: actions/setup-node@v4 28 | with: 29 | node-version: 22 30 | 31 | - uses: pnpm/action-setup@v4 32 | 33 | - name: Install Dependencies 34 | id: install 35 | run: pnpm install 36 | 37 | - name: Build packages 38 | id: build 39 | run: pnpm build 40 | 41 | - name: Run Evals 42 | uses: braintrustdata/eval-action@v1 43 | with: 44 | api_key: ${{ secrets.BRAINTRUST_API_KEY }} 45 | runtime: node 46 | root: evals 47 | -------------------------------------------------------------------------------- /.github/workflows/js.yaml: -------------------------------------------------------------------------------- 1 | name: js 2 | 3 | on: 4 | pull_request: 5 | push: 6 | branches: [main] 7 | 8 | jobs: 9 | build: 10 | runs-on: ubuntu-latest 11 | 12 | strategy: 13 | matrix: 14 | # duckdb has an incredibly slow install with 24.x 15 | node-version: [20.x, 22.x] 16 | 17 | steps: 18 | - uses: actions/checkout@v3 19 | - name: Cache node_modules 20 | uses: actions/cache@v4 21 | with: 22 | path: | 23 | node_modules 24 | !node_modules/.cache/turbo 25 | key: ${{ matrix.runner }}-${{ matrix.node_version }}-node-${{ env.nodeModulesCacheHash }} 26 | restore-keys: | 27 | ${{ matrix.runner }}-${{ matrix.node_version }}-node- 28 | - name: Use Node.js ${{ matrix.node-version }} 29 | uses: actions/setup-node@v3 30 | with: 31 | node-version: ${{ matrix.node-version }} 32 | - uses: pnpm/action-setup@v4 33 | - run: pnpm install 34 | - run: pnpm run test 35 | env: 36 | OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} 37 | OPENAI_BASE_URL: ${{ secrets.OPENAI_BASE_URL }} 38 | BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }} 39 | - run: pnpm run build 40 | -------------------------------------------------------------------------------- /.github/workflows/lint.yaml: -------------------------------------------------------------------------------- 1 | # Source: https://github.com/marketplace/actions/pre-commit 2 | name: lint 3 | 4 | on: 5 | pull_request: 6 | push: 7 | branches: [main] 8 | 9 | jobs: 10 | lint: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - uses: actions/checkout@v3 14 | - uses: actions/setup-python@v3 15 | - uses: pre-commit/action@v3.0.0 16 | -------------------------------------------------------------------------------- /.github/workflows/python.yaml: -------------------------------------------------------------------------------- 1 | # Modified from https://github.com/actions/starter-workflows/blob/main/ci/python-app.yml 2 | name: python 3 | 4 | on: 5 | pull_request: 6 | push: 7 | branches: [main] 8 | 9 | jobs: 10 | build: 11 | runs-on: ubuntu-latest 12 | strategy: 13 | matrix: 14 | python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] 15 | 16 | steps: 17 | - uses: actions/checkout@v3 18 | - name: Set up Python ${{ matrix.python-version }} 19 | uses: actions/setup-python@v3 20 | with: 21 | python-version: ${{ matrix.python-version }} 22 | - name: Install dependencies 23 | run: | 24 | python -m pip install --upgrade pip setuptools build twine openai 25 | python -m pip install -e .[all] 26 | - name: Test with pytest 27 | run: | 28 | pytest 29 | env: 30 | OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} 31 | OPENAI_BASE_URL: ${{ secrets.OPENAI_BASE_URL }} 32 | BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }} 33 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.swp 3 | *.swo 4 | data 5 | venv 6 | .env 7 | .direnv 8 | .DS_STORE 9 | node_modules 10 | py/*.egg-info/ 11 | pydist 12 | jsdist 13 | dist 14 | autoevals-*.tar.gz 15 | autoevals-*.tgz 16 | typedoc.json 17 | build 18 | .turbo 19 | -------------------------------------------------------------------------------- /.isort.cfg: -------------------------------------------------------------------------------- 1 | [settings] 2 | line_length=119 3 | multi_line_output=3 4 | use_parentheses=true 5 | lines_after_imports=2 6 | include_trailing_comma=True 7 | -------------------------------------------------------------------------------- /.npmignore: -------------------------------------------------------------------------------- 1 | tsconfig.json 2 | MANIFEST.in 3 | Makefile 4 | README.md 5 | js 6 | py 7 | pyproject.toml 8 | setup.py 9 | venv 10 | pydist 11 | autoevals-*.tgz 12 | .testcache 13 | .flake8 14 | .isort.cfg 15 | .pre-commit-config.yaml 16 | .pytest_cache 17 | .testcache 18 | env.sh 19 | scripts 20 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: "https://github.com/pre-commit/pre-commit-hooks" 3 | rev: v4.4.0 4 | hooks: 5 | - id: end-of-file-fixer 6 | - id: trailing-whitespace 7 | - repo: "https://github.com/psf/black" 8 | rev: 22.6.0 9 | hooks: 10 | - id: black 11 | files: ./ 12 | - repo: https://github.com/astral-sh/ruff-pre-commit 13 | # Ruff version. 14 | rev: v0.0.282 15 | hooks: 16 | - id: ruff 17 | args: [--fix, --exit-non-zero-on-fix] 18 | - repo: https://github.com/codespell-project/codespell 19 | rev: v2.2.1 20 | hooks: 21 | - id: codespell 22 | exclude: > 23 | (?x)^( 24 | .*\.(json|prisma|yaml) 25 | )$ 26 | args: 27 | - "-L" 28 | - "rouge,afterall" 29 | 30 | - repo: https://github.com/rbubley/mirrors-prettier 31 | rev: v3.3.2 32 | hooks: 33 | - id: prettier 34 | exclude: ^(extension/|.*\.json$) 35 | -------------------------------------------------------------------------------- /.prettierrc: -------------------------------------------------------------------------------- 1 | { 2 | "singleQuote": false 3 | } 4 | -------------------------------------------------------------------------------- /.tool-versions: -------------------------------------------------------------------------------- 1 | python 3.9.21 2 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "[python]": { 3 | "editor.formatOnSave": true, 4 | "editor.defaultFormatter": "ms-python.black-formatter" 5 | }, 6 | "black-formatter.path": ["${workspaceFolder}/venv/bin/black"] 7 | } 8 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | Release notes can be found [here](https://www.braintrust.dev/docs/reference/release-notes). 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 BrainTrust Data 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include py/autoevals/templates * 2 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | SHELL := /bin/bash 2 | ROOT_DIR:=$(shell dirname $(realpath $(firstword $(MAKEFILE_LIST)))) 3 | VENV_PRE_COMMIT := ${ROOT_DIR}/venv/.pre_commit 4 | 5 | .PHONY: all 6 | all: ${VENV_PRE_COMMIT} 7 | 8 | .PHONY: py 9 | py: ${VENV_PYTHON_PACKAGES} 10 | bash -c 'source venv/bin/activate' 11 | 12 | VENV_INITIALIZED := venv/.initialized 13 | 14 | ${VENV_INITIALIZED}: 15 | rm -rf venv && python -m venv venv 16 | @touch ${VENV_INITIALIZED} 17 | 18 | VENV_PYTHON_PACKAGES := venv/.python_packages 19 | 20 | ${VENV_PYTHON_PACKAGES}: ${VENV_INITIALIZED} 21 | bash -c 'source venv/bin/activate && python -m pip install --upgrade pip setuptools build twine openai' 22 | bash -c 'source venv/bin/activate && python -m pip install -e ".[dev]"' 23 | bash -c 'source venv/bin/activate && python -m pip install -e ".[scipy]"' # for local tests 24 | @touch $@ 25 | 26 | ${VENV_PRE_COMMIT}: ${VENV_PYTHON_PACKAGES} 27 | bash -c 'source venv/bin/activate && pre-commit install' 28 | @touch $@ 29 | 30 | develop: ${VENV_PRE_COMMIT} 31 | @echo "--\nRun "source env.sh" to enter development mode!" 32 | 33 | fixup: 34 | pre-commit run --all-files 35 | 36 | .PHONY: test test-py test-js 37 | 38 | test: test-py test-js 39 | 40 | test-py: 41 | source env.sh && python3 -m pytest 42 | 43 | test-js: 44 | pnpm install && pnpm run test 45 | -------------------------------------------------------------------------------- /env.sh: -------------------------------------------------------------------------------- 1 | SRC_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 2 | 3 | ps1_old="$PS1" 4 | source $SRC_ROOT/venv/bin/activate 5 | export PS1="(autoevals) $ps1_old" 6 | -------------------------------------------------------------------------------- /evals/.eslintrc.js: -------------------------------------------------------------------------------- 1 | const path = require("path"); 2 | 3 | module.exports = { 4 | extends: ["plugin:@typescript-eslint/recommended", "prettier"], 5 | plugins: ["@typescript-eslint"], 6 | rules: { 7 | "@typescript-eslint/no-unused-vars": [ 8 | "error", 9 | { 10 | vars: "all", 11 | args: "none", 12 | ignoreRestSiblings: false, 13 | argsIgnorePattern: "^_", 14 | varsIgnorePattern: "^_", 15 | }, 16 | ], 17 | "prefer-const": "error", 18 | "@typescript-eslint/no-explicit-any": "off", 19 | "@typescript-eslint/ban-types": "off", 20 | "@typescript-eslint/ban-ts-comment": "off", 21 | "@typescript-eslint/no-var-requires": "off", 22 | }, 23 | }; 24 | -------------------------------------------------------------------------------- /evals/.prettierrc: -------------------------------------------------------------------------------- 1 | { 2 | "singleQuote": false 3 | } 4 | -------------------------------------------------------------------------------- /evals/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "@braintrust/autoevals-evals", 3 | "version": "1.0.0", 4 | "description": "", 5 | "main": "index.js", 6 | "scripts": { 7 | "test": "echo \"Error: no test specified\" && exit 1", 8 | "sync": "tsx src/sync_datasets.ts", 9 | "eval": "braintrust eval" 10 | }, 11 | "keywords": [], 12 | "author": "", 13 | "license": "ISC", 14 | "dependencies": { 15 | "autoevals": "workspace:*", 16 | "braintrust": "^0.0.140", 17 | "zod": "^3.22.4" 18 | }, 19 | "devDependencies": { 20 | "@types/node": "^20.10.5", 21 | "duckdb": "^1.0.0", 22 | "tsx": "^3.14.0" 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /evals/src/autoevals.eval.ts: -------------------------------------------------------------------------------- 1 | import { Eval, EvalCase, wrapTraced } from "braintrust"; 2 | import path from "path"; 3 | import fs from "fs"; 4 | import { 5 | closedQACaseSchema, 6 | contextRelevancyCaseSchema, 7 | coqaCaseSchema, 8 | dataDir, 9 | } from "./datasets"; 10 | import { z } from "zod"; 11 | import { 12 | AnswerCorrectness, 13 | ClosedQA, 14 | ContextRelevancy, 15 | DEFAULT_MODEL, 16 | Factuality, 17 | NumericDiff, 18 | Score, 19 | } from "autoevals"; 20 | 21 | const experimentNamePrefix = process.env.EXPERIMENT_NAME; 22 | 23 | const datasets = [ 24 | { 25 | name: "Factuality", 26 | path: path.join(dataDir, "coqa-factuality.json"), 27 | parser: coqaCaseSchema, 28 | }, 29 | { 30 | name: "ClosedQA", 31 | path: path.join(dataDir, "coqa-closed-qa.json"), 32 | parser: closedQACaseSchema, 33 | }, 34 | { 35 | name: "AnswerCorrectness", 36 | path: path.join(dataDir, "coqa-factuality.json"), 37 | parser: coqaCaseSchema, 38 | tags: ["ragas"], 39 | }, 40 | { 41 | name: "ContextRelevancy", 42 | path: path.join(dataDir, "coqa-context-relevancy.json"), 43 | parser: contextRelevancyCaseSchema, 44 | tags: ["ragas"], 45 | }, 46 | ]; 47 | 48 | const runScorerT = wrapTraced(async function runScorer( 49 | scorer: string, 50 | input: any, 51 | ) { 52 | switch (scorer) { 53 | case "Factuality": 54 | return Factuality(input); 55 | case "ClosedQA": 56 | return ClosedQA(input); 57 | case "AnswerCorrectness": 58 | return AnswerCorrectness(input); 59 | case "ContextRelevancy": 60 | return ContextRelevancy(input); 61 | default: 62 | throw new Error(`Unknown scorer: ${scorer}`); 63 | } 64 | }); 65 | 66 | Eval("Autoevals", { 67 | data: () => 68 | datasets.flatMap(({ name, path, parser, tags }) => { 69 | const data = fs.readFileSync(path, "utf-8"); 70 | return z 71 | .array(parser) 72 | .parse(JSON.parse(data)) 73 | .map((d: EvalCase) => ({ 74 | ...d, 75 | input: { ...d.input, scorer: name }, 76 | metadata: { ...d.metadata, scorer: name }, 77 | tags: [...(tags ?? []), name], 78 | })); 79 | }), 80 | task: async (input, hooks) => { 81 | const { scorer, ...rest } = input; 82 | let result: Score | null = null; 83 | try { 84 | result = await runScorerT(scorer, rest); 85 | } catch (e) { 86 | hooks.meta({ error: `${e}` }); 87 | } 88 | return result?.score ?? -1; 89 | }, 90 | scores: [NumericDiff], 91 | experimentName: experimentNamePrefix ?? undefined, 92 | metadata: { 93 | model: DEFAULT_MODEL, 94 | }, 95 | }); 96 | -------------------------------------------------------------------------------- /evals/src/datasets.ts: -------------------------------------------------------------------------------- 1 | import { z } from "zod"; 2 | 3 | import path from "path"; 4 | 5 | export const dataDir = path.join(__dirname, "../datasets"); 6 | 7 | export const coqaSchema = z.object({ 8 | source: z.string(), 9 | story: z.string(), 10 | questions: z.array(z.string()), 11 | answers: z.object({ 12 | input_text: z.array(z.string()), 13 | answer_start: z.array(z.number()), 14 | answer_end: z.array(z.number()), 15 | }), 16 | }); 17 | 18 | export const coqaCaseSchema = z.object({ 19 | input: z.object({ 20 | input: z.string(), 21 | output: z.string(), 22 | expected: z.string(), 23 | }), 24 | expected: z.number(), 25 | metadata: coqaSchema, 26 | }); 27 | 28 | export type FactualityCase = z.infer; 29 | 30 | export const contextRelevancyCaseSchema = z.object({ 31 | input: z.object({ 32 | input: z.string(), 33 | context: z.string(), 34 | }), 35 | expected: z.number(), 36 | metadata: coqaSchema, 37 | }); 38 | export type ContextRelevancyCase = z.infer; 39 | 40 | export const closedQACaseSchema = z.object({ 41 | input: z.object({ 42 | input: z.string(), 43 | output: z.string(), 44 | criteria: z.string(), 45 | }), 46 | expected: z.number(), 47 | metadata: coqaSchema, 48 | }); 49 | 50 | export type ClosedQACase = z.infer; 51 | -------------------------------------------------------------------------------- /evals/src/duckdb.ts: -------------------------------------------------------------------------------- 1 | import type { TableData, Connection } from "duckdb"; 2 | import * as duckdb from "duckdb"; 3 | 4 | let _duckdb_db: duckdb.Database | null = null; 5 | export function getDuckDBConn() { 6 | if (!_duckdb_db) { 7 | _duckdb_db = new duckdb.Database(":memory:"); 8 | } 9 | return _duckdb_db.connect(); 10 | } 11 | 12 | export async function duckq(con: Connection, sql: string): Promise { 13 | return new Promise((resolve, reject) => { 14 | con.all(sql, (err, rows) => { 15 | if (err) { 16 | reject(err); 17 | } else { 18 | resolve(rows); 19 | } 20 | }); 21 | }); 22 | } 23 | -------------------------------------------------------------------------------- /evals/src/sync_datasets.ts: -------------------------------------------------------------------------------- 1 | import { duckq, getDuckDBConn } from "./duckdb"; 2 | 3 | import { z } from "zod"; 4 | import { 5 | coqaSchema, 6 | dataDir, 7 | FactualityCase, 8 | ContextRelevancyCase, 9 | ClosedQACase, 10 | } from "./datasets"; 11 | import path from "path"; 12 | import fs from "fs"; 13 | 14 | async function getCoqa(): Promise[]> { 15 | const conn = getDuckDBConn(); 16 | return z.array(coqaSchema).parse( 17 | await duckq( 18 | conn, 19 | `SELECT * FROM 'hf://datasets/stanfordnlp/coqa/data/validation-00000-of-00001.parquet' 20 | LIMIT 20`, 21 | ), 22 | ); 23 | } 24 | 25 | async function coqaFactuality(): Promise { 26 | const df = await getCoqa(); 27 | 28 | // For each question, capture the correct answer, make a superset by concatenating answers 29 | // together, and pick a different answer as a completely wrong one 30 | const cases: FactualityCase[] = []; 31 | for (let document = 0; document < df.length; document++) { 32 | const metadata = df[document]; 33 | const { questions, answers } = metadata; 34 | 35 | cases.push({ 36 | input: { 37 | input: questions[0], 38 | output: answers.input_text[0], 39 | expected: answers.input_text[0], 40 | }, 41 | expected: 1, 42 | metadata, 43 | }); 44 | 45 | cases.push({ 46 | input: { 47 | input: questions[0], 48 | output: answers.input_text[1], 49 | expected: answers.input_text[0], 50 | }, 51 | expected: 0, 52 | metadata, 53 | }); 54 | 55 | cases.push({ 56 | input: { 57 | input: questions[0], 58 | output: `${answers.input_text[1]} ${answers.input_text[0]} ${answers.input_text[2]}`, 59 | expected: answers.input_text[0], 60 | }, 61 | expected: 0.6, 62 | metadata, 63 | }); 64 | } 65 | 66 | return cases; 67 | } 68 | 69 | async function coqaContextRelevancy(): Promise { 70 | const df = await getCoqa(); 71 | 72 | const cases: ContextRelevancyCase[] = []; 73 | for (const metadata of df) { 74 | const { story, questions, answers } = metadata; 75 | 76 | const input = questions[0]; 77 | const contexts = answers.answer_start.map((answer_start, i) => 78 | story.substring(answer_start, answers.answer_end[i]), 79 | ); 80 | 81 | cases.push({ 82 | input: { 83 | input, 84 | context: contexts[0], 85 | }, 86 | expected: 1, 87 | metadata, 88 | }); 89 | 90 | cases.push({ 91 | input: { 92 | input, 93 | context: contexts[1], 94 | }, 95 | expected: 0, 96 | metadata, 97 | }); 98 | 99 | const concat = `${contexts[0]} ${contexts[1]}`; 100 | cases.push({ 101 | input: { 102 | input, 103 | context: concat, 104 | }, 105 | expected: contexts[0].length / concat.length, 106 | metadata, 107 | }); 108 | } 109 | 110 | return cases; 111 | } 112 | 113 | async function coqaClosedQA(): Promise { 114 | const df = await getCoqa(); 115 | 116 | const cases: ClosedQACase[] = []; 117 | for (const metadata of df) { 118 | const { questions, answers, story } = metadata; 119 | 120 | const input = `Given the following context: ${story}, \n\n Answer the question: ${questions[0]}`; 121 | const criteria = "Is the answer correct?"; 122 | cases.push({ 123 | input: { input, output: answers.input_text[0], criteria }, 124 | expected: 1, 125 | metadata, 126 | }); 127 | cases.push({ 128 | input: { input, output: answers.input_text[1], criteria }, 129 | expected: 0, 130 | metadata, 131 | }); 132 | } 133 | return cases; 134 | } 135 | 136 | function saveFile(cases: unknown[], fname: string) { 137 | fs.writeFileSync(path.join(dataDir, fname), JSON.stringify(cases, null, 2)); 138 | } 139 | 140 | async function main() { 141 | if (!fs.existsSync(dataDir)) { 142 | fs.mkdirSync(dataDir, { recursive: true }); 143 | } 144 | 145 | saveFile(await coqaFactuality(), "coqa-factuality.json"); 146 | saveFile(await coqaContextRelevancy(), "coqa-context-relevancy.json"); 147 | saveFile(await coqaClosedQA(), "coqa-closed-qa.json"); 148 | } 149 | 150 | main(); 151 | -------------------------------------------------------------------------------- /evals/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "declaration": true, 4 | "outDir": "./jsdist", 5 | "lib": ["es2015", "dom"], 6 | "target": "ES2018", 7 | "moduleResolution": "node", 8 | "strict": true, 9 | "esModuleInterop": true, 10 | "skipLibCheck": true 11 | }, 12 | "include": ["js"], 13 | "exclude": ["node_modules/**"] 14 | } 15 | -------------------------------------------------------------------------------- /js/embeddings.test.ts: -------------------------------------------------------------------------------- 1 | import { EmbeddingSimilarity } from "./string"; 2 | 3 | const SYNONYMS = [ 4 | { 5 | word: "water", 6 | synonyms: ["water", "H2O", "agua"], 7 | }, 8 | { 9 | word: "fire", 10 | synonyms: ["fire", "flame"], 11 | }, 12 | { 13 | word: "earth", 14 | synonyms: ["earth", "Planet Earth"], 15 | }, 16 | ]; 17 | 18 | const UNRELATED = [ 19 | "water", 20 | "The quick brown fox jumps over the lazy dog", 21 | "I like to eat apples", 22 | ]; 23 | 24 | import { test, expect } from "vitest"; 25 | 26 | test("Embeddings Test", async () => { 27 | const prefix = "resource type: "; 28 | for (const { word, synonyms } of SYNONYMS) { 29 | for (const synonym of synonyms) { 30 | const result = await EmbeddingSimilarity({ 31 | prefix, 32 | output: word, 33 | expected: synonym, 34 | }); 35 | expect(result.score).toBeGreaterThan(0.6); 36 | } 37 | } 38 | 39 | for (let i = 0; i < UNRELATED.length; i++) { 40 | for (let j = 0; j < UNRELATED.length; j++) { 41 | if (i == j) { 42 | continue; 43 | } 44 | 45 | const word1 = UNRELATED[i]; 46 | const word2 = UNRELATED[j]; 47 | const result = await EmbeddingSimilarity({ 48 | prefix, 49 | output: word1, 50 | expected: word2, 51 | }); 52 | expect(result.score).toBeLessThan(0.5); 53 | } 54 | } 55 | }, 600000); 56 | -------------------------------------------------------------------------------- /js/index.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * AutoEvals is a tool to quickly and easily evaluate AI model outputs. 3 | * 4 | * ### Quickstart 5 | * ```bash 6 | * npm install autoevals 7 | * ``` 8 | * 9 | * ### Example 10 | * 11 | * Use AutoEvals to model-grade an example LLM completion using the [factuality prompt](templates/factuality.yaml). 12 | * 13 | * ```javascript 14 | * import { Factuality } from "autoevals"; 15 | * 16 | * (async () => { 17 | * const input = "Which country has the highest population?"; 18 | * const output = "People's Republic of China"; 19 | * const expected = "China"; 20 | * 21 | * const result = await Factuality({ output, expected, input }); 22 | * console.log(`Factuality score: ${result.score}`); 23 | * console.log(`Factuality metadata: ${result.metadata?.rationale}`); 24 | * })(); 25 | * ``` 26 | * 27 | * @module autoevals 28 | */ 29 | 30 | export type { Score, ScorerArgs, Scorer } from "@braintrust/core"; 31 | export * from "./llm"; 32 | export { init } from "./oai"; 33 | export * from "./string"; 34 | export * from "./list"; 35 | export * from "./moderation"; 36 | export * from "./number"; 37 | export * from "./json"; 38 | export * from "./templates"; 39 | export * from "./ragas"; 40 | export * from "./value"; 41 | export { Evaluators } from "./manifest"; 42 | export { makePartial, ScorerWithPartial } from "./partial"; 43 | -------------------------------------------------------------------------------- /js/json.test.ts: -------------------------------------------------------------------------------- 1 | import { JSONDiff, ValidJSON } from "./json"; 2 | import { NumericDiff } from "./number"; 3 | import { ExactMatch } from "./value"; 4 | 5 | import { test, expect } from "vitest"; 6 | 7 | test("JSON String Test", async () => { 8 | const cases = [ 9 | { a: "", b: "", expected: 1 }, 10 | { a: "", b: "a", expected: 0 }, 11 | { a: "a", b: "", expected: 0 }, 12 | { a: "a", b: "a", expected: 1 }, 13 | { a: "a", b: "b", expected: 0 }, 14 | { a: "ab", b: "ac", expected: 0.5 }, 15 | { a: "ac", b: "bc", expected: 0.5 }, 16 | { a: "abc", b: "axc", expected: 0.66667 }, 17 | { a: "xabxcdxxefxgx", b: "1ab2cd34ef5g6", expected: 0.53846 }, 18 | ]; 19 | 20 | for (const { a, b, expected } of cases) { 21 | const score = (await JSONDiff({ output: a, expected: b })).score; 22 | expect(score).toBeCloseTo(expected); 23 | } 24 | }); 25 | 26 | test("JSON Object Test", async () => { 27 | const cases = [ 28 | { a: null, b: null, expected: 1 }, 29 | { a: undefined, b: null, expected: 1 }, 30 | { a: "", b: null, expected: 0 }, 31 | { a: [], b: {}, expected: 0 }, 32 | { a: [], b: [], expected: 1 }, 33 | { a: {}, b: {}, expected: 1 }, 34 | { a: { a: 1 }, b: { a: 1 }, expected: 1 }, 35 | { a: { a: 1 }, b: { a: 2 }, expected: 0.66667 }, 36 | { a: { a: 1 }, b: ["a", 1], expected: 0.5714285714285714 }, 37 | { a: { a: 1 }, b: { b: { a: 1 } }, expected: 0 }, 38 | { a: { a: 1 }, b: { a: null }, expected: 0 }, 39 | { 40 | a: { mapping: { a: "foo", b: "bar" } }, 41 | b: { mapping: { a: "Foo", b: "Bar" }, Extra: 5 }, 42 | expected: 0.33333333333333337, 43 | }, 44 | ]; 45 | 46 | for (const { a, b, expected } of cases) { 47 | const score = (await JSONDiff({ output: a, expected: b })).score; 48 | expect(score).toBeCloseTo(expected); 49 | } 50 | }); 51 | 52 | test("Valid JSON Test", async () => { 53 | const cases = [ 54 | { output: "1", expected: 0 }, 55 | { output: '{ "a": 1, "b": "hello" }', expected: 1 }, 56 | { output: '[{ "a": 1 }]', expected: 1 }, 57 | { output: '[{ "a": 1 }', expected: 0 }, 58 | { 59 | output: '{ "mapping": { "a": "foo", "b": "bar" }, "extra": 4 }', 60 | expected: 1, 61 | }, 62 | { 63 | output: '{ mapping: { "a": "foo", "b": "bar" }, "extra": 4 }', 64 | expected: 0, 65 | }, 66 | { 67 | output: '{"a":"1"}', 68 | expected: 1, 69 | schema: { 70 | type: "object", 71 | properties: { 72 | a: { type: "string" }, 73 | }, 74 | required: ["a"], 75 | }, 76 | }, 77 | { 78 | output: '{ "a": "1", "b": "1" }', 79 | expected: 0, 80 | schema: { 81 | type: "object", 82 | properties: { 83 | a: { type: "string" }, 84 | b: { type: "number" }, 85 | }, 86 | required: ["a", "b"], 87 | }, 88 | }, 89 | { 90 | output: '[{ "a": "1" }, { "a": "1", "b": 22 }]', 91 | expected: 1, 92 | schema: { 93 | type: "array", 94 | items: { 95 | type: "object", 96 | properties: { 97 | a: { type: "string" }, 98 | b: { type: "number" }, 99 | }, 100 | required: ["a"], 101 | }, 102 | uniqueItems: true, 103 | }, 104 | }, 105 | { 106 | output: { a: "1", b: "1" }, 107 | expected: 1, 108 | }, 109 | { 110 | output: [{ a: "1" }, { a: "1", b: 22 }], 111 | expected: 1, 112 | }, 113 | { 114 | output: 100, 115 | expected: 0, 116 | }, 117 | { 118 | // This is technically ambiguous, because it _could_ be the valid parsed JSON value 119 | // or an unparsed, invalid JSON value. However, since structured outputs _only_ return 120 | // JSON values, we can safely assume that any strings are unparsed values. 121 | output: "100", 122 | expected: 0, 123 | }, 124 | ]; 125 | 126 | for (const { output, expected, schema } of cases) { 127 | const score = (await ValidJSON({ output, schema })).score; 128 | expect(score).toEqual(expected); 129 | } 130 | }); 131 | 132 | test("Semantic JSON Test", async () => { 133 | const cases = [ 134 | { a: '{"x": 1, "y": 2}', b: '{"y": 2, "x": 1}', expected: 1 }, 135 | { 136 | a: '{"zs": ["a", "b"], "x": 1, "y": 2}', 137 | b: '{"y": 2, "zs": ["a", "b"], "x": 1}', 138 | expected: 1, 139 | }, 140 | { 141 | a: '{"o1": {"x": 1, "y": 2}}', 142 | b: '{"o1": {"y": 2, "x": 1}}', 143 | expected: 1, 144 | }, 145 | { 146 | a: '{"xs": [{"o1": {"x": 1, "y": [2]}}]}', 147 | b: '{"xs": [{"o1": {"y": [2], "x": 1}}]}', 148 | expected: 1, 149 | }, 150 | { 151 | a: '{"o1": {"x": 2, "y": 2}}', 152 | b: '{"o1": {"y": 2, "x": 1}}', 153 | expected: 0.83333, 154 | }, 155 | { 156 | a: { o1: { x: 2, y: 2 } }, 157 | b: '{"o1": {"y": 2, "x": 1}}', 158 | expected: 0.83333, 159 | }, 160 | { a: '{"x": 1, "y": 2}', b: '{"x": 1, "z": 2}', expected: 0.3333 }, 161 | { a: "[1, 2]", b: "[1, 2]", expected: 1 }, 162 | { a: "[1, 2]", b: "[2, 1]", expected: 0.66667 }, 163 | ]; 164 | 165 | for (const { a, b, expected } of cases) { 166 | for (const exactNumber of [true, false]) { 167 | const score = ( 168 | await JSONDiff({ 169 | output: a, 170 | expected: b, 171 | numberScorer: exactNumber ? ExactMatch : NumericDiff, 172 | }) 173 | ).score; 174 | if (!exactNumber) { 175 | expect(score).toBeCloseTo(expected); 176 | } else { 177 | expect(Math.round((score ?? 0) * 100)).toBeLessThanOrEqual( 178 | Math.round(expected * 100), 179 | ); 180 | } 181 | } 182 | } 183 | }); 184 | -------------------------------------------------------------------------------- /js/json.ts: -------------------------------------------------------------------------------- 1 | import { Scorer } from "@braintrust/core"; 2 | import { NumericDiff } from "./number"; 3 | import { LevenshteinScorer } from "./string"; 4 | import Ajv, { JSONSchemaType, Schema } from "ajv"; 5 | import { makePartial, ScorerWithPartial } from "./partial"; 6 | 7 | /** 8 | * A simple scorer that compares JSON objects, using a customizable comparison method for strings 9 | * (defaults to Levenshtein) and numbers (defaults to NumericDiff). 10 | */ 11 | export const JSONDiff: ScorerWithPartial< 12 | any, 13 | { 14 | stringScorer?: Scorer; 15 | numberScorer?: Scorer; 16 | preserveStrings?: boolean; 17 | } 18 | > = makePartial( 19 | async ({ 20 | output, 21 | expected, 22 | stringScorer = LevenshteinScorer, 23 | numberScorer = NumericDiff, 24 | preserveStrings = false, 25 | }) => { 26 | return { 27 | name: "JSONDiff", 28 | score: await jsonDiff( 29 | output, 30 | expected, 31 | stringScorer, 32 | numberScorer, 33 | preserveStrings, 34 | ), 35 | }; 36 | }, 37 | "JSONDiff", 38 | ); 39 | 40 | /** 41 | * A binary scorer that evaluates the validity of JSON output, optionally validating against a 42 | * JSON Schema definition (see https://json-schema.org/learn/getting-started-step-by-step#create). 43 | */ 44 | export const ValidJSON: ScorerWithPartial = makePartial( 45 | async ({ output, schema }) => { 46 | return { 47 | name: "ValidJSON", 48 | score: validJSON(output, schema), 49 | metadata: { schema }, 50 | }; 51 | }, 52 | "ValidJSON", 53 | ); 54 | 55 | async function jsonDiff( 56 | o1: any, 57 | o2: any, 58 | stringScorer: Scorer, 59 | numberScorer: Scorer, 60 | preserveStrings: boolean, 61 | ): Promise { 62 | if (!preserveStrings) { 63 | if (typeof o1 === "string" && validJSON(o1) === 1) { 64 | o1 = JSON.parse(o1); 65 | } 66 | if (typeof o2 === "string" && validJSON(o2) === 1) { 67 | o2 = JSON.parse(o2); 68 | } 69 | } 70 | 71 | if (isObject(o1) && isObject(o2)) { 72 | if (Object.keys(o1).length == 0 && Object.keys(o2).length == 0) { 73 | return 1; 74 | } 75 | 76 | const allKeys = Object.keys( 77 | Object.fromEntries( 78 | Object.keys(o1) 79 | .concat(Object.keys(o2)) 80 | .map((k) => [k, true]), 81 | ), 82 | ); 83 | 84 | // eslint-disable-next-line @typescript-eslint/consistent-type-assertions 85 | const baseScores = ( 86 | await Promise.all( 87 | allKeys.map((k) => 88 | jsonDiff(o1[k], o2[k], stringScorer, numberScorer, preserveStrings), 89 | ), 90 | ) 91 | ).filter((s) => s !== null) as number[]; 92 | return baseScores.reduce((acc, s) => acc + s, 0) / baseScores.length; 93 | } else if (isArray(o1) && isArray(o2)) { 94 | if (o1.length === 0 && o2.length === 0) { 95 | return 1; 96 | } 97 | 98 | // eslint-disable-next-line @typescript-eslint/consistent-type-assertions 99 | const baseScores = ( 100 | await Promise.all( 101 | Array.from({ 102 | length: Math.min(o1.length, o2.length), 103 | }).map((_, i) => 104 | jsonDiff(o1[i], o2[i], stringScorer, numberScorer, preserveStrings), 105 | ), 106 | ) 107 | ).filter((s) => s !== null) as number[]; 108 | return ( 109 | baseScores.reduce((acc, s) => acc + s, 0) / Math.max(o1.length, o2.length) 110 | ); 111 | } else if (typeof o1 === "string" && typeof o2 === "string") { 112 | return (await stringScorer({ output: o1, expected: o2 })).score; 113 | } else if (typeof o1 === "number" && typeof o2 === "number") { 114 | return (await numberScorer({ output: o1, expected: o2 })).score; 115 | } else if ( 116 | (o1 === null || o1 === undefined) && 117 | (o2 === null || o2 === undefined) 118 | ) { 119 | return 1; 120 | } else if ( 121 | o1 === null || 122 | o1 === undefined || 123 | o2 === null || 124 | o2 === undefined 125 | ) { 126 | return 0; 127 | } else { 128 | return ( 129 | await stringScorer({ 130 | output: JSON.stringify(o1, replacer), 131 | expected: JSON.stringify(o2, replacer), 132 | }) 133 | ).score; 134 | } 135 | } 136 | 137 | function isObject(value: any): value is { [key: string]: any } { 138 | return value instanceof Object && !(value instanceof Array); 139 | } 140 | 141 | function isArray(value: any): value is Array { 142 | return value instanceof Array; 143 | } 144 | 145 | // https://gist.github.com/davidfurlong/463a83a33b70a3b6618e97ec9679e490 146 | const replacer = (key: string, value: any) => 147 | isObject(value) 148 | ? Object.keys(value) 149 | .sort() 150 | .reduce((sorted: { [key: string]: any }, key) => { 151 | sorted[key] = value[key]; 152 | return sorted; 153 | }, {}) 154 | : value; 155 | 156 | function validJSON(output: any, schema?: Schema | JSONSchemaType) { 157 | try { 158 | const parsed = typeof output === "string" ? JSON.parse(output) : output; 159 | 160 | if (schema) { 161 | return validateSchema(parsed, schema); 162 | } 163 | if (isObject(parsed) || isArray(parsed)) { 164 | return 1; 165 | } 166 | } catch { 167 | // Ignore errors 168 | } 169 | 170 | return 0; 171 | } 172 | 173 | function validateSchema(data: any, schema: any) { 174 | const ajv = new Ajv(); 175 | const validate = ajv.compile(schema); 176 | const valid = validate(data); 177 | return valid ? 1 : 0; 178 | } 179 | -------------------------------------------------------------------------------- /js/list.ts: -------------------------------------------------------------------------------- 1 | import { Scorer } from "@braintrust/core"; 2 | import { Levenshtein } from "./string"; 3 | import { linearSumAssignment } from "linear-sum-assignment"; 4 | import { makePartial, ScorerWithPartial } from "./partial"; 5 | 6 | /** 7 | * A scorer that semantically evaluates the overlap between two lists of strings. It works by 8 | * computing the pairwise similarity between each element of the output and the expected value, 9 | * and then using Linear Sum Assignment to find the best matching pairs. 10 | */ 11 | export const ListContains: ScorerWithPartial< 12 | string[], 13 | { 14 | pairwiseScorer?: Scorer; 15 | allowExtraEntities?: boolean; 16 | } 17 | > = makePartial(async (args) => { 18 | const { output, expected, allowExtraEntities } = args; 19 | if (expected === undefined) { 20 | throw new Error("ListContains requires an expected value"); 21 | } 22 | 23 | if (output.length == 0 && expected.length == 0) { 24 | return { 25 | name: "ListContains", 26 | score: 1, 27 | }; 28 | } else if (output.length == 0 || expected.length == 0) { 29 | return { 30 | name: "ListContains", 31 | score: 0, 32 | }; 33 | } 34 | 35 | const pairwiseScorer = args.pairwiseScorer || Levenshtein; 36 | 37 | const similarities = await Promise.all( 38 | args.output.map(async (output_item) => 39 | Promise.all( 40 | expected.map( 41 | async (expected_item) => 42 | ( 43 | await pairwiseScorer({ 44 | output: output_item, 45 | expected: expected_item, 46 | }) 47 | ).score ?? 0, 48 | ), 49 | ), 50 | ), 51 | ); 52 | 53 | if (similarities.length === 1 && similarities[0].length === 1) { 54 | // There appears to be a bug in the linearSumAssignment library when there is only one element 55 | return { 56 | name: "ListContains", 57 | score: similarities[0][0], 58 | }; 59 | } 60 | 61 | const result = linearSumAssignment(similarities, { maximaze: true }); 62 | 63 | const pairs = Array.from(result.rowAssignments) 64 | .map((c, r) => 65 | c >= 0 66 | ? { 67 | output: output[r], 68 | expected: expected[c], 69 | score: similarities[r][c], 70 | } 71 | : null, 72 | ) 73 | .filter((pair) => pair !== null) as Array<{ 74 | output: string; 75 | expected: string; 76 | score: number; 77 | }>; 78 | 79 | const denominator = allowExtraEntities 80 | ? expected.length 81 | : Math.max(output.length, expected.length); 82 | 83 | const avgScore = 84 | pairs.reduce((acc, pair) => acc + pair.score, 0) / denominator; 85 | 86 | return { 87 | name: "ListContains", 88 | score: Math.min(Math.max(avgScore, 0), 1), 89 | metadata: { 90 | pairs, 91 | }, 92 | }; 93 | }, "ListContains"); 94 | -------------------------------------------------------------------------------- /js/llm.fixtures.ts: -------------------------------------------------------------------------------- 1 | export const openaiClassifierShouldEvaluateTitles = [ 2 | { 3 | id: "chatcmpl-B7WxpqqPbHYiAOPDl3ViYNalDFbce", 4 | object: "chat.completion", 5 | created: 1741134709, 6 | model: "gpt-3.5-turbo-0125", 7 | choices: [ 8 | { 9 | index: 0, 10 | message: { 11 | role: "assistant", 12 | content: null, 13 | tool_calls: [ 14 | { 15 | id: "call_OlUJAex0cWI84acfE0XydrHz", 16 | type: "function", 17 | function: { 18 | name: "select_choice", 19 | arguments: 20 | '{"reasons":"Title 1: Pros - Clearly states the goal of standardizing error responses for better developer experience. Cons - Might be too specific and not catchy. Title 2: Pros - Short and simple. Cons - Lacks information about the issue.","choice":"1"}', 21 | }, 22 | }, 23 | ], 24 | refusal: null, 25 | }, 26 | logprobs: null, 27 | finish_reason: "stop", 28 | }, 29 | ], 30 | usage: { 31 | prompt_tokens: 354, 32 | completion_tokens: 58, 33 | total_tokens: 412, 34 | prompt_tokens_details: { 35 | cached_tokens: 0, 36 | audio_tokens: 0, 37 | }, 38 | completion_tokens_details: { 39 | reasoning_tokens: 0, 40 | audio_tokens: 0, 41 | accepted_prediction_tokens: 0, 42 | rejected_prediction_tokens: 0, 43 | }, 44 | }, 45 | service_tier: "default", 46 | system_fingerprint: null, 47 | }, 48 | ]; 49 | 50 | export const openaiClassifierShouldEvaluateTitlesWithCoT = [ 51 | { 52 | id: "chatcmpl-B7XFw0OCpCbMVwLizRts3Cl72Obg0", 53 | object: "chat.completion", 54 | created: 1741135832, 55 | model: "gpt-4o-2024-08-06", 56 | choices: [ 57 | { 58 | index: 0, 59 | message: { 60 | role: "assistant", 61 | content: null, 62 | tool_calls: [ 63 | { 64 | id: "call_jUzxFALMTbpzGX4DfFH57VdI", 65 | type: "function", 66 | function: { 67 | name: "select_choice", 68 | arguments: 69 | '{"reasons":"1. The issue description talks about the need to standardize error responses from GoTrue, Postgres, and Realtime APIs to improve developer experience (DX).\\n2. Title 1 directly mentions the key components involved (GoTrue, Postgres, and Realtime APIs) and the goal (better DX), which aligns well with the issue description.\\n3. Title 2, \\"Good title,\\" is vague and does not provide any information about the issue or its context.\\n4. Therefore, Title 1 is more descriptive and relevant to the issue at hand.","choice":"1"}', 70 | }, 71 | }, 72 | ], 73 | refusal: null, 74 | }, 75 | logprobs: null, 76 | finish_reason: "stop", 77 | }, 78 | ], 79 | usage: { 80 | prompt_tokens: 370, 81 | completion_tokens: 125, 82 | total_tokens: 495, 83 | prompt_tokens_details: { 84 | cached_tokens: 0, 85 | audio_tokens: 0, 86 | }, 87 | completion_tokens_details: { 88 | reasoning_tokens: 0, 89 | audio_tokens: 0, 90 | accepted_prediction_tokens: 0, 91 | rejected_prediction_tokens: 0, 92 | }, 93 | }, 94 | service_tier: "default", 95 | system_fingerprint: "fp_eb9dce56a8", 96 | }, 97 | { 98 | id: "chatcmpl-B7YPU81s7cb2uzlwJ8w9aS5qhfhtJ", 99 | object: "chat.completion", 100 | created: 1741140268, 101 | model: "gpt-4o-2024-08-06", 102 | choices: [ 103 | { 104 | index: 0, 105 | message: { 106 | role: "assistant", 107 | content: null, 108 | tool_calls: [ 109 | { 110 | id: "call_3Z63hgrYvLuSZKc2rrHAYLI4", 111 | type: "function", 112 | function: { 113 | name: "select_choice", 114 | arguments: 115 | '{"reasons":"1. The issue description talks about the need to standardize error responses from GoTrue, Postgres, and Realtime APIs to improve developer experience (DX).\\n2. Title 1, \\"Good title,\\" is vague and does not convey any specific information about the issue. It does not mention the APIs involved or the purpose of the standardization.\\n3. Title 2, \\"Standardize error responses from GoTrue, Postgres, and Realtime APIs for better DX,\\" directly reflects the main goal of the issue, which is to standardize error responses for better developer experience. It also specifies the APIs involved, making it clear and informative.\\n4. Therefore, Title 2 is a better choice as it accurately and clearly describes the issue at hand.","choice":"2"}', 116 | }, 117 | }, 118 | ], 119 | refusal: null, 120 | }, 121 | logprobs: null, 122 | finish_reason: "stop", 123 | }, 124 | ], 125 | usage: { 126 | prompt_tokens: 370, 127 | completion_tokens: 164, 128 | total_tokens: 534, 129 | prompt_tokens_details: { cached_tokens: 0, audio_tokens: 0 }, 130 | completion_tokens_details: { 131 | reasoning_tokens: 0, 132 | audio_tokens: 0, 133 | accepted_prediction_tokens: 0, 134 | rejected_prediction_tokens: 0, 135 | }, 136 | }, 137 | service_tier: "default", 138 | system_fingerprint: "fp_eb9dce56a8", 139 | }, 140 | { 141 | id: "chatcmpl-B7YQ9ILZ9DJR2AjY2s4qU15Rc6qII", 142 | object: "chat.completion", 143 | created: 1741140309, 144 | model: "gpt-4o-2024-08-06", 145 | choices: [ 146 | { 147 | index: 0, 148 | message: { 149 | role: "assistant", 150 | content: null, 151 | tool_calls: [ 152 | { 153 | id: "call_CxDdx3i9eaHg81kYjQIICPfd", 154 | type: "function", 155 | function: { name: "select_choice", arguments: '{"choice":"1"}' }, 156 | }, 157 | ], 158 | refusal: null, 159 | }, 160 | logprobs: null, 161 | finish_reason: "stop", 162 | }, 163 | ], 164 | usage: { 165 | prompt_tokens: 292, 166 | completion_tokens: 6, 167 | total_tokens: 298, 168 | prompt_tokens_details: { cached_tokens: 0, audio_tokens: 0 }, 169 | completion_tokens_details: { 170 | reasoning_tokens: 0, 171 | audio_tokens: 0, 172 | accepted_prediction_tokens: 0, 173 | rejected_prediction_tokens: 0, 174 | }, 175 | }, 176 | service_tier: "default", 177 | system_fingerprint: "fp_eb9dce56a8", 178 | }, 179 | { 180 | id: "chatcmpl-B7YQa80DGu61zUWpdPtXRaJdRQz6l", 181 | object: "chat.completion", 182 | created: 1741140336, 183 | model: "gpt-4o-2024-08-06", 184 | choices: [ 185 | { 186 | index: 0, 187 | message: { 188 | role: "assistant", 189 | content: null, 190 | tool_calls: [ 191 | { 192 | id: "call_ksuniPMn2w99hFt5Z1mzhWMe", 193 | type: "function", 194 | function: { name: "select_choice", arguments: '{"choice":"2"}' }, 195 | }, 196 | ], 197 | refusal: null, 198 | }, 199 | logprobs: null, 200 | finish_reason: "stop", 201 | }, 202 | ], 203 | usage: { 204 | prompt_tokens: 292, 205 | completion_tokens: 6, 206 | total_tokens: 298, 207 | prompt_tokens_details: { cached_tokens: 0, audio_tokens: 0 }, 208 | completion_tokens_details: { 209 | reasoning_tokens: 0, 210 | audio_tokens: 0, 211 | accepted_prediction_tokens: 0, 212 | rejected_prediction_tokens: 0, 213 | }, 214 | }, 215 | service_tier: "default", 216 | system_fingerprint: "fp_eb9dce56a8", 217 | }, 218 | ]; 219 | 220 | export const openaiClassifierShouldEvaluateArithmeticExpressions = [ 221 | { 222 | id: "chatcmpl-B7YSMVJ7qaQTJ9OtR6zPUEdHxrNbT", 223 | object: "chat.completion", 224 | created: 1741140446, 225 | model: "gpt-4o-2024-08-06", 226 | choices: [ 227 | { 228 | index: 0, 229 | message: { 230 | role: "assistant", 231 | content: null, 232 | tool_calls: [ 233 | { 234 | id: "call_Iatq5uhNc05I95JHjM7v3N5Y", 235 | type: "function", 236 | function: { 237 | name: "select_choice", 238 | arguments: 239 | '{"reasons":"1. The instruction is to add the numbers 1, 2, and 3.\\n2. The correct sum of these numbers is 1 + 2 + 3 = 6.\\n3. Response 1 provides the answer as 600, which is incorrect.\\n4. Response 2 provides the answer as 6, which is correct.\\n5. Since the task is to evaluate which response is better based on the correctness of the addition, Response 2 is better because it provides the correct sum.\\n6. Therefore, Response 1 is not better than Response 2.","choice":"No"}', 240 | }, 241 | }, 242 | ], 243 | refusal: null, 244 | }, 245 | logprobs: null, 246 | finish_reason: "stop", 247 | }, 248 | ], 249 | usage: { 250 | prompt_tokens: 248, 251 | completion_tokens: 133, 252 | total_tokens: 381, 253 | prompt_tokens_details: { cached_tokens: 0, audio_tokens: 0 }, 254 | completion_tokens_details: { 255 | reasoning_tokens: 0, 256 | audio_tokens: 0, 257 | accepted_prediction_tokens: 0, 258 | rejected_prediction_tokens: 0, 259 | }, 260 | }, 261 | service_tier: "default", 262 | system_fingerprint: "fp_eb9dce56a8", 263 | }, 264 | { 265 | id: "chatcmpl-B7YTPWIPOFpRcVOjEnU6s0kZXgPdB", 266 | object: "chat.completion", 267 | created: 1741140511, 268 | model: "gpt-4o-2024-08-06", 269 | choices: [ 270 | { 271 | index: 0, 272 | message: { 273 | role: "assistant", 274 | content: null, 275 | tool_calls: [ 276 | { 277 | id: "call_eYJIS5zb9S0qS3NW2XZ7HtPu", 278 | type: "function", 279 | function: { 280 | name: "select_choice", 281 | arguments: 282 | '{"reasons":"1. The instruction in both cases is to add the numbers 1, 2, and 3.\\n2. The correct sum of these numbers is 1 + 2 + 3 = 6.\\n3. Response 1 provides the answer as 6, which is the correct sum of the numbers.\\n4. Response 2 provides the answer as 600, which is incorrect as it does not represent the sum of the numbers given in the instruction.\\n5. Since Response 1 correctly answers the instruction and Response 2 does not, Response 1 is objectively better than Response 2.\\n6. Therefore, based on the correctness of the responses, the first response is better than the second.","choice":"Yes"}', 283 | }, 284 | }, 285 | ], 286 | refusal: null, 287 | }, 288 | logprobs: null, 289 | finish_reason: "stop", 290 | }, 291 | ], 292 | usage: { 293 | prompt_tokens: 248, 294 | completion_tokens: 157, 295 | total_tokens: 405, 296 | prompt_tokens_details: { cached_tokens: 0, audio_tokens: 0 }, 297 | completion_tokens_details: { 298 | reasoning_tokens: 0, 299 | audio_tokens: 0, 300 | accepted_prediction_tokens: 0, 301 | rejected_prediction_tokens: 0, 302 | }, 303 | }, 304 | service_tier: "default", 305 | system_fingerprint: "fp_eb9dce56a8", 306 | }, 307 | { 308 | id: "chatcmpl-B7YU2qluNL0SenvL1zBiSzrka236n", 309 | object: "chat.completion", 310 | created: 1741140550, 311 | model: "gpt-4o-2024-08-06", 312 | choices: [ 313 | { 314 | index: 0, 315 | message: { 316 | role: "assistant", 317 | content: null, 318 | tool_calls: [ 319 | { 320 | id: "call_kfVuMD09ytJIQVocHTEBrYLW", 321 | type: "function", 322 | function: { 323 | name: "select_choice", 324 | arguments: 325 | '{"reasons":"1. Both instructions are identical, asking to add the numbers 1, 2, and 3.\\n2. Both responses provide the correct sum of these numbers, which is 6.\\n3. There is no additional context, explanation, or formatting in either response that would differentiate them in terms of quality or clarity.\\n4. Since both responses are identical and correct, there is no basis to claim that one is better than the other.\\n5. Therefore, the first response is not better than the second; they are equally good.","choice":"No"}', 326 | }, 327 | }, 328 | ], 329 | refusal: null, 330 | }, 331 | logprobs: null, 332 | finish_reason: "stop", 333 | }, 334 | ], 335 | usage: { 336 | prompt_tokens: 248, 337 | completion_tokens: 121, 338 | total_tokens: 369, 339 | prompt_tokens_details: { cached_tokens: 0, audio_tokens: 0 }, 340 | completion_tokens_details: { 341 | reasoning_tokens: 0, 342 | audio_tokens: 0, 343 | accepted_prediction_tokens: 0, 344 | rejected_prediction_tokens: 0, 345 | }, 346 | }, 347 | service_tier: "default", 348 | system_fingerprint: "fp_eb9dce56a8", 349 | }, 350 | { 351 | id: "chatcmpl-B7YUTk3771FhLlXQNZPaobEC0d8R6", 352 | object: "chat.completion", 353 | created: 1741140577, 354 | model: "gpt-4o-2024-08-06", 355 | choices: [ 356 | { 357 | index: 0, 358 | message: { 359 | role: "assistant", 360 | content: null, 361 | tool_calls: [ 362 | { 363 | id: "call_lbRjfwrJVP8HgLupWflqoCBM", 364 | type: "function", 365 | function: { name: "select_choice", arguments: '{"choice":"No"}' }, 366 | }, 367 | ], 368 | refusal: null, 369 | }, 370 | logprobs: null, 371 | finish_reason: "stop", 372 | }, 373 | ], 374 | usage: { 375 | prompt_tokens: 170, 376 | completion_tokens: 6, 377 | total_tokens: 176, 378 | prompt_tokens_details: { cached_tokens: 0, audio_tokens: 0 }, 379 | completion_tokens_details: { 380 | reasoning_tokens: 0, 381 | audio_tokens: 0, 382 | accepted_prediction_tokens: 0, 383 | rejected_prediction_tokens: 0, 384 | }, 385 | }, 386 | service_tier: "default", 387 | system_fingerprint: "fp_eb9dce56a8", 388 | }, 389 | { 390 | id: "chatcmpl-B7YUtrpit4RvQCeqfOcZme9L6pMAP", 391 | object: "chat.completion", 392 | created: 1741140603, 393 | model: "gpt-4o-2024-08-06", 394 | choices: [ 395 | { 396 | index: 0, 397 | message: { 398 | role: "assistant", 399 | content: null, 400 | tool_calls: [ 401 | { 402 | id: "call_d3YnOawL5qadUmE46hoKds6B", 403 | type: "function", 404 | function: { 405 | name: "select_choice", 406 | arguments: '{"choice":"Yes"}', 407 | }, 408 | }, 409 | ], 410 | refusal: null, 411 | }, 412 | logprobs: null, 413 | finish_reason: "stop", 414 | }, 415 | ], 416 | usage: { 417 | prompt_tokens: 170, 418 | completion_tokens: 6, 419 | total_tokens: 176, 420 | prompt_tokens_details: { cached_tokens: 0, audio_tokens: 0 }, 421 | completion_tokens_details: { 422 | reasoning_tokens: 0, 423 | audio_tokens: 0, 424 | accepted_prediction_tokens: 0, 425 | rejected_prediction_tokens: 0, 426 | }, 427 | }, 428 | service_tier: "default", 429 | system_fingerprint: "fp_eb9dce56a8", 430 | }, 431 | { 432 | id: "chatcmpl-B7YV8HHTm4hZU58Zp9gcjwp3MigEl", 433 | object: "chat.completion", 434 | created: 1741140618, 435 | model: "gpt-4o-2024-08-06", 436 | choices: [ 437 | { 438 | index: 0, 439 | message: { 440 | role: "assistant", 441 | content: null, 442 | tool_calls: [ 443 | { 444 | id: "call_l3AonPTlmEhJ95fbq4M6J0sd", 445 | type: "function", 446 | function: { name: "select_choice", arguments: '{"choice":"No"}' }, 447 | }, 448 | ], 449 | refusal: null, 450 | }, 451 | logprobs: null, 452 | finish_reason: "stop", 453 | }, 454 | ], 455 | usage: { 456 | prompt_tokens: 170, 457 | completion_tokens: 6, 458 | total_tokens: 176, 459 | prompt_tokens_details: { cached_tokens: 0, audio_tokens: 0 }, 460 | completion_tokens_details: { 461 | reasoning_tokens: 0, 462 | audio_tokens: 0, 463 | accepted_prediction_tokens: 0, 464 | rejected_prediction_tokens: 0, 465 | }, 466 | }, 467 | service_tier: "default", 468 | system_fingerprint: "fp_eb9dce56a8", 469 | }, 470 | ]; 471 | -------------------------------------------------------------------------------- /js/llm.test.ts: -------------------------------------------------------------------------------- 1 | import { bypass, http, HttpResponse } from "msw"; 2 | import { setupServer } from "msw/node"; 3 | import { OpenAI } from "openai"; 4 | import { ChatCompletionMessageParam } from "openai/resources"; 5 | import { afterAll, afterEach, beforeAll, describe, expect, test } from "vitest"; 6 | import { 7 | Battle, 8 | buildClassificationTools, 9 | LLMClassifierFromTemplate, 10 | OpenAIClassifier, 11 | } from "../js/llm"; 12 | import { 13 | openaiClassifierShouldEvaluateArithmeticExpressions, 14 | openaiClassifierShouldEvaluateTitles, 15 | openaiClassifierShouldEvaluateTitlesWithCoT, 16 | } from "./llm.fixtures"; 17 | import { init } from "./oai"; 18 | 19 | export const server = setupServer(); 20 | 21 | beforeAll(() => { 22 | server.listen({ 23 | onUnhandledRequest: (req) => { 24 | throw new Error(`Unhandled request ${req.method}, ${req.url}`); 25 | }, 26 | }); 27 | 28 | init({ 29 | client: new OpenAI({ 30 | apiKey: "test-api-key", 31 | baseURL: "https://api.openai.com/v1", 32 | }), 33 | }); 34 | }); 35 | 36 | afterEach(() => { 37 | server.resetHandlers(); 38 | }); 39 | 40 | afterAll(() => { 41 | server.close(); 42 | init(); 43 | }); 44 | 45 | describe("LLM Tests", () => { 46 | test("openai classifier should evaluate titles", async () => { 47 | let callCount = -1; 48 | server.use( 49 | http.post("https://api.openai.com/v1/chat/completions", async () => { 50 | const response = openaiClassifierShouldEvaluateTitles[++callCount]; 51 | return response 52 | ? HttpResponse.json(response) 53 | : HttpResponse.json({}, { status: 500 }); 54 | }), 55 | ); 56 | 57 | const messages: ChatCompletionMessageParam[] = [ 58 | { 59 | role: "system", 60 | content: `You are a technical project manager who helps software engineers generate better titles for their GitHub issues. 61 | You will look at the issue description, and pick which of two titles better describes it.`, 62 | }, 63 | { 64 | role: "user", 65 | content: `I'm going to provide you with the issue description, and two possible titles. 66 | 67 | Issue Description: {{page_content}} 68 | 69 | 1: {{output}} 70 | 2: {{expected}} 71 | 72 | Please discuss each title briefly (one line for pros, one for cons), and then answer the question by calling 73 | the select_choice function with "1" or "2".`, 74 | }, 75 | ]; 76 | 77 | const page_content = `As suggested by Nicolo, we should standardize the error responses coming from GoTrue, postgres, and realtime (and any other/future APIs) so that it's better DX when writing a client, 78 | 79 | We can make this change on the servers themselves, but since postgrest and gotrue are fully/partially external may be harder to change, it might be an option to transform the errors within the client libraries/supabase-js, could be messy? 80 | 81 | Nicolo also dropped this as a reference: http://spec.openapis.org/oas/v3.0.3#openapi-specification`; 82 | 83 | const output = `Standardize error responses from GoTrue, Postgres, and Realtime APIs for better DX`; 84 | const expected = `Good title`; 85 | 86 | const score = await OpenAIClassifier({ 87 | name: "titles", 88 | output, 89 | expected, 90 | messages, 91 | model: "gpt-3.5-turbo", 92 | parseScoreFn: (grade: string) => grade.match(/Winner: (\d+)/)![1], 93 | choiceScores: { "1": 1, "2": 0 }, 94 | classificationTools: buildClassificationTools(true, ["1", "2"]), 95 | page_content, 96 | maxTokens: 500, 97 | openAiApiKey: "test-api-key", 98 | }); 99 | 100 | expect(score.error).toBeUndefined(); 101 | }); 102 | 103 | test("llm classifier should evaluate with and without chain of thought", async () => { 104 | let callCount = -1; 105 | server.use( 106 | http.post( 107 | "https://api.openai.com/v1/chat/completions", 108 | async ({ request }) => { 109 | const response = 110 | openaiClassifierShouldEvaluateTitlesWithCoT[++callCount]; 111 | 112 | if (!response) { 113 | const res = await fetch(bypass(request)); 114 | const body = await res.json(); 115 | return HttpResponse.json(body, { 116 | status: res.status, 117 | headers: res.headers, 118 | }); 119 | } 120 | 121 | return response 122 | ? HttpResponse.json(response) 123 | : HttpResponse.json({}, { status: 500 }); 124 | }, 125 | ), 126 | ); 127 | 128 | const pageContent = `As suggested by Nicolo, we should standardize the error responses coming from GoTrue, postgres, and realtime (and any other/future APIs) so that it's better DX when writing a client, 129 | 130 | We can make this change on the servers themselves, but since postgrest and gotrue are fully/partially external may be harder to change, it might be an option to transform the errors within the client libraries/supabase-js, could be messy? 131 | 132 | Nicolo also dropped this as a reference: http://spec.openapis.org/oas/v3.0.3#openapi-specification`; 133 | const genTitle = `Standardize error responses from GoTrue, Postgres, and Realtime APIs for better DX`; 134 | const originalTitle = `Good title`; 135 | 136 | for (const useCoT of [true, false]) { 137 | const classifier = LLMClassifierFromTemplate<{ page_content: string }>({ 138 | name: "titles", 139 | promptTemplate: `You are a technical project manager who helps software engineers generate better titles for their GitHub issues. 140 | You will look at the issue description, and pick which of two titles better describes it. 141 | 142 | I'm going to provide you with the issue description, and two possible titles. 143 | 144 | Issue Description: {{page_content}} 145 | 146 | 1: {{output}} 147 | 2: {{expected}}`, 148 | choiceScores: { "1": 1, "2": 0 }, 149 | useCoT, 150 | }); 151 | 152 | let response = await classifier({ 153 | output: genTitle, 154 | expected: originalTitle, 155 | page_content: pageContent, 156 | openAiApiKey: "test-api-key", 157 | }); 158 | 159 | expect(response.error).toBeUndefined(); 160 | 161 | response = await classifier({ 162 | output: originalTitle, 163 | expected: genTitle, 164 | page_content: pageContent, 165 | openAiApiKey: "test-api-key", 166 | }); 167 | 168 | expect(response.error).toBeUndefined(); 169 | } 170 | }); 171 | 172 | test("battle should evaluate arithmetic expressions", async () => { 173 | let callCount = -1; 174 | server.use( 175 | http.post("https://api.openai.com/v1/chat/completions", async () => { 176 | const response = 177 | openaiClassifierShouldEvaluateArithmeticExpressions[++callCount]; 178 | 179 | return response 180 | ? HttpResponse.json(response) 181 | : HttpResponse.json({}, { status: 500 }); 182 | }), 183 | ); 184 | 185 | // reset the client to test direct client usage 186 | init(); 187 | 188 | const client = new OpenAI({ 189 | apiKey: "test-api-key", 190 | baseURL: "https://api.openai.com/v1", 191 | }); 192 | 193 | for (const useCoT of [true, false]) { 194 | let response = await Battle({ 195 | useCoT, 196 | instructions: "Add the following numbers: 1, 2, 3", 197 | output: "600", 198 | expected: "6", 199 | client, 200 | }); 201 | 202 | expect(response.error).toBeUndefined(); 203 | 204 | response = await Battle({ 205 | useCoT, 206 | instructions: "Add the following numbers: 1, 2, 3", 207 | output: "6", 208 | expected: "600", 209 | client, 210 | }); 211 | 212 | expect(response.error).toBeUndefined(); 213 | 214 | response = await Battle({ 215 | useCoT, 216 | instructions: "Add the following numbers: 1, 2, 3", 217 | output: "6", 218 | expected: "6", 219 | client, 220 | }); 221 | 222 | expect(response.error).toBeUndefined(); 223 | } 224 | }); 225 | }); 226 | -------------------------------------------------------------------------------- /js/llm.ts: -------------------------------------------------------------------------------- 1 | import { Score, Scorer, ScorerArgs } from "@braintrust/core"; 2 | import { ChatCache, OpenAIAuth, cachedChatCompletion } from "./oai"; 3 | import { ModelGradedSpec, templates } from "./templates"; 4 | import { 5 | ChatCompletionMessage, 6 | ChatCompletionMessageParam, 7 | ChatCompletionTool, 8 | } from "openai/resources"; 9 | import { makePartial, ScorerWithPartial } from "./partial"; 10 | import { renderMessages } from "./render-messages"; 11 | 12 | const NO_COT_SUFFIX = 13 | "Answer the question by calling `select_choice` with a single choice from {{__choices}}."; 14 | 15 | const COT_SUFFIX = 16 | "Answer the question by calling `select_choice` with your reasoning in a step-by-step manner to be sure that your conclusion is correct. Avoid simply stating the correct answer at the outset. Select a single choice by setting the `choice` parameter to a single choice from {{__choices}}."; 17 | 18 | export type LLMArgs = { 19 | maxTokens?: number; 20 | temperature?: number; 21 | } & OpenAIAuth; 22 | 23 | export const DEFAULT_MODEL = "gpt-4o"; 24 | 25 | const PLAIN_RESPONSE_SCHEMA = { 26 | properties: { 27 | choice: { description: "The choice", title: "Choice", type: "string" }, 28 | }, 29 | required: ["choice"], 30 | title: "FunctionResponse", 31 | type: "object", 32 | }; 33 | 34 | const COT_RESPONSE_SCHEMA = { 35 | properties: { 36 | reasons: { 37 | description: 38 | "Write out in a step by step manner your reasoning to be sure that your conclusion is correct. Avoid simply stating the correct answer at the outset.", 39 | title: "Reasoning", 40 | type: "string", 41 | }, 42 | choice: { description: "The choice", title: "Choice", type: "string" }, 43 | }, 44 | required: ["reasons", "choice"], 45 | title: "CoTResponse", 46 | type: "object", 47 | }; 48 | 49 | export function buildClassificationTools( 50 | useCoT: boolean, 51 | choiceStrings: string[], 52 | ): ChatCompletionTool[] { 53 | const params = useCoT ? COT_RESPONSE_SCHEMA : PLAIN_RESPONSE_SCHEMA; 54 | const enumParams = { 55 | ...params, 56 | properties: { 57 | ...params.properties, 58 | choice: { ...params.properties.choice, enum: choiceStrings }, 59 | }, 60 | }; 61 | return [ 62 | { 63 | type: "function", 64 | function: { 65 | name: "select_choice", 66 | description: "Call this function to select a choice.", 67 | parameters: enumParams, 68 | }, 69 | }, 70 | ]; 71 | } 72 | 73 | export type OpenAIClassifierArgs = { 74 | name: string; 75 | model: string; 76 | messages: ChatCompletionMessageParam[]; 77 | choiceScores: Record; 78 | classificationTools: ChatCompletionTool[]; 79 | cache?: ChatCache; 80 | } & LLMArgs & 81 | RenderArgs; 82 | 83 | export async function OpenAIClassifier( 84 | args: ScorerArgs>, 85 | ): Promise { 86 | const { 87 | name, 88 | output, 89 | expected, 90 | openAiApiKey, 91 | openAiOrganizationId, 92 | openAiBaseUrl, 93 | openAiDefaultHeaders, 94 | openAiDangerouslyAllowBrowser, 95 | azureOpenAi, 96 | client, 97 | ...remaining 98 | } = args; 99 | 100 | const { 101 | messages: messagesArg, 102 | model, 103 | choiceScores, 104 | classificationTools: classificationTools, 105 | maxTokens, 106 | temperature, 107 | cache, 108 | ...remainingRenderArgs 109 | } = remaining; 110 | 111 | const extraArgs = { 112 | temperature: temperature || 0, 113 | max_tokens: maxTokens, 114 | }; 115 | 116 | const renderArgs = { 117 | output, 118 | expected, 119 | ...remainingRenderArgs, 120 | }; 121 | 122 | const messages = renderMessages(messagesArg, renderArgs); 123 | 124 | const resp = await cachedChatCompletion( 125 | { 126 | model, 127 | messages, 128 | tools: classificationTools, 129 | tool_choice: { 130 | type: "function", 131 | function: { 132 | name: "select_choice", 133 | }, 134 | }, 135 | ...extraArgs, 136 | }, 137 | client 138 | ? { client } 139 | : { 140 | cache, 141 | openAiApiKey, 142 | openAiOrganizationId, 143 | openAiBaseUrl, 144 | openAiDefaultHeaders, 145 | openAiDangerouslyAllowBrowser, 146 | azureOpenAi, 147 | }, 148 | ); 149 | 150 | if (resp.choices.length > 0) { 151 | return { 152 | name, 153 | ...parseResponse(resp.choices[0].message!, choiceScores), 154 | }; 155 | } else { 156 | throw new Error("Empty response from OpenAI"); 157 | } 158 | } 159 | 160 | function parseResponse( 161 | resp: ChatCompletionMessage, 162 | choiceScores: Record, 163 | ): Omit { 164 | let score = 0; 165 | const metadata: Record = {}; 166 | 167 | if (!resp.tool_calls || resp.tool_calls.length === 0) { 168 | throw new Error("No tool calls in response"); 169 | } 170 | const toolCall = resp.tool_calls[0]; 171 | if (toolCall.function.name !== "select_choice") { 172 | throw new Error("Unexpected tool call"); 173 | } 174 | 175 | const args = JSON.parse(toolCall.function.arguments); 176 | metadata["rationale"] = args["reasons"]; 177 | const choice = args["choice"]?.trim(); 178 | metadata["choice"] = choice; 179 | if (choice && choiceScores[choice] !== undefined) { 180 | score = choiceScores[choice]; 181 | } else { 182 | throw new Error(`Unknown score choice ${choice}`); 183 | } 184 | return { 185 | score, 186 | metadata, 187 | }; 188 | } 189 | 190 | export type LLMClassifierArgs = { 191 | model?: string; 192 | useCoT?: boolean; 193 | } & LLMArgs & 194 | RenderArgs; 195 | 196 | export function LLMClassifierFromTemplate({ 197 | name, 198 | promptTemplate, 199 | choiceScores, 200 | model = DEFAULT_MODEL, 201 | useCoT: useCoTArg, 202 | temperature, 203 | }: { 204 | name: string; 205 | promptTemplate: string; 206 | choiceScores: Record; 207 | model?: string; 208 | useCoT?: boolean; 209 | temperature?: number; 210 | }): Scorer> { 211 | const choiceStrings = Object.keys(choiceScores); 212 | const ret = async ( 213 | runtimeArgs: ScorerArgs>, 214 | ) => { 215 | const useCoT = runtimeArgs.useCoT ?? useCoTArg ?? true; 216 | 217 | const prompt = 218 | promptTemplate + "\n" + (useCoT ? COT_SUFFIX : NO_COT_SUFFIX); 219 | 220 | const maxTokens = 512; 221 | const messages: ChatCompletionMessageParam[] = [ 222 | { 223 | role: "user", 224 | content: prompt, 225 | }, 226 | ]; 227 | 228 | return await OpenAIClassifier({ 229 | name, 230 | messages, 231 | choiceScores, 232 | classificationTools: buildClassificationTools(useCoT, choiceStrings), 233 | model, 234 | maxTokens, 235 | temperature, 236 | __choices: choiceStrings, 237 | ...runtimeArgs, 238 | 239 | // Since the logic is a bit funky for computing this, include 240 | // it at the end to prevent overrides 241 | useCoT, 242 | }); 243 | }; 244 | Object.defineProperty(ret, "name", { 245 | value: name, 246 | configurable: true, 247 | }); 248 | 249 | return ret; 250 | } 251 | 252 | export function LLMClassifierFromSpec( 253 | name: string, 254 | spec: ModelGradedSpec, 255 | ): Scorer> { 256 | return LLMClassifierFromTemplate({ 257 | name, 258 | promptTemplate: spec.prompt, 259 | choiceScores: spec.choice_scores, 260 | model: spec.model, 261 | useCoT: spec.use_cot, 262 | temperature: spec.temperature, 263 | }); 264 | } 265 | 266 | export function LLMClassifierFromSpecFile( 267 | name: string, 268 | templateName: keyof typeof templates, 269 | ): Scorer> { 270 | const doc = templates[templateName]; 271 | return LLMClassifierFromSpec(name, doc); 272 | } 273 | 274 | function buildLLMClassifier( 275 | name: string, 276 | templateName: keyof typeof templates, 277 | ): ScorerWithPartial> { 278 | if (!(templateName in templates)) { 279 | throw new Error(`Model template ${name} not found`); 280 | } 281 | 282 | return makePartial( 283 | LLMClassifierFromSpecFile( 284 | name, 285 | templateName as keyof typeof templates, 286 | ), 287 | name, 288 | ); 289 | } 290 | 291 | /** 292 | * Test whether an output _better_ performs the `instructions` than the original 293 | * (expected) value. 294 | */ 295 | export const Battle = buildLLMClassifier<{ instructions: string }>( 296 | "Battle", 297 | "battle", 298 | ); 299 | 300 | /** 301 | * Test whether an output answers the `input` using knowledge built into the model. 302 | * You can specify `criteria` to further constrain the answer. 303 | */ 304 | export const ClosedQA = buildLLMClassifier<{ input: string; criteria: any }>( 305 | "ClosedQA", 306 | "closed_q_a", 307 | ); 308 | 309 | /** 310 | * Test whether an output is funny. 311 | */ 312 | export const Humor = buildLLMClassifier<{}>("Humor", "humor"); 313 | 314 | /** 315 | * Test whether an output is factual, compared to an original (`expected`) value. 316 | */ 317 | export const Factuality = buildLLMClassifier<{ 318 | input: string; 319 | output: string; 320 | expected?: string; 321 | }>("Factuality", "factuality"); 322 | 323 | /** 324 | * Test whether an output is a possible solution to the challenge posed in the input. 325 | */ 326 | export const Possible = buildLLMClassifier<{ input: string }>( 327 | "Possible", 328 | "possible", 329 | ); 330 | 331 | /** 332 | * Test whether an output is malicious. 333 | */ 334 | export const Security = buildLLMClassifier<{}>("Security", "security"); 335 | 336 | /** 337 | * Test whether a SQL query is semantically the same as a reference (output) query. 338 | */ 339 | export const Sql = buildLLMClassifier<{ input: string }>("Sql", "sql"); 340 | 341 | /** 342 | * Test whether an output is a better summary of the `input` than the original (`expected`) value. 343 | */ 344 | export const Summary = buildLLMClassifier<{ input: string }>( 345 | "Summary", 346 | "summary", 347 | ); 348 | 349 | /** 350 | * Test whether an `output` is as good of a translation of the `input` in the specified `language` 351 | * as an expert (`expected`) value. 352 | */ 353 | export const Translation = buildLLMClassifier<{ 354 | language: string; 355 | input: string; 356 | }>("Translation", "translation"); 357 | -------------------------------------------------------------------------------- /js/manifest.ts: -------------------------------------------------------------------------------- 1 | import { JSONDiff, ValidJSON } from "./json"; 2 | import { 3 | Battle, 4 | ClosedQA, 5 | Factuality, 6 | Humor, 7 | Possible, 8 | Security, 9 | Sql, 10 | Summary, 11 | Translation, 12 | } from "./llm"; 13 | import { NumericDiff } from "./number"; 14 | import { EmbeddingSimilarity, Levenshtein } from "./string"; 15 | import { 16 | ContextEntityRecall, 17 | ContextRelevancy, 18 | ContextRecall, 19 | ContextPrecision, 20 | AnswerRelevancy, 21 | AnswerSimilarity, 22 | AnswerCorrectness, 23 | } from "./ragas"; 24 | import { ListContains } from "./list"; 25 | import { ScorerWithPartial } from "./partial"; 26 | import { Moderation } from "./moderation"; 27 | import { ExactMatch } from "./value"; 28 | import { ModelGradedSpec, templates } from "./templates"; 29 | 30 | interface AutoevalMethod { 31 | method: ScorerWithPartial; 32 | description: string; 33 | template?: ModelGradedSpec; 34 | requiresExtraParams?: boolean; 35 | } 36 | 37 | export const Evaluators: { 38 | label: string; 39 | methods: AutoevalMethod[]; 40 | }[] = [ 41 | { 42 | label: "LLM-as-a-Judge", 43 | methods: [ 44 | { 45 | method: Battle, 46 | description: 47 | "Test whether an output _better_ performs the `instructions` than the original (expected) value.", 48 | template: templates.battle, 49 | requiresExtraParams: true, 50 | }, 51 | { 52 | method: ClosedQA, 53 | description: 54 | "Test whether an output answers the `input` using knowledge built into the model. You can specify `criteria` to further constrain the answer.", 55 | template: templates.closed_q_a, 56 | requiresExtraParams: true, 57 | }, 58 | { 59 | method: Humor, 60 | description: "Test whether an output is funny.", 61 | template: templates.humor, 62 | }, 63 | { 64 | method: Factuality, 65 | description: 66 | "Test whether an output is factual, compared to an original (`expected`) value.", 67 | template: templates.factuality, 68 | }, 69 | { 70 | method: Moderation, 71 | description: 72 | "A scorer that uses OpenAI's moderation API to determine if AI response contains ANY flagged content.", 73 | }, 74 | { 75 | method: Possible, 76 | description: 77 | "Test whether an output is a possible solution to the challenge posed in the input.", 78 | template: templates.possible, 79 | }, 80 | { 81 | method: Security, 82 | description: "Test whether an output is malicious.", 83 | template: templates.security, 84 | }, 85 | { 86 | method: Sql, 87 | description: 88 | "Test whether a SQL query is semantically the same as a reference (output) query.", 89 | template: templates.sql, 90 | }, 91 | { 92 | method: Summary, 93 | description: 94 | "Test whether an output is a better summary of the `input` than the original (`expected`) value.", 95 | template: templates.summary, 96 | }, 97 | { 98 | method: Translation, 99 | description: 100 | "Test whether an `output` is as good of a translation of the `input` in the specified `language` as an expert (`expected`) value.", 101 | template: templates.translation, 102 | requiresExtraParams: true, 103 | }, 104 | ], 105 | }, 106 | { 107 | label: "RAG", 108 | methods: [ 109 | { 110 | method: ContextEntityRecall, 111 | description: 112 | "Estimates context recall by estimating TP and FN using annotated answer and retrieved context.", 113 | requiresExtraParams: true, 114 | }, 115 | { 116 | method: ContextRelevancy, 117 | description: 118 | "Extracts relevant sentences from the provided context that are absolutely required to answer the given question.", 119 | requiresExtraParams: true, 120 | }, 121 | { 122 | method: ContextRecall, 123 | description: 124 | "Analyzes each sentence in the answer and classifies if the sentence can be attributed to the given context or not.", 125 | requiresExtraParams: true, 126 | }, 127 | { 128 | method: ContextPrecision, 129 | description: 130 | "Verifies if the context was useful in arriving at the given answer.", 131 | requiresExtraParams: true, 132 | }, 133 | { 134 | method: AnswerRelevancy, 135 | description: 136 | "Scores the relevancy of the generated answer to the given question.", 137 | requiresExtraParams: true, 138 | }, 139 | { 140 | method: AnswerSimilarity, 141 | description: 142 | "Scores the semantic similarity between the generated answer and ground truth.", 143 | requiresExtraParams: true, 144 | }, 145 | { 146 | method: AnswerCorrectness, 147 | description: 148 | "Measures answer correctness compared to ground truth using a weighted average of factuality and semantic similarity.", 149 | requiresExtraParams: true, 150 | }, 151 | ], 152 | }, 153 | { 154 | label: "Composite", 155 | methods: [ 156 | { 157 | method: ListContains, 158 | description: 159 | "Semantically evaluates the overlap between two lists of strings using pairwise similarity and Linear Sum Assignment.", 160 | }, 161 | { 162 | method: ValidJSON, 163 | description: 164 | "Evaluates the validity of JSON output, optionally validating against a JSON Schema definition.", 165 | }, 166 | ], 167 | }, 168 | { 169 | label: "Embeddings", 170 | methods: [ 171 | { 172 | method: EmbeddingSimilarity, 173 | description: 174 | "Evaluates the semantic similarity between two embeddings using cosine distance.", 175 | }, 176 | ], 177 | }, 178 | { 179 | label: "Heuristic", 180 | methods: [ 181 | { 182 | method: JSONDiff, 183 | description: 184 | "Compares JSON objects using customizable comparison methods for strings and numbers.", 185 | }, 186 | { 187 | method: Levenshtein, 188 | description: "Uses the Levenshtein distance to compare two strings.", 189 | }, 190 | { 191 | method: ExactMatch, 192 | description: 193 | "Compares two values for exact equality. If the values are objects, they are converted to JSON strings before comparison.", 194 | }, 195 | { 196 | method: NumericDiff, 197 | description: "Compares numbers by normalizing their difference.", 198 | }, 199 | ], 200 | }, 201 | ]; 202 | -------------------------------------------------------------------------------- /js/moderation.test.ts: -------------------------------------------------------------------------------- 1 | import { Moderation } from "./moderation"; 2 | import { describe, expect, test } from "vitest"; 3 | 4 | describe("Moderation", () => { 5 | const cases = [ 6 | { 7 | name: "Safe content without threshold", 8 | output: "hello", 9 | threshold: undefined, 10 | expected: 1, 11 | }, 12 | { 13 | name: "Unsafe content without threshold", 14 | output: "kill kill kill kill kill", 15 | threshold: undefined, 16 | expected: 0, 17 | }, 18 | { 19 | name: "Unsafe content with lower threshold", 20 | output: "here are my suicidal thoughts", 21 | threshold: 0.25, 22 | expected: 0, 23 | }, 24 | { 25 | name: "Unsafe content with higher threshold", 26 | output: "here are my suicidal thoughts", 27 | threshold: 0.99, 28 | expected: 1, 29 | }, 30 | ]; 31 | 32 | for (const { name, output, threshold, expected } of cases) { 33 | test(`passes ${name} check`, async () => { 34 | const result = await Moderation({ 35 | output, 36 | threshold, 37 | }); 38 | 39 | expect(result.score).toEqual(expected); 40 | }, 6000); 41 | } 42 | }); 43 | -------------------------------------------------------------------------------- /js/moderation.ts: -------------------------------------------------------------------------------- 1 | import { OpenAIAuth, buildOpenAIClient } from "./oai"; 2 | import { Moderation as ModerationResult } from "openai/resources"; 3 | import { makePartial, ScorerWithPartial } from "./partial"; 4 | 5 | const MODERATION_NAME = "Moderation"; 6 | 7 | function computeScore(result: ModerationResult, threshold?: number): number { 8 | if (threshold === undefined) { 9 | return result.flagged ? 0 : 1; 10 | } 11 | 12 | for (const key of Object.keys(result.category_scores)) { 13 | const score = 14 | result.category_scores[key as keyof typeof result.category_scores]; 15 | if (score > threshold) { 16 | return 0; 17 | } 18 | } 19 | 20 | return 1; 21 | } 22 | 23 | /** 24 | * A scorer that uses OpenAI's moderation API to determine if AI response contains ANY flagged content. 25 | * 26 | * @param args 27 | * @param args.threshold Optional. Threshold to use to determine whether content has exceeded threshold. By 28 | * default, it uses OpenAI's default. (Using `flagged` from the response payload.) 29 | * @param args.categories Optional. Specific categories to look for. If not set, all categories will 30 | * be considered. 31 | * @returns A score between 0 and 1, where 1 means content passed all moderation checks. 32 | */ 33 | export const Moderation: ScorerWithPartial< 34 | string, 35 | { 36 | threshold?: number; 37 | } & OpenAIAuth 38 | > = makePartial(async (args) => { 39 | const threshold = args.threshold ?? undefined; 40 | const output = args.output; 41 | 42 | const openai = buildOpenAIClient(args); 43 | 44 | const moderationResults = await openai.moderations.create({ 45 | input: output, 46 | }); 47 | 48 | const result = moderationResults.results[0]; 49 | 50 | return { 51 | name: MODERATION_NAME, 52 | score: computeScore(result, threshold), 53 | metadata: { 54 | threshold, 55 | // @NOTE: `as unknown ...` is intentional. See https://stackoverflow.com/a/57280262 56 | category_scores: 57 | (result.category_scores as unknown as Record) || 58 | undefined, 59 | }, 60 | }; 61 | }, MODERATION_NAME); 62 | -------------------------------------------------------------------------------- /js/number.ts: -------------------------------------------------------------------------------- 1 | import { makePartial, ScorerWithPartial } from "./partial"; 2 | 3 | /** 4 | * A simple scorer that compares numbers by normalizing their difference. 5 | */ 6 | export const NumericDiff: ScorerWithPartial = makePartial( 7 | async (args) => { 8 | const { output, expected } = args; 9 | 10 | if (expected === undefined) { 11 | throw new Error("NumericDiff requires an expected value"); 12 | } 13 | 14 | const score = 15 | output === 0 && expected === 0 16 | ? 1 17 | : 1 - 18 | Math.abs(expected - output) / (Math.abs(expected) + Math.abs(output)); 19 | 20 | return { 21 | name: "NumericDiff", 22 | score, 23 | }; 24 | }, 25 | "NumericDiff", 26 | ); 27 | -------------------------------------------------------------------------------- /js/oai.test.ts: -------------------------------------------------------------------------------- 1 | import { http, HttpResponse } from "msw"; 2 | import OpenAI from "openai"; 3 | import { 4 | afterAll, 5 | afterEach, 6 | beforeAll, 7 | beforeEach, 8 | describe, 9 | expect, 10 | test, 11 | vi, 12 | } from "vitest"; 13 | import { buildOpenAIClient, init } from "./oai"; 14 | 15 | import { setupServer } from "msw/node"; 16 | 17 | export const server = setupServer(); 18 | 19 | beforeAll(() => { 20 | server.listen({ 21 | onUnhandledRequest: (req) => { 22 | throw new Error(`Unhandled request ${req.method}, ${req.url}`); 23 | }, 24 | }); 25 | }); 26 | 27 | let OPENAI_API_KEY: string | undefined; 28 | let OPENAI_BASE_URL: string | undefined; 29 | 30 | beforeEach(() => { 31 | OPENAI_API_KEY = process.env.OPENAI_API_KEY; 32 | OPENAI_BASE_URL = process.env.OPENAI_BASE_URL; 33 | }); 34 | 35 | afterEach(() => { 36 | server.resetHandlers(); 37 | 38 | process.env.OPENAI_API_KEY = OPENAI_API_KEY; 39 | process.env.OPENAI_BASE_URL = OPENAI_BASE_URL; 40 | }); 41 | 42 | afterAll(() => { 43 | server.close(); 44 | }); 45 | 46 | const MOCK_OPENAI_COMPLETION_RESPONSE = { 47 | choices: [ 48 | { 49 | message: { 50 | content: "Hello, I am a mock response!", 51 | role: "assistant", 52 | }, 53 | finish_reason: "stop", 54 | index: 0, 55 | }, 56 | ], 57 | created: Date.now(), 58 | id: "mock-id", 59 | model: "mock-model", 60 | object: "chat.completion", 61 | usage: { 62 | completion_tokens: 9, 63 | prompt_tokens: 5, 64 | total_tokens: 14, 65 | }, 66 | }; 67 | 68 | describe("OAI", () => { 69 | test("should use Azure OpenAI", async () => { 70 | server.use( 71 | http.post( 72 | "https://*.openai.azure.com/openai/deployments/*/chat/completions*", 73 | () => { 74 | return HttpResponse.json(MOCK_OPENAI_COMPLETION_RESPONSE); 75 | }, 76 | ), 77 | ); 78 | 79 | const client = buildOpenAIClient({ 80 | azureOpenAi: { 81 | apiKey: "test-api-key", 82 | endpoint: "https://test-resource.openai.azure.com", 83 | apiVersion: "2024-02-15-preview", 84 | }, 85 | }); 86 | 87 | const response = await client.chat.completions.create({ 88 | model: "test-model", 89 | messages: [{ role: "system", content: "Hello" }], 90 | }); 91 | 92 | expect(response.choices[0].message.content).toBe( 93 | "Hello, I am a mock response!", 94 | ); 95 | expect(response.choices).toHaveLength(1); 96 | }); 97 | 98 | test("should use regular OpenAI", async () => { 99 | server.use( 100 | http.post("https://api.openai.com/v1/chat/completions", () => { 101 | return HttpResponse.json(MOCK_OPENAI_COMPLETION_RESPONSE); 102 | }), 103 | ); 104 | 105 | const client = buildOpenAIClient({ 106 | openAiApiKey: "test-api-key", 107 | openAiBaseUrl: "https://api.openai.com/v1", 108 | }); 109 | 110 | const response = await client.chat.completions.create({ 111 | model: "gpt-4", 112 | messages: [{ role: "user", content: "Hello" }], 113 | }); 114 | 115 | expect(response.choices[0].message.content).toBe( 116 | "Hello, I am a mock response!", 117 | ); 118 | }); 119 | 120 | test("calls proxy if everything unset", async () => { 121 | delete process.env.OPENAI_API_KEY; 122 | delete process.env.OPENAI_BASE_URL; 123 | 124 | server.use( 125 | http.post("https://api.braintrust.dev/v1/proxy/chat/completions", () => { 126 | debugger; 127 | return HttpResponse.json(MOCK_OPENAI_COMPLETION_RESPONSE); 128 | }), 129 | ); 130 | 131 | const client = buildOpenAIClient({}); 132 | const response = await client.chat.completions.create({ 133 | model: "gpt-4", 134 | messages: [{ role: "user", content: "Hello" }], 135 | }); 136 | 137 | debugger; 138 | 139 | expect(response.choices[0].message.content).toBe( 140 | "Hello, I am a mock response!", 141 | ); 142 | }); 143 | 144 | test("default wraps", async () => { 145 | delete process.env.OPENAI_API_KEY; 146 | delete process.env.OPENAI_BASE_URL; 147 | 148 | server.use( 149 | http.post("https://api.braintrust.dev/v1/proxy/chat/completions", () => { 150 | return HttpResponse.json(MOCK_OPENAI_COMPLETION_RESPONSE); 151 | }), 152 | ); 153 | 154 | await withMockWrapper(async ({ createSpy }) => { 155 | const client = buildOpenAIClient({}); 156 | 157 | await client.chat.completions.create({ 158 | model: "gpt-4", 159 | messages: [{ role: "user", content: "Hello" }], 160 | }); 161 | 162 | expect(createSpy).toHaveBeenCalledTimes(1); 163 | expect(createSpy).toHaveBeenCalledWith({ 164 | model: "gpt-4", 165 | messages: [{ role: "user", content: "Hello" }], 166 | }); 167 | }); 168 | }); 169 | 170 | test("wraps once", async () => { 171 | delete process.env.OPENAI_API_KEY; 172 | delete process.env.OPENAI_BASE_URL; 173 | 174 | server.use( 175 | http.post("https://api.braintrust.dev/v1/proxy/chat/completions", () => { 176 | return HttpResponse.json(MOCK_OPENAI_COMPLETION_RESPONSE); 177 | }), 178 | ); 179 | 180 | await withMockWrapper(async ({ wrapperMock, createSpy }) => { 181 | const client = wrapperMock( 182 | new OpenAI({ 183 | apiKey: "test-api-key", 184 | }), 185 | ); 186 | const builtClient = buildOpenAIClient({ client }); 187 | 188 | expect(builtClient).toBe(client); 189 | 190 | await builtClient.chat.completions.create({ 191 | model: "gpt-4", 192 | messages: [{ role: "user", content: "Hello" }], 193 | }); 194 | 195 | expect(createSpy).toHaveBeenCalledTimes(1); 196 | expect(createSpy).toHaveBeenCalledWith({ 197 | model: "gpt-4", 198 | messages: [{ role: "user", content: "Hello" }], 199 | }); 200 | }); 201 | }); 202 | 203 | test("wraps client, if possible", async () => { 204 | server.use( 205 | http.post("https://api.openai.com/v1/chat/completions", () => { 206 | return HttpResponse.json(MOCK_OPENAI_COMPLETION_RESPONSE); 207 | }), 208 | ); 209 | 210 | await withMockWrapper(async ({ wrapperMock, createSpy }) => { 211 | const client = new OpenAI({ apiKey: "test-api-key" }); 212 | const builtClient = buildOpenAIClient({ client }); 213 | 214 | await builtClient.chat.completions.create({ 215 | model: "gpt-4", 216 | messages: [{ role: "user", content: "Hello" }], 217 | }); 218 | 219 | expect(createSpy).toHaveBeenCalledTimes(1); 220 | expect(createSpy).toHaveBeenCalledWith({ 221 | model: "gpt-4", 222 | messages: [{ role: "user", content: "Hello" }], 223 | }); 224 | }); 225 | }); 226 | 227 | test("init sets client", async () => { 228 | server.use( 229 | http.post("https://api.openai.com/v1/chat/completions", () => { 230 | return HttpResponse.json(MOCK_OPENAI_COMPLETION_RESPONSE); 231 | }), 232 | ); 233 | 234 | const client = new OpenAI({ apiKey: "test-api-key" }); 235 | 236 | init({ client }); 237 | 238 | const builtClient = buildOpenAIClient({}); 239 | 240 | expect(Object.is(builtClient, client)).toBe(true); 241 | }); 242 | 243 | test("client wins against init", async () => { 244 | server.use( 245 | http.post("https://api.openai.com/v1/chat/completions", () => { 246 | return HttpResponse.json(MOCK_OPENAI_COMPLETION_RESPONSE); 247 | }), 248 | ); 249 | 250 | const client = new OpenAI({ apiKey: "test-api-key" }); 251 | 252 | init({ client }); 253 | 254 | const otherClient = new OpenAI({ apiKey: "other-api-key" }); 255 | 256 | const builtClient = buildOpenAIClient({ client: otherClient }); 257 | 258 | expect(Object.is(builtClient, otherClient)).toBe(true); 259 | }); 260 | }); 261 | 262 | const withMockWrapper = async ( 263 | fn: (args: { 264 | wrapperMock: (client: any) => any; 265 | createSpy: ReturnType; 266 | }) => Promise, 267 | ) => { 268 | const createSpy = vi.fn(); 269 | const wrapperMock = (client: any) => { 270 | return new Proxy(client, { 271 | get(target, prop) { 272 | if (prop === "chat") { 273 | return new Proxy( 274 | {}, 275 | { 276 | get(target, prop) { 277 | if (prop === "completions") { 278 | return new Proxy( 279 | {}, 280 | { 281 | get(target, prop) { 282 | if (prop === "create") { 283 | return createSpy; 284 | } 285 | return Reflect.get(target, prop); 286 | }, 287 | }, 288 | ); 289 | } 290 | return Reflect.get(target, prop); 291 | }, 292 | }, 293 | ); 294 | } 295 | return Reflect.get(target, prop); 296 | }, 297 | }); 298 | }; 299 | 300 | const originalWrapper = globalThis.__inherited_braintrust_wrap_openai; 301 | try { 302 | globalThis.__inherited_braintrust_wrap_openai = wrapperMock; 303 | await fn({ wrapperMock, createSpy }); 304 | } finally { 305 | globalThis.__inherited_braintrust_wrap_openai = originalWrapper; 306 | } 307 | }; 308 | -------------------------------------------------------------------------------- /js/oai.ts: -------------------------------------------------------------------------------- 1 | import { 2 | ChatCompletion, 3 | ChatCompletionMessageParam, 4 | ChatCompletionTool, 5 | ChatCompletionToolChoiceOption, 6 | } from "openai/resources"; 7 | import { AzureOpenAI, OpenAI } from "openai"; 8 | 9 | export interface CachedLLMParams { 10 | /** 11 | Model to use for the completion. 12 | Note: If using Azure OpenAI, this should be the deployment name.. 13 | */ 14 | model: string; 15 | messages: ChatCompletionMessageParam[]; 16 | tools?: ChatCompletionTool[]; 17 | tool_choice?: ChatCompletionToolChoiceOption; 18 | temperature?: number; 19 | max_tokens?: number; 20 | span_info?: { 21 | spanAttributes?: Record; 22 | }; 23 | } 24 | 25 | export interface ChatCache { 26 | get(params: CachedLLMParams): Promise; 27 | set(params: CachedLLMParams, response: ChatCompletion): Promise; 28 | } 29 | 30 | export type OpenAIAuth = 31 | | { 32 | /** @deprecated Use the `client` option instead */ 33 | openAiApiKey?: string; 34 | /** @deprecated Use the `client` option instead */ 35 | openAiOrganizationId?: string; 36 | /** @deprecated Use the `client` option instead */ 37 | openAiBaseUrl?: string; 38 | /** @deprecated Use the `client` option instead */ 39 | openAiDefaultHeaders?: Record; 40 | /** @deprecated Use the `client` option instead */ 41 | openAiDangerouslyAllowBrowser?: boolean; 42 | /** @deprecated Use the `client` option instead */ 43 | azureOpenAi?: AzureOpenAiAuth; 44 | client?: never; 45 | } 46 | | { 47 | client: OpenAI; 48 | /** @deprecated Use the `client` option instead */ 49 | openAiApiKey?: never; 50 | /** @deprecated Use the `client` option instead */ 51 | openAiOrganizationId?: never; 52 | /** @deprecated Use the `client` option instead */ 53 | openAiBaseUrl?: never; 54 | /** @deprecated Use the `client` option instead */ 55 | openAiDefaultHeaders?: never; 56 | /** @deprecated Use the `client` option instead */ 57 | openAiDangerouslyAllowBrowser?: never; 58 | /** @deprecated Use the `client` option instead */ 59 | azureOpenAi?: never; 60 | }; 61 | 62 | export interface AzureOpenAiAuth { 63 | apiKey: string; 64 | endpoint: string; 65 | apiVersion: string; 66 | } 67 | 68 | export function extractOpenAIArgs>( 69 | args: OpenAIAuth & T, 70 | ): OpenAIAuth { 71 | return args.client 72 | ? { client: args.client } 73 | : { 74 | openAiApiKey: args.openAiApiKey, 75 | openAiOrganizationId: args.openAiOrganizationId, 76 | openAiBaseUrl: args.openAiBaseUrl, 77 | openAiDefaultHeaders: args.openAiDefaultHeaders, 78 | openAiDangerouslyAllowBrowser: args.openAiDangerouslyAllowBrowser, 79 | azureOpenAi: args.azureOpenAi, 80 | }; 81 | } 82 | 83 | const PROXY_URL = "https://api.braintrust.dev/v1/proxy"; 84 | 85 | const resolveOpenAIClient = (options: OpenAIAuth): OpenAI => { 86 | const { 87 | openAiApiKey, 88 | openAiOrganizationId, 89 | openAiBaseUrl, 90 | openAiDefaultHeaders, 91 | openAiDangerouslyAllowBrowser, 92 | azureOpenAi, 93 | } = options; 94 | 95 | if (options.client) { 96 | return options.client; 97 | } 98 | 99 | if (globalThis.__client) { 100 | return globalThis.__client; 101 | } 102 | 103 | if (azureOpenAi) { 104 | // if not unset will could raise an exception 105 | delete process.env.OPENAI_BASE_URL; 106 | 107 | return new AzureOpenAI({ 108 | apiKey: azureOpenAi.apiKey, 109 | endpoint: azureOpenAi.endpoint, 110 | apiVersion: azureOpenAi.apiVersion, 111 | defaultHeaders: openAiDefaultHeaders, 112 | dangerouslyAllowBrowser: openAiDangerouslyAllowBrowser, 113 | }); 114 | } 115 | 116 | return new OpenAI({ 117 | apiKey: 118 | openAiApiKey || 119 | process.env.OPENAI_API_KEY || 120 | process.env.BRAINTRUST_API_KEY, 121 | organization: openAiOrganizationId, 122 | baseURL: openAiBaseUrl || process.env.OPENAI_BASE_URL || PROXY_URL, 123 | defaultHeaders: openAiDefaultHeaders, 124 | dangerouslyAllowBrowser: openAiDangerouslyAllowBrowser, 125 | }); 126 | }; 127 | 128 | const isWrapped = (client: OpenAI): boolean => { 129 | const Constructor = Object.getPrototypeOf(client).constructor; 130 | const clean = new Constructor({ apiKey: "dummy" }); 131 | return ( 132 | String(client.chat.completions.create) !== 133 | String(clean.chat.completions.create) 134 | ); 135 | }; 136 | 137 | export function buildOpenAIClient(options: OpenAIAuth): OpenAI { 138 | const client = resolveOpenAIClient(options); 139 | 140 | // avoid re-wrapping if the client is already wrapped (proxied) 141 | if (globalThis.__inherited_braintrust_wrap_openai && !isWrapped(client)) { 142 | return globalThis.__inherited_braintrust_wrap_openai(client); 143 | } 144 | 145 | return client; 146 | } 147 | 148 | declare global { 149 | /* eslint-disable no-var */ 150 | var __inherited_braintrust_wrap_openai: ((openai: any) => any) | undefined; 151 | var __client: OpenAI | undefined; 152 | } 153 | 154 | export const init = ({ client }: { client?: OpenAI } = {}) => { 155 | globalThis.__client = client; 156 | }; 157 | 158 | export async function cachedChatCompletion( 159 | params: CachedLLMParams, 160 | options: { cache?: ChatCache } & OpenAIAuth, 161 | ): Promise { 162 | const openai = buildOpenAIClient(options); 163 | 164 | const fullParams = globalThis.__inherited_braintrust_wrap_openai 165 | ? { 166 | ...params, 167 | span_info: { 168 | spanAttributes: { 169 | ...params.span_info?.spanAttributes, 170 | purpose: "scorer", 171 | }, 172 | }, 173 | } 174 | : params; 175 | 176 | return await openai.chat.completions.create(fullParams); 177 | } 178 | -------------------------------------------------------------------------------- /js/partial.test.ts: -------------------------------------------------------------------------------- 1 | import { expect, test } from "vitest"; 2 | import { ClosedQA } from "./llm"; 3 | import { Levenshtein } from "./string"; 4 | 5 | test("Partial Test", async () => { 6 | const levenshteinBasic = await Levenshtein({ 7 | output: "abc", 8 | expected: "abcd", 9 | }); 10 | const levenshteinPartial = await Levenshtein.partial({ expected: "abcd" })({ 11 | output: "abc", 12 | }); 13 | expect(levenshteinBasic.score).toBeDefined(); 14 | expect(levenshteinPartial.score).toBeDefined(); 15 | expect(levenshteinPartial.score).toEqual(levenshteinBasic.score); 16 | expect(levenshteinBasic.name).toEqual(levenshteinPartial.name); 17 | expect(levenshteinBasic.name).toEqual("Levenshtein"); 18 | 19 | // Now do the same with ClosedQA which is an "LLM" scorer 20 | const closedQABasic = await ClosedQA({ 21 | criteria: "Is the answer correct?", 22 | input: "What is 1+1?", 23 | output: "2", 24 | }); 25 | const closedQAPartial = await ClosedQA.partial({ 26 | criteria: "Is the answer correct?", 27 | })({ 28 | input: "What is 1+1?", 29 | output: "2", 30 | }); 31 | expect(closedQABasic.score).toBeDefined(); 32 | expect(closedQAPartial.score).toBeDefined(); 33 | expect(closedQAPartial.score).toEqual(closedQABasic.score); 34 | expect(closedQABasic.name).toEqual(closedQAPartial.name); 35 | expect(closedQABasic.name).toEqual("ClosedQA"); 36 | }); 37 | -------------------------------------------------------------------------------- /js/partial.ts: -------------------------------------------------------------------------------- 1 | import { Scorer, ScorerArgs } from "@braintrust/core"; 2 | 3 | export interface ScorerWithPartial 4 | extends Scorer { 5 | partial: (args: { [K in T]: Extra[K] }) => Scorer< 6 | Output, 7 | Omit & Partial> 8 | >; 9 | } 10 | 11 | export function makePartial( 12 | fn: Scorer, 13 | name?: string, 14 | ): ScorerWithPartial { 15 | const ret: any = fn.bind({}); 16 | ret.partial = (args: Partial>) => { 17 | const newFn = (newArgs: ScorerArgs) => 18 | ret({ ...args, ...newArgs }); 19 | if (name) { 20 | Object.defineProperty(newFn, "name", { 21 | value: name, 22 | configurable: true, 23 | }); 24 | } 25 | return newFn; 26 | }; 27 | if (name) { 28 | Object.defineProperty(ret, "name", { 29 | value: name, 30 | configurable: true, 31 | }); 32 | } 33 | return ret; 34 | } 35 | -------------------------------------------------------------------------------- /js/ragas.test.ts: -------------------------------------------------------------------------------- 1 | import { expect, test } from "vitest"; 2 | import { 3 | AnswerCorrectness, 4 | AnswerRelevancy, 5 | AnswerSimilarity, 6 | ContextEntityRecall, 7 | ContextPrecision, 8 | ContextRecall, 9 | ContextRelevancy, 10 | Faithfulness, 11 | } from "./ragas"; 12 | 13 | const data = { 14 | input: "Can starred docs from different workspaces be accessed in one place?", 15 | output: 16 | "Yes, all starred docs, even from multiple different workspaces, will live in the My Shortcuts section.", 17 | expected: 18 | "Yes, all starred docs, even from multiple different workspaces, will live in the My Shortcuts section.", 19 | context: [ 20 | "Not all Coda docs are used in the same way. You'll inevitably have a few that you use every week, and some that you'll only use once. This is where starred docs can help you stay organized.\n\n\n\nStarring docs is a great way to mark docs of personal importance. After you star a doc, it will live in a section on your doc list called **[My Shortcuts](https://coda.io/shortcuts)**. All starred docs, even from multiple different workspaces, will live in this section.\n\n\n\nStarring docs only saves them to your personal My Shortcuts. It doesn\u2019t affect the view for others in your workspace. If you\u2019re wanting to shortcut docs not just for yourself but also for others in your team or workspace, you\u2019ll [use pinning](https://help.coda.io/en/articles/2865511-starred-pinned-docs) instead.", 21 | ], 22 | }; 23 | 24 | const retrievalMetrics = [ 25 | { scorer: ContextEntityRecall, score: 0.69525 }, 26 | { scorer: ContextRelevancy, score: 0.7423 }, 27 | { scorer: ContextRecall, score: 1 }, 28 | { scorer: ContextPrecision, score: 1 }, 29 | ]; 30 | 31 | test("Ragas retrieval test", async () => { 32 | for (const { scorer, score } of retrievalMetrics) { 33 | const actualScore = await scorer({ 34 | output: data.output, 35 | input: data.input, 36 | expected: data.expected, 37 | context: data.context, 38 | }); 39 | 40 | if (score === 1) { 41 | expect(actualScore.score).toBeCloseTo(score, 4); 42 | } 43 | } 44 | }, 600000); 45 | 46 | const generationMetrics = [ 47 | { scorer: AnswerRelevancy, score: 0.59 }, 48 | { scorer: Faithfulness, score: 1 }, 49 | ]; 50 | 51 | test("Ragas generation test", async () => { 52 | for (const { scorer, score } of generationMetrics) { 53 | const actualScore = await scorer({ 54 | input: data.input, 55 | output: data.output, 56 | expected: data.expected, 57 | context: data.context, 58 | temperature: 0, 59 | }); 60 | 61 | if (score === 1) { 62 | expect(actualScore.score).toBeCloseTo(score, 4); 63 | } 64 | } 65 | }, 600000); 66 | 67 | const endToEndMetrics = [ 68 | { scorer: AnswerSimilarity, score: 1 }, 69 | { scorer: AnswerCorrectness, score: 1 }, 70 | ]; 71 | 72 | test("Ragas end-to-end test", async () => { 73 | for (const { scorer, score } of endToEndMetrics) { 74 | const actualScore = await scorer({ 75 | input: data.input, 76 | output: data.output, 77 | expected: data.expected, 78 | context: data.context, 79 | }); 80 | 81 | if (score === 1) { 82 | expect(actualScore.score).toBeCloseTo(score, 4); 83 | expect(actualScore.score).toBeLessThanOrEqual(1); 84 | } 85 | } 86 | }, 600000); 87 | -------------------------------------------------------------------------------- /js/render-messages.test.ts: -------------------------------------------------------------------------------- 1 | import { describe, expect, it } from "vitest"; 2 | import { renderMessages } from "./render-messages"; 3 | import { ChatCompletionMessageParam } from "openai/resources"; 4 | 5 | describe("renderMessages", () => { 6 | it("should never HTML-escape values, regardless of mustache syntax", () => { 7 | const messages: ChatCompletionMessageParam[] = [ 8 | { role: "user", content: "{{value}} and {{{value}}}" }, 9 | ]; 10 | const rendered = renderMessages(messages, { value: "bold" }); 11 | expect(rendered[0].content).toBe("bold and bold"); 12 | }); 13 | 14 | it("should stringify objects when using {{...}}", () => { 15 | const messages: ChatCompletionMessageParam[] = [ 16 | { role: "user", content: "Data: {{data}}" }, 17 | ]; 18 | const data = { foo: "bar", num: 42 }; 19 | const rendered = renderMessages(messages, { data }); 20 | expect(rendered[0].content).toBe('Data: {"foo":"bar","num":42}'); 21 | }); 22 | 23 | it("should output [object Object] when using {{{...}}} with objects", () => { 24 | const messages: ChatCompletionMessageParam[] = [ 25 | { role: "user", content: "Data: {{{data}}}" }, 26 | ]; 27 | const data = { foo: "bar", num: 42 }; 28 | const rendered = renderMessages(messages, { data }); 29 | expect(rendered[0].content).toBe("Data: [object Object]"); 30 | }); 31 | 32 | it("should handle empty content", () => { 33 | const messages: ChatCompletionMessageParam[] = [ 34 | { role: "user", content: "" }, 35 | ]; 36 | const rendered = renderMessages(messages, {}); 37 | expect(rendered[0].content).toBe(""); 38 | }); 39 | }); 40 | -------------------------------------------------------------------------------- /js/render-messages.ts: -------------------------------------------------------------------------------- 1 | import mustache from "mustache"; 2 | import { ChatCompletionMessageParam } from "openai/resources"; 3 | 4 | export function renderMessages( 5 | messages: ChatCompletionMessageParam[], 6 | renderArgs: Record, 7 | ): ChatCompletionMessageParam[] { 8 | return messages.map((m) => ({ 9 | ...m, 10 | content: m.content 11 | ? mustache.render(m.content as string, renderArgs, undefined, { 12 | escape: (v: unknown) => 13 | typeof v === "string" ? v : JSON.stringify(v), 14 | }) 15 | : "", 16 | })); 17 | } 18 | -------------------------------------------------------------------------------- /js/string.ts: -------------------------------------------------------------------------------- 1 | import { Scorer, ScorerArgs } from "@braintrust/core"; 2 | import levenshtein from "js-levenshtein"; 3 | import { OpenAIAuth, buildOpenAIClient } from "./oai"; 4 | import cossim from "compute-cosine-similarity"; 5 | import { makePartial, ScorerWithPartial } from "./partial"; 6 | 7 | /** 8 | * A simple scorer that uses the Levenshtein distance to compare two strings. 9 | */ 10 | export const Levenshtein: ScorerWithPartial = makePartial( 11 | (args) => { 12 | if (args.expected === undefined) { 13 | throw new Error("LevenshteinScorer requires an expected value"); 14 | } 15 | 16 | const [output, expected] = [`${args.output}`, `${args.expected}`]; 17 | const maxLen = Math.max(output.length, expected.length); 18 | 19 | let score = 1; 20 | if (maxLen > 0) { 21 | score = 1 - levenshtein(output, expected) / maxLen; 22 | } 23 | 24 | return { 25 | name: "Levenshtein", 26 | score, 27 | }; 28 | }, 29 | 30 | "Levenshtein", 31 | ); 32 | 33 | // For back-compat 34 | export const LevenshteinScorer: ScorerWithPartial = Levenshtein; 35 | 36 | /** 37 | * A scorer that uses cosine similarity to compare two strings. 38 | * 39 | * @param args 40 | * @param args.prefix A prefix to prepend to the prompt. This is useful for specifying the domain of the inputs. 41 | * @param args.model The model to use for the embedding distance. Defaults to "text-embedding-ada-002". 42 | * @param args.expectedMin The minimum expected score. Defaults to 0.7. Values below this will be scored as 0, and 43 | * values between this and 1 will be scaled linearly. 44 | * @returns A score between 0 and 1, where 1 is a perfect match. 45 | */ 46 | export const EmbeddingSimilarity: ScorerWithPartial< 47 | string, 48 | { 49 | prefix?: string; 50 | expectedMin?: number; 51 | model?: string; 52 | } & OpenAIAuth 53 | > = makePartial(async (args) => { 54 | if (args.expected === undefined) { 55 | throw new Error("EmbeddingSimilarity requires an expected value"); 56 | } 57 | 58 | const prefix = args.prefix ?? ""; 59 | const expectedMin = args.expectedMin ?? 0.7; 60 | 61 | const [output, expected] = [ 62 | `${prefix}${args.output}`, 63 | `${prefix}${args.expected}`, 64 | ]; 65 | 66 | const openai = buildOpenAIClient(args); 67 | 68 | const [outputResult, expectedResult] = await Promise.all( 69 | [output, expected].map((input) => 70 | openai.embeddings.create({ 71 | input, 72 | model: args.model ?? "text-embedding-ada-002", 73 | }), 74 | ), 75 | ); 76 | 77 | const score = cossim( 78 | outputResult.data[0].embedding, 79 | expectedResult.data[0].embedding, 80 | ); 81 | 82 | return { 83 | name: "EmbeddingSimilarity", 84 | score: scaleScore(score ?? 0, expectedMin), 85 | error: score === null ? "EmbeddingSimilarity failed" : undefined, 86 | }; 87 | }, "EmbeddingSimilarity"); 88 | 89 | function scaleScore(score: number, expectedMin: number): number { 90 | return Math.min(Math.max((score - expectedMin) / (1 - expectedMin), 0), 1); 91 | } 92 | -------------------------------------------------------------------------------- /js/templates.ts: -------------------------------------------------------------------------------- 1 | import { z } from "zod"; 2 | import * as yaml from "js-yaml"; 3 | 4 | import battle from "../templates/battle.yaml"; 5 | import closed_q_a from "../templates/closed_q_a.yaml"; 6 | import factuality from "../templates/factuality.yaml"; 7 | import humor from "../templates/humor.yaml"; 8 | import possible from "../templates/possible.yaml"; 9 | import security from "../templates/security.yaml"; 10 | import sql from "../templates/sql.yaml"; 11 | import summary from "../templates/summary.yaml"; 12 | import translation from "../templates/translation.yaml"; 13 | 14 | export const modelGradedSpecSchema = z.object({ 15 | prompt: z.string(), 16 | choice_scores: z.record(z.number()), 17 | model: z.string().optional(), 18 | use_cot: z.boolean().optional(), 19 | temperature: z.number().optional(), 20 | }); 21 | 22 | export type ModelGradedSpec = z.infer; 23 | 24 | const templateStrings = { 25 | battle, 26 | closed_q_a, 27 | factuality, 28 | humor, 29 | possible, 30 | security, 31 | sql, 32 | summary, 33 | translation, 34 | } as const; 35 | 36 | // eslint-disable-next-line @typescript-eslint/consistent-type-assertions 37 | export const templates = Object.fromEntries( 38 | Object.entries(templateStrings).map(([name, template]) => [ 39 | name, 40 | modelGradedSpecSchema.parse( 41 | typeof template === "string" ? yaml.load(template) : template, 42 | ), 43 | ]), 44 | ) as Record; 45 | -------------------------------------------------------------------------------- /js/value.test.ts: -------------------------------------------------------------------------------- 1 | import { expect, test } from "vitest"; 2 | import { ListContains } from "./list"; 3 | import { NumericDiff } from "./number"; 4 | import { LevenshteinScorer } from "./string"; 5 | import { ExactMatch } from "./value"; 6 | 7 | test("Levenshtein Test", async () => { 8 | const cases = [ 9 | { a: "", b: "", expected: 1 }, 10 | { a: "", b: "a", expected: 0 }, 11 | { a: "a", b: "", expected: 0 }, 12 | { a: "a", b: "a", expected: 1 }, 13 | { a: "a", b: "b", expected: 0 }, 14 | { a: "ab", b: "ac", expected: 0.5 }, 15 | { a: "ac", b: "bc", expected: 0.5 }, 16 | { a: "abc", b: "axc", expected: 0.6666666666666667 }, 17 | { a: "xabxcdxxefxgx", b: "1ab2cd34ef5g6", expected: 0.5384615384615384 }, 18 | ]; 19 | 20 | for (const { a, b, expected } of cases) { 21 | const score = (await LevenshteinScorer({ output: a, expected: b })).score; 22 | expect(score).toBeCloseTo(expected); 23 | } 24 | }); 25 | 26 | test("Numeric Test", async () => { 27 | const cases = [ 28 | { a: 0, b: 0, expected: 1 }, 29 | { a: 0, b: 1, expected: 0 }, 30 | { a: 1, b: 2, expected: 0.66667 }, 31 | { a: 1.0, b: 2.0, expected: 0.66667 }, 32 | { a: -1, b: 2, expected: 0 }, 33 | ]; 34 | 35 | for (const { a, b, expected } of cases) { 36 | const score = (await NumericDiff({ output: a, expected: b })).score; 37 | expect(score).toBeCloseTo(expected); 38 | } 39 | }); 40 | 41 | test("ListContains Test", async () => { 42 | const cases = [ 43 | { a: [], b: [], expected: 1 }, 44 | { a: ["0"], b: [], expected: 0 }, 45 | { a: [], b: ["0"], expected: 0 }, 46 | { a: ["a"], b: ["a"], expected: 1 }, 47 | { a: ["a"], b: ["a", "b"], expected: 0.5 }, 48 | { a: ["a", "b"], b: ["a"], expected: 0.5 }, 49 | { 50 | a: [ 51 | "workspaces", 52 | "section", 53 | "view", 54 | "others", 55 | "workspace", 56 | "team", 57 | "pinning", 58 | ], 59 | b: ["starred", "multiple different workspaces", "shortcuts"], 60 | expected: 0.1218, 61 | }, 62 | { 63 | a: ["starred", "multiple different workspaces", "shortcuts"], 64 | b: [ 65 | "workspaces", 66 | "section", 67 | "view", 68 | "others", 69 | "workspace", 70 | "team", 71 | "pinning", 72 | ], 73 | expected: 0.1218, 74 | }, 75 | ]; 76 | 77 | for (const { a, b, expected } of cases) { 78 | const score = (await ListContains({ output: a, expected: b })).score; 79 | expect(score).toBeCloseTo(expected, 4); 80 | } 81 | 82 | expect( 83 | ( 84 | await ListContains({ 85 | output: ["a", "b"], 86 | expected: ["b"], 87 | allowExtraEntities: true, 88 | }) 89 | ).score, 90 | ).toBe(1); 91 | }); 92 | 93 | test("ExactMatch", async () => { 94 | const cases = [ 95 | { output: "hello", expected: "hello", expectedScore: 1 }, 96 | { output: "hello", expected: "world", expectedScore: 0 }, 97 | { output: 123, expected: 123, expectedScore: 1 }, 98 | { output: 123, expected: "123", expectedScore: 1 }, 99 | { output: { a: 1, b: 2 }, expected: { a: 1, b: 2 }, expectedScore: 1 }, 100 | { output: { a: 1, b: 2 }, expected: { a: 1, b: 3 }, expectedScore: 0 }, 101 | { output: [1, 2, 3], expected: [1, 2, 3], expectedScore: 1 }, 102 | { output: [1, 2, 3], expected: [3, 2, 1], expectedScore: 0 }, 103 | { output: { a: 1, b: 2 }, expected: { b: 2, a: 1 }, expectedScore: 0 }, // Order matters 104 | { output: { a: 1, b: 2 }, expected: '{"a": 1, "b": 2}', expectedScore: 1 }, // String representation matches dict 105 | { output: { a: 1, b: 2 }, expected: '{"a":1, "b":2}', expectedScore: 1 }, // String representation matches dict 106 | { output: { a: 1, b: 2 }, expected: '{"b":2, "a":1}', expectedScore: 0 }, 107 | { 108 | output: { a: 1, b: 2 }, 109 | expected: { b: 2, a: 1, c: 3 }, 110 | expectedScore: 0, 111 | }, // Extra key, not equal 112 | { output: null, expected: null, expectedScore: 1 }, 113 | { output: null, expected: undefined, expectedScore: 1 }, 114 | ]; 115 | 116 | for (const { output, expected, expectedScore } of cases) { 117 | const score = (await ExactMatch({ output, expected })).score; 118 | expect(score).toBeCloseTo(expectedScore, 4); 119 | } 120 | }); 121 | -------------------------------------------------------------------------------- /js/value.ts: -------------------------------------------------------------------------------- 1 | import { makePartial, ScorerWithPartial } from "./partial"; 2 | 3 | /** 4 | * A simple scorer that tests whether two values are equal. If the value is an object or array, 5 | * it will be JSON-serialized and the strings compared for equality. 6 | */ 7 | export const ExactMatch: ScorerWithPartial = makePartial( 8 | (args) => { 9 | const maybeObject = needsJSON(args.output) || needsJSON(args.expected); 10 | const [output, expected] = [ 11 | normalizeValue(args.output ?? null, maybeObject), 12 | normalizeValue(args.expected ?? null, maybeObject), 13 | ]; 14 | 15 | const score = output === expected ? 1 : 0; 16 | 17 | return { 18 | name: "ExactMatch", 19 | score, 20 | }; 21 | }, 22 | "ExactMatch", 23 | ); 24 | 25 | function needsJSON(value: unknown): boolean { 26 | return typeof value === "object" || Array.isArray(value); 27 | } 28 | 29 | export function normalizeValue(value: unknown, maybeObject: boolean): string { 30 | if (needsJSON(value)) { 31 | return JSON.stringify(value); 32 | } 33 | try { 34 | if (typeof value === "string" && maybeObject) { 35 | return JSON.stringify(JSON.parse(value)); 36 | } 37 | } catch (e) { 38 | // That's ok, just return the string representation 39 | } 40 | return `${value}`; 41 | } 42 | -------------------------------------------------------------------------------- /js/yaml.d.ts: -------------------------------------------------------------------------------- 1 | declare module "*.yaml" { 2 | const content: string; 3 | export default content; 4 | } 5 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "autoevals", 3 | "version": "0.0.0", 4 | "description": "Universal library for evaluating AI models", 5 | "repository": { 6 | "type": "git", 7 | "url": "git+https://github.com/braintrustdata/autoevals.git" 8 | }, 9 | "homepage": "https://www.braintrust.dev/docs", 10 | "main": "./jsdist/index.js", 11 | "module": "./jsdist/index.mjs", 12 | "types": "./jsdist/index.d.ts", 13 | "exports": { 14 | "./package.json": "./package.json", 15 | ".": { 16 | "types": "./jsdist/index.d.ts", 17 | "import": "./jsdist/index.mjs", 18 | "module": "./jsdist/index.mjs", 19 | "require": "./jsdist/index.js" 20 | } 21 | }, 22 | "files": [ 23 | "jsdist/**/*" 24 | ], 25 | "scripts": { 26 | "build": "tsup", 27 | "watch": "tsup --watch", 28 | "docs": "npx typedoc --options typedoc.json js/index.ts", 29 | "test": "vitest", 30 | "prepublishOnly": "../scripts/node_prepublish_autoevals.py", 31 | "postpublish": "../scripts/node_postpublish_autoevals.py" 32 | }, 33 | "author": "", 34 | "license": "MIT", 35 | "devDependencies": { 36 | "@rollup/plugin-yaml": "^4.1.2", 37 | "@types/js-levenshtein": "^1.1.3", 38 | "@types/js-yaml": "^4.0.9", 39 | "@types/mustache": "^4.2.5", 40 | "@types/node": "^20.10.5", 41 | "msw": "^2.7.3", 42 | "tsup": "^8.4.0", 43 | "tsx": "^3.14.0", 44 | "typedoc": "^0.25.4", 45 | "typedoc-plugin-markdown": "^3.17.1", 46 | "typescript": "^5.3.3", 47 | "vitest": "^2.1.9" 48 | }, 49 | "dependencies": { 50 | "@braintrust/core": "^0.0.8", 51 | "ajv": "^8.13.0", 52 | "compute-cosine-similarity": "^1.1.0", 53 | "js-levenshtein": "^1.1.6", 54 | "js-yaml": "^4.1.0", 55 | "linear-sum-assignment": "^1.0.7", 56 | "mustache": "^4.2.0", 57 | "openai": "^4.47.1", 58 | "zod": "^3.22.4", 59 | "zod-to-json-schema": "^3.22.5" 60 | }, 61 | "packageManager": "pnpm@8.15.5" 62 | } 63 | -------------------------------------------------------------------------------- /pnpm-workspace.yaml: -------------------------------------------------------------------------------- 1 | packages: 2 | - "." 3 | - "evals" 4 | -------------------------------------------------------------------------------- /py/autoevals/__init__.py: -------------------------------------------------------------------------------- 1 | """Autoevals is a comprehensive toolkit for evaluating AI model outputs. 2 | 3 | This library provides a collection of specialized scorers for different types of evaluations: 4 | 5 | - `string`: Text similarity using edit distance or embeddings 6 | - `llm`: LLM-based evaluation for correctness, complexity, security, etc. 7 | - `moderation`: Content safety and policy compliance checks 8 | - `ragas`: Advanced NLP metrics for RAG system evaluation 9 | - `json`: JSON validation and structural comparison 10 | - `number`: Numeric similarity with relative scaling 11 | - `value`: Exact matching and basic comparisons 12 | 13 | **Key features**: 14 | 15 | - Both sync and async evaluation support 16 | - Configurable scoring parameters 17 | - Detailed feedback through metadata 18 | - Integration with OpenAI and other LLM providers through Braintrust AI Proxy 19 | 20 | **Client setup**: 21 | 22 | There are two ways to configure the OpenAI client: 23 | 24 | 1. Global initialization (recommended): 25 | 26 | ```python 27 | from autoevals import init 28 | from openai import AsyncOpenAI 29 | 30 | # Set up once at the start of your application 31 | client = AsyncOpenAI() 32 | init(client=client) 33 | ``` 34 | 35 | 2. Per-evaluator initialization: 36 | 37 | ```python 38 | from openai import AsyncOpenAI 39 | from autoevals.ragas import CloseQA 40 | 41 | # Pass client directly to evaluator 42 | client = AsyncOpenAI() 43 | evaluator = CloseQA(client=client) 44 | ``` 45 | 46 | **Multi-provider support via the Braintrust AI Proxy**: 47 | 48 | Autoevals supports multiple LLM providers (Anthropic, Azure, etc.) through the Braintrust AI Proxy. 49 | Configure your client to use the proxy: 50 | 51 | ```python 52 | import os 53 | from openai import AsyncOpenAI 54 | from autoevals.llm import Factuality 55 | 56 | # Configure client to use Braintrust AI Proxy 57 | client = AsyncOpenAI( 58 | base_url="https://api.braintrustproxy.com/v1", 59 | api_key=os.getenv("BRAINTRUST_API_KEY"), 60 | ) 61 | 62 | # Use with any evaluator 63 | evaluator = Factuality(client=client) 64 | ``` 65 | 66 | **Braintrust integration**: 67 | 68 | Autoevals automatically integrates with Braintrust logging when you install the library. If needed, you can manually wrap the client: 69 | 70 | ```python 71 | from openai import AsyncOpenAI 72 | from braintrust import wrap_openai 73 | from autoevals.ragas import CloseQA 74 | 75 | # Explicitly wrap the client if needed 76 | client = wrap_openai(AsyncOpenAI()) 77 | evaluator = CloseQA(client=client) 78 | ``` 79 | 80 | **Example Autoevals usage**: 81 | 82 | ```python 83 | from autoevals.ragas import CloseQA 84 | import asyncio 85 | 86 | async def evaluate_qa(): 87 | # Create evaluator for question answering 88 | evaluator = CloseQA() 89 | 90 | # Question and context 91 | question = "What was the purpose of the Apollo missions?" 92 | context = ''' 93 | The Apollo program was a NASA space program that ran from 1961 to 1972, 94 | with the goal of landing humans on the Moon and bringing them safely back 95 | to Earth. The program achieved its most famous success when Apollo 11 96 | astronauts Neil Armstrong and Buzz Aldrin became the first humans to walk 97 | on the Moon on July 20, 1969. 98 | ''' 99 | 100 | # Two different answers to evaluate 101 | answer = "The Apollo program's main goal was to land humans on the Moon and return them safely to Earth." 102 | expected = "The Apollo missions were designed to achieve human lunar landing and safe return." 103 | 104 | # Evaluate the answer 105 | result = await evaluator.eval_async( 106 | question=question, 107 | context=context, 108 | output=answer, 109 | expected=expected 110 | ) 111 | 112 | print(f"Score: {result.score}") # Semantic similarity score (0-1) 113 | print(f"Rationale: {result.metadata.rationale}") # Detailed explanation 114 | print(f"Faithfulness: {result.metadata.faithfulness}") # Context alignment 115 | 116 | # Run async evaluation 117 | asyncio.run(evaluate_qa()) 118 | ``` 119 | 120 | See individual module documentation for detailed usage and options. 121 | """ 122 | 123 | from braintrust_core.score import Score, Scorer 124 | 125 | from .json import * 126 | from .list import * 127 | from .llm import * 128 | from .moderation import * 129 | from .number import * 130 | from .oai import init 131 | from .ragas import * 132 | from .string import * 133 | from .value import ExactMatch 134 | -------------------------------------------------------------------------------- /py/autoevals/json.py: -------------------------------------------------------------------------------- 1 | """JSON evaluation scorers for comparing and validating JSON data. 2 | 3 | This module provides scorers for working with JSON data: 4 | 5 | - JSONDiff: Compare JSON objects for structural and content similarity 6 | - Handles nested structures, strings, numbers 7 | - Customizable with different scorers for string and number comparisons 8 | - Can automatically parse JSON strings 9 | 10 | - ValidJSON: Validate if a string is valid JSON and matches an optional schema 11 | - Validates JSON syntax 12 | - Optional JSON Schema validation 13 | - Works with both strings and parsed objects 14 | """ 15 | 16 | import json 17 | 18 | from braintrust_core.score import Score, Scorer 19 | from jsonschema import ValidationError, validate 20 | 21 | from autoevals.partial import ScorerWithPartial 22 | 23 | from .number import NumericDiff 24 | from .string import Levenshtein 25 | 26 | 27 | class JSONDiff(ScorerWithPartial): 28 | """Compare JSON objects for structural and content similarity. 29 | 30 | This scorer recursively compares JSON objects, handling: 31 | - Nested dictionaries and lists 32 | - String similarity using Levenshtein distance 33 | - Numeric value comparison 34 | - Automatic parsing of JSON strings 35 | 36 | Example: 37 | ```python 38 | import asyncio 39 | from openai import AsyncOpenAI 40 | from autoevals import JSONDiff 41 | from autoevals.string import EmbeddingSimilarity 42 | 43 | async def compare_json(): 44 | # Initialize with async client for string comparison 45 | client = AsyncOpenAI() 46 | string_scorer = EmbeddingSimilarity(client=client) 47 | 48 | diff = JSONDiff(string_scorer=string_scorer) 49 | 50 | result = await diff.eval_async( 51 | output={ 52 | "name": "John Smith", 53 | "age": 30, 54 | "skills": ["python", "javascript"] 55 | }, 56 | expected={ 57 | "name": "John A. Smith", 58 | "age": 31, 59 | "skills": ["python", "typescript"] 60 | } 61 | ) 62 | 63 | print(result.score) # Similarity score between 0-1 64 | print(result.metadata) # Detailed comparison breakdown 65 | 66 | # Run the async evaluation 67 | asyncio.run(compare_json()) 68 | ``` 69 | 70 | Args: 71 | string_scorer: Optional custom scorer for string comparisons (default: Levenshtein) 72 | number_scorer: Optional custom scorer for number comparisons (default: NumericDiff) 73 | preserve_strings: Don't attempt to parse strings as JSON (default: False) 74 | 75 | Returns: 76 | Score object with: 77 | - score: Similarity score between 0-1 78 | - metadata: Detailed comparison breakdown 79 | """ 80 | 81 | def __init__(self, string_scorer: Scorer = None, number_scorer: Scorer = None, preserve_strings: bool = False): 82 | self.string_scorer = string_scorer or Levenshtein() 83 | self.number_scorer = number_scorer or NumericDiff() 84 | self.preserve_strings = preserve_strings 85 | self._valid_json = ValidJSON() 86 | 87 | def _run_eval_sync(self, output, expected=None, **kwargs): 88 | return Score(name=self._name(), score=self.json_diff(output, expected)) 89 | 90 | def json_diff(self, o1, o2): 91 | if not self.preserve_strings: 92 | if isinstance(o1, str) and self._valid_json.valid_json(o1) == 1: 93 | o1 = json.loads(o1) 94 | if isinstance(o2, str) and self._valid_json.valid_json(o2) == 1: 95 | o2 = json.loads(o2) 96 | 97 | if isinstance(o1, dict) and isinstance(o2, dict): 98 | if len(o1) == 0 and len(o2) == 0: 99 | return 1 100 | 101 | all_keys = set(o1.keys()).union(set(o2.keys())) 102 | base_scores = [self.json_diff(o1.get(k), o2.get(k)) for k in all_keys] 103 | base_scores = [s for s in base_scores if s is not None] 104 | return sum(base_scores) / len(base_scores) 105 | elif isinstance(o1, list) and isinstance(o2, list): 106 | if len(o1) == 0 and len(o2) == 0: 107 | return 1 108 | base_scores = [self.json_diff(e1, e2) for (e1, e2) in zip(o1, o2)] 109 | base_scores = [s for s in base_scores if s is not None] 110 | return sum(base_scores) / max(len(o1), len(o2)) 111 | elif isinstance(o1, str) and isinstance(o2, str): 112 | return self.string_scorer.eval(o1, o2).score 113 | elif (isinstance(o1, int) or isinstance(o1, float)) and (isinstance(o2, int) or isinstance(o2, float)): 114 | return self.number_scorer.eval(o1, o2).score 115 | elif o1 is None and o2 is None: 116 | return 1 117 | elif o1 is None or o2 is None: 118 | return 0 119 | else: 120 | kwargs = {"separators": (",", ":"), "sort_keys": True} 121 | return self.string_scorer.eval(json.dumps(o1, **kwargs), json.dumps(o2, **kwargs)).score 122 | 123 | 124 | class ValidJSON(ScorerWithPartial): 125 | """Validate if a string is valid JSON and optionally matches a schema. 126 | 127 | This scorer checks if: 128 | - The input can be parsed as valid JSON 129 | - The parsed JSON matches an optional JSON Schema 130 | - Handles both string inputs and pre-parsed JSON objects 131 | 132 | Example: 133 | ```python 134 | import asyncio 135 | from autoevals import ValidJSON 136 | 137 | async def validate_json(): 138 | # Define a schema to validate against 139 | schema = { 140 | "type": "object", 141 | "properties": { 142 | "name": {"type": "string"}, 143 | "age": {"type": "number"}, 144 | "skills": { 145 | "type": "array", 146 | "items": {"type": "string"} 147 | } 148 | }, 149 | "required": ["name", "age"] 150 | } 151 | 152 | validator = ValidJSON(schema=schema) 153 | 154 | result = await validator.eval_async( 155 | output=''' 156 | { 157 | "name": "John Smith", 158 | "age": 30, 159 | "skills": ["python", "javascript"] 160 | } 161 | ''' 162 | ) 163 | 164 | print(result.score) # 1 if valid, 0 if invalid 165 | print(result.metadata) # Validation details or error messages 166 | 167 | # Run the async validation 168 | asyncio.run(validate_json()) 169 | ``` 170 | 171 | Args: 172 | schema: Optional JSON Schema to validate against 173 | 174 | Returns: 175 | Score object with: 176 | - score: 1 if valid JSON (and matches schema if provided), 0 otherwise 177 | - metadata: Validation details or error messages 178 | """ 179 | 180 | def __init__(self, schema=None): 181 | self.schema = schema 182 | 183 | def _run_eval_sync(self, output, schema=None, **kwargs): 184 | return Score(name=self._name(), score=self.valid_json(output, schema)) 185 | 186 | def valid_json(self, output, schema=None): 187 | try: 188 | parsed = json.loads(output) if isinstance(output, str) else output 189 | 190 | if schema is not None: 191 | validate(parsed, schema) 192 | return 1 193 | 194 | if isinstance(parsed, dict) or isinstance(parsed, list): 195 | return 1 196 | 197 | except (json.JSONDecodeError, ValidationError): 198 | pass 199 | 200 | return 0 201 | 202 | 203 | __all__ = ["JSONDiff", "ValidJSON"] 204 | -------------------------------------------------------------------------------- /py/autoevals/list.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | from braintrust_core.score import Score 4 | 5 | from autoevals.partial import ScorerWithPartial 6 | 7 | from .string import Levenshtein 8 | 9 | 10 | class ListContains(ScorerWithPartial): 11 | """ 12 | A scorer that semantically evaluates the overlap between two lists of strings. It works by 13 | computing the pairwise similarity between each element of the output and the expected value, 14 | and then using Linear Sum Assignment to find the best matching pairs. 15 | """ 16 | 17 | def __init__(self, pairwise_scorer=None, allow_extra_entities=False, **kwargs): 18 | self.allow_extra_entities = allow_extra_entities 19 | self.pairwise_scorer = pairwise_scorer or Levenshtein() 20 | 21 | # If it's a class, then instantiate it 22 | if isinstance(self.pairwise_scorer, type): 23 | self.pairwise_scorer = self.pairwise_scorer() 24 | 25 | async def _run_eval_async(self, output, expected=None, **kwargs): 26 | if expected is None: 27 | raise ValueError("ListContains requires an expected value") 28 | 29 | similarities_futures = [ 30 | [ 31 | self.pairwise_scorer._run_eval_async(output=output_item, expected=expected_item) 32 | for expected_item in expected 33 | ] 34 | for output_item in output 35 | ] 36 | 37 | similarities = [] 38 | 39 | for similarity_futures in similarities_futures: 40 | similarities.append([(await similarity_future).score for similarity_future in similarity_futures]) 41 | 42 | return self._compute_score(output, expected, similarities, **kwargs) 43 | 44 | def _run_eval_sync(self, output, expected=None, **kwargs): 45 | if expected is None: 46 | raise ValueError("ListContains requires an expected value") 47 | 48 | similarities = [ 49 | [self.pairwise_scorer._run_eval_sync(output_item, expected_item).score for expected_item in expected] 50 | for output_item in output 51 | ] 52 | 53 | return self._compute_score(output, expected, similarities, **kwargs) 54 | 55 | def _compute_score(self, outputs, expecteds, similarities, **kwargs): 56 | if len(outputs) == 0 and len(expecteds) == 0: 57 | return Score(name=self._name(), score=1) 58 | elif len(outputs) == 0 or len(expecteds) == 0: 59 | return Score(name=self._name(), score=0) 60 | 61 | similarities = [[d or 0 for d in row] for row in similarities] 62 | 63 | try: 64 | import numpy as np 65 | from scipy.optimize import linear_sum_assignment 66 | except ImportError: 67 | print( 68 | "ListContains requires the scipy extension, which you can install with `pip install 'autoevals[scipy]'`", 69 | file=sys.stderr, 70 | ) 71 | raise 72 | 73 | distances = -np.array(similarities) 74 | row_ind, col_ind = linear_sum_assignment(distances) 75 | 76 | pairs = [(outputs[r], expecteds[c], similarities[r][c]) for (r, c) in zip(row_ind, col_ind)] 77 | lowest_distances = distances[row_ind, col_ind] 78 | 79 | # Generally speaking, outputs that are not in expecteds should be penalized, but in certain use cases 80 | # (eg checking whether a passage of text has all of the entities in a list, and maybe a few more), it's 81 | # ok to allow them. 82 | denominator = max(len(outputs), len(expecteds)) if not self.allow_extra_entities else len(expecteds) 83 | assert len(lowest_distances) <= denominator, "There should be at most as many pairs as there are rows" 84 | score = min(max(sum(-lowest_distances) / denominator, 0), 1) 85 | 86 | return Score( 87 | name=self._name(), 88 | score=score, 89 | metadata={"pairs": pairs}, 90 | ) 91 | -------------------------------------------------------------------------------- /py/autoevals/moderation.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from braintrust_core.score import Score 4 | 5 | from autoevals.llm import OpenAIScorer 6 | 7 | from .oai import Client, arun_cached_request, run_cached_request 8 | 9 | REQUEST_TYPE = "moderation" 10 | 11 | 12 | class Moderation(OpenAIScorer): 13 | """A scorer that evaluates if AI responses contain inappropriate or unsafe content. 14 | 15 | This scorer uses OpenAI's moderation API to check for potentially harmful content in text. 16 | It returns a score of 0 if content is flagged as inappropriate, and 1 if it passes moderation. 17 | 18 | Args: 19 | threshold: Optional float to set a custom threshold for content flagging. 20 | If None, uses OpenAI's default flagging logic. 21 | If set, content is flagged if any category score exceeds this threshold. 22 | client: Optional client for making API calls. Can be: 23 | - None: Uses global client from init() 24 | - OpenAI v1 client: Used directly 25 | - OpenAI v0 module: Wrapped in a client adapter 26 | 27 | Example: 28 | ```python 29 | from openai import OpenAI 30 | from autoevals import init 31 | from autoevals.moderation import Moderation 32 | 33 | # Initialize with your OpenAI client 34 | init(OpenAI()) 35 | 36 | # Create evaluator with default settings 37 | moderator = Moderation() 38 | result = moderator.eval( 39 | output="This is the text to check for inappropriate content" 40 | ) 41 | print(result.score) # 1 if content is appropriate, 0 if flagged 42 | print(result.metadata) # Detailed category scores and threshold used 43 | ``` 44 | """ 45 | 46 | threshold = None 47 | extra_args = {} 48 | 49 | def __init__( 50 | self, 51 | threshold=None, 52 | api_key=None, 53 | base_url=None, 54 | client: Optional[Client] = None, 55 | ): 56 | """Initialize a Moderation scorer. 57 | 58 | Args: 59 | threshold: Optional float to set a custom threshold for content flagging. 60 | If None, uses OpenAI's default flagging logic. 61 | If set, content is flagged if any category score exceeds this threshold. 62 | client: Optional client for making API calls. Can be: 63 | - None: Uses global client from init() 64 | - OpenAI v1 client: Used directly 65 | - OpenAI v0 module: Wrapped in a client adapter 66 | api_key: Deprecated. Use client instead. 67 | base_url: Deprecated. Use client instead. 68 | 69 | Note: 70 | The api_key and base_url parameters are deprecated and will be removed in a future version. 71 | Instead, you can either: 72 | 1. Pass a client instance directly to this constructor using the client parameter 73 | 2. Set a global client using autoevals.init(client=your_client) 74 | 75 | The global client can be configured once and will be used by all evaluators that don't have 76 | a specific client passed to them. 77 | """ 78 | super().__init__(api_key=api_key, base_url=base_url, client=client) 79 | self.threshold = threshold 80 | 81 | def _run_eval_sync(self, output, expected=None, **kwargs): 82 | moderation_response = run_cached_request( 83 | client=self.client, request_type=REQUEST_TYPE, input=output, **self.extra_args 84 | )["results"][0] 85 | return self.__postprocess_response(moderation_response) 86 | 87 | def __postprocess_response(self, moderation_response) -> Score: 88 | return Score( 89 | name=self._name(), 90 | score=self.compute_score(moderation_response, self.threshold), 91 | metadata={ 92 | "threshold": self.threshold, 93 | "category_scores": moderation_response["category_scores"], 94 | }, 95 | ) 96 | 97 | async def _run_eval_async(self, output, expected=None, **kwargs) -> Score: 98 | moderation_response = ( 99 | await arun_cached_request(client=self.client, request_type=REQUEST_TYPE, input=output, **self.extra_args) 100 | )["results"][0] 101 | return self.__postprocess_response(moderation_response) 102 | 103 | @staticmethod 104 | def compute_score(moderation_result, threshold): 105 | if threshold is None: 106 | return 0 if moderation_result["flagged"] else 1 107 | 108 | category_scores = moderation_result["category_scores"] 109 | for category in category_scores.keys(): 110 | if category_scores[category] > threshold: 111 | return 0 112 | 113 | return 1 114 | 115 | 116 | __all__ = ["Moderation"] 117 | -------------------------------------------------------------------------------- /py/autoevals/number.py: -------------------------------------------------------------------------------- 1 | """Numeric evaluation scorers for comparing numerical values. 2 | 3 | This module provides scorers for working with numbers: 4 | - NumericDiff: Compare numbers using normalized difference, providing a similarity score 5 | that accounts for both absolute and relative differences between values. 6 | 7 | Features: 8 | - Normalized scoring between 0 and 1 9 | - Handles special cases like comparing zeros 10 | - Accounts for magnitude when computing differences 11 | - Suitable for both small and large number comparisons 12 | """ 13 | 14 | from braintrust_core.score import Score 15 | 16 | from autoevals.partial import ScorerWithPartial 17 | 18 | 19 | class NumericDiff(ScorerWithPartial): 20 | """Numeric similarity scorer using normalized difference. 21 | 22 | Example: 23 | ```python 24 | scorer = NumericDiff() 25 | result = scorer.eval( 26 | output=105, 27 | expected=100 28 | ) 29 | print(result.score) # 0.95 (normalized similarity) 30 | ``` 31 | 32 | Args: 33 | output: Number to evaluate 34 | expected: Reference number to compare against 35 | 36 | Returns: 37 | Score object with normalized similarity (0-1), where: 38 | - 1 means identical numbers 39 | - Score decreases as difference increases relative to magnitude 40 | - Special case: score=1 when both numbers are 0 41 | """ 42 | 43 | def _run_eval_sync(self, output, expected=None, **kwargs): 44 | if expected is None: 45 | raise ValueError("NumericDiff requires an expected value") 46 | 47 | if expected == 0 and output == 0: 48 | score = 1 49 | else: 50 | score = 1 - abs(expected - output) / (abs(expected) + abs(output)) 51 | return Score(name=self._name(), score=score) 52 | 53 | 54 | __all__ = ["NumericDiff"] 55 | -------------------------------------------------------------------------------- /py/autoevals/oai.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | import sys 4 | import textwrap 5 | import time 6 | import warnings 7 | from contextvars import ContextVar 8 | from dataclasses import dataclass 9 | from typing import Any, Callable, Dict, Optional, Protocol, Tuple, Type, TypeVar, Union, cast, runtime_checkable 10 | 11 | PROXY_URL = "https://api.braintrust.dev/v1/proxy" 12 | 13 | 14 | @runtime_checkable 15 | class ChatCompletions(Protocol): 16 | create: Callable[..., Any] 17 | 18 | 19 | @runtime_checkable 20 | class Chat(Protocol): 21 | @property 22 | def completions(self) -> ChatCompletions: 23 | ... 24 | 25 | 26 | @runtime_checkable 27 | class Embeddings(Protocol): 28 | create: Callable[..., Any] 29 | 30 | 31 | @runtime_checkable 32 | class Moderations(Protocol): 33 | create: Callable[..., Any] 34 | 35 | 36 | @runtime_checkable 37 | class OpenAIV1Module(Protocol): 38 | class OpenAI(Protocol): 39 | # Core API resources 40 | @property 41 | def chat(self) -> Chat: 42 | ... 43 | 44 | @property 45 | def embeddings(self) -> Embeddings: 46 | ... 47 | 48 | @property 49 | def moderations(self) -> Moderations: 50 | ... 51 | 52 | # Configuration 53 | @property 54 | def api_key(self) -> str: 55 | ... 56 | 57 | @property 58 | def organization(self) -> Optional[str]: 59 | ... 60 | 61 | @property 62 | def base_url(self) -> Union[str, Any, None]: 63 | ... 64 | 65 | class AsyncOpenAI(OpenAI): 66 | ... 67 | 68 | class RateLimitError(Exception): 69 | ... 70 | 71 | 72 | # TODO: we're removing v0 support in the next release 73 | @runtime_checkable 74 | class OpenAIV0Module(Protocol): 75 | class ChatCompletion(Protocol): 76 | acreate: Callable[..., Any] 77 | create: Callable[..., Any] 78 | 79 | class Embedding(Protocol): 80 | acreate: Callable[..., Any] 81 | create: Callable[..., Any] 82 | 83 | class Moderation(Protocol): 84 | acreate: Callable[..., Any] 85 | create: Callable[..., Any] 86 | 87 | api_key: Optional[str] 88 | api_base: Optional[str] 89 | base_url: Optional[str] 90 | 91 | class error(Protocol): 92 | class RateLimitError(Exception): 93 | ... 94 | 95 | 96 | _openai_module: Optional[Union[OpenAIV1Module, OpenAIV0Module]] = None 97 | 98 | 99 | def get_openai_module() -> Union[OpenAIV1Module, OpenAIV0Module]: 100 | global _openai_module 101 | 102 | if _openai_module is not None: 103 | return _openai_module 104 | 105 | import openai # type: ignore 106 | 107 | _openai_module = cast(Union[OpenAIV1Module, OpenAIV0Module], openai) 108 | return _openai_module 109 | 110 | 111 | @dataclass 112 | class LLMClient: 113 | """A client wrapper for LLM operations that supports both OpenAI SDK v0 and v1. 114 | 115 | This class provides a consistent interface for common LLM operations regardless of the 116 | underlying OpenAI SDK version. It's designed to be extensible for custom implementations. 117 | 118 | Attributes: 119 | openai: The OpenAI module or client instance (either v0 or v1 SDK). 120 | complete: Completion function that creates chat completions. 121 | - For v0: openai.ChatCompletion.create or acreate 122 | - For v1: openai.chat.completions.create 123 | embed: Embedding function that creates embeddings. 124 | - For v0: openai.Embedding.create or acreate 125 | - For v1: openai.embeddings.create 126 | moderation: Moderation function that creates content moderations. 127 | - For v0: openai.Moderations.create or acreate 128 | - For v1: openai.moderations.create 129 | RateLimitError: The rate limit exception class for the SDK version. 130 | - For v0: openai.error.RateLimitError 131 | - For v1: openai.RateLimitError 132 | is_async: Whether the client is async (only used for v0 autoconfiguration). 133 | 134 | Note: 135 | If using async OpenAI methods you must use the async methods in Autoevals. 136 | The client will automatically configure itself if methods are not provided. 137 | 138 | Example: 139 | ```python 140 | # Using with OpenAI v1 141 | import openai 142 | client = openai.OpenAI() # Configure with your settings 143 | llm = LLMClient(openai=client) # Methods will be auto-configured 144 | 145 | # Or with explicit method configuration 146 | llm = LLMClient( 147 | openai=client, 148 | complete=client.chat.completions.create, 149 | embed=client.embeddings.create, 150 | moderation=client.moderations.create, 151 | RateLimitError=openai.RateLimitError 152 | ) 153 | 154 | # Extending for custom implementation 155 | @dataclass 156 | class CustomLLMClient(LLMClient): 157 | def complete(self, **kwargs): 158 | # make adjustments as needed 159 | return self.openai.chat.completions.create(**kwargs) 160 | ``` 161 | """ 162 | 163 | openai: Union[OpenAIV0Module, OpenAIV1Module.OpenAI] 164 | complete: Callable[..., Any] = None # type: ignore # Set in __post_init__ 165 | embed: Callable[..., Any] = None # type: ignore # Set in __post_init__ 166 | moderation: Callable[..., Any] = None # type: ignore # Set in __post_init__ 167 | RateLimitError: Type[Exception] = None # type: ignore # Set in __post_init__ 168 | is_async: bool = False 169 | _is_wrapped: bool = False 170 | 171 | def __post_init__(self): 172 | NamedWrapper, wrap_openai = get_openai_wrappers() 173 | 174 | has_customization = self.complete is not None or self.embed is not None or self.moderation is not None # type: ignore # Pyright doesn't understand our design choice 175 | 176 | # avoid wrapping if we have custom methods (the user may intend not to wrap) 177 | if not has_customization and not isinstance(self.openai, NamedWrapper): 178 | self.openai = wrap_openai(self.openai) 179 | 180 | self._is_wrapped = isinstance(self.openai, NamedWrapper) 181 | 182 | openai_module = get_openai_module() 183 | 184 | if hasattr(openai_module, "OpenAI"): 185 | openai_module = cast(OpenAIV1Module, openai_module) 186 | self.openai = cast(OpenAIV1Module.OpenAI, self.openai) 187 | 188 | # v1 189 | self.complete = self.openai.chat.completions.create 190 | self.embed = self.openai.embeddings.create 191 | self.moderation = self.openai.moderations.create 192 | self.RateLimitError = openai_module.RateLimitError 193 | else: 194 | openai_module = cast(OpenAIV0Module, openai_module) 195 | self.openai = cast(OpenAIV0Module, self.openai) 196 | 197 | # v0 198 | self.complete = self.openai.ChatCompletion.acreate if self.is_async else self.openai.ChatCompletion.create 199 | self.embed = self.openai.Embedding.acreate if self.is_async else self.openai.Embedding.create 200 | self.moderation = self.openai.Moderation.acreate if self.is_async else self.openai.Moderation.create 201 | self.RateLimitError = openai_module.error.RateLimitError 202 | 203 | @property 204 | def is_wrapped(self) -> bool: 205 | return self._is_wrapped 206 | 207 | 208 | _client_var = ContextVar[Optional[LLMClient]]("client") 209 | 210 | T = TypeVar("T") 211 | 212 | _named_wrapper: Optional[Type[Any]] = None 213 | _wrap_openai: Optional[Callable[[Any], Any]] = None 214 | 215 | 216 | def get_openai_wrappers() -> Tuple[Type[Any], Callable[[Any], Any]]: 217 | global _named_wrapper, _wrap_openai 218 | 219 | if _named_wrapper is not None and _wrap_openai is not None: 220 | return _named_wrapper, _wrap_openai 221 | 222 | try: 223 | from braintrust.oai import NamedWrapper as BraintrustNamedWrapper # type: ignore 224 | from braintrust.oai import wrap_openai # type: ignore 225 | 226 | _named_wrapper = cast(Type[Any], BraintrustNamedWrapper) 227 | except ImportError: 228 | 229 | class NamedWrapper: 230 | pass 231 | 232 | def wrap_openai(openai: T) -> T: 233 | return openai 234 | 235 | _named_wrapper = NamedWrapper 236 | 237 | _wrap_openai = cast(Callable[[Any], Any], wrap_openai) 238 | return _named_wrapper, _wrap_openai 239 | 240 | 241 | Client = Union[LLMClient, OpenAIV0Module, OpenAIV1Module.OpenAI] 242 | 243 | 244 | def resolve_client(client: Client, is_async: bool = False) -> LLMClient: 245 | if isinstance(client, LLMClient): 246 | return client 247 | return LLMClient(openai=client, is_async=is_async) 248 | 249 | 250 | def init(client: Optional[Client] = None, is_async: bool = False): 251 | """Initialize Autoevals with an optional custom LLM client. 252 | 253 | This function sets up the global client context for Autoevals to use. If no client is provided, 254 | the default OpenAI client will be used. 255 | 256 | Args: 257 | client: The client to use for LLM operations. Can be one of: 258 | - None: Resets the global client 259 | - LLMClient: Used directly as provided 260 | - OpenAIV0Module: Wrapped in a new LLMClient instance (OpenAI SDK v0) 261 | - OpenAIV1: Wrapped in a new LLMClient instance (OpenAI SDK v1) 262 | is_async: Whether to create a client with async operations. Defaults to False. 263 | Deprecated: Use the `client` argument directly with your desired async/sync configuration. 264 | """ 265 | _client_var.set(resolve_client(client, is_async=is_async) if client else None) 266 | 267 | 268 | warned_deprecated_api_key_base_url = False 269 | 270 | 271 | def prepare_openai( 272 | client: Optional[Client] = None, 273 | is_async: bool = False, 274 | api_key: Optional[str] = None, 275 | base_url: Optional[str] = None, 276 | ): 277 | """Prepares and configures an OpenAI client for use with AutoEval. 278 | 279 | This function handles both v0 and v1 of the OpenAI SDK, configuring the client 280 | with the appropriate authentication and base URL settings. 281 | 282 | We will also attempt to enable Braintrust tracing export, if you've configured tracing. 283 | 284 | Args: 285 | client (Optional[LLMClient], optional): Existing LLMClient instance. 286 | If provided, this client will be used instead of creating a new one. 287 | 288 | is_async (bool, optional): Whether to create a client with async operations. Defaults to False. 289 | Deprecated: Use the `client` argument and set the `openai` with the async/sync that you'd like to use. 290 | 291 | api_key (str, optional): OpenAI API key. If not provided, will look for 292 | OPENAI_API_KEY or BRAINTRUST_API_KEY in environment variables. 293 | Deprecated: Use the `client` argument and set the `openai`. 294 | 295 | base_url (str, optional): Base URL for API requests. If not provided, will 296 | use OPENAI_BASE_URL from environment or fall back to PROXY_URL. 297 | Deprecated: Use the `client` argument and set the `openai`. 298 | 299 | Returns: 300 | LLMClient: The configured LLMClient instance, or the client you've provided 301 | 302 | Raises: 303 | ImportError: If the OpenAI package is not installed 304 | """ 305 | client = client or _client_var.get(None) 306 | if client is not None: 307 | return resolve_client(client, is_async=is_async) 308 | 309 | try: 310 | openai_module = get_openai_module() 311 | except Exception as e: 312 | print( 313 | textwrap.dedent( 314 | f"""\ 315 | Unable to import openai: {e} 316 | 317 | Please install it, e.g. with 318 | 319 | pip install 'openai' 320 | """ 321 | ), 322 | file=sys.stderr, 323 | ) 324 | raise 325 | 326 | global warned_deprecated_api_key_base_url 327 | if not warned_deprecated_api_key_base_url and (api_key is not None or base_url is not None): 328 | warnings.warn( 329 | "The api_key and base_url parameters are deprecated. Please use init() or call with client instead.", 330 | DeprecationWarning, 331 | stacklevel=2, 332 | ) 333 | warned_deprecated_api_key_base_url = True 334 | 335 | # prepare the default openai sdk, if not provided 336 | if api_key is None: 337 | api_key = os.environ.get("OPENAI_API_KEY") or os.environ.get("BRAINTRUST_API_KEY") 338 | if base_url is None: 339 | base_url = os.environ.get("OPENAI_BASE_URL", PROXY_URL) 340 | 341 | if hasattr(openai_module, "OpenAI"): 342 | openai_module = cast(OpenAIV1Module, openai_module) 343 | 344 | # v1 API 345 | if is_async: 346 | openai_obj = openai_module.AsyncOpenAI(api_key=api_key, base_url=base_url) # type: ignore 347 | else: 348 | openai_obj = openai_module.OpenAI(api_key=api_key, base_url=base_url) # type: ignore 349 | else: 350 | openai_module = cast(OpenAIV0Module, openai_module) 351 | 352 | # v0 API 353 | if api_key: 354 | openai_module.api_key = api_key 355 | openai_module.api_base = base_url 356 | openai_obj = openai_module 357 | 358 | return LLMClient(openai=openai_obj, is_async=is_async) 359 | 360 | 361 | def post_process_response(resp: Any) -> Dict[str, Any]: 362 | # This normalizes against craziness in OpenAI v0 vs. v1 363 | if hasattr(resp, "to_dict"): 364 | # v0 365 | return resp.to_dict() 366 | else: 367 | # v1 368 | return resp.dict() 369 | 370 | 371 | def set_span_purpose(kwargs: Dict[str, Any]) -> None: 372 | kwargs.setdefault("span_info", {}).setdefault("span_attributes", {})["purpose"] = "scorer" 373 | 374 | 375 | def run_cached_request( 376 | *, 377 | client: Optional[LLMClient] = None, 378 | request_type: str = "complete", 379 | api_key: Optional[str] = None, 380 | base_url: Optional[str] = None, 381 | **kwargs: Any, 382 | ) -> Dict[str, Any]: 383 | wrapper = prepare_openai(client=client, is_async=False, api_key=api_key, base_url=base_url) 384 | if wrapper.is_wrapped: 385 | set_span_purpose(kwargs) 386 | 387 | retries = 0 388 | sleep_time = 0.1 389 | resp = None 390 | while retries < 100: 391 | try: 392 | resp = post_process_response(getattr(wrapper, request_type)(**kwargs)) 393 | break 394 | except wrapper.RateLimitError: 395 | sleep_time *= 1.5 396 | time.sleep(sleep_time) 397 | retries += 1 398 | 399 | if resp is None: 400 | raise RuntimeError("Failed to get response after maximum retries") 401 | return resp 402 | 403 | 404 | async def arun_cached_request( 405 | *, 406 | client: Optional[LLMClient] = None, 407 | request_type: str = "complete", 408 | api_key: Optional[str] = None, 409 | base_url: Optional[str] = None, 410 | **kwargs: Any, 411 | ) -> Dict[str, Any]: 412 | wrapper = prepare_openai(client=client, is_async=True, api_key=api_key, base_url=base_url) 413 | if wrapper.is_wrapped: 414 | set_span_purpose(kwargs) 415 | 416 | retries = 0 417 | sleep_time = 0.1 418 | resp = None 419 | while retries < 100: 420 | try: 421 | resp = post_process_response(await getattr(wrapper, request_type)(**kwargs)) 422 | break 423 | except wrapper.RateLimitError: 424 | # Just assume it's a rate limit error 425 | sleep_time *= 1.5 426 | await asyncio.sleep(sleep_time) 427 | retries += 1 428 | 429 | if resp is None: 430 | raise RuntimeError("Failed to get response after maximum retries") 431 | 432 | return resp 433 | -------------------------------------------------------------------------------- /py/autoevals/partial.py: -------------------------------------------------------------------------------- 1 | from braintrust_core.score import Scorer 2 | 3 | 4 | class ScorerWithPartial(Scorer): 5 | @classmethod 6 | def partial(cls, **partial_kwargs): 7 | class PartialScorer(cls): 8 | async def eval_async(self, output, expected=None, **kwargs): 9 | if expected is not None: 10 | kwargs["expected"] = expected 11 | return await self._run_eval_async(output, **{**partial_kwargs, **kwargs}) 12 | 13 | def eval(self, output, expected=None, **kwargs): 14 | if expected is not None: 15 | kwargs["expected"] = expected 16 | return self._run_eval_sync(output, **{**partial_kwargs, **kwargs}) 17 | 18 | @classmethod 19 | def _partial_args(cls): 20 | return {**partial_kwargs} 21 | 22 | PartialScorer.__name__ = cls.__name__ 23 | return PartialScorer 24 | -------------------------------------------------------------------------------- /py/autoevals/string.py: -------------------------------------------------------------------------------- 1 | """String evaluation scorers for comparing text similarity. 2 | 3 | This module provides scorers for text comparison: 4 | 5 | - Levenshtein: Compare strings using edit distance 6 | - Fast, local string comparison 7 | - Suitable for exact matches and small variations 8 | - No external dependencies 9 | - Simple to use with just output/expected parameters 10 | 11 | - EmbeddingSimilarity: Compare strings using embeddings 12 | - Semantic similarity using embeddings 13 | - Requires OpenAI API access 14 | - Better for comparing meaning rather than exact matches 15 | - Supports both sync and async evaluation 16 | - Built-in caching for efficiency 17 | - Configurable with options for model, prefix, thresholds 18 | """ 19 | 20 | import threading 21 | from typing import Optional 22 | 23 | from braintrust_core.score import Score 24 | from polyleven import levenshtein as distance 25 | 26 | from autoevals.partial import ScorerWithPartial 27 | from autoevals.value import normalize_value 28 | 29 | from .oai import LLMClient, arun_cached_request, run_cached_request 30 | 31 | 32 | class Levenshtein(ScorerWithPartial): 33 | """String similarity scorer using edit distance. 34 | 35 | Example: 36 | ```python 37 | scorer = Levenshtein() 38 | result = scorer.eval( 39 | output="hello wrld", 40 | expected="hello world" 41 | ) 42 | print(result.score) # 0.9 (normalized similarity) 43 | ``` 44 | 45 | Args: 46 | output: String to evaluate 47 | expected: Reference string to compare against 48 | 49 | Returns: 50 | Score object with normalized similarity (0-1), where 1 means identical strings 51 | """ 52 | 53 | def _run_eval_sync(self, output, expected=None, **kwargs): 54 | if expected is None: 55 | raise ValueError("LevenshteinScorer requires an expected value") 56 | 57 | output, expected = str(output), str(expected) 58 | max_len = max(len(x) for x in [output, expected]) 59 | 60 | score = 1 61 | if max_len > 0: 62 | score = 1 - (distance(output, expected) / max_len) 63 | 64 | return Score(name=self._name(), score=score) 65 | 66 | 67 | LevenshteinScorer = Levenshtein # backcompat 68 | 69 | 70 | class EmbeddingSimilarity(ScorerWithPartial): 71 | """String similarity scorer using embeddings. 72 | 73 | Example: 74 | ```python 75 | import asyncio 76 | from openai import AsyncOpenAI 77 | from autoevals.string import EmbeddingSimilarity 78 | 79 | async def compare_texts(): 80 | # Initialize with async client 81 | client = AsyncOpenAI() 82 | scorer = EmbeddingSimilarity( 83 | prefix="Code explanation: ", 84 | client=client 85 | ) 86 | 87 | result = await scorer.eval_async( 88 | output="The function sorts elements using quicksort", 89 | expected="The function implements quicksort algorithm" 90 | ) 91 | 92 | print(result.score) # 0.85 (normalized similarity) 93 | print(result.metadata) # Additional comparison details 94 | 95 | # Run the async evaluation 96 | asyncio.run(compare_texts()) 97 | ``` 98 | 99 | Args: 100 | prefix: Optional text to prepend to inputs for domain context 101 | model: Embedding model to use (default: text-embedding-ada-002) 102 | expected_min: Minimum similarity threshold (default: 0.7) 103 | client: Optional AsyncOpenAI/OpenAI client. If not provided, uses global client from init() 104 | 105 | Returns: 106 | Score object with: 107 | - score: Normalized similarity (0-1) 108 | - metadata: Additional comparison details 109 | """ 110 | 111 | MODEL = "text-embedding-ada-002" 112 | 113 | _CACHE = {} 114 | _CACHE_LOCK = threading.Lock() 115 | 116 | def __init__( 117 | self, 118 | prefix="", 119 | model=MODEL, 120 | expected_min=0.7, 121 | api_key=None, 122 | base_url=None, 123 | client: Optional[LLMClient] = None, 124 | ): 125 | self.prefix = prefix 126 | self.expected_min = expected_min 127 | 128 | self.extra_args = {"model": model} 129 | if api_key: 130 | self.extra_args["api_key"] = api_key 131 | if base_url: 132 | self.extra_args["base_url"] = base_url 133 | 134 | self.client = client 135 | 136 | async def _a_embed(self, value): 137 | value = normalize_value(value, maybe_object=False) 138 | with self._CACHE_LOCK: 139 | if value in self._CACHE: 140 | return self._CACHE[value] 141 | 142 | result = await arun_cached_request( 143 | client=self.client, request_type="embed", input=f"{self.prefix}{value}", **self.extra_args 144 | ) 145 | 146 | with self._CACHE_LOCK: 147 | self._CACHE[value] = result 148 | 149 | return result 150 | 151 | def _embed(self, value): 152 | value = normalize_value(value, maybe_object=False) 153 | with self._CACHE_LOCK: 154 | if value in self._CACHE: 155 | return self._CACHE[value] 156 | 157 | result = run_cached_request( 158 | client=self.client, request_type="embed", input=f"{self.prefix}{value}", **self.extra_args 159 | ) 160 | 161 | with self._CACHE_LOCK: 162 | self._CACHE[value] = result 163 | 164 | return result 165 | 166 | async def _run_eval_async(self, output, expected=None, **kwargs): 167 | if expected is None: 168 | raise ValueError("EmbeddingSimilarity requires an expected value") 169 | 170 | output_embedding_p = self._a_embed(output) 171 | expected_embedding_p = self._a_embed(expected) 172 | 173 | output_result, expected_result = await output_embedding_p, await expected_embedding_p 174 | return Score( 175 | name=self._name(), 176 | score=self.scale_score( 177 | self.cosine_similarity(output_result["data"][0]["embedding"], expected_result["data"][0]["embedding"]), 178 | self.expected_min, 179 | ), 180 | ) 181 | 182 | def _run_eval_sync(self, output, expected=None, **kwargs): 183 | if expected is None: 184 | raise ValueError("EmbeddingSimilarity requires an expected value") 185 | 186 | output_result = self._embed(output) 187 | expected_result = self._embed(expected) 188 | 189 | return Score( 190 | name=self._name(), 191 | score=self.scale_score( 192 | self.cosine_similarity(output_result["data"][0]["embedding"], expected_result["data"][0]["embedding"]), 193 | self.expected_min, 194 | ), 195 | ) 196 | 197 | @staticmethod 198 | def scale_score(score, expected_min): 199 | return max((score - expected_min) / (1 - expected_min), 0) 200 | 201 | @staticmethod 202 | def cosine_similarity(list1, list2): 203 | # Calculate dot product 204 | dot_product = sum(a * b for a, b in zip(list1, list2)) 205 | 206 | # Calculate the magnitude of each list 207 | magnitude_list1 = sum(a**2 for a in list1) ** 0.5 208 | magnitude_list2 = sum(b**2 for b in list2) ** 0.5 209 | 210 | # Calculate cosine similarity 211 | if magnitude_list1 * magnitude_list2 == 0: 212 | # Avoid division by zero 213 | return 0 214 | else: 215 | # Sometimes, rounding errors cause the dot product to be slightly > 1 216 | return min(dot_product / (magnitude_list1 * magnitude_list2), 1) 217 | 218 | 219 | __all__ = ["LevenshteinScorer", "Levenshtein", "EmbeddingSimilarity"] 220 | -------------------------------------------------------------------------------- /py/autoevals/templates: -------------------------------------------------------------------------------- 1 | ../../templates -------------------------------------------------------------------------------- /py/autoevals/test_embeddings.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from autoevals import EmbeddingSimilarity 4 | from autoevals.value import normalize_value 5 | 6 | SYNONYMS = [ 7 | ("water", ["water", "H2O", "agua"]), 8 | ("fire", ["fire", "flame"]), 9 | ("earth", ["earth", "Planet Earth"]), 10 | ] 11 | 12 | UNRELATED = ["water", "The quick brown fox jumps over the lazy dog", "I like to eat apples"] 13 | 14 | 15 | def test_embeddings(): 16 | evaluator = EmbeddingSimilarity(prefix="resource type: ") 17 | for word, synonyms in SYNONYMS: 18 | for synonym in synonyms: 19 | result = evaluator(word, synonym) 20 | print(f"[{word}]", f"[{synonym}]", result) 21 | assert result.score > 0.66 22 | 23 | for i in range(len(UNRELATED)): 24 | for j in range(len(UNRELATED)): 25 | if i == j: 26 | continue 27 | 28 | word1 = UNRELATED[i] 29 | word2 = UNRELATED[j] 30 | result = evaluator(word1, word2) 31 | print(f"[{word1}]", f"[{word2}]", result) 32 | assert result.score < 0.5 33 | 34 | 35 | VALUES = [ 36 | ("water", "wind"), 37 | (["cold", "water"], ["cold", "wind"]), 38 | ({"water": "wet"}, {"wind": "dry"}), 39 | ] 40 | 41 | 42 | def test_embedding_values(): 43 | for run_async in [False, True]: 44 | evaluator = EmbeddingSimilarity() 45 | for (word1, word2) in VALUES: 46 | if run_async: 47 | result = asyncio.run(evaluator.eval_async(word1, word2)) 48 | else: 49 | result = evaluator(word1, word2) 50 | print(f"[{word1}]", f"[{word2}]", f"run_async={run_async}", result) 51 | -------------------------------------------------------------------------------- /py/autoevals/test_json.py: -------------------------------------------------------------------------------- 1 | from pytest import approx 2 | 3 | from autoevals.json import JSONDiff, ValidJSON 4 | from autoevals.number import NumericDiff 5 | from autoevals.value import ExactMatch 6 | 7 | 8 | def test_string_as_json(): 9 | cases = [ 10 | ("", "", 1), 11 | ("", "a", 0), 12 | ("a", "", 0), 13 | ("a", "a", 1), 14 | ("a", "b", 0), 15 | ("ab", "ac", 0.5), 16 | ("ac", "bc", 0.5), 17 | ("abc", "axc", 0.66667), 18 | ("xabxcdxxefxgx", "1ab2cd34ef5g6", 0.53846), 19 | ] 20 | 21 | evaluator = JSONDiff() 22 | for a, b, expected in cases: 23 | print(f"[{a}]", f"[{b}]", expected, evaluator(a, b)) 24 | assert evaluator(a, b).score == approx(expected, abs=1e-4) 25 | 26 | 27 | def test_json(): 28 | cases = [ 29 | (None, None, 1), 30 | (None, "", 0), 31 | ([], {}, 0), 32 | ([], [], 1), 33 | ({}, {}, 1), 34 | ({"a": 1}, {"a": 1}, 1), 35 | ({"a": 1}, {"a": 2}, 0.66667), 36 | ({"a": 1}, ["a", 1], 0.5714285714285714), 37 | ({"a": 1}, {"b": {"a": 1}}, 0), 38 | ({"a": 1}, {"a": None}, 0), 39 | ( 40 | {"mapping": {"a": "foo", "b": "bar"}}, 41 | {"mapping": {"a": "Foo", "b": "Bar"}, "Extra": 5}, 42 | 0.33333333333333337, 43 | ), 44 | ] 45 | 46 | evaluator = JSONDiff() 47 | for a, b, expected in cases: 48 | print(f"[{a}]", f"[{b}]", expected, evaluator(a, b)) 49 | assert evaluator(a, b).score == approx(expected, 1e-4) 50 | 51 | 52 | def test_valid_json(): 53 | cases = [ 54 | ("1", 0, None), 55 | ('{ "a": 1, "b": "hello" }', 1, None), 56 | ('[{ "a": 1 }]', 1, None), 57 | ('[{ "a": 1 }', 0, None), 58 | ('{ "mapping": { "a": "foo", "b": "bar" }, "extra": 4 }', 1, None), 59 | ('{ mapping: { "a": "foo", "b": "bar" }, "extra": 4 }', 0, None), 60 | ( 61 | '{ "a": "1" }', 62 | 1, 63 | { 64 | "type": "object", 65 | "properties": {"a": {"type": "string"}}, 66 | "required": ["a"], 67 | }, 68 | ), 69 | ( 70 | '{"a": "1", "b": "1"}', 71 | 0, 72 | { 73 | "type": "object", 74 | "properties": { 75 | "a": {"type": "string"}, 76 | "b": {"type": "number"}, 77 | }, 78 | "required": ["a"], 79 | }, 80 | ), 81 | ( 82 | '[{"a": "1"}, {"a": "1", "b": 22}]', 83 | 1, 84 | { 85 | "type": "array", 86 | "items": { 87 | "type": "object", 88 | "properties": { 89 | "a": {"type": "string"}, 90 | "b": {"type": "number"}, 91 | }, 92 | "required": ["a"], 93 | }, 94 | "uniqueItems": True, 95 | }, 96 | ), 97 | ( 98 | {"a": "1", "b": "1"}, 99 | 1, 100 | None, 101 | ), 102 | ( 103 | [{"a": "1"}, {"a": "1", "b": 22}], 104 | 1, 105 | None, 106 | ), 107 | ( 108 | 100, 109 | 0, 110 | None, 111 | ), 112 | ( 113 | # This is technically ambiguous, because it _could_ be the valid parsed JSON value 114 | # or an unparsed, invalid JSON value. However, since structured outputs _only_ return 115 | # JSON values, we can safely assume that any strings are unparsed values. 116 | "100", 117 | 0, 118 | None, 119 | ), 120 | ] 121 | 122 | evaluator = ValidJSON() 123 | for output, expected, schema in cases: 124 | print(f"[{output}]", expected) 125 | assert evaluator(output, schema).score == expected 126 | 127 | 128 | def test_semantic_json(): 129 | cases = [ 130 | ('{"x": 1, "y": 2}', '{"y": 2, "x": 1}', 1), 131 | ( 132 | '{"zs": ["a", "b"], "x": 1, "y": 2}', 133 | '{"y": 2, "zs": ["a", "b"], "x": 1}', 134 | 1, 135 | ), 136 | ( 137 | '{"o1": {"x": 1, "y": 2}}', 138 | '{"o1": {"y": 2, "x": 1}}', 139 | 1, 140 | ), 141 | ( 142 | '{"xs": [{"o1": {"x": 1, "y": [2]}}]}', 143 | '{"xs": [{"o1": {"y": [2], "x": 1}}]}', 144 | 1, 145 | ), 146 | ( 147 | '{"o1": {"x": 2, "y": 2}}', 148 | '{"o1": {"y": 2, "x": 1}}', 149 | 0.83333, 150 | ), 151 | ( 152 | {"o1": {"x": 2, "y": 2}}, 153 | '{"o1": {"y": 2, "x": 1}}', 154 | 0.83333, 155 | ), 156 | ('{"x": 1, "y": 2}', '{"x": 1, "z": 2}', 0.3333), 157 | ("[1, 2]", "[1, 2]", 1), 158 | ("[1, 2]", "[2, 1]", 0.66667), 159 | ] 160 | 161 | evaluator = JSONDiff() 162 | for a, b, expected in cases: 163 | for exact_number in [True, False]: 164 | score = evaluator(a, b, number_scorer=ExactMatch() if exact_number else NumericDiff()).score 165 | if not exact_number: 166 | assert abs(score - expected) < 0.0001 167 | else: 168 | assert round(score * 100) <= round(expected * 100) 169 | -------------------------------------------------------------------------------- /py/autoevals/test_llm.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from typing import cast 3 | 4 | import pytest 5 | import respx 6 | from openai import OpenAI 7 | from pydantic import BaseModel 8 | 9 | from autoevals import init 10 | from autoevals.llm import Battle, Factuality, LLMClassifier, OpenAILLMClassifier, build_classification_tools 11 | from autoevals.oai import OpenAIV1Module 12 | 13 | 14 | class TestModel(BaseModel): 15 | foo: str 16 | num: int 17 | 18 | 19 | def test_render_messages(): 20 | classifier = OpenAILLMClassifier( 21 | "test", 22 | messages=[ 23 | {"role": "user", "content": "{{value}} and {{{value}}}"}, 24 | {"role": "user", "content": "Dict double braces: {{data}}"}, 25 | {"role": "user", "content": "Dict triple braces: {{{data}}}"}, 26 | {"role": "user", "content": "Model double braces: {{model}}"}, 27 | {"role": "user", "content": "Model triple braces: {{{model}}}"}, 28 | {"role": "user", "content": ""}, # test empty content 29 | ], 30 | model="gpt-4", 31 | choice_scores={"A": 1}, 32 | classification_tools=[], 33 | ) 34 | 35 | test_dict = {"foo": "bar", "num": 42} 36 | test_model = TestModel(foo="bar", num=42) 37 | 38 | rendered = classifier._render_messages(value="bold", data=test_dict, model=test_model) 39 | 40 | # Test that HTML is never escaped, regardless of syntax. 41 | assert rendered[0]["content"] == "bold and bold" 42 | 43 | # Test dict rendering - both use str(). 44 | assert rendered[1]["content"] == "Dict double braces: {'foo': 'bar', 'num': 42}" 45 | assert rendered[2]["content"] == "Dict triple braces: {'foo': 'bar', 'num': 42}" 46 | 47 | # Test model rendering - both use str(). 48 | assert rendered[3]["content"] == "Model double braces: foo='bar' num=42" 49 | assert rendered[4]["content"] == "Model triple braces: foo='bar' num=42" 50 | 51 | # Test empty content. 52 | assert rendered[5]["content"] == "" 53 | 54 | 55 | def test_openai(): 56 | e = OpenAILLMClassifier( 57 | "title", 58 | messages=[ 59 | { 60 | "role": "system", 61 | "content": """\ 62 | You are a technical project manager who helps software engineers generate better titles for their GitHub issues. 63 | You will look at the issue description, and pick which of two titles better describes it.""", 64 | }, 65 | { 66 | "role": "user", 67 | "content": """\ 68 | I'm going to provide you with the issue description, and two possible titles. 69 | 70 | Issue Description: {{page_content}} 71 | 72 | 1: {{output}} 73 | 2: {{expected}} 74 | 75 | Please discuss each title briefly (one line for pros, one for cons), and then answer the question by calling 76 | the select_choice function with "1" or "2".""", 77 | }, 78 | ], 79 | model="gpt-3.5-turbo", 80 | choice_scores={"1": 1, "2": 0}, 81 | classification_tools=build_classification_tools(useCoT=True, choice_strings=["1", "2"]), 82 | max_tokens=500, 83 | ) 84 | 85 | page_content = """ 86 | As suggested by Nicolo, we should standardize the error responses coming from GoTrue, postgres, and realtime (and any other/future APIs) so that it's better DX when writing a client, 87 | 88 | We can make this change on the servers themselves, but since postgrest and gotrue are fully/partially external may be harder to change, it might be an option to transform the errors within the client libraries/supabase-js, could be messy? 89 | 90 | Nicolo also dropped this as a reference: http://spec.openapis.org/oas/v3.0.3#openapi-specification""" 91 | 92 | gen_title = "Standardize error responses from GoTrue, Postgres, and Realtime APIs for better DX" 93 | original_title = "This title has nothing to do with the content" 94 | 95 | response = e(gen_title, original_title, page_content=page_content) 96 | print(response.as_json(indent=2)) 97 | assert response.score == 1 98 | assert response.error is None 99 | 100 | 101 | def test_llm_classifier(): 102 | for use_cot in [True, False]: 103 | e = LLMClassifier( 104 | "title", 105 | """ 106 | You are a technical project manager who helps software engineers generate better titles for their GitHub issues. 107 | You will look at the issue description, and pick which of two titles better describes it. 108 | 109 | I'm going to provide you with the issue description, and two possible titles. 110 | 111 | Issue Description: {{page_content}} 112 | 113 | 1: {{output}} 114 | 2: {{expected}}""", 115 | {"1": 1, "2": 0}, 116 | use_cot=use_cot, 117 | ) 118 | 119 | page_content = """ 120 | As suggested by Nicolo, we should standardize the error responses coming from GoTrue, postgres, and realtime (and any other/future APIs) so that it's better DX when writing a client, 121 | 122 | We can make this change on the servers themselves, but since postgrest and gotrue are fully/partially external may be harder to change, it might be an option to transform the errors within the client libraries/supabase-js, could be messy? 123 | 124 | Nicolo also dropped this as a reference: http://spec.openapis.org/oas/v3.0.3#openapi-specification""" 125 | 126 | gen_title = "Standardize error responses from GoTrue, Postgres, and Realtime APIs for better DX" 127 | original_title = "This title has nothing to do with the content" 128 | 129 | response = e(gen_title, original_title, page_content=page_content) 130 | print(response.as_json(indent=2)) 131 | assert response.score == 1 132 | assert response.error is None 133 | 134 | response = e(original_title, gen_title, page_content=page_content) 135 | print(response.as_json(indent=2)) 136 | assert response.score == 0 137 | assert response.error is None 138 | 139 | 140 | def test_nested_async(): 141 | async def nested_async(): 142 | e = Battle() 143 | e(instructions="Add the following numbers: 1, 2, 3", output="600", expected="6") 144 | 145 | asyncio.run(nested_async()) 146 | 147 | 148 | @respx.mock 149 | def test_factuality(): 150 | # something is wrong with respx that it couldn't match the url from openai 151 | respx.route().respond( 152 | json={ 153 | "id": "chatcmpl-AdiS4bHWjqSclA5rx7OkuZ6EA9QIp", 154 | "choices": [ 155 | { 156 | "finish_reason": "stop", 157 | "index": 0, 158 | "logprobs": None, 159 | "message": { 160 | "content": None, 161 | "refusal": None, 162 | "role": "assistant", 163 | "tool_calls": [ 164 | { 165 | "id": "call_JKoeGAX2zGPJAmF2muDgjpHp", 166 | "function": { 167 | "arguments": '{"reasons":"1. The question asks to add the numbers 1, 2, and 3.\\n2. The expert answer provides the sum of these numbers as 6.\\n3. The submitted answer also provides the sum as 6.\\n4. Both the expert and submitted answers provide the same numerical result, which is 6.\\n5. Since both answers provide the same factual content, the submitted answer contains all the same details as the expert answer.\\n6. There is no additional information or discrepancy between the two answers.\\n7. Therefore, the submitted answer is neither a subset nor a superset; it is exactly the same as the expert answer in terms of factual content.","choice":"C"}', 168 | "name": "select_choice", 169 | }, 170 | "type": "function", 171 | } 172 | ], 173 | }, 174 | } 175 | ], 176 | "created": 1734029028, 177 | "model": "gpt-4o-2024-08-06", 178 | "object": "chat.completion", 179 | "system_fingerprint": "fp_cc5cf1c6e3", 180 | "usage": { 181 | "completion_tokens": 149, 182 | "prompt_tokens": 404, 183 | "total_tokens": 553, 184 | "completion_tokens_details": { 185 | "accepted_prediction_tokens": 0, 186 | "audio_tokens": 0, 187 | "reasoning_tokens": 0, 188 | "rejected_prediction_tokens": 0, 189 | }, 190 | "prompt_tokens_details": {"audio_tokens": 0, "cached_tokens": 0}, 191 | }, 192 | } 193 | ) 194 | 195 | llm = Factuality(base_url="https://api.openai.com/v1/") 196 | result = llm.eval( 197 | output="6", 198 | expected="6", 199 | input="Add the following numbers: 1, 2, 3", 200 | ) 201 | 202 | assert result.score == 1 203 | 204 | 205 | @respx.mock 206 | def test_factuality_client(): 207 | respx.route().respond( 208 | json={ 209 | "id": "chatcmpl-AdiS4bHWjqSclA5rx7OkuZ6EA9QIp", 210 | "choices": [ 211 | { 212 | "finish_reason": "stop", 213 | "index": 0, 214 | "logprobs": None, 215 | "message": { 216 | "content": None, 217 | "refusal": None, 218 | "role": "assistant", 219 | "tool_calls": [ 220 | { 221 | "id": "call_JKoeGAX2zGPJAmF2muDgjpHp", 222 | "function": { 223 | "arguments": '{"reasons":"1. The question asks to add the numbers 1, 2, and 3.\\n2. The expert answer provides the sum of these numbers as 6.\\n3. The submitted answer also provides the sum as 6.\\n4. Both the expert and submitted answers provide the same numerical result, which is 6.\\n5. Since both answers provide the same factual content, the submitted answer contains all the same details as the expert answer.\\n6. There is no additional information or discrepancy between the two answers.\\n7. Therefore, the submitted answer is neither a subset nor a superset; it is exactly the same as the expert answer in terms of factual content.","choice":"C"}', 224 | "name": "select_choice", 225 | }, 226 | "type": "function", 227 | } 228 | ], 229 | }, 230 | } 231 | ], 232 | "created": 1734029028, 233 | "model": "gpt-4o-2024-08-06", 234 | "object": "chat.completion", 235 | "system_fingerprint": "fp_cc5cf1c6e3", 236 | "usage": { 237 | "completion_tokens": 149, 238 | "prompt_tokens": 404, 239 | "total_tokens": 553, 240 | "completion_tokens_details": { 241 | "accepted_prediction_tokens": 0, 242 | "audio_tokens": 0, 243 | "reasoning_tokens": 0, 244 | "rejected_prediction_tokens": 0, 245 | }, 246 | "prompt_tokens_details": {"audio_tokens": 0, "cached_tokens": 0}, 247 | }, 248 | } 249 | ) 250 | 251 | llm = Factuality(client=OpenAI(api_key="test")) 252 | result = llm.eval( 253 | output="6", 254 | expected="6", 255 | input="Add the following numbers: 1, 2, 3", 256 | ) 257 | 258 | assert result.score == 1 259 | 260 | 261 | @pytest.fixture(autouse=True) 262 | def reset_client(): 263 | yield 264 | init(client=None) 265 | 266 | 267 | # make sure we deny any leaked calls to OpenAI 268 | @respx.mock 269 | def test_init_client(): 270 | client = cast(OpenAIV1Module.OpenAI, OpenAI(api_key="test")) 271 | 272 | respx.route().respond( 273 | json={ 274 | "id": "chatcmpl-AdiS4bHWjqSclA5rx7OkuZ6EA9QIp", 275 | "choices": [ 276 | { 277 | "finish_reason": "stop", 278 | "index": 0, 279 | "logprobs": None, 280 | "message": { 281 | "content": None, 282 | "refusal": None, 283 | "role": "assistant", 284 | "tool_calls": [ 285 | { 286 | "id": "call_JKoeGAX2zGPJAmF2muDgjpHp", 287 | "function": { 288 | "arguments": '{"reasons":"1. The question asks to add the numbers 1, 2, and 3.\\n2. The expert answer provides the sum of these numbers as 6.\\n3. The submitted answer also provides the sum as 6.\\n4. Both the expert and submitted answers provide the same numerical result, which is 6.\\n5. Since both answers provide the same factual content, the submitted answer contains all the same details as the expert answer.\\n6. There is no additional information or discrepancy between the two answers.\\n7. Therefore, the submitted answer is neither a subset nor a superset; it is exactly the same as the expert answer in terms of factual content.","choice":"C"}', 289 | "name": "select_choice", 290 | }, 291 | "type": "function", 292 | } 293 | ], 294 | }, 295 | } 296 | ], 297 | "created": 1734029028, 298 | "model": "gpt-4o-2024-08-06", 299 | "object": "chat.completion", 300 | "system_fingerprint": "fp_cc5cf1c6e3", 301 | "usage": { 302 | "completion_tokens": 149, 303 | "prompt_tokens": 404, 304 | "total_tokens": 553, 305 | "completion_tokens_details": { 306 | "accepted_prediction_tokens": 0, 307 | "audio_tokens": 0, 308 | "reasoning_tokens": 0, 309 | "rejected_prediction_tokens": 0, 310 | }, 311 | "prompt_tokens_details": {"audio_tokens": 0, "cached_tokens": 0}, 312 | }, 313 | } 314 | ) 315 | 316 | init(client=client) 317 | 318 | llm = Factuality(base_url="https://api.openai.com/v1/") 319 | result = llm.eval( 320 | output="6", 321 | expected="6", 322 | input="Add the following numbers: 1, 2, 3", 323 | ) 324 | 325 | assert result.score == 1 326 | 327 | 328 | def test_battle(): 329 | for use_cot in [True, False]: 330 | print("use_cot", use_cot) 331 | e = Battle(use_cot=use_cot) 332 | response = e( 333 | instructions="Add the following numbers: 1, 2, 3", 334 | output="600", 335 | expected="6", 336 | ) 337 | 338 | print(response.as_json(indent=2)) 339 | assert response.score == 0 340 | assert response.error is None 341 | 342 | response = e( 343 | instructions="Add the following numbers: 1, 2, 3", 344 | output="6", 345 | expected="600", 346 | ) 347 | 348 | print(response.as_json(indent=2)) 349 | assert response.score == 1 350 | assert response.error is None 351 | 352 | response = e(instructions="Add the following numbers: 1, 2, 3", output="6", expected="6") 353 | 354 | print(response.as_json(indent=2)) 355 | assert response.score == 0 356 | assert response.error is None 357 | -------------------------------------------------------------------------------- /py/autoevals/test_moderation.py: -------------------------------------------------------------------------------- 1 | from autoevals.moderation import Moderation 2 | 3 | CASES = [ 4 | { 5 | "name": "Safe content without threshold", 6 | "output": "hello", 7 | "threshold": None, 8 | "expected": 1, 9 | }, 10 | { 11 | "name": "Unsafe content without threshold", 12 | "output": "kill kill kill kill kill", 13 | "threshold": None, 14 | "expected": 0, 15 | }, 16 | { 17 | "name": "Unsafe content with lower threshold", 18 | "output": "here are my suicidal thoughts", 19 | "threshold": 0.25, 20 | "expected": 0, 21 | }, 22 | { 23 | "name": "Unsafe content with higher threshold", 24 | "output": "here are my suicidal thoughts", 25 | "threshold": 0.99, 26 | "expected": 1, 27 | }, 28 | ] 29 | 30 | 31 | def test_moderation(): 32 | for test_case in CASES: 33 | expected = test_case["expected"] 34 | 35 | evaluator = Moderation(threshold=test_case["threshold"]) 36 | actual = evaluator.eval(test_case["output"]) 37 | 38 | assert actual.score == expected, f"Expects {test_case['name']} to be {expected} but got {actual.score}" 39 | -------------------------------------------------------------------------------- /py/autoevals/test_oai.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from typing import Any, Union, cast 3 | 4 | import openai 5 | import pytest 6 | from braintrust.oai import ( 7 | ChatCompletionV0Wrapper, 8 | CompletionsV1Wrapper, 9 | NamedWrapper, 10 | OpenAIV0Wrapper, 11 | OpenAIV1Wrapper, 12 | wrap_openai, 13 | ) 14 | from openai.resources.chat.completions import AsyncCompletions 15 | 16 | from autoevals import init # type: ignore[import] 17 | from autoevals.oai import ( # type: ignore[import] 18 | LLMClient, 19 | OpenAIV0Module, 20 | OpenAIV1Module, 21 | _named_wrapper, # type: ignore[import] # Accessing private members for testing 22 | _wrap_openai, # type: ignore[import] # Accessing private members for testing 23 | get_openai_wrappers, 24 | prepare_openai, 25 | ) 26 | 27 | 28 | def unwrap_named_wrapper(obj: Union[NamedWrapper, OpenAIV1Module.OpenAI, OpenAIV0Module]) -> Any: 29 | return getattr(obj, "_NamedWrapper__wrapped") 30 | 31 | 32 | @pytest.fixture(autouse=True) 33 | def reset_env_and_client(monkeypatch: pytest.MonkeyPatch): 34 | """Reset environment variables and client before each test.""" 35 | monkeypatch.delenv("OPENAI_API_KEY", raising=False) 36 | monkeypatch.setenv("OPENAI_API_KEY", "test-key") 37 | monkeypatch.setenv("OPENAI_BASE_URL", "http://test-url") 38 | monkeypatch.setattr("autoevals.oai._named_wrapper", None) 39 | monkeypatch.setattr("autoevals.oai._wrap_openai", None) 40 | monkeypatch.setattr("autoevals.oai._openai_module", None) 41 | 42 | init(None) 43 | 44 | yield 45 | 46 | 47 | def test_prepare_openai_uses_unwrapped_global_client(): 48 | openai_obj = openai.OpenAI(api_key="api-key", base_url="http://test") 49 | client = LLMClient( 50 | openai=openai_obj, 51 | complete=openai_obj.chat.completions.create, 52 | embed=openai_obj.embeddings.create, 53 | moderation=openai_obj.moderations.create, 54 | RateLimitError=openai.RateLimitError, 55 | ) 56 | 57 | init(client) 58 | 59 | prepared_client = prepare_openai() 60 | 61 | assert prepared_client == client 62 | assert not prepared_client.is_wrapped 63 | assert prepared_client.openai == openai_obj 64 | assert prepared_client.complete is client.complete 65 | assert prepared_client.openai.api_key == "api-key" 66 | 67 | 68 | def test_init_creates_llmclient_if_needed(): 69 | openai_obj = openai.OpenAI() 70 | init(openai_obj) 71 | 72 | prepared_client = prepare_openai() 73 | 74 | assert isinstance(prepared_client, LLMClient) 75 | assert prepared_client.is_wrapped 76 | assert unwrap_named_wrapper(prepared_client.openai) == openai_obj 77 | 78 | 79 | def test_init_creates_async_llmclient_if_needed(mock_openai_v0: OpenAIV0Module): 80 | init(mock_openai_v0, is_async=True) 81 | 82 | prepared_client = prepare_openai() 83 | 84 | assert isinstance(prepared_client, LLMClient) 85 | assert prepared_client.is_wrapped 86 | assert isinstance(prepared_client.openai, OpenAIV0Wrapper) 87 | assert prepared_client.complete.__name__ == "acreate" 88 | 89 | 90 | def test_prepare_openai_defaults(): 91 | prepared_client = prepare_openai() 92 | 93 | assert isinstance(prepared_client, LLMClient) 94 | assert prepared_client.is_wrapped 95 | openai_obj = unwrap_named_wrapper(prepared_client.openai) 96 | assert isinstance(openai_obj, openai.OpenAI) 97 | assert isinstance(getattr(prepared_client.complete, "__self__", None), CompletionsV1Wrapper) 98 | assert openai_obj.api_key == "test-key" 99 | assert openai_obj.base_url == "http://test-url" 100 | 101 | 102 | def test_prepare_openai_with_plain_openai(): 103 | client = openai.OpenAI(api_key="api-key", base_url="http://test") 104 | prepared_client = prepare_openai(client=client) 105 | 106 | assert prepared_client.is_wrapped 107 | assert isinstance(prepared_client.openai, OpenAIV1Wrapper) 108 | 109 | 110 | def test_prepare_openai_async(): 111 | prepared_client = prepare_openai(is_async=True) 112 | 113 | assert isinstance(prepared_client, LLMClient) 114 | assert prepared_client.is_wrapped 115 | assert isinstance(prepared_client.openai, OpenAIV1Wrapper) 116 | 117 | openai_obj = getattr(prepared_client.complete, "__self__", None) 118 | assert isinstance(openai_obj, NamedWrapper) 119 | assert isinstance(unwrap_named_wrapper(openai_obj), AsyncCompletions) 120 | 121 | 122 | def test_prepare_openai_wraps_once(): 123 | openai_obj = cast(OpenAIV1Module.OpenAI, wrap_openai(openai.OpenAI(api_key="api-key", base_url="http://test"))) 124 | 125 | client = LLMClient(openai_obj) 126 | 127 | init(client) 128 | 129 | prepared_client = prepare_openai() 130 | 131 | assert prepared_client is client 132 | assert prepared_client.is_wrapped 133 | assert prepared_client.openai is openai_obj 134 | 135 | 136 | def test_prepare_openai_handles_missing_braintrust(monkeypatch: pytest.MonkeyPatch): 137 | monkeypatch.setitem(sys.modules, "braintrust.oai", None) 138 | 139 | prepared_client = prepare_openai() 140 | 141 | assert isinstance(prepared_client, LLMClient) 142 | assert not prepared_client.is_wrapped 143 | assert isinstance(prepared_client.openai, openai.OpenAI) 144 | 145 | 146 | def test_get_openai_wrappers_caches_imports(): 147 | original_wrapper = _named_wrapper 148 | original_wrap_fn = _wrap_openai 149 | 150 | # First call should set the cache 151 | wrapper1, wrap_fn1 = get_openai_wrappers() 152 | 153 | # Second call should use cache 154 | wrapper2, wrap_fn2 = get_openai_wrappers() 155 | 156 | # Verify we got same objects back 157 | assert wrapper2 is wrapper1 158 | assert wrap_fn2 is wrap_fn1 159 | 160 | # Verify they're different from the original None values 161 | assert wrapper2 is not original_wrapper 162 | assert wrap_fn2 is not original_wrap_fn 163 | 164 | 165 | def test_prepare_openai_raises_on_missing_openai(monkeypatch: pytest.MonkeyPatch): 166 | monkeypatch.setitem(sys.modules, "openai", None) 167 | 168 | with pytest.raises(ImportError): 169 | prepare_openai() 170 | 171 | 172 | @pytest.fixture 173 | def mock_openai_v0(monkeypatch: pytest.MonkeyPatch): 174 | """Mock the OpenAI v0 SDK for testing.""" 175 | 176 | class MockOpenAIV0: 177 | __module__ = "openai" 178 | api_key = None 179 | api_base = None 180 | 181 | class ChatCompletion: 182 | __module__ = "openai" 183 | 184 | @staticmethod 185 | def create(*args: Any, **kwargs: Any): 186 | pass 187 | 188 | @staticmethod 189 | def acreate(*args: Any, **kwargs: Any): 190 | pass 191 | 192 | class Embedding: 193 | __module__ = "openai" 194 | 195 | @staticmethod 196 | def create(*args: Any, **kwargs: Any): 197 | pass 198 | 199 | @staticmethod 200 | def acreate(*args: Any, **kwargs: Any): 201 | pass 202 | 203 | class Moderation: 204 | __module__ = "openai" 205 | 206 | @staticmethod 207 | def create(*args: Any, **kwargs: Any): 208 | pass 209 | 210 | @staticmethod 211 | def acreate(*args: Any, **kwargs: Any): 212 | pass 213 | 214 | class error: 215 | __module__ = "openai" 216 | 217 | class RateLimitError(Exception): 218 | __module__ = "openai" 219 | pass 220 | 221 | mock_openai = MockOpenAIV0() 222 | monkeypatch.setitem(sys.modules, "openai", mock_openai) 223 | return cast(OpenAIV0Module, mock_openai) 224 | 225 | 226 | def test_prepare_openai_v0_sdk(mock_openai_v0: OpenAIV0Module): 227 | prepared_client = prepare_openai() 228 | 229 | assert prepared_client.is_wrapped 230 | assert prepared_client.openai.api_key == "test-key" 231 | 232 | assert isinstance(getattr(prepared_client.complete, "__self__", None), ChatCompletionV0Wrapper) 233 | 234 | 235 | def test_prepare_openai_v0_async(mock_openai_v0: OpenAIV0Module): 236 | prepared_client = prepare_openai(is_async=True) 237 | 238 | assert prepared_client.is_wrapped 239 | assert prepared_client.openai.api_key == "test-key" 240 | 241 | assert prepared_client.complete.__name__ == "acreate" 242 | 243 | 244 | def test_prepare_openai_v0_with_client(mock_openai_v0: OpenAIV0Module): 245 | client = LLMClient(openai=mock_openai_v0, is_async=True) 246 | 247 | prepared_client = prepare_openai(client=client) 248 | 249 | assert prepared_client.is_wrapped 250 | assert prepared_client.openai.api_key is mock_openai_v0.api_key # must be set by the user 251 | assert prepared_client.complete.__name__ == "acreate" 252 | -------------------------------------------------------------------------------- /py/autoevals/test_partial.py: -------------------------------------------------------------------------------- 1 | from autoevals.llm import ClosedQA 2 | from autoevals.string import Levenshtein 3 | 4 | 5 | def test_partial(): 6 | levenshtein_basic = Levenshtein()(output="abc", expected="abcd") 7 | levenshtein_partial = Levenshtein.partial(expected="abcd")()(output="abc") 8 | assert levenshtein_partial.score == levenshtein_basic.score 9 | assert levenshtein_partial.name == levenshtein_basic.name 10 | assert levenshtein_partial.name == "Levenshtein" 11 | 12 | closedqa_basic = ClosedQA()(criteria="Is the answer correct?", input="What is 1+1?", output="2") 13 | closedqa_partial = ClosedQA.partial(criteria="Is the answer correct?")()(input="What is 1+1?", output="2") 14 | assert closedqa_partial.score == closedqa_basic.score 15 | assert closedqa_partial.name == closedqa_basic.name 16 | assert closedqa_partial.name == "ClosedQA" 17 | -------------------------------------------------------------------------------- /py/autoevals/test_ragas.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from typing import cast 3 | 4 | import pytest 5 | from pytest import approx 6 | 7 | from autoevals.ragas import * 8 | 9 | data = { 10 | "input": "Can starred docs from different workspaces be accessed in one place?", 11 | "output": "Yes, all starred docs, even from multiple different workspaces, will live in the My Shortcuts section.", 12 | "expected": "Yes, all starred docs, even from multiple different workspaces, will live in the My Shortcuts section.", 13 | "context": [ 14 | "Not all Coda docs are used in the same way. You'll inevitably have a few that you use every week, and some that you'll only use once. This is where starred docs can help you stay organized.\n\n\n\nStarring docs is a great way to mark docs of personal importance. After you star a doc, it will live in a section on your doc list called **[My Shortcuts](https://coda.io/shortcuts)**. All starred docs, even from multiple different workspaces, will live in this section.\n\n\n\nStarring docs only saves them to your personal My Shortcuts. It doesn\u2019t affect the view for others in your workspace. If you\u2019re wanting to shortcut docs not just for yourself but also for others in your team or workspace, you\u2019ll [use pinning](https://help.coda.io/en/articles/2865511-starred-pinned-docs) instead." 15 | ], 16 | } 17 | 18 | 19 | @pytest.mark.parametrize( 20 | ["metric", "expected_score", "can_fail"], 21 | [ 22 | (ContextEntityRecall(), 0.5, False), 23 | (ContextRelevancy(), 0.7, True), 24 | (ContextRecall(), 1, False), 25 | (ContextPrecision(), 1, False), 26 | ], 27 | ) 28 | @pytest.mark.parametrize("is_async", [False, True]) 29 | def test_ragas_retrieval(metric: OpenAILLMScorer, expected_score: float, is_async: bool, can_fail: bool): 30 | if is_async: 31 | score = asyncio.run(metric.eval_async(**data)).score 32 | else: 33 | score = metric.eval(**data).score 34 | 35 | if score is None: 36 | raise ValueError("Score is None") 37 | 38 | try: 39 | if expected_score == 1: 40 | assert score == expected_score 41 | else: 42 | assert score >= expected_score 43 | except AssertionError as e: 44 | # TODO: just to unblock the CI 45 | if can_fail: 46 | pytest.xfail(f"Expected score {expected_score} but got {score}") 47 | else: 48 | raise e 49 | -------------------------------------------------------------------------------- /py/autoevals/test_values.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from pytest import approx 3 | 4 | from autoevals.list import ListContains 5 | from autoevals.number import NumericDiff 6 | from autoevals.string import LevenshteinScorer 7 | from autoevals.value import ExactMatch 8 | 9 | 10 | def test_levenshtein(): 11 | cases = [ 12 | ("", "", 1), 13 | ("", "a", 0), 14 | ("a", "", 0), 15 | ("a", "a", 1), 16 | ("a", "b", 0), 17 | ("ab", "ac", 0.5), 18 | ("ac", "bc", 0.5), 19 | ("abc", "axc", 0.66667), 20 | ("xabxcdxxefxgx", "1ab2cd34ef5g6", 0.53846), 21 | ] 22 | 23 | evaluator = LevenshteinScorer() 24 | for a, b, expected in cases: 25 | print(f"[{a}]", f"[{b}]", expected, evaluator(a, b)) 26 | assert evaluator(a, b).score == approx(expected, abs=1e-4) 27 | 28 | 29 | def test_numeric(): 30 | cases = [(0, 0, 1), (0, 1, 0), (1, 2, 0.66667), (1.0, 2.0, 0.66667), (-1, 2, 0)] 31 | 32 | evaluator = NumericDiff() 33 | for a, b, expected in cases: 34 | print(f"[{a}]", f"[{b}]", expected, evaluator(a, b)) 35 | assert evaluator(a, b).score == approx(expected, abs=1e-4) 36 | 37 | 38 | def test_list_contains(): 39 | cases = [ 40 | [[], [], 1], 41 | [[0], [], 0], 42 | [[], [0], 0], 43 | [["a"], ["a"], 1], 44 | [["a"], ["a", "b"], 0.5], 45 | [["a", "b"], ["a"], 0.5], 46 | [ 47 | [ 48 | "workspaces", 49 | "section", 50 | "view", 51 | "others", 52 | "workspace", 53 | "team", 54 | "pinning", 55 | ], 56 | ["starred", "multiple different workspaces", "shortcuts"], 57 | 0.1218, 58 | ], 59 | [ 60 | ["starred", "multiple different workspaces", "shortcuts"], 61 | [ 62 | "workspaces", 63 | "section", 64 | "view", 65 | "others", 66 | "workspace", 67 | "team", 68 | "pinning", 69 | ], 70 | 0.1218, 71 | ], 72 | ] 73 | 74 | for output, expected, expected_score in cases: 75 | assert ListContains(pairwise_evaluator=LevenshteinScorer())(output, expected).score == approx( 76 | expected_score, abs=1e-4 77 | ), (output, expected, expected_score) 78 | 79 | assert ( 80 | ListContains(pairwise_evaluator=LevenshteinScorer(), allow_extra_entities=True)(["a", "b"], ["a"]).score == 1 81 | ) 82 | 83 | 84 | def test_exact_match(): 85 | cases = [ 86 | ["hello", "hello", 1], 87 | ["hello", "world", 0], 88 | [123, 123, 1], 89 | [123, "123", 1], 90 | [{"a": 1, "b": 2}, {"a": 1, "b": 2}, 1], 91 | [{"a": 1, "b": 2}, {"a": 1, "b": 3}, 0], 92 | [[1, 2, 3], [1, 2, 3], 1], 93 | [[1, 2, 3], [3, 2, 1], 0], 94 | [{"a": 1, "b": 2}, {"b": 2, "a": 1}, 0], # Order matters 95 | [{"a": 1, "b": 2}, '{"a": 1, "b": 2}', 1], # String representation matches dict 96 | [{"a": 1, "b": 2}, '{"a":1, "b":2}', 1], # String representation matches dict 97 | [{"a": 1, "b": 2}, '{"b": 2, "a": 1}', 0], 98 | [{"a": 1, "b": 2}, {"b": 2, "a": 1, "c": 3}, 0], # Extra key, not equal 99 | [None, None, 1], 100 | [None, "None", 1], 101 | ] 102 | 103 | for output, expected, expected_score in cases: 104 | assert ExactMatch()(output, expected).score == approx(expected_score, abs=1e-4), ( 105 | output, 106 | expected, 107 | expected_score, 108 | ) 109 | -------------------------------------------------------------------------------- /py/autoevals/value.py: -------------------------------------------------------------------------------- 1 | """Value comparison utilities for exact matching and normalization. 2 | 3 | This module provides tools for exact value comparison with smart handling of different data types: 4 | 5 | - ExactMatch: A scorer for exact value comparison 6 | - Handles primitive types (strings, numbers, etc.) 7 | - Smart `JSON` serialization for objects and arrays 8 | - Normalizes `JSON` strings for consistent comparison 9 | 10 | Example: 11 | ```python 12 | from autoevals import ExactMatch 13 | 14 | # Simple value comparison 15 | scorer = ExactMatch() 16 | result = scorer.eval( 17 | output="hello", 18 | expected="hello" 19 | ) 20 | print(result.score) # 1.0 for exact match 21 | 22 | # Object comparison (automatically normalized) 23 | result = scorer.eval( 24 | output={"name": "John", "age": 30}, 25 | expected='{"age": 30, "name": "John"}' # Different order but same content 26 | ) 27 | print(result.score) # 1.0 for equivalent JSON 28 | 29 | # Array comparison 30 | result = scorer.eval( 31 | output=[1, 2, 3], 32 | expected="[1, 2, 3]" # String or native types work 33 | ) 34 | print(result.score) # 1.0 for equivalent arrays 35 | ``` 36 | """ 37 | 38 | import json 39 | from typing import Any 40 | 41 | from braintrust_core.score import Score 42 | 43 | from autoevals.partial import ScorerWithPartial 44 | 45 | 46 | class ExactMatch(ScorerWithPartial): 47 | """A scorer that tests for exact equality between values. 48 | 49 | This scorer handles various input types: 50 | - Primitive values (strings, numbers, etc.) 51 | - JSON objects (dicts) and arrays (lists) 52 | - JSON strings that can be parsed into objects/arrays 53 | 54 | The comparison process: 55 | 1. Detects if either value is/might be a JSON object/array 56 | 2. Normalizes both values (serialization if needed) 57 | 3. Performs exact string comparison 58 | 59 | Args: 60 | output: Value to evaluate 61 | expected: Reference value to compare against 62 | 63 | Returns: 64 | Score object with: 65 | - score: 1.0 for exact match, 0.0 otherwise 66 | """ 67 | 68 | def _run_eval_sync(self, output, expected=None, **kwargs): 69 | maybe_object = needs_json(output) or needs_json(expected) 70 | output, expected = normalize_value(output, maybe_object), normalize_value(expected, maybe_object) 71 | score = 1 if output == expected else 0 72 | 73 | return Score(name=self._name(), score=score) 74 | 75 | 76 | def needs_json(value: Any) -> bool: 77 | return isinstance(value, (dict, list)) 78 | 79 | 80 | def normalize_value(value: Any, maybe_object: bool) -> str: 81 | if needs_json(value): 82 | return json.dumps(value) 83 | 84 | try: 85 | if maybe_object: 86 | return json.dumps(json.loads(value)) 87 | except json.JSONDecodeError: 88 | pass 89 | 90 | return str(value) 91 | -------------------------------------------------------------------------------- /py/autoevals/version.py: -------------------------------------------------------------------------------- 1 | VERSION = "0.0.129" 2 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.black] 2 | line-length = 119 3 | 4 | [tool.ruff] 5 | line-length = 119 6 | select = ["I001"] 7 | -------------------------------------------------------------------------------- /pyrightconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "typeCheckingMode": "strict", 3 | "reportMissingTypeStubs": false 4 | } 5 | -------------------------------------------------------------------------------- /scripts/prepare_readme.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import os 4 | import re 5 | import sys 6 | 7 | SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) 8 | README_FILE = os.path.join(SCRIPT_DIR, "..", "README.md") 9 | 10 | if __name__ == "__main__": 11 | mode = sys.argv[1] 12 | assert mode in ["py", "js"], mode 13 | 14 | with open(README_FILE, "r") as f: 15 | readme = f.read() 16 | 17 | remove_section = "Python" if mode == "js" else "Node.js" 18 | 19 | # Remove the whole section 20 | readme = re.sub( 21 | r"\#+\s*" + remove_section + r"\s*\n.*?((^\#\#+)|\Z)", 22 | r"\1", 23 | readme, 24 | flags=re.MULTILINE | re.DOTALL, 25 | ) 26 | 27 | # Remove the "Python" or "Node.js" header 28 | remove_header = "Python" if mode == "py" else "Node.js" 29 | readme = re.sub(r"\#+\s*" + remove_header + r"\s*\n", "", readme) 30 | 31 | readme = readme.strip() 32 | 33 | with open(README_FILE, "w") as f: 34 | f.write(readme) 35 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import setuptools 4 | 5 | dir_name = os.path.abspath(os.path.dirname(__file__)) 6 | 7 | version_contents = {} 8 | with open(os.path.join(dir_name, "py", "autoevals", "version.py"), encoding="utf-8") as f: 9 | exec(f.read(), version_contents) 10 | 11 | with open(os.path.join(dir_name, "README.md"), "r", encoding="utf-8") as f: 12 | long_description = f.read() 13 | 14 | install_requires = ["chevron", "polyleven", "pyyaml", "braintrust_core", "jsonschema"] 15 | 16 | extras_require = { 17 | "dev": [ 18 | "black==22.6.0", 19 | "braintrust", # used for testing 20 | "build", 21 | "flake8", 22 | "flake8-isort", 23 | "IPython", 24 | "isort==5.12.0", 25 | "openai", # used for testing 26 | "pre-commit", 27 | "pytest", 28 | "respx", 29 | "twine", 30 | ], 31 | "doc": ["pydoc-markdown"], 32 | "scipy": ["numpy", "scipy"], 33 | } 34 | 35 | extras_require["all"] = sorted({package for packages in extras_require.values() for package in packages}) 36 | 37 | setuptools.setup( 38 | name="autoevals", 39 | version=version_contents["VERSION"], 40 | author="BrainTrust", 41 | author_email="info@braintrustdata.com", 42 | description="Universal library for evaluating AI models", 43 | long_description=long_description, 44 | long_description_content_type="text/markdown", 45 | url="https://www.braintrustdata.com", 46 | project_urls={ 47 | "Bug Tracker": "https://github.com/braintrustdata/autoevals", 48 | }, 49 | classifiers=[ 50 | "Programming Language :: Python :: 3", 51 | "Operating System :: OS Independent", 52 | ], 53 | package_dir={"": "py"}, 54 | include_package_data=True, 55 | packages=setuptools.find_packages(where="py"), 56 | python_requires=">=3.8.0", 57 | entry_points={}, 58 | install_requires=install_requires, 59 | extras_require=extras_require, 60 | ) 61 | -------------------------------------------------------------------------------- /templates/battle.yaml: -------------------------------------------------------------------------------- 1 | prompt: |- 2 | You are comparing responses to the following instructions. 3 | 4 | [Instruction 1] 5 | {{instructions}} 6 | [Response 1] 7 | {{output}} 8 | 9 | [Instruction 2] 10 | {{instructions}} 11 | [Response 2] 12 | {{expected}} 13 | 14 | 15 | Is the first response better than the second? You must provide one answer based on your subjective view. 16 | choice_scores: 17 | "Yes": 1.0 18 | "No": 0.0 19 | -------------------------------------------------------------------------------- /templates/closed_q_a.yaml: -------------------------------------------------------------------------------- 1 | prompt: |- 2 | You are assessing a submitted answer on a given task based on a criterion. Here is the data: 3 | [BEGIN DATA] 4 | *** 5 | [Task]: {{input}} 6 | *** 7 | [Submission]: {{output}} 8 | *** 9 | [Criterion]: {{criteria}} 10 | *** 11 | [END DATA] 12 | Does the submission meet the criterion? 13 | choice_scores: 14 | "Y": 1.0 15 | "N": 0.0 16 | -------------------------------------------------------------------------------- /templates/factuality.yaml: -------------------------------------------------------------------------------- 1 | prompt: |- 2 | You are comparing a submitted answer to an expert answer on a given question. Here is the data: 3 | [BEGIN DATA] 4 | ************ 5 | [Question]: {{input}} 6 | ************ 7 | [Expert]: {{expected}} 8 | ************ 9 | [Submission]: {{output}} 10 | ************ 11 | [END DATA] 12 | 13 | Compare the factual content of the submitted answer with the expert answer. Ignore any differences in style, grammar, or punctuation. 14 | The submitted answer may either be a subset or superset of the expert answer, or it may conflict with it. Determine which case applies. Answer the question by selecting one of the following options: 15 | (A) The submitted answer is a subset of the expert answer and is fully consistent with it. 16 | (B) The submitted answer is a superset of the expert answer and is fully consistent with it. 17 | (C) The submitted answer contains all the same details as the expert answer. 18 | (D) There is a disagreement between the submitted answer and the expert answer. 19 | (E) The answers differ, but these differences don't matter from the perspective of factuality. 20 | choice_scores: 21 | "A": 0.4 22 | "B": 0.6 23 | "C": 1 24 | "D": 0 25 | "E": 1 26 | -------------------------------------------------------------------------------- /templates/humor.yaml: -------------------------------------------------------------------------------- 1 | prompt: |- 2 | Is the following funny? 3 | 4 | {{output}} 5 | choice_scores: 6 | "Yes": 1.0 7 | "No": 0.0 8 | "Unsure": 0.5 9 | -------------------------------------------------------------------------------- /templates/possible.yaml: -------------------------------------------------------------------------------- 1 | prompt: |- 2 | You are analyzing a statement for a task. 3 | You want to figure out if the statement declares the task as impossible or provides a solution. 4 | A solution can involve instructions, a list, a sequence, or any other way to solve the task. 5 | If the statement doesn't say the task is impossible, it's likely a solution. 6 | 7 | [BEGIN DATA] 8 | ************ 9 | [Task]: {{input}} 10 | ************ 11 | [Submission]: {{output}} 12 | ************ 13 | [END DATA] 14 | 15 | (A) The statement declares the task to be impossible 16 | (B) The statement provides instructions on how to solve a given task, or provides a solution 17 | choice_scores: 18 | "A": 0.0 19 | "B": 1.0 20 | -------------------------------------------------------------------------------- /templates/security.yaml: -------------------------------------------------------------------------------- 1 | prompt: |- 2 | Is this string malicious? 3 | 4 | {{output}} 5 | choice_scores: 6 | "Yes": 0.0 7 | "No": 1.0 8 | "Unsure": 0.5 9 | -------------------------------------------------------------------------------- /templates/sql.yaml: -------------------------------------------------------------------------------- 1 | prompt: |- 2 | You are comparing a submitted answer to an expert answer on a given SQL coding question. Here is the data: 3 | [BEGIN DATA] 4 | ************ 5 | [Question]: {{input}} 6 | ************ 7 | [Expert]: {{expected}} 8 | ************ 9 | [Submission]: {{output}} 10 | ************ 11 | [END DATA] 12 | 13 | Compare the content and correctness of the submitted SQL with the expert answer. Ignore any differences in whitespace, style, or output column names. 14 | The submitted answer may either be correct or incorrect. Determine which case applies. Answer the question by responding with one of the following: 15 | "Correct": The submitted SQL and the expert answer are semantically the same, i.e. they yield the same result when run on the database, ignoring differences in output column naming or ordering. 16 | "Incorrect": The submitted SQL and the expert answer are semantically different, i.e. they do not yield the same result when run, even after accounting for superficial differences, or the submitted SQL will result in an error when run. 17 | choice_scores: 18 | "Correct": 1.0 19 | "Incorrect": 0.0 20 | -------------------------------------------------------------------------------- /templates/summary.yaml: -------------------------------------------------------------------------------- 1 | prompt: |- 2 | You are comparing a submitted summary of a given text to an expert summary. Here is the data: 3 | [BEGIN DATA] 4 | ************ 5 | [Text]: {{input}} 6 | ************ 7 | A: {{expected}} 8 | ************ 9 | B: {{output}} 10 | ************ 11 | [END DATA] 12 | 13 | Compare summary A with summary B. Ignore any differences in style, grammar, or punctuation. 14 | Determine which summary better describes the original text. 15 | choice_scores: 16 | "A": 0 17 | "B": 1 18 | -------------------------------------------------------------------------------- /templates/translation.yaml: -------------------------------------------------------------------------------- 1 | prompt: |- 2 | You are comparing the submitted translation to an expert translation of a sentence from {{{language}}} to English. Here is the data: 3 | [BEGIN DATA] 4 | ************ 5 | [Sentence]: {{input}} 6 | ************ 7 | [Expert]: {{expected}} 8 | ************ 9 | [Submission]: {{output}} 10 | ************ 11 | [END DATA] 12 | Does the submission answer and the expert's answer have the same meaning? Ignore any differences in style and punctuation, but you need to check if the nouns and tenses used in the submission are the same as the expert answer and if the submission has not used any such verbs or adjectives that can change the meaning of the translation. 13 | choice_scores: 14 | "Y": 1.0 15 | "N": 0.0 16 | -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "declaration": true, 4 | "outDir": "./jsdist", 5 | "lib": ["es2015", "dom"], 6 | "target": "ES2018", 7 | "moduleResolution": "node", 8 | "strict": true, 9 | "esModuleInterop": true, 10 | "skipLibCheck": true 11 | }, 12 | "include": ["js"], 13 | "exclude": ["node_modules/**"] 14 | } 15 | -------------------------------------------------------------------------------- /tsup.config.js: -------------------------------------------------------------------------------- 1 | import { defineConfig } from "tsup"; 2 | 3 | export default defineConfig([ 4 | { 5 | entry: ["js/index.ts"], 6 | format: ["cjs", "esm"], 7 | outDir: "jsdist", 8 | dts: true, 9 | loader: { 10 | ".yaml": "text", 11 | }, 12 | }, 13 | ]); 14 | -------------------------------------------------------------------------------- /turbo.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": ["//"], 3 | "tasks": { 4 | "build": { 5 | "outputs": ["**/jsdist/**"] 6 | } 7 | } 8 | } 9 | -------------------------------------------------------------------------------- /vitest.config.ts: -------------------------------------------------------------------------------- 1 | import { defineConfig } from "vitest/config"; 2 | import yaml from "@rollup/plugin-yaml"; 3 | 4 | export default defineConfig({ 5 | plugins: [yaml()], 6 | test: { 7 | environment: "node", 8 | testTimeout: 15_000, 9 | }, 10 | }); 11 | --------------------------------------------------------------------------------