├── python ├── tests │ ├── __init__.py │ ├── graph_trajectory │ │ ├── test_graph_trajectory_strict.py │ │ ├── test_graph_trajectory_strict_async.py │ │ ├── test_graph_trajectory_llm.py │ │ ├── test_graph_trajectory_llm_async.py │ │ └── test_graph_trajectory_utils.py │ ├── test_trajectory_llm.py │ └── test_trajectory_llm_async.py ├── .python-version ├── agentevals │ ├── __init__.py │ ├── graph_trajectory │ │ ├── __init__.py │ │ ├── strict.py │ │ ├── utils.py │ │ └── llm.py │ ├── trajectory │ │ ├── __init__.py │ │ ├── unordered.py │ │ ├── subset.py │ │ ├── superset.py │ │ ├── utils.py │ │ ├── strict.py │ │ ├── match.py │ │ └── llm.py │ ├── utils.py │ └── types.py ├── pyproject.toml └── LICENSE ├── js ├── .yarnrc.yml ├── .gitignore ├── .gitattributes ├── vitest.config.ts ├── .editorconfig ├── tsconfig.cjs.json ├── .prettierrc ├── langchain.config.js ├── tsconfig.json ├── src │ ├── index.ts │ ├── graph_trajectory │ │ ├── strict.ts │ │ ├── tests │ │ │ ├── graph_trajectory_strict.test.ts │ │ │ ├── graph_trajectory_utils.test.ts │ │ │ └── graph_trajectory_llm.test.ts │ │ ├── utils.ts │ │ └── llm.ts │ ├── types.ts │ ├── trajectory │ │ ├── subset.ts │ │ ├── superset.ts │ │ ├── unordered.ts │ │ ├── match.ts │ │ ├── strict.ts │ │ ├── llm.ts │ │ ├── tests │ │ │ └── trajectory_llm.test.ts │ │ └── utils.ts │ └── utils.ts ├── LICENSE ├── package.json └── .eslintrc.cjs ├── static └── img │ ├── pytest_output.png │ └── langsmith_results.png ├── uv.lock ├── .gitignore ├── LICENSE ├── .github └── workflows │ ├── build.yml │ └── integration_tests.yml └── scripts └── generate_language_readmes.py /python/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /python/.python-version: -------------------------------------------------------------------------------- 1 | 3.11 2 | -------------------------------------------------------------------------------- /python/agentevals/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /python/agentevals/graph_trajectory/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /js/.yarnrc.yml: -------------------------------------------------------------------------------- 1 | yarnPath: .yarn/releases/yarn-3.5.1.cjs 2 | nodeLinker: node-modules -------------------------------------------------------------------------------- /js/.gitignore: -------------------------------------------------------------------------------- 1 | index.cjs 2 | index.js 3 | index.d.ts 4 | index.d.cts 5 | node_modules 6 | dist 7 | .yarn 8 | -------------------------------------------------------------------------------- /static/img/pytest_output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/agentevals/HEAD/static/img/pytest_output.png -------------------------------------------------------------------------------- /static/img/langsmith_results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/agentevals/HEAD/static/img/langsmith_results.png -------------------------------------------------------------------------------- /uv.lock: -------------------------------------------------------------------------------- 1 | version = 1 2 | requires-python = ">=3.9, <4.0" 3 | 4 | [[package]] 5 | name = "openevals-monorepo" 6 | version = "0.0.1" 7 | source = { virtual = "." } 8 | -------------------------------------------------------------------------------- /js/.gitattributes: -------------------------------------------------------------------------------- 1 | /.yarn/** linguist-vendored 2 | /.yarn/releases/* binary 3 | /.yarn/plugins/**/* binary 4 | /.pnp.* binary linguist-generated 5 | -------------------------------------------------------------------------------- /js/vitest.config.ts: -------------------------------------------------------------------------------- 1 | import { defineConfig } from "vitest/config"; 2 | 3 | export default defineConfig({ 4 | test: { 5 | setupFiles: ["dotenv/config"], 6 | }, 7 | }); 8 | -------------------------------------------------------------------------------- /js/.editorconfig: -------------------------------------------------------------------------------- 1 | root = true 2 | 3 | [*] 4 | end_of_line = lf 5 | insert_final_newline = true 6 | 7 | [*.{js,json,yml}] 8 | charset = utf-8 9 | indent_style = space 10 | indent_size = 2 11 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Python-generated files 2 | __pycache__/ 3 | *.py[oc] 4 | build/ 5 | dist/ 6 | wheels/ 7 | *.egg-info 8 | 9 | # Virtual environments 10 | .venv 11 | 12 | # JS 13 | node_modules/ 14 | .env 15 | .eslintcache -------------------------------------------------------------------------------- /js/tsconfig.cjs.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": "./tsconfig.json", 3 | "compilerOptions": { 4 | "module": "commonjs", 5 | "declaration": false 6 | }, 7 | "exclude": [ 8 | "node_modules", 9 | "dist", 10 | "docs", 11 | "**/tests" 12 | ] 13 | } -------------------------------------------------------------------------------- /python/agentevals/trajectory/__init__.py: -------------------------------------------------------------------------------- 1 | from .match import ( 2 | create_trajectory_match_evaluator, 3 | create_async_trajectory_match_evaluator, 4 | ) 5 | from .llm import create_trajectory_llm_as_judge, create_async_trajectory_llm_as_judge 6 | 7 | __all__ = [ 8 | "create_trajectory_match_evaluator", 9 | "create_async_trajectory_match_evaluator", 10 | "create_trajectory_llm_as_judge", 11 | "create_async_trajectory_llm_as_judge", 12 | ] 13 | -------------------------------------------------------------------------------- /js/.prettierrc: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://json.schemastore.org/prettierrc", 3 | "printWidth": 80, 4 | "tabWidth": 2, 5 | "useTabs": false, 6 | "semi": true, 7 | "singleQuote": false, 8 | "quoteProps": "as-needed", 9 | "jsxSingleQuote": false, 10 | "trailingComma": "es5", 11 | "bracketSpacing": true, 12 | "arrowParens": "always", 13 | "requirePragma": false, 14 | "insertPragma": false, 15 | "proseWrap": "preserve", 16 | "htmlWhitespaceSensitivity": "css", 17 | "vueIndentScriptAndStyle": false, 18 | "endOfLine": "lf" 19 | } 20 | -------------------------------------------------------------------------------- /js/langchain.config.js: -------------------------------------------------------------------------------- 1 | import { resolve, dirname } from "node:path"; 2 | import { fileURLToPath } from "node:url"; 3 | 4 | /** 5 | * @param {string} relativePath 6 | * @returns {string} 7 | */ 8 | function abs(relativePath) { 9 | return resolve(dirname(fileURLToPath(import.meta.url)), relativePath); 10 | } 11 | 12 | export const config = { 13 | internals: [ 14 | /node\:/, 15 | /js-tiktoken/, 16 | /langsmith/, 17 | /openevals\/llm/, 18 | /openevals\/types/, 19 | /@langchain\/core\/messages/, 20 | ], 21 | entrypoints: { 22 | index: "index", 23 | }, 24 | tsConfigPath: resolve("./tsconfig.json"), 25 | packageSuffix: "core", 26 | cjsSource: "./dist-cjs", 27 | cjsDestination: "./dist", 28 | abs, 29 | } -------------------------------------------------------------------------------- /python/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "agentevals" 3 | version = "0.0.9" 4 | license = {text = "MIT"} 5 | description = "Open-source evaluators for LLM agents" 6 | readme = "README.md" 7 | requires-python = ">=3.9" 8 | dependencies = [ 9 | "openevals>=0.0.20" 10 | ] 11 | 12 | [dependency-groups] 13 | dev = [ 14 | "langgraph>=0.5.4", 15 | "mypy>=1.15.0", 16 | "openai>=1.61.1", 17 | "openevals>=0.1.0", 18 | "pytest>=8.3.4", 19 | "pytest-asyncio>=0.25.3", 20 | "pytest-dotenv>=0.5.2", 21 | "ruff>=0.9.5", 22 | ] 23 | 24 | [tool.setuptools.packages.find] 25 | include = ["agentevals*"] 26 | 27 | [tool.pytest.ini_options] 28 | addopts = [ 29 | "--import-mode=importlib", 30 | ] 31 | pythonpath = [ 32 | "." 33 | ] 34 | -------------------------------------------------------------------------------- /js/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": "@tsconfig/recommended", 3 | "compilerOptions": { 4 | "target": "ES2021", 5 | "lib": [ 6 | "ES2021", 7 | "ES2022.Object", 8 | "DOM" 9 | ], 10 | "module": "ES2020", 11 | "moduleResolution": "nodenext", 12 | "esModuleInterop": true, 13 | "declaration": true, 14 | "noImplicitReturns": true, 15 | "noFallthroughCasesInSwitch": true, 16 | "noUnusedLocals": true, 17 | "noUnusedParameters": true, 18 | "useDefineForClassFields": true, 19 | "strictPropertyInitialization": false, 20 | "allowJs": true, 21 | "strict": true 22 | }, 23 | "include": [ 24 | "src/**/*", 25 | ], 26 | "exclude": [ 27 | "node_modules", 28 | "**/dist/", 29 | "docs", 30 | "dist/", 31 | ] 32 | } -------------------------------------------------------------------------------- /js/src/index.ts: -------------------------------------------------------------------------------- 1 | export { trajectoryStrictMatch } from "./trajectory/strict.js"; 2 | export { trajectorySubset } from "./trajectory/subset.js"; 3 | export { trajectorySuperset } from "./trajectory/superset.js"; 4 | export { trajectoryUnorderedMatch } from "./trajectory/unordered.js"; 5 | export { 6 | createTrajectoryMatchEvaluator, 7 | type TrajectoryMatchMode, 8 | } from "./trajectory/match.js"; 9 | export { 10 | createTrajectoryLLMAsJudge, 11 | TRAJECTORY_ACCURACY_PROMPT, 12 | TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE, 13 | } from "./trajectory/llm.js"; 14 | export { 15 | createGraphTrajectoryLLMAsJudge, 16 | GRAPH_TRAJECTORY_ACCURACY_PROMPT, 17 | } from "./graph_trajectory/llm.js"; 18 | 19 | export * from "./types.js"; 20 | export * from "./utils.js"; 21 | export * from "./graph_trajectory/utils.js"; 22 | -------------------------------------------------------------------------------- /python/agentevals/utils.py: -------------------------------------------------------------------------------- 1 | __all__ = ["_run_evaluator", "_arun_evaluator"] 2 | 3 | from openevals.types import EvaluatorResult 4 | from openevals.utils import ( 5 | _run_evaluator as _base_run_evaluator, 6 | _arun_evaluator as _base_arun_evaluator, 7 | ) 8 | 9 | from typing import Any, Callable 10 | 11 | 12 | def _run_evaluator( 13 | *, run_name: str, scorer: Callable, feedback_key: str, **kwargs: Any 14 | ) -> EvaluatorResult | list[EvaluatorResult]: 15 | return _base_run_evaluator( 16 | run_name=run_name, 17 | scorer=scorer, 18 | feedback_key=feedback_key, 19 | ls_framework="agentevals", 20 | **kwargs, 21 | ) 22 | 23 | 24 | async def _arun_evaluator( 25 | *, run_name: str, scorer: Callable, feedback_key: str, **kwargs: Any 26 | ) -> EvaluatorResult | list[EvaluatorResult]: 27 | return await _base_arun_evaluator( 28 | run_name=run_name, 29 | scorer=scorer, 30 | feedback_key=feedback_key, 31 | ls_framework="agentevals", 32 | **kwargs, 33 | ) 34 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2025 LangChain, Inc. 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. 20 | -------------------------------------------------------------------------------- /js/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2025 LangChain, Inc. 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. 20 | -------------------------------------------------------------------------------- /python/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2025 LangChain, Inc. 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. 20 | -------------------------------------------------------------------------------- /python/agentevals/types.py: -------------------------------------------------------------------------------- 1 | from typing import Callable, Literal, Optional, Union 2 | from typing_extensions import TypedDict 3 | 4 | from openevals.types import ( 5 | ChatCompletionMessage, 6 | EvaluatorResult, 7 | FewShotExample, 8 | SimpleEvaluator, 9 | SimpleAsyncEvaluator, 10 | ) 11 | 12 | 13 | # Trajectory extracted from agent 14 | class GraphTrajectory(TypedDict): 15 | inputs: Optional[list[dict]] 16 | results: list[dict] 17 | steps: list[list[str]] 18 | 19 | 20 | # Trajectory extracted from a LangGraph thread 21 | class ExtractedLangGraphThreadTrajectory(TypedDict): 22 | inputs: list 23 | outputs: GraphTrajectory 24 | 25 | 26 | ToolArgsMatchMode = Literal["exact", "ignore", "subset", "superset"] 27 | 28 | ToolArgsMatchOverrides = dict[ 29 | str, Union[ToolArgsMatchMode, list[str], Callable[[dict, dict], bool]] 30 | ] 31 | 32 | __all__ = [ 33 | "GraphTrajectory", 34 | "ChatCompletionMessage", 35 | "EvaluatorResult", 36 | "SimpleEvaluator", 37 | "SimpleAsyncEvaluator", 38 | "FewShotExample", 39 | "ToolArgsMatchMode", 40 | "ToolArgsMatchOverrides", 41 | ] 42 | -------------------------------------------------------------------------------- /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: Build Success 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | branches: 9 | - main 10 | workflow_dispatch: 11 | 12 | permissions: 13 | contents: read 14 | 15 | jobs: 16 | changed_files: 17 | runs-on: ubuntu-latest 18 | outputs: 19 | js_changed: ${{ steps.check-changes.outputs.js_changed }} 20 | steps: 21 | - uses: actions/checkout@v4 22 | with: 23 | fetch-depth: 0 24 | 25 | - name: Check for JS file changes 26 | id: check-changes 27 | run: | 28 | if git diff --name-only origin/main HEAD | grep -E "^js/.*\.(js|ts|jsx|tsx|json)$"; then 29 | echo "js_changed=true" >> $GITHUB_OUTPUT 30 | else 31 | echo "js_changed=false" >> $GITHUB_OUTPUT 32 | fi 33 | 34 | js_build_test: 35 | name: JS Build Test 36 | needs: changed_files 37 | if: > 38 | (github.event_name == 'push') || 39 | (github.event_name == 'pull_request' && needs.changed_files.outputs.js_changed == 'true') || 40 | (github.event_name == 'workflow_dispatch') 41 | runs-on: ubuntu-latest 42 | defaults: 43 | run: 44 | working-directory: js 45 | steps: 46 | - uses: actions/checkout@v3 47 | 48 | - name: Setup Node 49 | uses: actions/setup-node@v3 50 | with: 51 | node-version: 22.x 52 | cache: "yarn" 53 | cache-dependency-path: "js/yarn.lock" 54 | 55 | - name: Install dependencies 56 | run: yarn install --immutable 57 | 58 | - name: Build 59 | run: yarn build 60 | -------------------------------------------------------------------------------- /python/tests/graph_trajectory/test_graph_trajectory_strict.py: -------------------------------------------------------------------------------- 1 | from agentevals.graph_trajectory.utils import ( 2 | extract_langgraph_trajectory_from_thread, 3 | ) 4 | from agentevals.graph_trajectory.strict import graph_trajectory_strict_match 5 | 6 | from langgraph.prebuilt import create_react_agent 7 | from langgraph.checkpoint.memory import MemorySaver 8 | from langgraph.types import Command, interrupt 9 | from langchain_core.tools import tool 10 | 11 | import pytest 12 | 13 | 14 | @tool 15 | def search(query: str): 16 | """Call to surf the web.""" 17 | user_answer = interrupt("Tell me the answer to the question.") 18 | return user_answer 19 | 20 | 21 | tools = [search] 22 | 23 | 24 | @pytest.mark.langsmith 25 | def test_trajectory_match(): 26 | checkpointer = MemorySaver() 27 | graph = create_react_agent( 28 | model="gpt-4o-mini", 29 | checkpointer=checkpointer, 30 | tools=[search], 31 | ) 32 | graph.invoke( 33 | {"messages": [{"role": "user", "content": "what's the weather in sf?"}]}, 34 | config={"configurable": {"thread_id": "1"}}, 35 | ) 36 | graph.invoke( 37 | Command(resume="It is rainy and 70 degrees!"), 38 | config={"configurable": {"thread_id": "1"}}, 39 | ) 40 | extracted_trajectory = extract_langgraph_trajectory_from_thread( 41 | graph, {"configurable": {"thread_id": "1"}} 42 | ) 43 | reference_trajectory = { 44 | "results": [], 45 | "steps": [["__start__", "agent", "tools", "__interrupt__"], ["agent"]], 46 | } 47 | res = graph_trajectory_strict_match( 48 | outputs=extracted_trajectory["outputs"], 49 | reference_outputs=reference_trajectory, 50 | ) 51 | assert res["score"] 52 | -------------------------------------------------------------------------------- /python/tests/graph_trajectory/test_graph_trajectory_strict_async.py: -------------------------------------------------------------------------------- 1 | from agentevals.graph_trajectory.utils import ( 2 | aextract_langgraph_trajectory_from_thread, 3 | ) 4 | from agentevals.graph_trajectory.strict import graph_trajectory_strict_match_async 5 | 6 | from langgraph.prebuilt import create_react_agent 7 | from langgraph.checkpoint.memory import MemorySaver 8 | from langgraph.types import Command, interrupt 9 | from langchain_core.tools import tool 10 | 11 | import pytest 12 | 13 | 14 | @tool 15 | def search(query: str): 16 | """Call to surf the web.""" 17 | user_answer = interrupt("Tell me the answer to the question.") 18 | return user_answer 19 | 20 | 21 | tools = [search] 22 | 23 | 24 | @pytest.mark.langsmith 25 | @pytest.mark.asyncio 26 | async def test_trajectory_match(): 27 | checkpointer = MemorySaver() 28 | graph = create_react_agent( 29 | model="gpt-4o-mini", 30 | checkpointer=checkpointer, 31 | tools=[search], 32 | ) 33 | await graph.ainvoke( 34 | {"messages": [{"role": "user", "content": "what's the weather in sf?"}]}, 35 | config={"configurable": {"thread_id": "1"}}, 36 | ) 37 | await graph.ainvoke( 38 | Command(resume="It is rainy and 70 degrees!"), 39 | config={"configurable": {"thread_id": "1"}}, 40 | ) 41 | extracted_trajectory = await aextract_langgraph_trajectory_from_thread( 42 | graph, {"configurable": {"thread_id": "1"}} 43 | ) 44 | reference_trajectory = { 45 | "results": [], 46 | "steps": [["__start__", "agent", "tools", "__interrupt__"], ["agent"]], 47 | } 48 | res = await graph_trajectory_strict_match_async( 49 | outputs=extracted_trajectory["outputs"], 50 | reference_outputs=reference_trajectory, 51 | ) 52 | assert res["score"] 53 | -------------------------------------------------------------------------------- /scripts/generate_language_readmes.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | def strip_language_details(content: str, target_language: str) -> str: 4 | """ 5 | Strip out
tags for non-target language sections and remove the tags 6 | for the target language while preserving its content. 7 | 8 | Args: 9 | content: The README content 10 | target_language: Either 'Python' or 'TypeScript' 11 | """ 12 | # Define the opposite language to remove 13 | opposite_language = "TypeScript" if target_language == "Python" else "Python" 14 | 15 | # First remove the opposite language blocks completely 16 | pattern = rf']*>\s*{opposite_language}.*?
' 17 | content = re.sub(pattern, '', content, flags=re.DOTALL) 18 | 19 | # Then remove just the detail/summary tags for target language, keeping content 20 | pattern = rf']*>\s*{target_language}(.*?)' 21 | 22 | def replace_match(match): 23 | return match.group(1).strip() 24 | 25 | content = re.sub(pattern, replace_match, content, flags=re.DOTALL) 26 | 27 | # Clean up any double newlines created during the process 28 | content = re.sub(r'\n{3,}', '\n\n', content) 29 | 30 | return content 31 | 32 | def main(): 33 | # Read the README 34 | with open('README.md', 'r') as f: 35 | content = f.read() 36 | 37 | # Generate Python version 38 | python_content = strip_language_details(content, "Python") 39 | with open('./python/README.md', 'w') as f: 40 | f.write(python_content) 41 | 42 | # Generate TypeScript version 43 | ts_content = strip_language_details(content, "TypeScript") 44 | with open('./js/README.md', 'w') as f: 45 | f.write(ts_content) 46 | 47 | if __name__ == "__main__": 48 | main() 49 | -------------------------------------------------------------------------------- /js/src/graph_trajectory/strict.ts: -------------------------------------------------------------------------------- 1 | import { GraphTrajectory } from "../types.js"; 2 | import { _runEvaluator } from "../utils.js"; 3 | 4 | const _scorer = (params: { 5 | outputs: GraphTrajectory; 6 | referenceOutputs: GraphTrajectory; 7 | }) => { 8 | const { outputs, referenceOutputs } = params; 9 | if (!outputs || !referenceOutputs) { 10 | throw new Error( 11 | "Strict trajectory match requires both outputs and referenceOutputs" 12 | ); 13 | } 14 | if (outputs.steps.length !== referenceOutputs.steps.length) { 15 | return false; 16 | } 17 | for (let i = 0; i < outputs.steps.length; i++) { 18 | if (outputs.steps[i].length !== referenceOutputs.steps[i].length) { 19 | return false; 20 | } 21 | for (let j = 0; j < outputs.steps[i].length; j++) { 22 | if (outputs.steps[i][j] !== referenceOutputs.steps[i][j]) { 23 | return false; 24 | } 25 | } 26 | } 27 | return true; 28 | }; 29 | 30 | /** 31 | * Evaluate whether an input graph trajectory strictly matches a reference graph trajectory. 32 | * This means that at each step, the agent took the same steps in the same order as specified in the reference trajectory. 33 | * 34 | * @param params - The parameters object 35 | * @param params.outputs - Actual trajectory the agent followed 36 | * @param params.referenceOutputs - Ideal reference trajectory the agent should have followed 37 | * @returns Contains a score of true if trajectory (including called tools) matches, false otherwise 38 | */ 39 | export const graphTrajectoryStrictMatch = ({ 40 | outputs, 41 | referenceOutputs, 42 | }: { 43 | outputs: GraphTrajectory; 44 | referenceOutputs: GraphTrajectory; 45 | }) => { 46 | return _runEvaluator( 47 | "graph_trajectory_strict_match", 48 | _scorer, 49 | "graph_trajectory_strict_match", 50 | { 51 | outputs, 52 | referenceOutputs, 53 | } 54 | ); 55 | }; 56 | -------------------------------------------------------------------------------- /js/src/types.ts: -------------------------------------------------------------------------------- 1 | import { createLLMAsJudge } from "openevals/llm"; 2 | 3 | export * from "openevals/types"; 4 | 5 | // More tolerant version of ChatCompletionMessage that allows missing tool_call_id 6 | // eslint-disable-next-line @typescript-eslint/no-explicit-any 7 | export type FlexibleChatCompletionMessage = Record & 8 | ( 9 | | { 10 | // eslint-disable-next-line @typescript-eslint/no-explicit-any 11 | content: any; 12 | role: "user" | "system" | "developer"; 13 | id?: string; 14 | } 15 | | { 16 | role: "assistant"; 17 | // eslint-disable-next-line @typescript-eslint/no-explicit-any 18 | content: any; 19 | // eslint-disable-next-line @typescript-eslint/no-explicit-any 20 | tool_calls?: any[]; 21 | id?: string; 22 | } 23 | | { 24 | role: "tool"; 25 | // eslint-disable-next-line @typescript-eslint/no-explicit-any 26 | content: any; 27 | tool_call_id?: string; // Made optional for backward compatibility 28 | id?: string; 29 | } 30 | ); 31 | 32 | // Trajectory extracted from agent 33 | export type GraphTrajectory = { 34 | inputs?: (Record | null)[]; 35 | results: Record[]; 36 | steps: string[][]; 37 | }; 38 | 39 | // Trajectory extracted from a LangGraph thread 40 | export type ExtractedLangGraphThreadTrajectory = { 41 | inputs: (Record | null)[][]; 42 | outputs: GraphTrajectory; 43 | }; 44 | 45 | export type TrajectoryLLMAsJudgeParams = Partial< 46 | Omit[0], "prompt"> 47 | > & { 48 | prompt?: Parameters[0]["prompt"]; 49 | }; 50 | 51 | export type ToolArgsMatchMode = "exact" | "ignore" | "subset" | "superset"; 52 | 53 | export type ToolArgsMatcher = ( 54 | toolCall: Record, 55 | referenceToolCall: Record 56 | ) => boolean | Promise; 57 | 58 | export type ToolArgsMatchOverrides = Record< 59 | string, 60 | ToolArgsMatchMode | string[] | ToolArgsMatcher 61 | >; 62 | -------------------------------------------------------------------------------- /js/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "agentevals", 3 | "version": "0.0.6", 4 | "packageManager": "yarn@3.5.1", 5 | "type": "module", 6 | "scripts": { 7 | "build": "yarn lc_build --create-entrypoints --pre --tree-shaking", 8 | "lint:eslint": "NODE_OPTIONS=--max-old-space-size=4096 eslint --cache --ext .ts,.js src/", 9 | "lint:dpdm": "dpdm --exit-code circular:1 --no-warning --no-tree src/*.ts src/**/*.ts", 10 | "lint": "yarn lint:eslint && yarn lint:dpdm", 11 | "lint:fix": "yarn lint:eslint --fix && yarn lint:dpdm", 12 | "format": "prettier --config .prettierrc --write \"src\"", 13 | "format:check": "prettier --config .prettierrc --check \"src\"", 14 | "test": "vitest run" 15 | }, 16 | "dependencies": { 17 | "@langchain/openai": ">=0.4.4", 18 | "langchain": ">=0.3.18", 19 | "langsmith": ">=0.3.11", 20 | "openevals": "^0.1.0" 21 | }, 22 | "peerDependencies": { 23 | "@langchain/core": ">=0.3.73", 24 | "@langchain/langgraph": ">=0.2.46" 25 | }, 26 | "devDependencies": { 27 | "@langchain/core": "^0.3.73", 28 | "@langchain/langgraph": "^0.4.9", 29 | "@langchain/scripts": "0.1.3", 30 | "@tsconfig/recommended": "^1.0.8", 31 | "@typescript-eslint/eslint-plugin": "^8.24.1", 32 | "@typescript-eslint/parser": "^8.24.1", 33 | "dotenv": "^16.4.7", 34 | "dpdm": "^3.14.0", 35 | "eslint": "^8.33.0", 36 | "eslint-config-airbnb-base": "^15.0.0", 37 | "eslint-config-prettier": "^8.6.0", 38 | "eslint-plugin-import": "^2.27.5", 39 | "eslint-plugin-jest": "^27.6.0", 40 | "eslint-plugin-no-instanceof": "^1.0.1", 41 | "eslint-plugin-prettier": "^4.2.1", 42 | "openai": "^4.85.1", 43 | "prettier": "^3.5.1", 44 | "typescript": "~5.1.6", 45 | "vitest": "^3.0.5", 46 | "zod": "^4.1.5" 47 | }, 48 | "files": [ 49 | "dist/", 50 | "index.cjs", 51 | "index.js", 52 | "index.d.ts", 53 | "index.d.cts" 54 | ], 55 | "exports": { 56 | ".": { 57 | "types": { 58 | "import": "./index.d.ts", 59 | "require": "./index.d.cts", 60 | "default": "./index.d.ts" 61 | }, 62 | "import": "./index.js", 63 | "require": "./index.cjs" 64 | }, 65 | "./package.json": "./package.json" 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /js/src/graph_trajectory/tests/graph_trajectory_strict.test.ts: -------------------------------------------------------------------------------- 1 | import * as ls from "langsmith/vitest"; 2 | import { expect } from "vitest"; 3 | import { 4 | MemorySaver, 5 | Command, 6 | interrupt, 7 | Annotation, 8 | StateGraph, 9 | } from "@langchain/langgraph"; 10 | 11 | import { graphTrajectoryStrictMatch } from "../strict.js"; 12 | import { extractLangGraphTrajectoryFromThread } from "../utils.js"; 13 | 14 | ls.describe( 15 | "graph_trajectory_strict_match", 16 | () => { 17 | ls.test( 18 | "should match the reference trajectory", 19 | { 20 | inputs: {}, 21 | referenceOutputs: { 22 | results: [ 23 | {}, 24 | { 25 | myKey: "It is rainy and 70 degrees!", 26 | }, 27 | ], 28 | steps: [["__start__", "agent", "interrupt", "__interrupt__"], []], 29 | }, 30 | }, 31 | async ({ referenceOutputs }) => { 32 | const graph = new StateGraph( 33 | Annotation.Root({ 34 | myKey: Annotation, 35 | }) 36 | ) 37 | .addNode("agent", async () => { 38 | return { 39 | myKey: "hello", 40 | }; 41 | }) 42 | .addNode("interrupt", async () => { 43 | const res = interrupt("Tell me the answer to the question."); 44 | return { myKey: res }; 45 | }) 46 | .addEdge("__start__", "agent") 47 | .addEdge("agent", "interrupt") 48 | .compile({ checkpointer: new MemorySaver() }); 49 | const config = { 50 | configurable: { 51 | thread_id: "1", 52 | }, 53 | }; 54 | await graph.invoke( 55 | { 56 | myKey: "foo", 57 | }, 58 | config 59 | ); 60 | await graph.invoke( 61 | new Command({ resume: "It is rainy and 70 degrees!" }), 62 | config 63 | ); 64 | const trajectory = await extractLangGraphTrajectoryFromThread( 65 | graph, 66 | config 67 | ); 68 | const result = await graphTrajectoryStrictMatch({ 69 | outputs: trajectory.outputs, 70 | referenceOutputs: referenceOutputs!, 71 | }); 72 | expect(result.score).toBe(true); 73 | } 74 | ); 75 | }, 76 | { 77 | enableTestTracking: false, 78 | } 79 | ); 80 | -------------------------------------------------------------------------------- /js/.eslintrc.cjs: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | extends: [ 3 | "airbnb-base", 4 | "eslint:recommended", 5 | "prettier", 6 | "plugin:@typescript-eslint/recommended", 7 | ], 8 | parserOptions: { 9 | ecmaVersion: 12, 10 | parser: "@typescript-eslint/parser", 11 | project: "./tsconfig.json", 12 | sourceType: "module", 13 | }, 14 | plugins: ["@typescript-eslint", "no-instanceof", "eslint-plugin-jest"], 15 | ignorePatterns: [ 16 | "src/utils/@cfworker", 17 | "src/utils/fast-json-patch", 18 | "src/utils/js-sha1", 19 | "src/utils/sax-js", 20 | ".eslintrc.cjs", 21 | "scripts", 22 | "node_modules", 23 | "dist", 24 | "dist-cjs", 25 | "*.js", 26 | "*.cjs", 27 | "*.d.ts", 28 | ], 29 | rules: { 30 | "no-process-env": 2, 31 | "no-instanceof/no-instanceof": 2, 32 | "@typescript-eslint/explicit-module-boundary-types": 0, 33 | "@typescript-eslint/no-empty-function": 0, 34 | "@typescript-eslint/no-shadow": 0, 35 | "@typescript-eslint/no-empty-interface": 0, 36 | "@typescript-eslint/no-use-before-define": ["error", "nofunc"], 37 | "@typescript-eslint/no-unused-vars": ["warn", { args: "none" }], 38 | "@typescript-eslint/no-floating-promises": "error", 39 | "@typescript-eslint/no-misused-promises": "error", 40 | "@typescript-eslint/no-this-alias": 0, 41 | camelcase: 0, 42 | "class-methods-use-this": 0, 43 | "import/extensions": [2, "ignorePackages"], 44 | "import/no-extraneous-dependencies": [ 45 | "error", 46 | { devDependencies: ["**/*.test.ts"] }, 47 | ], 48 | "import/no-unresolved": 0, 49 | "import/prefer-default-export": 0, 50 | "keyword-spacing": "error", 51 | "max-classes-per-file": 0, 52 | "max-len": 0, 53 | "no-await-in-loop": 0, 54 | "no-bitwise": 0, 55 | "no-console": 0, 56 | "no-empty-function": 0, 57 | "no-restricted-syntax": 0, 58 | "no-shadow": 0, 59 | "no-continue": 0, 60 | "no-void": 0, 61 | "no-underscore-dangle": 0, 62 | "no-use-before-define": 0, 63 | "no-useless-constructor": 0, 64 | "no-return-await": 0, 65 | "no-plusplus": 0, 66 | "consistent-return": 0, 67 | "no-else-return": 0, 68 | "func-names": 0, 69 | "no-lonely-if": 0, 70 | "prefer-rest-params": 0, 71 | "new-cap": ["error", { properties: false, capIsNew: false }], 72 | 'jest/no-focused-tests': 'error', 73 | "arrow-body-style": 0, 74 | "prefer-destructuring": 0, 75 | }, 76 | overrides: [ 77 | { 78 | files: ['**/*.test.ts'], 79 | rules: { 80 | '@typescript-eslint/no-unused-vars': 'off' 81 | } 82 | } 83 | ] 84 | }; -------------------------------------------------------------------------------- /js/src/trajectory/subset.ts: -------------------------------------------------------------------------------- 1 | import { BaseMessage } from "@langchain/core/messages"; 2 | import { 3 | ChatCompletionMessage, 4 | FlexibleChatCompletionMessage, 5 | EvaluatorResult, 6 | ToolArgsMatchMode, 7 | ToolArgsMatchOverrides, 8 | } from "../types.js"; 9 | import { _normalizeToOpenAIMessagesList, _runEvaluator } from "../utils.js"; 10 | import { _isTrajectorySuperset } from "./utils.js"; 11 | 12 | export const _scorer = async (params: { 13 | outputs: ChatCompletionMessage[]; 14 | referenceOutputs: ChatCompletionMessage[]; 15 | toolArgsMatchMode: ToolArgsMatchMode; 16 | toolArgsMatchOverrides?: ToolArgsMatchOverrides; 17 | }): Promise => { 18 | const isSubset = await _isTrajectorySuperset( 19 | params.referenceOutputs, 20 | params.outputs, 21 | params.toolArgsMatchMode, 22 | params.toolArgsMatchOverrides 23 | ); 24 | return isSubset; 25 | }; 26 | 27 | /** 28 | * @deprecated Use `createTrajectoryMatchEvaluator` with `trajectoryMatchMode: "subset"` instead. 29 | * Evaluate whether an agent trajectory and called tools is a subset of a reference trajectory and called tools. 30 | * This means the agent called a subset of the tools specified in the reference trajectory. 31 | * 32 | * @param params - The parameters for trajectory subset evaluation 33 | * @param params.outputs - Actual trajectory the agent followed. 34 | * May be a list of OpenAI messages, a list of LangChain messages, or a dictionary containing 35 | * a "messages" key with one of the above. 36 | * @param params.reference_outputs - Ideal reference trajectory the agent should have followed. 37 | * May be a list of OpenAI messages, a list of LangChain messages, or a dictionary containing 38 | * a "messages" key with one of the above. 39 | * @returns EvaluatorResult containing a score of true if trajectory (including called tools) matches, false otherwise 40 | */ 41 | export async function trajectorySubset(params: { 42 | outputs: 43 | | FlexibleChatCompletionMessage[] 44 | | BaseMessage[] 45 | | { 46 | messages: ( 47 | | BaseMessage 48 | | ChatCompletionMessage 49 | | FlexibleChatCompletionMessage 50 | )[]; 51 | }; 52 | referenceOutputs: 53 | | FlexibleChatCompletionMessage[] 54 | | BaseMessage[] 55 | | { 56 | messages: ( 57 | | BaseMessage 58 | | ChatCompletionMessage 59 | | FlexibleChatCompletionMessage 60 | )[]; 61 | }; 62 | }): Promise { 63 | const { outputs, referenceOutputs } = params; 64 | const outputsList = _normalizeToOpenAIMessagesList(outputs); 65 | const referenceOutputsList = _normalizeToOpenAIMessagesList(referenceOutputs); 66 | 67 | return _runEvaluator("trajectory_subset", _scorer, "trajectory_subset", { 68 | ...params, 69 | outputs: outputsList, 70 | referenceOutputs: referenceOutputsList, 71 | toolArgsMatchMode: "ignore", 72 | }); 73 | } 74 | -------------------------------------------------------------------------------- /js/src/graph_trajectory/tests/graph_trajectory_utils.test.ts: -------------------------------------------------------------------------------- 1 | /* eslint-disable no-promise-executor-return */ 2 | import { expect, test } from "vitest"; 3 | import { Annotation, StateGraph, MemorySaver } from "@langchain/langgraph"; 4 | 5 | import { extractLangGraphTrajectoryFromThread } from "../utils.js"; 6 | 7 | test("trajectory match", async () => { 8 | const checkpointer = new MemorySaver(); 9 | 10 | const inner = new StateGraph( 11 | Annotation.Root({ 12 | myKey: Annotation({ 13 | reducer: (a, b) => a + b, 14 | default: () => "", 15 | }), 16 | myOtherKey: Annotation, 17 | }) 18 | ) 19 | .addNode("inner1", async (state) => { 20 | await new Promise((resolve) => setTimeout(resolve, 100)); 21 | return { myKey: "got here", myOtherKey: state.myKey }; 22 | }) 23 | .addNode("inner2", (state) => ({ 24 | myKey: " and there", 25 | myOtherKey: state.myKey, 26 | })) 27 | .addEdge("inner1", "inner2") 28 | .addEdge("__start__", "inner1") 29 | .compile({ interruptBefore: ["inner2"] }); 30 | 31 | const app = new StateGraph( 32 | Annotation.Root({ 33 | myKey: Annotation({ 34 | reducer: (a, b) => a + b, 35 | default: () => "", 36 | }), 37 | }) 38 | ) 39 | .addNode("inner", (state, config) => inner.invoke(state, config), { 40 | subgraphs: [inner], 41 | }) 42 | .addNode("outer1", () => ({ myKey: " and parallel" })) 43 | .addNode("outer2", () => ({ myKey: " and back again" })) 44 | .addEdge("__start__", "inner") 45 | .addEdge("__start__", "outer1") 46 | .addEdge(["inner", "outer1"], "outer2") 47 | .compile({ checkpointer }); 48 | 49 | // test invoke w/ nested interrupt 50 | const config = { configurable: { thread_id: "1" } }; 51 | expect(await app.invoke({ myKey: "" }, config)).toEqual({ 52 | __interrupt__: [], 53 | myKey: " and parallel", 54 | }); 55 | 56 | expect(await app.invoke(null, config)).toEqual({ 57 | myKey: "got here and there and parallel and back again", 58 | }); 59 | 60 | const trajectory = await extractLangGraphTrajectoryFromThread(app, config); 61 | expect(trajectory).toEqual({ 62 | inputs: [ 63 | { 64 | __start__: { 65 | myKey: "", 66 | }, 67 | }, 68 | { 69 | __start__: { 70 | myKey: "", 71 | }, 72 | }, 73 | ], 74 | outputs: { 75 | results: [ 76 | { 77 | myKey: "got here and there", 78 | myOtherKey: "got here", 79 | }, 80 | { 81 | myKey: "got here and there and parallel and back again", 82 | }, 83 | ], 84 | steps: [ 85 | [ 86 | "__start__", 87 | "outer1", 88 | "inner", 89 | "inner:__start__", 90 | "inner:inner1", 91 | "inner:inner2", 92 | ], 93 | ["outer2"], 94 | ], 95 | }, 96 | }); 97 | }); 98 | -------------------------------------------------------------------------------- /js/src/trajectory/superset.ts: -------------------------------------------------------------------------------- 1 | import { BaseMessage } from "@langchain/core/messages"; 2 | import { 3 | ChatCompletionMessage, 4 | FlexibleChatCompletionMessage, 5 | EvaluatorResult, 6 | ToolArgsMatchMode, 7 | ToolArgsMatchOverrides, 8 | } from "../types.js"; 9 | import { _normalizeToOpenAIMessagesList, _runEvaluator } from "../utils.js"; 10 | import { _isTrajectorySuperset } from "./utils.js"; 11 | 12 | export const _scorer = async (params: { 13 | outputs: ChatCompletionMessage[]; 14 | referenceOutputs: ChatCompletionMessage[]; 15 | toolArgsMatchMode: ToolArgsMatchMode; 16 | toolArgsMatchOverrides?: ToolArgsMatchOverrides; 17 | }): Promise => { 18 | const isSuperset = await _isTrajectorySuperset( 19 | params.outputs, 20 | params.referenceOutputs, 21 | params.toolArgsMatchMode, 22 | params.toolArgsMatchOverrides 23 | ); 24 | return isSuperset; 25 | }; 26 | 27 | /** 28 | * @deprecated Use `createTrajectoryMatchEvaluator` with `trajectoryMatchMode: "superset"` instead. 29 | * Evaluate whether an agent trajectory and called tools is a superset of a reference trajectory and called tools. 30 | * This means the agent called a superset of the tools specified in the reference trajectory. 31 | * 32 | * @param params - The parameters for trajectory superset evaluation 33 | * @param params.outputs - Actual trajectory the agent followed. 34 | * May be a list of OpenAI messages, a list of LangChain messages, or a dictionary containing 35 | * a "messages" key with one of the above. 36 | * @param params.reference_outputs - Ideal reference trajectory the agent should have followed. 37 | * May be a list of OpenAI messages, a list of LangChain messages, or a dictionary containing 38 | * a "messages" key with one of the above. 39 | * @returns EvaluatorResult containing a score of true if trajectory (including called tools) matches, false otherwise 40 | */ 41 | export async function trajectorySuperset(params: { 42 | outputs: 43 | | FlexibleChatCompletionMessage[] 44 | | BaseMessage[] 45 | | { 46 | messages: ( 47 | | BaseMessage 48 | | ChatCompletionMessage 49 | | FlexibleChatCompletionMessage 50 | )[]; 51 | }; 52 | referenceOutputs: 53 | | FlexibleChatCompletionMessage[] 54 | | BaseMessage[] 55 | | { 56 | messages: ( 57 | | BaseMessage 58 | | ChatCompletionMessage 59 | | FlexibleChatCompletionMessage 60 | )[]; 61 | }; 62 | }): Promise { 63 | const { outputs, referenceOutputs } = params; 64 | const outputsList = _normalizeToOpenAIMessagesList(outputs); 65 | const referenceOutputsList = _normalizeToOpenAIMessagesList(referenceOutputs); 66 | 67 | return _runEvaluator("trajectory_superset", _scorer, "trajectory_superset", { 68 | ...params, 69 | outputs: outputsList, 70 | referenceOutputs: referenceOutputsList, 71 | toolArgsMatchMode: "ignore", 72 | }); 73 | } 74 | -------------------------------------------------------------------------------- /python/agentevals/graph_trajectory/strict.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from agentevals.types import EvaluatorResult, GraphTrajectory 4 | from agentevals.utils import _run_evaluator, _arun_evaluator 5 | 6 | from typing import Any 7 | 8 | 9 | def _scorer( 10 | *, 11 | outputs: GraphTrajectory, 12 | reference_outputs: GraphTrajectory, 13 | ) -> float: 14 | if outputs is None or reference_outputs is None: 15 | raise ValueError( 16 | "Strict trajectory match requires both outputs and reference_outputs" 17 | ) 18 | if len(outputs["steps"]) != len(reference_outputs["steps"]): 19 | return False 20 | exact_match = True 21 | for output, reference_output in zip(outputs["steps"], reference_outputs["steps"]): 22 | if output != reference_output: 23 | exact_match = False 24 | break 25 | return exact_match 26 | 27 | 28 | def graph_trajectory_strict_match( 29 | *, 30 | outputs: GraphTrajectory, 31 | reference_outputs: GraphTrajectory, 32 | **kwargs: Any, 33 | ) -> EvaluatorResult: 34 | """ 35 | Evaluate whether an input graph trajectory strictly matches a reference graph trajectory. 36 | This means that at each step, the agent took the same steps in the same order as specified in the reference trajectory. 37 | 38 | Args: 39 | outputs (GraphTrajectory): Actual trajectory the agent followed. 40 | reference_outputs (GraphTrajectory): Ideal reference trajectory the agent should have followed. 41 | 42 | Returns: 43 | EvaluatorResult: Contains a score of True if trajectory (including called tools) matches, False otherwise 44 | """ 45 | return _run_evaluator( 46 | run_name="graph_trajectory_strict_match", 47 | scorer=_scorer, 48 | feedback_key="graph_trajectory_strict_match", 49 | outputs=outputs, 50 | reference_outputs=reference_outputs, 51 | ) 52 | 53 | 54 | async def graph_trajectory_strict_match_async( 55 | *, 56 | outputs: GraphTrajectory, 57 | reference_outputs: GraphTrajectory, 58 | **kwargs: Any, 59 | ) -> EvaluatorResult: 60 | """ 61 | Evaluate whether an input graph trajectory strictly matches a reference graph trajectory. 62 | This means that at each step, the agent took the same steps in the same order as specified in the reference trajectory. 63 | 64 | Args: 65 | outputs (GraphTrajectory): Actual trajectory the agent followed. 66 | reference_outputs (GraphTrajectory): Ideal reference trajectory the agent should have followed. 67 | 68 | Returns: 69 | EvaluatorResult: Contains a score of True if trajectory (including called tools) matches, False otherwise 70 | """ 71 | 72 | async def async_wrapper(**kwargs: Any): 73 | return _scorer(**kwargs) 74 | 75 | return await _arun_evaluator( 76 | run_name="graph_trajectory_strict_match", 77 | scorer=async_wrapper, 78 | feedback_key="graph_trajectory_strict_match", 79 | outputs=outputs, 80 | reference_outputs=reference_outputs, 81 | ) 82 | -------------------------------------------------------------------------------- /js/src/trajectory/unordered.ts: -------------------------------------------------------------------------------- 1 | import { BaseMessage } from "@langchain/core/messages"; 2 | import { 3 | ChatCompletionMessage, 4 | FlexibleChatCompletionMessage, 5 | EvaluatorResult, 6 | ToolArgsMatchMode, 7 | ToolArgsMatchOverrides, 8 | } from "../types.js"; 9 | import { _normalizeToOpenAIMessagesList, _runEvaluator } from "../utils.js"; 10 | import { _isTrajectorySuperset } from "./utils.js"; 11 | 12 | export const _scorer = async (params: { 13 | outputs: ChatCompletionMessage[]; 14 | referenceOutputs: ChatCompletionMessage[]; 15 | toolArgsMatchMode: ToolArgsMatchMode; 16 | toolArgsMatchOverrides?: ToolArgsMatchOverrides; 17 | }): Promise => { 18 | const isUnorderedMatch = 19 | (await _isTrajectorySuperset( 20 | params.outputs, 21 | params.referenceOutputs, 22 | params.toolArgsMatchMode, 23 | params.toolArgsMatchOverrides 24 | )) && 25 | (await _isTrajectorySuperset( 26 | params.referenceOutputs, 27 | params.outputs, 28 | params.toolArgsMatchMode, 29 | params.toolArgsMatchOverrides 30 | )); 31 | return isUnorderedMatch; 32 | }; 33 | 34 | /** 35 | * @deprecated Use `createTrajectoryMatchEvaluator` with `trajectoryMatchMode: "unordered"` instead. 36 | * Evaluate whether an input agent trajectory and called tools contains all the tools used in a reference trajectory. 37 | * This accounts for some differences in an LLM's reasoning process in a case-by-case basis. 38 | * 39 | * @param params - The parameters for trajectory unordered match evaluation 40 | * @param params.outputs - Actual trajectory the agent followed. 41 | * May be a list of OpenAI messages, a list of LangChain messages, or a dictionary containing 42 | * a "messages" key with one of the above. 43 | * @param params.reference_outputs - Ideal reference trajectory the agent should have followed. 44 | * May be a list of OpenAI messages, a list of LangChain messages, or a dictionary containing 45 | * a "messages" key with one of the above. 46 | * @returns EvaluatorResult containing a score of true if trajectory (including called tools) matches, false otherwise 47 | */ 48 | export async function trajectoryUnorderedMatch(params: { 49 | outputs: 50 | | FlexibleChatCompletionMessage[] 51 | | BaseMessage[] 52 | | { 53 | messages: ( 54 | | BaseMessage 55 | | ChatCompletionMessage 56 | | FlexibleChatCompletionMessage 57 | )[]; 58 | }; 59 | referenceOutputs: 60 | | FlexibleChatCompletionMessage[] 61 | | BaseMessage[] 62 | | { 63 | messages: ( 64 | | BaseMessage 65 | | ChatCompletionMessage 66 | | FlexibleChatCompletionMessage 67 | )[]; 68 | }; 69 | }): Promise { 70 | const { outputs, referenceOutputs } = params; 71 | const outputsList = _normalizeToOpenAIMessagesList(outputs); 72 | const referenceOutputsList = _normalizeToOpenAIMessagesList(referenceOutputs); 73 | 74 | return _runEvaluator( 75 | "trajectory_unordered_match", 76 | _scorer, 77 | "trajectory_unordered_match", 78 | { 79 | ...params, 80 | outputs: outputsList, 81 | referenceOutputs: referenceOutputsList, 82 | toolArgsMatchMode: "ignore", 83 | } 84 | ); 85 | } 86 | -------------------------------------------------------------------------------- /python/tests/graph_trajectory/test_graph_trajectory_llm.py: -------------------------------------------------------------------------------- 1 | from agentevals.graph_trajectory.utils import ( 2 | extract_langgraph_trajectory_from_thread, 3 | ) 4 | from agentevals.graph_trajectory.llm import create_graph_trajectory_llm_as_judge 5 | 6 | from langgraph.prebuilt import create_react_agent 7 | from langgraph.checkpoint.memory import MemorySaver 8 | from langgraph.types import Command, interrupt 9 | from langchain_core.tools import tool 10 | 11 | import pytest 12 | 13 | 14 | @tool 15 | def search(query: str): 16 | """Call to surf the web.""" 17 | user_answer = interrupt("Tell me the answer to the question.") 18 | return user_answer 19 | 20 | 21 | tools = [search] 22 | 23 | 24 | @pytest.mark.langsmith 25 | def test_sensible_trajectory(): 26 | checkpointer = MemorySaver() 27 | graph = create_react_agent( 28 | model="gpt-4o-mini", 29 | checkpointer=checkpointer, 30 | tools=[search], 31 | ) 32 | graph.invoke( 33 | {"messages": [{"role": "user", "content": "what's the weather in sf?"}]}, 34 | config={"configurable": {"thread_id": "1"}}, 35 | ) 36 | graph.invoke( 37 | Command(resume="It is rainy and 70 degrees!"), 38 | config={"configurable": {"thread_id": "1"}}, 39 | ) 40 | extracted_trajectory = extract_langgraph_trajectory_from_thread( 41 | graph, {"configurable": {"thread_id": "1"}} 42 | ) 43 | evaluator = create_graph_trajectory_llm_as_judge( 44 | model="openai:o3-mini", 45 | ) 46 | res = evaluator( 47 | inputs=extracted_trajectory["inputs"], 48 | outputs=extracted_trajectory["outputs"], 49 | ) 50 | assert res["key"] == "graph_trajectory_accuracy" 51 | assert res["score"] 52 | 53 | 54 | @pytest.mark.langsmith 55 | def test_unsensible_trajectory(): 56 | checkpointer = MemorySaver() 57 | 58 | @tool 59 | def askjeeves(query: str): 60 | """Call to surf the web.""" 61 | return "foo" 62 | 63 | graph = create_react_agent( 64 | model="gpt-4o-mini", 65 | checkpointer=checkpointer, 66 | tools=[askjeeves], 67 | prompt="You are an evil assistant who is inefficient and calls more tools than necessary.", 68 | ) 69 | graph.invoke( 70 | {"messages": [{"role": "user", "content": "what's the weather in sf?"}]}, 71 | config={"configurable": {"thread_id": "1"}}, 72 | ) 73 | extracted_trajectory = extract_langgraph_trajectory_from_thread( 74 | graph, {"configurable": {"thread_id": "1"}} 75 | ) 76 | evaluator = create_graph_trajectory_llm_as_judge( 77 | prompt="""You are an expert data labeler. 78 | Your task is to grade the accuracy of an AI agent's internal steps in resolving a user queries. 79 | 80 | 81 | An accurate trajectory: 82 | - Makes logical sense between steps 83 | - Shows clear progression 84 | - Is perfectly efficient, with no more than one tool call 85 | - Is semantically equivalent to the provided reference trajectory, if present 86 | 87 | 88 | 89 | Grade the following thread, evaluating whether the agent's overall steps are logical and relatively efficient. 90 | For the trajectory, "__start__" denotes an initial entrypoint to the agent, and "__interrupt__" corresponds to the agent 91 | interrupting to await additional data from another source ("human-in-the-loop"): 92 | 93 | 94 | 95 | {thread} 96 | 97 | 98 | {reference_outputs} 99 | """, 100 | model="openai:o3-mini", 101 | ) 102 | res = evaluator( 103 | inputs=extracted_trajectory["inputs"], 104 | outputs=extracted_trajectory["outputs"], 105 | ) 106 | assert res["key"] == "graph_trajectory_accuracy" 107 | assert not res["score"] 108 | -------------------------------------------------------------------------------- /js/src/utils.ts: -------------------------------------------------------------------------------- 1 | import { BaseMessage, isBaseMessage } from "@langchain/core/messages"; 2 | import { _convertMessagesToOpenAIParams } from "@langchain/openai"; 3 | import { 4 | _runEvaluator as baseRunEvaluator, 5 | EvaluationResultType, 6 | } from "openevals/utils"; 7 | import { 8 | ChatCompletionMessage, 9 | FlexibleChatCompletionMessage, 10 | MultiResultScorerReturnType, 11 | SingleResultScorerReturnType, 12 | } from "./types.js"; 13 | 14 | export const _convertToOpenAIMessage = ( 15 | message: BaseMessage | ChatCompletionMessage 16 | ): ChatCompletionMessage => { 17 | if (isBaseMessage(message)) { 18 | // eslint-disable-next-line @typescript-eslint/no-explicit-any 19 | return _convertMessagesToOpenAIParams([message])[0] as any; 20 | } else { 21 | return message; 22 | } 23 | }; 24 | 25 | export const _convertToChatCompletionMessage = ( 26 | message: BaseMessage | ChatCompletionMessage | FlexibleChatCompletionMessage 27 | ): ChatCompletionMessage => { 28 | let converted: FlexibleChatCompletionMessage; 29 | 30 | if (isBaseMessage(message)) { 31 | // eslint-disable-next-line @typescript-eslint/no-explicit-any 32 | converted = _convertMessagesToOpenAIParams([message])[0] as any; 33 | } else { 34 | converted = message as FlexibleChatCompletionMessage; 35 | } 36 | 37 | // For tool messages without tool_call_id, generate one for compatibility 38 | if (converted.role === "tool" && !converted.tool_call_id) { 39 | converted = { 40 | ...converted, 41 | tool_call_id: `generated-${Math.random().toString(36).substring(2)}`, 42 | }; 43 | } 44 | 45 | return converted as ChatCompletionMessage; 46 | }; 47 | 48 | export const _normalizeToOpenAIMessagesList = ( 49 | messages?: 50 | | (BaseMessage | ChatCompletionMessage | FlexibleChatCompletionMessage)[] 51 | | { 52 | messages: ( 53 | | BaseMessage 54 | | ChatCompletionMessage 55 | | FlexibleChatCompletionMessage 56 | )[]; 57 | } 58 | ): ChatCompletionMessage[] => { 59 | if (!messages) { 60 | return []; 61 | } 62 | let messagesList: ( 63 | | BaseMessage 64 | | ChatCompletionMessage 65 | | FlexibleChatCompletionMessage 66 | )[]; 67 | if (!Array.isArray(messages)) { 68 | if ("messages" in messages && Array.isArray(messages.messages)) { 69 | messagesList = messages.messages; 70 | } else { 71 | throw new Error( 72 | `If passing messages as an object, it must contain a "messages" key` 73 | ); 74 | } 75 | } else { 76 | messagesList = messages; 77 | } 78 | return messagesList.map(_convertToChatCompletionMessage); 79 | }; 80 | 81 | export const processScore = ( 82 | _: string, 83 | value: boolean | number | { score: boolean | number; reasoning?: string } 84 | ) => { 85 | if (typeof value === "object") { 86 | if (value != null && "score" in value) { 87 | return [ 88 | value.score, 89 | "reasoning" in value && typeof value.reasoning === "string" 90 | ? value.reasoning 91 | : undefined, 92 | ] as const; 93 | } else { 94 | throw new Error( 95 | `Expected a dictionary with a "score" key, but got "${JSON.stringify( 96 | value, 97 | null, 98 | 2 99 | )}"` 100 | ); 101 | } 102 | } 103 | return [value] as const; 104 | }; 105 | 106 | export const _runEvaluator = async < 107 | T extends Record, 108 | O extends 109 | | SingleResultScorerReturnType 110 | | MultiResultScorerReturnType 111 | | Promise, 112 | >( 113 | runName: string, 114 | scorer: (params: T) => O, 115 | feedbackKey: string, 116 | extra?: T 117 | ): Promise> => { 118 | return baseRunEvaluator(runName, scorer, feedbackKey, extra, "agentevals"); 119 | }; 120 | -------------------------------------------------------------------------------- /python/tests/graph_trajectory/test_graph_trajectory_llm_async.py: -------------------------------------------------------------------------------- 1 | from agentevals.graph_trajectory.utils import ( 2 | aextract_langgraph_trajectory_from_thread, 3 | ) 4 | from agentevals.graph_trajectory.llm import create_async_graph_trajectory_llm_as_judge 5 | 6 | from langgraph.prebuilt import create_react_agent 7 | from langgraph.checkpoint.memory import MemorySaver 8 | from langgraph.types import Command, interrupt 9 | from langchain_core.tools import tool 10 | 11 | import pytest 12 | 13 | 14 | @tool 15 | def search(query: str): 16 | """Call to surf the web.""" 17 | user_answer = interrupt("Tell me the answer to the question.") 18 | return user_answer 19 | 20 | 21 | tools = [search] 22 | 23 | 24 | @pytest.mark.langsmith 25 | @pytest.mark.asyncio 26 | async def test_sensible_trajectory(): 27 | checkpointer = MemorySaver() 28 | graph = create_react_agent( 29 | model="gpt-4o-mini", 30 | checkpointer=checkpointer, 31 | tools=[search], 32 | ) 33 | await graph.ainvoke( 34 | {"messages": [{"role": "user", "content": "what's the weather in sf?"}]}, 35 | config={"configurable": {"thread_id": "1"}}, 36 | ) 37 | await graph.ainvoke( 38 | Command(resume="It is rainy and 70 degrees!"), 39 | config={"configurable": {"thread_id": "1"}}, 40 | ) 41 | extracted_trajectory = await aextract_langgraph_trajectory_from_thread( 42 | graph, {"configurable": {"thread_id": "1"}} 43 | ) 44 | evaluator = create_async_graph_trajectory_llm_as_judge( 45 | model="openai:o3-mini", 46 | ) 47 | res = await evaluator( 48 | inputs=extracted_trajectory["inputs"], 49 | outputs=extracted_trajectory["outputs"], 50 | ) 51 | assert res["key"] == "graph_trajectory_accuracy" 52 | assert res["score"] 53 | 54 | 55 | @pytest.mark.langsmith 56 | @pytest.mark.asyncio 57 | async def test_unsensible_trajectory(): 58 | checkpointer = MemorySaver() 59 | 60 | @tool 61 | def askjeeves(query: str): 62 | """Call to surf the web.""" 63 | return "foo" 64 | 65 | graph = create_react_agent( 66 | model="gpt-4o-mini", 67 | checkpointer=checkpointer, 68 | tools=[askjeeves], 69 | prompt="You are an evil assistant who is inefficient and calls more tools than necessary.", 70 | ) 71 | await graph.ainvoke( 72 | {"messages": [{"role": "user", "content": "what's the weather in sf?"}]}, 73 | config={"configurable": {"thread_id": "1"}}, 74 | ) 75 | extracted_trajectory = await aextract_langgraph_trajectory_from_thread( 76 | graph, {"configurable": {"thread_id": "1"}} 77 | ) 78 | evaluator = create_async_graph_trajectory_llm_as_judge( 79 | prompt="""You are an expert data labeler. 80 | Your task is to grade the accuracy of an AI agent's internal steps in resolving a user queries. 81 | 82 | 83 | An accurate trajectory: 84 | - Makes logical sense between steps 85 | - Shows clear progression 86 | - Is perfectly efficient, with no more than one tool call 87 | - Is semantically equivalent to the provided reference trajectory, if present 88 | 89 | 90 | 91 | Grade the following thread, evaluating whether the agent's overall steps are logical and relatively efficient. 92 | For the trajectory, "__start__" denotes an initial entrypoint to the agent, and "__interrupt__" corresponds to the agent 93 | interrupting to await additional data from another source ("human-in-the-loop"): 94 | 95 | 96 | 97 | {thread} 98 | 99 | 100 | {reference_outputs} 101 | """, 102 | model="openai:o3-mini", 103 | ) 104 | res = await evaluator( 105 | inputs=extracted_trajectory["inputs"], 106 | outputs=extracted_trajectory["outputs"], 107 | ) 108 | assert res["key"] == "graph_trajectory_accuracy" 109 | assert not res["score"] 110 | -------------------------------------------------------------------------------- /.github/workflows/integration_tests.yml: -------------------------------------------------------------------------------- 1 | name: Integration Tests CI 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | branches: 9 | - main 10 | workflow_dispatch: 11 | 12 | permissions: 13 | contents: read 14 | pull-requests: read 15 | 16 | jobs: 17 | changed_files: 18 | runs-on: ubuntu-latest 19 | outputs: 20 | python_changed: ${{ steps.check-changes.outputs.python_changed }} 21 | js_changed: ${{ steps.check-changes.outputs.js_changed }} 22 | steps: 23 | - uses: actions/checkout@v4 24 | with: 25 | fetch-depth: 0 # Required for diff with main branch 26 | 27 | - name: Check for file changes 28 | id: check-changes 29 | run: | 30 | if git diff --name-only origin/main HEAD | grep -E "^python/.*\.py$"; then 31 | echo "python_changed=true" >> $GITHUB_OUTPUT 32 | else 33 | echo "python_changed=false" >> $GITHUB_OUTPUT 34 | fi 35 | 36 | if git diff --name-only origin/main HEAD | grep -E "^js/.*\.(js|ts|jsx|tsx)$"; then 37 | echo "js_changed=true" >> $GITHUB_OUTPUT 38 | else 39 | echo "js_changed=false" >> $GITHUB_OUTPUT 40 | fi 41 | 42 | python_integration_test: 43 | name: Python Integration Test (${{ matrix.python-version }}) 44 | needs: changed_files 45 | if: > 46 | (github.event_name == 'push') || 47 | (github.event_name == 'pull_request' && ( 48 | contains(github.event.pull_request.labels.*.name, 'release') || 49 | needs.changed_files.outputs.python_changed == 'true' 50 | )) || 51 | (github.event_name == 'workflow_dispatch' && github.event.inputs.run-python-tests == 'true') 52 | runs-on: ubuntu-latest 53 | strategy: 54 | matrix: 55 | python-version: ["3.9", "3.11"] 56 | defaults: 57 | run: 58 | working-directory: python 59 | steps: 60 | - uses: actions/checkout@v3 61 | 62 | - name: Install uv 63 | uses: astral-sh/setup-uv@v5 64 | with: 65 | version: "0.6.2" 66 | 67 | - name: Set up Python ${{ matrix.python-version }} 68 | uses: actions/setup-python@v5 69 | with: 70 | python-version: ${{ matrix.python-version }} 71 | 72 | - name: Install dependencies 73 | run: | 74 | uv venv 75 | source .venv/bin/activate 76 | uv sync 77 | uv sync --group dev 78 | shell: bash 79 | working-directory: python 80 | 81 | - name: Run integration tests 82 | env: 83 | LANGSMITH_TRACING: "true" 84 | LANGSMITH_ENDPOINT: ${{ secrets.LANGSMITH_ENDPOINT }} 85 | LANGSMITH_API_KEY: ${{ secrets.LANGSMITH_API_KEY }} 86 | OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} 87 | run: uv run pytest tests 88 | shell: bash 89 | working-directory: python 90 | 91 | js_integration_test: 92 | name: JS Integration Test 93 | needs: changed_files 94 | if: > 95 | (github.event_name == 'push') || 96 | (github.event_name == 'pull_request' && ( 97 | contains(github.event.pull_request.labels.*.name, 'release') || 98 | needs.changed_files.outputs.js_changed == 'true' 99 | )) || 100 | (github.event_name == 'workflow_dispatch' && github.event.inputs.run-js-tests == 'true') 101 | runs-on: ubuntu-latest 102 | defaults: 103 | run: 104 | working-directory: js 105 | steps: 106 | - uses: actions/checkout@v3 107 | 108 | - name: Setup Node 109 | uses: actions/setup-node@v3 110 | with: 111 | node-version: 22.x 112 | cache: "yarn" 113 | cache-dependency-path: "js/yarn.lock" 114 | 115 | - name: Install Yarn dependencies 116 | run: yarn install 117 | shell: bash 118 | working-directory: js 119 | 120 | 121 | - name: Run JS integration tests 122 | env: 123 | LANGSMITH_TRACING: "true" 124 | LANGSMITH_ENDPOINT: ${{ secrets.LANGSMITH_ENDPOINT }} 125 | LANGSMITH_API_KEY: ${{ secrets.LANGSMITH_API_KEY }} 126 | OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} 127 | run: yarn test src/trajectory/tests src/graph_trajectory/tests 128 | shell: bash 129 | working-directory: js -------------------------------------------------------------------------------- /js/src/graph_trajectory/tests/graph_trajectory_llm.test.ts: -------------------------------------------------------------------------------- 1 | import * as ls from "langsmith/vitest"; 2 | import { expect } from "vitest"; 3 | 4 | import { createReactAgent } from "@langchain/langgraph/prebuilt"; 5 | import { MemorySaver } from "@langchain/langgraph"; 6 | import { tool } from "@langchain/core/tools"; 7 | import { z } from "zod"; 8 | import { ChatOpenAI } from "@langchain/openai"; 9 | 10 | import { createGraphTrajectoryLLMAsJudge } from "../llm.js"; 11 | import { extractLangGraphTrajectoryFromThread } from "../utils.js"; 12 | 13 | const search = tool( 14 | async () => { 15 | return "It's 80 degrees and sunny in San Francisco."; 16 | }, 17 | { 18 | name: "search", 19 | description: "Call to surf the web.", 20 | schema: z.object({ 21 | query: z.string(), 22 | }), 23 | } 24 | ); 25 | 26 | const tools = [search]; 27 | 28 | ls.describe("graph_trajectory_llm", () => { 29 | ls.test( 30 | "sensible_trajectory", 31 | { 32 | inputs: {}, 33 | referenceOutputs: {}, 34 | }, 35 | async () => { 36 | const checkpointer = new MemorySaver(); 37 | const graph = createReactAgent({ 38 | llm: new ChatOpenAI({ model: "gpt-4o-mini" }), 39 | checkpointer, 40 | tools, 41 | }); 42 | const config = { configurable: { thread_id: "1" } }; 43 | await graph.invoke( 44 | { messages: [{ role: "user", content: "what's the weather in sf?" }] }, 45 | config 46 | ); 47 | const trajectory = await extractLangGraphTrajectoryFromThread( 48 | graph, 49 | config 50 | ); 51 | const evaluator = createGraphTrajectoryLLMAsJudge({ 52 | model: "openai:o3-mini", 53 | }); 54 | const res = await evaluator({ 55 | inputs: trajectory.inputs, 56 | outputs: trajectory.outputs, 57 | }); 58 | expect(res.key).toBe("graph_trajectory_accuracy"); 59 | expect(res.score).toBe(true); 60 | } 61 | ); 62 | 63 | ls.test( 64 | "unsensible_trajectory", 65 | { 66 | inputs: {}, 67 | referenceOutputs: {}, 68 | }, 69 | async () => { 70 | const checkpointer = new MemorySaver(); 71 | const askjeeves = tool( 72 | async () => { 73 | return "foo"; 74 | }, 75 | { 76 | name: "askjeeves", 77 | description: "Call to surf the web.", 78 | schema: z.object({ query: z.string() }), 79 | } 80 | ); 81 | const graph = createReactAgent({ 82 | llm: new ChatOpenAI({ model: "gpt-4o-mini" }), 83 | checkpointer, 84 | prompt: 85 | "You are an evil assistant who is inefficient and calls more tools than necessary.", 86 | tools: [askjeeves], 87 | }); 88 | const config = { configurable: { thread_id: "1" } }; 89 | await graph.invoke( 90 | { messages: [{ role: "user", content: "what's the weather in sf?" }] }, 91 | config 92 | ); 93 | const trajectory = await extractLangGraphTrajectoryFromThread( 94 | graph, 95 | config 96 | ); 97 | const evaluator = createGraphTrajectoryLLMAsJudge({ 98 | model: "openai:o3-mini", 99 | prompt: `You are an expert data labeler. 100 | Your task is to grade the accuracy of an AI agent's internal steps in resolving a user queries. 101 | 102 | 103 | An accurate trajectory: 104 | - Makes logical sense between steps 105 | - Shows clear progression 106 | - Is perfectly efficient, with no more than one tool call 107 | - Is semantically equivalent to the provided reference trajectory, if present 108 | 109 | 110 | 111 | Grade the following thread, evaluating whether the agent's overall steps are logical and relatively efficient. 112 | For the trajectory, "__start__" denotes an initial entrypoint to the agent, and "__interrupt__" corresponds to the agent 113 | interrupting to await additional data from another source ("human-in-the-loop"): 114 | 115 | 116 | 117 | {thread} 118 | 119 | 120 | {reference_outputs}`, 121 | }); 122 | const res = await evaluator({ 123 | inputs: trajectory.inputs, 124 | outputs: trajectory.outputs, 125 | }); 126 | expect(res.key).toBe("graph_trajectory_accuracy"); 127 | expect(res.score).toBe(false); 128 | } 129 | ); 130 | }); 131 | -------------------------------------------------------------------------------- /js/src/graph_trajectory/utils.ts: -------------------------------------------------------------------------------- 1 | import type { StateSnapshot, Pregel } from "@langchain/langgraph/web"; 2 | import { isBaseMessage } from "@langchain/core/messages"; 3 | import type { RunnableConfig } from "@langchain/core/runnables"; 4 | import { _convertMessagesToOpenAIParams } from "@langchain/openai"; 5 | 6 | import type { GraphTrajectory } from "../types.js"; 7 | 8 | export const extractLangGraphTrajectoryFromSnapshots = ( 9 | snapshots: StateSnapshot[] 10 | ) => { 11 | const inputs = []; 12 | const trajectory: GraphTrajectory = { 13 | results: [], 14 | steps: [], 15 | }; 16 | let isAccumulatingSteps = false; 17 | for (let i = 0; i < snapshots.length; i += 1) { 18 | const snapshot = snapshots[i]; 19 | const hasInterrupts = snapshot.tasks?.find((task) => { 20 | return task.interrupts?.length; 21 | }); 22 | if (!snapshot.next?.length || hasInterrupts) { 23 | isAccumulatingSteps = true; 24 | if (hasInterrupts) { 25 | trajectory.results.push({}); 26 | } else if ( 27 | snapshot.values != null && 28 | typeof snapshot.values === "object" && 29 | !Array.isArray(snapshot.values) && 30 | "messages" in snapshot.values && 31 | Array.isArray(snapshot.values.messages) 32 | ) { 33 | const lastMessage = snapshot.values.messages.at(-1); 34 | if (isBaseMessage(lastMessage)) { 35 | // Just append the last message in the output to the results to reduce context size 36 | trajectory.results.push({ 37 | messages: _convertMessagesToOpenAIParams([lastMessage]), 38 | }); 39 | } else { 40 | trajectory.results.push({ messages: [lastMessage] }); 41 | } 42 | } else { 43 | trajectory.results.push(snapshot.values); 44 | } 45 | trajectory.steps.push([]); 46 | } 47 | if (isAccumulatingSteps && snapshot.tasks?.length) { 48 | const checkpointNs = snapshot.config?.configurable?.checkpoint_ns ?? ""; 49 | let subgraphPath = ""; 50 | if (checkpointNs.split(":").length > 1) { 51 | subgraphPath = `${checkpointNs.split(":")[0]}:`; 52 | } 53 | for (const task of snapshot.tasks) { 54 | if (task.interrupts?.length) { 55 | trajectory.steps.at(-1)?.push("__interrupt__"); 56 | } 57 | trajectory.steps.at(-1)?.push(`${subgraphPath}${task.name}`); 58 | } 59 | } 60 | if (isAccumulatingSteps) { 61 | if (snapshot.metadata != null && snapshot.metadata.source === "input") { 62 | if ( 63 | "writes" in snapshot.metadata && 64 | snapshot.metadata.writes != null && 65 | typeof snapshot.metadata.writes === "object" 66 | ) { 67 | inputs.push(snapshot.metadata.writes as Record); 68 | } else { 69 | inputs.push( 70 | ...snapshot.tasks.map((task) => ({ [task.name]: task.result })) 71 | ); 72 | } 73 | } else if ( 74 | i + 1 < snapshots.length && 75 | snapshots[i + 1].tasks?.find((task) => task.interrupts?.length > 0) 76 | ) { 77 | inputs.push("__resuming__"); 78 | } 79 | } 80 | } 81 | inputs.reverse(); 82 | trajectory.results.reverse(); 83 | trajectory.steps.reverse(); 84 | for (const stepList of trajectory.steps) { 85 | stepList.reverse(); 86 | } 87 | if (inputs.length !== trajectory.results.length) { 88 | console.warn( 89 | "Trajectory parsing may be incomplete: inputs and results have different lengths" 90 | ); 91 | } else if (inputs.length !== trajectory.steps.length) { 92 | console.warn( 93 | "Trajectory parsing may be incomplete: inputs and steps have different lengths" 94 | ); 95 | } 96 | return { inputs, outputs: trajectory }; 97 | }; 98 | 99 | export const _getLangGraphStateHistoryRecursive = async ( 100 | // eslint-disable-next-line @typescript-eslint/no-explicit-any 101 | graph: Pregel, 102 | config: RunnableConfig 103 | ): Promise => { 104 | const stateHistory = []; 105 | for await (const history of graph.getStateHistory(config)) { 106 | if (history.tasks?.length) { 107 | for (const task of history.tasks) { 108 | if ((task.state as RunnableConfig)?.configurable?.checkpoint_ns) { 109 | stateHistory.push( 110 | ...(await _getLangGraphStateHistoryRecursive( 111 | graph, 112 | task.state as RunnableConfig 113 | )) 114 | ); 115 | } 116 | } 117 | } 118 | stateHistory.push(history); 119 | } 120 | return stateHistory; 121 | }; 122 | 123 | export const extractLangGraphTrajectoryFromThread = async ( 124 | // eslint-disable-next-line @typescript-eslint/no-explicit-any 125 | graph: Pregel, 126 | config: RunnableConfig 127 | ) => { 128 | const history = await _getLangGraphStateHistoryRecursive(graph, config); 129 | return extractLangGraphTrajectoryFromSnapshots(history); 130 | }; 131 | -------------------------------------------------------------------------------- /js/src/trajectory/match.ts: -------------------------------------------------------------------------------- 1 | import { BaseMessage } from "@langchain/core/messages"; 2 | import { 3 | ChatCompletionMessage, 4 | FlexibleChatCompletionMessage, 5 | ToolArgsMatchMode, 6 | ToolArgsMatchOverrides, 7 | } from "../types.js"; 8 | import { _normalizeToOpenAIMessagesList, _runEvaluator } from "../utils.js"; 9 | import { _scorer as trajectoryStrictScorer } from "./strict.js"; 10 | import { _scorer as trajectoryUnorderedScorer } from "./unordered.js"; 11 | import { _scorer as trajectorySubsetScorer } from "./subset.js"; 12 | import { _scorer as trajectorySuperstScorer } from "./superset.js"; 13 | 14 | export type TrajectoryMatchMode = 15 | | "strict" 16 | | "unordered" 17 | | "subset" 18 | | "superset"; 19 | 20 | /** 21 | * Creates an evaluator that compares trajectories between model outputs and reference outputs. 22 | * 23 | * @param options - The configuration options 24 | * @param options.trajectoryMatchMode - The mode for matching trajectories: 25 | * - `"strict"`: Requires exact match in order and content 26 | * - `"unordered"`: Allows matching in any order 27 | * - `"subset"`: Accepts if output trajectory is a subset of reference 28 | * - `"superset"`: Accepts if output trajectory is a superset of reference 29 | * @param options.toolArgsMatchMode - Mode for matching tool arguments ("exact" by default, can be "ignore") 30 | * @param options.toolArgsMatchOverrides - Object containing custom overrides for tool argument matching. 31 | * Each key should be a tool name, and each value should be either a match mode or a matcher function. 32 | * Matchers should be a function that takes two sets of tool call args and returns whether they are equal. 33 | * 34 | * @returns An async function that evaluates trajectory matches between outputs and references. 35 | * The returned evaluator accepts: 36 | * - outputs: List of messages or dict representing the model output trajectory 37 | * - referenceOutputs: List of messages or dict representing the reference trajectory 38 | * - Additional arguments passed to the underlying evaluator 39 | * 40 | * @example 41 | * ```typescript 42 | * const matcher = ( 43 | * outputToolCallArgs: Record, 44 | * referenceToolCallArgs: Record 45 | * ): boolean => { 46 | * const outputArgs = (outputToolCallArgs.query ?? "").toLowerCase(); 47 | * const referenceArgs = (referenceToolCallArgs.query ?? "").toLowerCase(); 48 | * return outputArgs === referenceArgs; 49 | * }; 50 | * 51 | * const evaluator = createAsyncTrajectoryMatchEvaluator({ 52 | * trajectoryMatchMode: "strict", 53 | * toolArgsMatchMode: "exact", 54 | * toolArgsMatchOverrides: { 55 | * myToolName: matcher, 56 | * }, 57 | * }); 58 | * 59 | * const result = await evaluator({ 60 | * outputs: [...], 61 | * referenceOutputs: [...], 62 | * }); 63 | * ``` 64 | */ 65 | export function createTrajectoryMatchEvaluator({ 66 | trajectoryMatchMode = "strict", 67 | toolArgsMatchMode = "exact", 68 | toolArgsMatchOverrides, 69 | }: { 70 | trajectoryMatchMode?: TrajectoryMatchMode; 71 | toolArgsMatchMode?: ToolArgsMatchMode; 72 | toolArgsMatchOverrides?: ToolArgsMatchOverrides; 73 | }) { 74 | let scorer: (params: { 75 | outputs: ChatCompletionMessage[]; 76 | referenceOutputs: ChatCompletionMessage[]; 77 | toolArgsMatchMode: ToolArgsMatchMode; 78 | toolArgsMatchOverrides?: ToolArgsMatchOverrides; 79 | }) => boolean | Promise; 80 | switch (trajectoryMatchMode) { 81 | case "strict": 82 | scorer = trajectoryStrictScorer; 83 | break; 84 | case "unordered": 85 | scorer = trajectoryUnorderedScorer; 86 | break; 87 | case "subset": 88 | scorer = trajectorySubsetScorer; 89 | break; 90 | case "superset": 91 | scorer = trajectorySuperstScorer; 92 | break; 93 | default: 94 | throw new Error(`Invalid trajectory match type: ${trajectoryMatchMode}`); 95 | } 96 | 97 | return async function _wrappedEvaluator({ 98 | outputs, 99 | referenceOutputs, 100 | ...extra 101 | }: { 102 | outputs: 103 | | ChatCompletionMessage[] 104 | | FlexibleChatCompletionMessage[] 105 | | BaseMessage[] 106 | | { 107 | messages: ( 108 | | BaseMessage 109 | | ChatCompletionMessage 110 | | FlexibleChatCompletionMessage 111 | )[]; 112 | }; 113 | referenceOutputs: 114 | | ChatCompletionMessage[] 115 | | FlexibleChatCompletionMessage[] 116 | | BaseMessage[] 117 | | { 118 | messages: ( 119 | | BaseMessage 120 | | ChatCompletionMessage 121 | | FlexibleChatCompletionMessage 122 | )[]; 123 | }; 124 | [key: string]: unknown; 125 | }) { 126 | const normalizedOutputs = _normalizeToOpenAIMessagesList(outputs); 127 | const normalizedReferenceOutputs = 128 | _normalizeToOpenAIMessagesList(referenceOutputs); 129 | 130 | return _runEvaluator( 131 | `trajectory_${trajectoryMatchMode}_match`, 132 | scorer, 133 | `trajectory_${trajectoryMatchMode}_match`, 134 | { 135 | outputs: normalizedOutputs, 136 | referenceOutputs: normalizedReferenceOutputs, 137 | toolArgsMatchMode, 138 | toolArgsMatchOverrides, 139 | ...extra, 140 | } 141 | ); 142 | }; 143 | } 144 | -------------------------------------------------------------------------------- /js/src/trajectory/strict.ts: -------------------------------------------------------------------------------- 1 | import { BaseMessage } from "@langchain/core/messages"; 2 | import { 3 | ChatCompletionMessage, 4 | FlexibleChatCompletionMessage, 5 | EvaluatorResult, 6 | ToolArgsMatchMode, 7 | ToolArgsMatchOverrides, 8 | } from "../types.js"; 9 | import { _normalizeToOpenAIMessagesList, _runEvaluator } from "../utils.js"; 10 | import { _getMatcherForToolName } from "./utils.js"; 11 | 12 | export async function _scorer(params: { 13 | outputs: ChatCompletionMessage[]; 14 | referenceOutputs: ChatCompletionMessage[]; 15 | toolArgsMatchMode: ToolArgsMatchMode; 16 | toolArgsMatchOverrides?: ToolArgsMatchOverrides; 17 | }): Promise { 18 | const { 19 | outputs, 20 | referenceOutputs, 21 | toolArgsMatchMode, 22 | toolArgsMatchOverrides, 23 | } = params; 24 | const normalizedOutputs = outputs; 25 | const normalizedReferenceOutputs = referenceOutputs; 26 | 27 | if (!normalizedOutputs || !normalizedReferenceOutputs) { 28 | throw new Error( 29 | "Strict trajectory match requires both outputs and reference_outputs" 30 | ); 31 | } 32 | 33 | if (normalizedOutputs.length !== normalizedReferenceOutputs.length) { 34 | return false; 35 | } 36 | 37 | for (let i = 0; i < normalizedOutputs.length; i++) { 38 | const output = normalizedOutputs[i]; 39 | const referenceOutput = normalizedReferenceOutputs[i]; 40 | 41 | if (output.role !== referenceOutput.role) { 42 | return false; 43 | } 44 | 45 | const outputHasToolCalls = output.tool_calls != null; 46 | const referenceHasToolCalls = referenceOutput.tool_calls != null; 47 | 48 | if (outputHasToolCalls !== referenceHasToolCalls) { 49 | return false; 50 | } 51 | 52 | if (outputHasToolCalls) { 53 | if (output.tool_calls!.length !== referenceOutput.tool_calls!.length) { 54 | return false; 55 | } 56 | const referenceCalls = referenceOutput.tool_calls ?? []; 57 | const seen = new Array(referenceCalls.length).fill(false); 58 | 59 | for (const outputCall of output.tool_calls ?? []) { 60 | let foundMatch = false; 61 | for (let i = 0; i < referenceCalls.length; i++) { 62 | const referenceCall = referenceCalls[i]; 63 | if ( 64 | !seen[i] && 65 | outputCall.function?.name === referenceCall.function?.name 66 | ) { 67 | const matcher = _getMatcherForToolName( 68 | outputCall.function?.name ?? "", 69 | toolArgsMatchMode, 70 | toolArgsMatchOverrides 71 | ); 72 | if ( 73 | await matcher( 74 | JSON.parse(outputCall.function?.arguments ?? "{}"), 75 | JSON.parse(referenceCall.function?.arguments ?? "{}") 76 | ) 77 | ) { 78 | foundMatch = true; 79 | seen[i] = true; 80 | break; 81 | } 82 | } 83 | } 84 | if (!foundMatch) { 85 | return false; 86 | } 87 | } 88 | } 89 | } 90 | 91 | return true; 92 | } 93 | 94 | /** 95 | * @deprecated Use `createTrajectoryMatchEvaluator` with `trajectoryMatchMode: "strict"` instead. 96 | * Evaluate whether an input agent trajectory and called tools strictly matches a reference trajectory. 97 | * This means that at each step, the agent called the same tools in the same order as specified in the reference trajectory. 98 | * 99 | * @param outputs - Actual trajectory the agent followed. May be a list of OpenAI messages, 100 | * a list of LangChain messages, or a dictionary containing a "messages" key with one of the above. 101 | * @param referenceOutputs - Ideal reference trajectory the agent should have followed. May be a list of OpenAI messages, 102 | * a list of LangChain messages, or a dictionary containing a "messages" key with one of the above. 103 | * @param toolCallArgsExactMatch - Whether to require exact matches for tool call arguments 104 | * @returns EvaluatorResult containing a score of true if trajectory (including called tools) matches, false otherwise 105 | */ 106 | export async function trajectoryStrictMatch(params: { 107 | outputs: 108 | | ChatCompletionMessage[] 109 | | FlexibleChatCompletionMessage[] 110 | | BaseMessage[] 111 | | { 112 | messages: ( 113 | | BaseMessage 114 | | ChatCompletionMessage 115 | | FlexibleChatCompletionMessage 116 | )[]; 117 | }; 118 | referenceOutputs: 119 | | ChatCompletionMessage[] 120 | | FlexibleChatCompletionMessage[] 121 | | BaseMessage[] 122 | | { 123 | messages: ( 124 | | BaseMessage 125 | | ChatCompletionMessage 126 | | FlexibleChatCompletionMessage 127 | )[]; 128 | }; 129 | toolCallArgsExactMatch: boolean; 130 | }): Promise { 131 | const normalizedOutputs = _normalizeToOpenAIMessagesList(params.outputs); 132 | const normalizedReferenceOutputs = _normalizeToOpenAIMessagesList( 133 | params.referenceOutputs 134 | ); 135 | 136 | return _runEvaluator( 137 | "trajectory_strict_match", 138 | _scorer, 139 | "trajectory_strict_match", 140 | { 141 | outputs: normalizedOutputs, 142 | referenceOutputs: normalizedReferenceOutputs, 143 | toolArgsMatchMode: params.toolCallArgsExactMatch ? "exact" : "ignore", 144 | } 145 | ); 146 | } 147 | -------------------------------------------------------------------------------- /python/agentevals/graph_trajectory/utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | from typing import Iterable, TYPE_CHECKING 3 | import warnings 4 | 5 | from langchain_core.messages import BaseMessage 6 | from langchain_core.messages.utils import convert_to_openai_messages 7 | 8 | from agentevals.types import GraphTrajectory, ExtractedLangGraphThreadTrajectory 9 | 10 | from langchain_core.runnables import RunnableConfig 11 | 12 | if TYPE_CHECKING: 13 | from langgraph.pregel import Pregel 14 | from langgraph.pregel.types import StateSnapshot 15 | 16 | 17 | def extract_langgraph_trajectory_from_snapshots( 18 | snapshots: Iterable[StateSnapshot], 19 | ) -> ExtractedLangGraphThreadTrajectory: 20 | inputs = [] 21 | trajectory = GraphTrajectory( 22 | inputs=[], 23 | results=[], 24 | steps=[], 25 | ) 26 | is_acc_steps = False 27 | snapshot_list = list(snapshots) 28 | for i, snapshot in enumerate(snapshot_list): 29 | has_interrupts = any(t.interrupts for t in snapshot.tasks) 30 | if not snapshot.next or has_interrupts: 31 | is_acc_steps = True 32 | if has_interrupts: 33 | trajectory["results"].append({}) 34 | elif ( 35 | isinstance(snapshot.values, dict) 36 | and "messages" in snapshot.values 37 | and isinstance(snapshot.values["messages"], list) 38 | ): 39 | # Just append the last message in the output to the results to reduce context size 40 | last_message = snapshot.values["messages"][-1] 41 | if isinstance(last_message, BaseMessage): 42 | trajectory["results"].append( 43 | {"messages": convert_to_openai_messages([last_message])} 44 | ) 45 | else: 46 | trajectory["results"].append({"messages": [last_message]}) 47 | else: 48 | trajectory["results"].append(snapshot.values) 49 | trajectory["steps"].append([]) 50 | if is_acc_steps and snapshot.tasks: 51 | checkpoint_ns = snapshot.config.get("configurable", {}).get( 52 | "checkpoint_ns", "" 53 | ) 54 | subgraph_path = "" 55 | if checkpoint_ns and len(checkpoint_ns.split(":")) > 1: 56 | subgraph_path = f"{checkpoint_ns.split(':')[0]}:" 57 | for task in snapshot.tasks: 58 | if task.interrupts: 59 | trajectory["steps"][-1].append("__interrupt__") 60 | trajectory["steps"][-1].append(f"{subgraph_path}{task.name}") 61 | if is_acc_steps: 62 | if snapshot.metadata is not None and snapshot.metadata["source"] == "input": 63 | inputs.extend({task.name: task.result} for task in snapshot.tasks) 64 | elif i + 1 < len(snapshot_list) and any( 65 | t.interrupts for t in snapshot_list[i + 1].tasks 66 | ): 67 | inputs.append("__resuming__") # type: ignore 68 | inputs.reverse() 69 | trajectory["results"].reverse() 70 | trajectory["steps"].reverse() 71 | for ss in trajectory["steps"]: 72 | ss.reverse() 73 | if len(inputs) != len(trajectory["results"]): 74 | warnings.warn( 75 | "Trajectory parsing may be incomplete: inputs and results have different lengths" 76 | ) 77 | elif len(inputs) != len(trajectory["steps"]): 78 | warnings.warn( 79 | "Trajectory parsing may be incomplete: inputs and steps have different lengths" 80 | ) 81 | 82 | return {"inputs": inputs, "outputs": trajectory} 83 | 84 | 85 | def _get_langgraph_state_history_recursive(graph: Pregel, config: RunnableConfig): 86 | state_history = [] 87 | for history in graph.get_state_history(config=config): 88 | if history.tasks: 89 | for task in history.tasks: 90 | if task.state and task.state.get("configurable", {}).get( 91 | "checkpoint_ns", None 92 | ): 93 | state_history.extend( 94 | _get_langgraph_state_history_recursive(graph, task.state) 95 | ) 96 | state_history.append(history) 97 | return state_history 98 | 99 | 100 | async def _aget_langgraph_state_history_recursive( 101 | graph: Pregel, config: RunnableConfig 102 | ): 103 | state_history = [] 104 | async for history in graph.aget_state_history(config=config): 105 | if history.tasks: 106 | for task in history.tasks: 107 | if task.state and task.state.get("configurable", {}).get( 108 | "checkpoint_ns", None 109 | ): 110 | state_history.extend( 111 | await _aget_langgraph_state_history_recursive(graph, task.state) 112 | ) 113 | state_history.append(history) 114 | return state_history 115 | 116 | 117 | def extract_langgraph_trajectory_from_thread( 118 | graph: Pregel, config: RunnableConfig 119 | ) -> ExtractedLangGraphThreadTrajectory: 120 | return extract_langgraph_trajectory_from_snapshots( 121 | _get_langgraph_state_history_recursive(graph, config) 122 | ) 123 | 124 | 125 | async def aextract_langgraph_trajectory_from_thread( 126 | graph: Pregel, config: RunnableConfig 127 | ) -> ExtractedLangGraphThreadTrajectory: 128 | return extract_langgraph_trajectory_from_snapshots( 129 | await _aget_langgraph_state_history_recursive(graph, config) 130 | ) 131 | -------------------------------------------------------------------------------- /python/agentevals/trajectory/unordered.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | from warnings import warn 3 | 4 | from agentevals.types import ( 5 | ChatCompletionMessage, 6 | ToolArgsMatchMode, 7 | ToolArgsMatchOverrides, 8 | ) 9 | from agentevals.trajectory.utils import ( 10 | _is_trajectory_superset, 11 | _normalize_to_openai_messages_list, 12 | ) 13 | from agentevals.utils import _run_evaluator, _arun_evaluator 14 | 15 | from typing import Any, Optional, Union, TYPE_CHECKING 16 | 17 | if TYPE_CHECKING: 18 | from langchain_core.messages import BaseMessage 19 | 20 | 21 | def _scorer( 22 | *, 23 | outputs: list[ChatCompletionMessage], 24 | reference_outputs: list[ChatCompletionMessage], 25 | tool_args_match_mode: ToolArgsMatchMode, 26 | tool_args_match_overrides: Optional[ToolArgsMatchOverrides] = None, 27 | **kwargs: Any, 28 | ): 29 | if outputs is None or reference_outputs is None: 30 | raise ValueError( 31 | "Trajectory unordered match requires both outputs and reference_outputs" 32 | ) 33 | unordered_match = _is_trajectory_superset( 34 | outputs, reference_outputs, tool_args_match_mode, tool_args_match_overrides 35 | ) and _is_trajectory_superset( 36 | reference_outputs, outputs, tool_args_match_mode, tool_args_match_overrides 37 | ) 38 | return unordered_match 39 | 40 | 41 | def trajectory_unordered_match( 42 | *, 43 | outputs: Union[list[ChatCompletionMessage], list[BaseMessage], dict], 44 | reference_outputs: Union[list[ChatCompletionMessage], list[BaseMessage], dict], 45 | **kwargs: Any, 46 | ): 47 | """ 48 | DEPRECATED: Use create_trajectory_match_evaluator() instead: 49 | ```python 50 | from agentevals.trajectory.match import create_trajectory_match_evaluator 51 | evaluator = create_trajectory_match_evaluator(trajectory_match_mode="unordered") 52 | evaluator(outputs=outputs, reference_outputs=reference_outputs) 53 | ``` 54 | 55 | Evaluate whether an input agent trajectory and called tools contains all the tools used in a reference trajectory. 56 | This accounts for some differences in an LLM's reasoning process in a case-by-case basis. 57 | 58 | Args: 59 | outputs (Union[list[ChatCompletionMessage], list[BaseMessage], dict]): Actual trajectory the agent followed. 60 | May be a list of OpenAI messages, a list of LangChain messages, or a dictionary containing 61 | a "messages" key with one of the above. 62 | reference_outputs (Union[list[ChatCompletionMessage], list[BaseMessage], dict]): Ideal reference trajectory the agent should have followed. 63 | May be a list of OpenAI messages, a list of LangChain messages, or a dictionary containing 64 | a "messages" key with one of the above. 65 | 66 | Returns: 67 | EvaluatorResult: Contains a score of True if trajectory matches, False otherwise 68 | """ 69 | warn( 70 | "trajectory_unordered_match() is deprecated. Use create_trajectory_match_evaluator(trajectory_match_mode='unordered') instead.", 71 | DeprecationWarning, 72 | stacklevel=2, 73 | ) 74 | 75 | outputs = _normalize_to_openai_messages_list(outputs) 76 | reference_outputs = _normalize_to_openai_messages_list(reference_outputs) 77 | 78 | return _run_evaluator( 79 | run_name="trajectory_unordered_match", 80 | scorer=_scorer, 81 | feedback_key="trajectory_unordered_match", 82 | outputs=outputs, 83 | reference_outputs=reference_outputs, 84 | tool_args_match_mode="ignore", 85 | **kwargs, 86 | ) 87 | 88 | 89 | async def trajectory_unordered_match_async( 90 | *, 91 | outputs: Union[list[ChatCompletionMessage], list[BaseMessage], dict], 92 | reference_outputs: Union[list[ChatCompletionMessage], list[BaseMessage], dict], 93 | **kwargs: Any, 94 | ): 95 | """ 96 | Evaluate whether an input agent trajectory and called tools contains all the tools used in a reference trajectory. 97 | This accounts for some differences in an LLM's reasoning process in a case-by-case basis. 98 | 99 | Args: 100 | outputs (Union[list[ChatCompletionMessage], list[BaseMessage], dict]): Actual trajectory the agent followed. 101 | May be a list of OpenAI messages, a list of LangChain messages, or a dictionary containing 102 | a "messages" key with one of the above. 103 | reference_outputs (Union[list[ChatCompletionMessage], list[BaseMessage], dict]): Ideal reference trajectory the agent should have followed. 104 | May be a list of OpenAI messages, a list of LangChain messages, or a dictionary containing 105 | a "messages" key with one of the above. 106 | 107 | Returns: 108 | EvaluatorResult: Contains a score of True if trajectory matches, False otherwise 109 | """ 110 | warn( 111 | "trajectory_unordered_match_async() is deprecated. Use create_async_trajectory_match_evaluator(trajectory_match_mode='unordered') instead.", 112 | DeprecationWarning, 113 | stacklevel=2, 114 | ) 115 | 116 | outputs = _normalize_to_openai_messages_list(outputs) 117 | reference_outputs = _normalize_to_openai_messages_list(reference_outputs) 118 | 119 | return await _arun_evaluator( 120 | run_name="trajectory_unordered_match", 121 | scorer=_scorer, 122 | feedback_key="trajectory_unordered_match", 123 | outputs=outputs, 124 | reference_outputs=reference_outputs, 125 | tool_args_match_mode="ignore", 126 | **kwargs, 127 | ) 128 | -------------------------------------------------------------------------------- /python/agentevals/trajectory/subset.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | from warnings import warn 3 | 4 | from agentevals.trajectory.utils import _normalize_to_openai_messages_list 5 | from agentevals.types import ChatCompletionMessage 6 | from agentevals.trajectory.utils import _is_trajectory_superset 7 | from agentevals.utils import _run_evaluator, _arun_evaluator 8 | from agentevals.types import ToolArgsMatchMode, ToolArgsMatchOverrides 9 | 10 | from typing import Any, Union, Optional, TYPE_CHECKING 11 | 12 | if TYPE_CHECKING: 13 | from langchain_core.messages import BaseMessage 14 | 15 | 16 | def _scorer( 17 | *, 18 | outputs: list[ChatCompletionMessage], 19 | reference_outputs: list[ChatCompletionMessage], 20 | tool_args_match_mode: ToolArgsMatchMode, 21 | tool_args_match_overrides: Optional[ToolArgsMatchOverrides] = None, 22 | **kwargs: Any, 23 | ): 24 | if outputs is None or reference_outputs is None: 25 | raise ValueError( 26 | "Trajectory subset match requires both outputs and reference_outputs" 27 | ) 28 | is_superset = _is_trajectory_superset( 29 | reference_outputs, outputs, tool_args_match_mode, tool_args_match_overrides 30 | ) 31 | return is_superset 32 | 33 | 34 | def trajectory_subset( 35 | *, 36 | outputs: Union[list[ChatCompletionMessage], list[BaseMessage], dict], 37 | reference_outputs: Union[list[ChatCompletionMessage], list[BaseMessage], dict], 38 | **kwargs: Any, 39 | ): 40 | """ 41 | DEPRECATED: Use create_trajectory_match_evaluator() instead: 42 | ```python 43 | from agentevals.trajectory.match import create_trajectory_match_evaluator 44 | evaluator = create_trajectory_match_evaluator(trajectory_match_mode="subset") 45 | evaluator(outputs=outputs, reference_outputs=reference_outputs) 46 | ``` 47 | 48 | Evaluate whether an agent trajectory and called tools is a subset of a reference trajectory and called tools. 49 | This means the agent called a subset of the tools specified in the reference trajectory. 50 | 51 | Args: 52 | outputs (Union[list[ChatCompletionMessage], list[BaseMessage], dict]): Actual trajectory the agent followed. 53 | May be a list of OpenAI messages, a list of LangChain messages, or a dictionary containing 54 | a "messages" key with one of the above. 55 | reference_outputs (Union[list[ChatCompletionMessage], list[BaseMessage], dict]): Ideal reference trajectory the agent should have followed. 56 | May be a list of OpenAI messages, a list of LangChain messages, or a dictionary containing 57 | a "messages" key with one of the above. 58 | 59 | Returns: 60 | EvaluatorResult: Contains a score of True if trajectory (including called tools) matches, False otherwise 61 | """ 62 | warn( 63 | "trajectory_subset() is deprecated. Use create_trajectory_match_evaluator(trajectory_match_mode='subset') instead.", 64 | DeprecationWarning, 65 | stacklevel=2, 66 | ) 67 | 68 | outputs = _normalize_to_openai_messages_list(outputs) 69 | reference_outputs = _normalize_to_openai_messages_list(reference_outputs) 70 | 71 | return _run_evaluator( 72 | run_name="trajectory_subset", 73 | scorer=_scorer, 74 | feedback_key="trajectory_subset", 75 | outputs=outputs, 76 | reference_outputs=reference_outputs, 77 | tool_args_match_mode="ignore", 78 | **kwargs, 79 | ) 80 | 81 | 82 | async def trajectory_subset_async( 83 | *, 84 | outputs: Union[list[ChatCompletionMessage], list[BaseMessage], dict], 85 | reference_outputs: Union[list[ChatCompletionMessage], list[BaseMessage], dict], 86 | **kwargs: Any, 87 | ): 88 | """ 89 | DEPRECATED: Use create_async_trajectory_match_evaluator() instead: 90 | ```python 91 | from agentevals.trajectory.match import create_trajectory_match_evaluator 92 | evaluator = create_async_trajectory_match_evaluator(trajectory_match_mode="subset") 93 | await evaluator(outputs=outputs, reference_outputs=reference_outputs) 94 | ``` 95 | 96 | Evaluate whether an agent trajectory and called tools is a subset of a reference trajectory and called tools. 97 | This means the agent called a subset of the tools specified in the reference trajectory. 98 | 99 | Args: 100 | outputs (Union[list[ChatCompletionMessage], list[BaseMessage], dict]): Actual trajectory the agent followed. 101 | May be a list of OpenAI messages, a list of LangChain messages, or a dictionary containing 102 | a "messages" key with one of the above. 103 | reference_outputs (Union[list[ChatCompletionMessage], list[BaseMessage], dict]): Ideal reference trajectory the agent should have followed. 104 | May be a list of OpenAI messages, a list of LangChain messages, or a dictionary containing 105 | a "messages" key with one of the above. 106 | 107 | Returns: 108 | EvaluatorResult: Contains a score of True if trajectory (including called tools) matches, False otherwise 109 | """ 110 | warn( 111 | "trajectory_subset_async() is deprecated. Use create_async_trajectory_match_evaluator(trajectory_match_mode='subset') instead.", 112 | DeprecationWarning, 113 | stacklevel=2, 114 | ) 115 | 116 | outputs = _normalize_to_openai_messages_list(outputs) 117 | reference_outputs = _normalize_to_openai_messages_list(reference_outputs) 118 | 119 | return await _arun_evaluator( 120 | run_name="trajectory_subset", 121 | scorer=_scorer, 122 | feedback_key="trajectory_subset", 123 | outputs=outputs, 124 | reference_outputs=reference_outputs, 125 | tool_args_match_mode="ignore", 126 | **kwargs, 127 | ) 128 | -------------------------------------------------------------------------------- /python/agentevals/trajectory/superset.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | from warnings import warn 3 | 4 | from agentevals.types import ( 5 | ChatCompletionMessage, 6 | ToolArgsMatchMode, 7 | ToolArgsMatchOverrides, 8 | ) 9 | from agentevals.trajectory.utils import ( 10 | _is_trajectory_superset, 11 | _normalize_to_openai_messages_list, 12 | ) 13 | from agentevals.utils import _run_evaluator, _arun_evaluator 14 | 15 | from typing import Any, Optional, Union, TYPE_CHECKING 16 | 17 | if TYPE_CHECKING: 18 | from langchain_core.messages import BaseMessage 19 | 20 | 21 | def _scorer( 22 | *, 23 | outputs: list[ChatCompletionMessage], 24 | reference_outputs: list[ChatCompletionMessage], 25 | tool_args_match_mode: ToolArgsMatchMode, 26 | tool_args_match_overrides: Optional[ToolArgsMatchOverrides] = None, 27 | **kwargs: Any, 28 | ): 29 | if outputs is None or reference_outputs is None: 30 | raise ValueError( 31 | "Trajectory superset match requires both outputs and reference_outputs" 32 | ) 33 | is_superset = _is_trajectory_superset( 34 | outputs, reference_outputs, tool_args_match_mode, tool_args_match_overrides 35 | ) 36 | return is_superset 37 | 38 | 39 | def trajectory_superset( 40 | *, 41 | outputs: Union[list[ChatCompletionMessage], list[BaseMessage], dict], 42 | reference_outputs: Union[list[ChatCompletionMessage], list[BaseMessage], dict], 43 | **kwargs: Any, 44 | ): 45 | """ 46 | DEPRECATED: Use create_trajectory_match_evaluator() instead: 47 | ```python 48 | from agentevals.trajectory.match import create_trajectory_match_evaluator 49 | evaluator = create_trajectory_match_evaluator(trajectory_match_mode="superset") 50 | evaluator(outputs=outputs, reference_outputs=reference_outputs) 51 | ``` 52 | 53 | Evaluate whether an agent trajectory and called tools is a superset of a reference trajectory and called tools. 54 | This means the agent called a superset of the tools specified in the reference trajectory. 55 | 56 | Args: 57 | outputs (Union[list[ChatCompletionMessage], list[BaseMessage], dict]): Actual trajectory the agent followed. 58 | May be a list of OpenAI messages, a list of LangChain messages, or a dictionary containing 59 | a "messages" key with one of the above. 60 | reference_outputs (Union[list[ChatCompletionMessage], list[BaseMessage], dict]): Ideal reference trajectory the agent should have followed. 61 | May be a list of OpenAI messages, a list of LangChain messages, or a dictionary containing 62 | a "messages" key with one of the above. 63 | 64 | Returns: 65 | EvaluatorResult: Contains a score of True if trajectory (including called tools) matches, False otherwise 66 | """ 67 | warn( 68 | "trajectory_superset() is deprecated. Use create_trajectory_match_evaluator(trajectory_match_mode='superset') instead.", 69 | DeprecationWarning, 70 | stacklevel=2, 71 | ) 72 | 73 | outputs = _normalize_to_openai_messages_list(outputs) 74 | reference_outputs = _normalize_to_openai_messages_list(reference_outputs) 75 | 76 | return _run_evaluator( 77 | run_name="trajectory_superset", 78 | scorer=_scorer, 79 | feedback_key="trajectory_superset", 80 | outputs=outputs, 81 | reference_outputs=reference_outputs, 82 | tool_args_match_mode="ignore", 83 | **kwargs, 84 | ) 85 | 86 | 87 | async def trajectory_superset_async( 88 | *, 89 | outputs: Union[list[ChatCompletionMessage], list[BaseMessage], dict], 90 | reference_outputs: Union[list[ChatCompletionMessage], list[BaseMessage], dict], 91 | **kwargs: Any, 92 | ): 93 | """ 94 | DEPRECATED: Use create_async_trajectory_match_evaluator() instead: 95 | ```python 96 | from agentevals.trajectory.match import create_trajectory_match_evaluator 97 | evaluator = create_async_trajectory_match_evaluator(trajectory_match_mode="superset") 98 | await evaluator(outputs=outputs, reference_outputs=reference_outputs) 99 | ``` 100 | 101 | Evaluate whether an agent trajectory and called tools is a superset of a reference trajectory and called tools. 102 | This means the agent called a superset of the tools specified in the reference trajectory. 103 | 104 | Args: 105 | outputs (Union[list[ChatCompletionMessage], list[BaseMessage], dict]): Actual trajectory the agent followed. 106 | May be a list of OpenAI messages, a list of LangChain messages, or a dictionary containing 107 | a "messages" key with one of the above. 108 | reference_outputs (Union[list[ChatCompletionMessage], list[BaseMessage], dict]): Ideal reference trajectory the agent should have followed. 109 | May be a list of OpenAI messages, a list of LangChain messages, or a dictionary containing 110 | a "messages" key with one of the above. 111 | 112 | Returns: 113 | EvaluatorResult: Contains a score of True if trajectory (including called tools) matches, False otherwise 114 | """ 115 | warn( 116 | "trajectory_superset_async() is deprecated. Use create_async_trajectory_match_evaluator(trajectory_match_mode='superset') instead.", 117 | DeprecationWarning, 118 | stacklevel=2, 119 | ) 120 | 121 | outputs = _normalize_to_openai_messages_list(outputs) 122 | reference_outputs = _normalize_to_openai_messages_list(reference_outputs) 123 | 124 | return await _arun_evaluator( 125 | run_name="trajectory_superset", 126 | scorer=_scorer, 127 | feedback_key="trajectory_superset", 128 | outputs=outputs, 129 | reference_outputs=reference_outputs, 130 | tool_args_match_mode="ignore", 131 | **kwargs, 132 | ) 133 | -------------------------------------------------------------------------------- /python/tests/graph_trajectory/test_graph_trajectory_utils.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from agentevals.graph_trajectory.utils import ( 4 | aextract_langgraph_trajectory_from_thread, 5 | extract_langgraph_trajectory_from_thread, 6 | ) 7 | from openevals.exact import exact_match 8 | 9 | from typing import Annotated 10 | from typing_extensions import TypedDict 11 | import operator 12 | import time 13 | 14 | from langgraph.graph import StateGraph 15 | from langgraph.checkpoint.memory import MemorySaver 16 | 17 | 18 | @pytest.mark.langsmith 19 | def test_trajectory_match(): 20 | checkpointer = MemorySaver() 21 | 22 | class InnerState(TypedDict): 23 | my_key: Annotated[str, operator.add] 24 | my_other_key: str 25 | 26 | def inner_1(state: InnerState): 27 | time.sleep(0.1) 28 | return {"my_key": "got here", "my_other_key": state["my_key"]} 29 | 30 | def inner_2(state: InnerState): 31 | return { 32 | "my_key": " and there", 33 | "my_other_key": state["my_key"], 34 | } 35 | 36 | inner = StateGraph(InnerState) 37 | inner.add_node("inner_1", inner_1) 38 | inner.add_node("inner_2", inner_2) 39 | inner.add_edge("inner_1", "inner_2") 40 | inner.set_entry_point("inner_1") 41 | inner.set_finish_point("inner_2") 42 | 43 | class State(TypedDict): 44 | my_key: Annotated[str, operator.add] 45 | 46 | def outer_1(state: State): 47 | return {"my_key": " and parallel"} 48 | 49 | def outer_2(state: State): 50 | return {"my_key": " and back again"} 51 | 52 | graph = StateGraph(State) 53 | graph.add_node("inner", inner.compile(interrupt_before=["inner_2"])) 54 | graph.add_node("outer_1", outer_1) 55 | graph.add_node("outer_2", outer_2) 56 | 57 | graph.add_edge("__start__", "inner") 58 | graph.add_edge("__start__", "outer_1") 59 | graph.add_edge(["inner", "outer_1"], "outer_2") 60 | graph.set_finish_point("outer_2") 61 | 62 | app = graph.compile(checkpointer=checkpointer) 63 | 64 | # test invoke w/ nested interrupt 65 | config = {"configurable": {"thread_id": "1"}} 66 | app.invoke({"my_key": ""}, config) 67 | 68 | app.invoke(None, config) == { 69 | "my_key": "got here and there and parallel and back again", 70 | } 71 | assert exact_match( 72 | outputs=extract_langgraph_trajectory_from_thread( 73 | app, {"configurable": {"thread_id": "1"}} 74 | ), 75 | reference_outputs={ 76 | "inputs": [ 77 | {"__start__": {"my_key": ""}}, 78 | {"__start__": {"my_key": ""}}, 79 | ], 80 | "outputs": { 81 | "inputs": [], 82 | "results": [ 83 | {"my_key": "got here and there", "my_other_key": "got here"}, 84 | {"my_key": "got here and there and parallel and back again"}, 85 | ], 86 | "steps": [ 87 | [ 88 | "__start__", 89 | "outer_1", 90 | "inner", 91 | "inner:__start__", 92 | "inner:inner_1", 93 | "inner:inner_2", 94 | ], 95 | ["outer_2"], 96 | ], 97 | }, 98 | }, 99 | )["score"] 100 | 101 | 102 | @pytest.mark.asyncio 103 | @pytest.mark.langsmith 104 | async def test_trajectory_match_async(): 105 | checkpointer = MemorySaver() 106 | 107 | class InnerState(TypedDict): 108 | my_key: Annotated[str, operator.add] 109 | my_other_key: str 110 | 111 | def inner_1(state: InnerState): 112 | time.sleep(0.1) 113 | return {"my_key": "got here", "my_other_key": state["my_key"]} 114 | 115 | def inner_2(state: InnerState): 116 | return { 117 | "my_key": " and there", 118 | "my_other_key": state["my_key"], 119 | } 120 | 121 | inner = StateGraph(InnerState) 122 | inner.add_node("inner_1", inner_1) 123 | inner.add_node("inner_2", inner_2) 124 | inner.add_edge("inner_1", "inner_2") 125 | inner.set_entry_point("inner_1") 126 | inner.set_finish_point("inner_2") 127 | 128 | class State(TypedDict): 129 | my_key: Annotated[str, operator.add] 130 | 131 | def outer_1(state: State): 132 | return {"my_key": " and parallel"} 133 | 134 | def outer_2(state: State): 135 | return {"my_key": " and back again"} 136 | 137 | graph = StateGraph(State) 138 | graph.add_node("inner", inner.compile(interrupt_before=["inner_2"])) 139 | graph.add_node("outer_1", outer_1) 140 | graph.add_node("outer_2", outer_2) 141 | 142 | graph.add_edge("__start__", "inner") 143 | graph.add_edge("__start__", "outer_1") 144 | graph.add_edge(["inner", "outer_1"], "outer_2") 145 | graph.set_finish_point("outer_2") 146 | 147 | app = graph.compile(checkpointer=checkpointer) 148 | 149 | # test invoke w/ nested interrupt 150 | config = {"configurable": {"thread_id": "1"}} 151 | await app.ainvoke({"my_key": ""}, config) 152 | 153 | await app.ainvoke(None, config) == { 154 | "my_key": "got here and there and parallel and back again", 155 | } 156 | assert exact_match( 157 | outputs=await aextract_langgraph_trajectory_from_thread( 158 | app, {"configurable": {"thread_id": "1"}} 159 | ), 160 | reference_outputs={ 161 | "inputs": [ 162 | {"__start__": {"my_key": ""}}, 163 | {"__start__": {"my_key": ""}}, 164 | ], 165 | "outputs": { 166 | "inputs": [], 167 | "results": [ 168 | {"my_key": "got here and there", "my_other_key": "got here"}, 169 | {"my_key": "got here and there and parallel and back again"}, 170 | ], 171 | "steps": [ 172 | [ 173 | "__start__", 174 | "outer_1", 175 | "inner", 176 | "inner:__start__", 177 | "inner:inner_1", 178 | "inner:inner_2", 179 | ], 180 | ["outer_2"], 181 | ], 182 | }, 183 | }, 184 | )["score"] 185 | -------------------------------------------------------------------------------- /js/src/graph_trajectory/llm.ts: -------------------------------------------------------------------------------- 1 | import { _createLLMAsJudgeScorer } from "openevals/llm"; 2 | 3 | import { _runEvaluator } from "../utils.js"; 4 | import type { GraphTrajectory, TrajectoryLLMAsJudgeParams } from "../types.js"; 5 | 6 | export const GRAPH_TRAJECTORY_ACCURACY_PROMPT = `You are an expert data labeler. 7 | Your task is to grade the accuracy of an AI agent's internal steps in resolving a user queries. 8 | 9 | 10 | An accurate trajectory: 11 | - Makes logical sense between steps 12 | - Shows clear progression 13 | - Is relatively efficient, though it does not need to be perfectly efficient 14 | - Is semantically equivalent to the provided reference trajectory, if present 15 | 16 | 17 | 18 | Grade the following thread, evaluating whether the agent's overall steps are logical and relatively efficient. 19 | For the trajectory, "__start__" denotes an initial entrypoint to the agent, and "__interrupt__" corresponds to the agent 20 | interrupting to await additional data from another source ("human-in-the-loop"). 21 | 22 | Steps containing a colon represent steps within subagents (e.g. "graph:step_name"). 23 | 24 | 25 | 26 | {thread} 27 | 28 | 29 | {reference_outputs} 30 | `; 31 | 32 | function _formatThread( 33 | inputs: (string | Record | null)[], 34 | outputs: GraphTrajectory 35 | ): string { 36 | let formattedThread = ""; 37 | const zippedData = inputs.map((input, i) => ({ 38 | input: JSON.stringify(input ?? ""), 39 | result: JSON.stringify(outputs.results[i]), 40 | step: JSON.stringify(outputs.steps[i]), 41 | })); 42 | 43 | for (const { input, result, step } of zippedData) { 44 | formattedThread += input ? `\n\n${input}\n\n` : ""; 45 | formattedThread += `\n\n${step}\n\n`; 46 | formattedThread += `\n\n${result}\n\n`; 47 | } 48 | return formattedThread; 49 | } 50 | 51 | function _formatInputs( 52 | inputs: 53 | | (string | Record | null)[] 54 | | { inputs: (string | Record | null)[] }, 55 | outputs: GraphTrajectory, 56 | referenceOutputs?: GraphTrajectory 57 | ) { 58 | let processedInputs: (string | Record | null)[]; 59 | 60 | if (Array.isArray(inputs)) { 61 | processedInputs = inputs; 62 | } else { 63 | if (!("inputs" in inputs)) { 64 | throw new Error( 65 | "inputs must be an array or an object with an 'inputs' key" 66 | ); 67 | } 68 | processedInputs = inputs.inputs; 69 | } 70 | 71 | if (processedInputs.length !== outputs.results.length) { 72 | throw new Error( 73 | "Provided `inputs` and `results` within provided `outputs` must have the same length" 74 | ); 75 | } 76 | if (processedInputs.length !== outputs.steps.length) { 77 | throw new Error( 78 | "Provided `inputs` and `steps` within provided `outputs` must have the same length" 79 | ); 80 | } 81 | 82 | const formattedThread = _formatThread(processedInputs, outputs); 83 | const formattedReferenceOutputs = referenceOutputs 84 | ? `\nUse the following trajectory as an example reference when grading:\n\n${_formatThread(referenceOutputs.inputs ?? [], referenceOutputs)}\n\n` 85 | : ""; 86 | 87 | return { 88 | formattedThread, 89 | formattedReferenceOutputs, 90 | }; 91 | } 92 | 93 | /** 94 | * Creates an evaluator that uses an LLM to judge agent trajectories. 95 | * @param options Configuration options 96 | * @param [options.prompt] - The evaluation prompt. Can be a string template, 97 | * LangChain prompt template, or callable that returns a list of chat messages. Note that the default prompt allows a rubric 98 | * in addition to the typical "inputs", "outputs", and "reference_outputs" parameters. 99 | * @param [options.feedbackKey="graph_trajectory_accuracy"] - Key used to store the evaluation result 100 | * @param [options.judge] - The LLM used for evaluation. Can be an OpenAI client 101 | * or a LangChain chat model. If an OpenAI client, must specify "model" as well. 102 | * If omitted, "model" will be used to instantiate a LangChain model instance by model string. 103 | * @param [options.model] - Model identifier to use. If "judge" is an OpenAI client, 104 | * this argument should be a model name directly. If "judge" is omitted, must be a valid 105 | * LangChain model identifier. See `init_chat_model` docs for more details: 106 | * https://python.langchain.com/docs/how_to/chat_models_universal_init/ 107 | * @param [options.continuous=false] - If true, score will be a float between 0 and 1. If false, score will be boolean. 108 | * @param [options.choices] - Optional list of specific float values the score must be chosen from 109 | * @param [options.useReasoning=true] - If true, includes explanation for the score in the output 110 | * @param [options.fewShotExamples] - Optional list of example evaluations to append to the prompt 111 | * @returns A function that evaluates agent trajectories using the configured LLM judge 112 | */ 113 | export const createGraphTrajectoryLLMAsJudge = ({ 114 | prompt = GRAPH_TRAJECTORY_ACCURACY_PROMPT, 115 | model, 116 | feedbackKey = "graph_trajectory_accuracy", 117 | judge, 118 | continuous = false, 119 | choices, 120 | useReasoning = true, 121 | fewShotExamples, 122 | }: TrajectoryLLMAsJudgeParams) => { 123 | const scorer = _createLLMAsJudgeScorer({ 124 | prompt, 125 | judge, 126 | model, 127 | continuous, 128 | choices, 129 | useReasoning, 130 | fewShotExamples, 131 | }); 132 | 133 | const _wrappedEvaluator = async ({ 134 | inputs, 135 | outputs, 136 | referenceOutputs, 137 | ...extra 138 | }: { 139 | inputs: 140 | | (string | Record | null)[] 141 | | { inputs: (string | Record | null)[] }; 142 | outputs: GraphTrajectory; 143 | referenceOutputs?: GraphTrajectory; 144 | [key: string]: unknown; 145 | }) => { 146 | const { formattedThread, formattedReferenceOutputs } = _formatInputs( 147 | inputs, 148 | outputs, 149 | referenceOutputs 150 | ); 151 | return _runEvaluator(`llm_as_${feedbackKey}_judge`, scorer, feedbackKey, { 152 | outputs, 153 | inputs, 154 | thread: formattedThread, 155 | referenceOutputs: formattedReferenceOutputs, 156 | ...extra, 157 | }); 158 | }; 159 | return _wrappedEvaluator; 160 | }; 161 | -------------------------------------------------------------------------------- /js/src/trajectory/llm.ts: -------------------------------------------------------------------------------- 1 | import { BaseMessage } from "@langchain/core/messages"; 2 | import { _createLLMAsJudgeScorer } from "openevals/llm"; 3 | 4 | import { _runEvaluator, _normalizeToOpenAIMessagesList } from "../utils.js"; 5 | import { _chatCompletionMessagesToString } from "./utils.js"; 6 | import { 7 | ChatCompletionMessage, 8 | FlexibleChatCompletionMessage, 9 | EvaluatorResult, 10 | TrajectoryLLMAsJudgeParams, 11 | } from "../types.js"; 12 | 13 | export const TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE = `You are an expert data labeler. 14 | Your task is to grade the accuracy of an AI agent's internal trajectory. 15 | 16 | 17 | An accurate trajectory: 18 | - Makes logical sense between steps 19 | - Shows clear progression 20 | - Is relatively efficient, though it does not need to be perfectly efficient 21 | - Is semantically equivalent to the provided reference trajectory 22 | 23 | 24 | Based on the following reference trajectory: 25 | 26 | 27 | {reference_outputs} 28 | 29 | 30 | Grade this actual trajectory: 31 | 32 | 33 | {outputs} 34 | 35 | `; 36 | 37 | export const TRAJECTORY_ACCURACY_PROMPT = `You are an expert data labeler. 38 | Your task is to grade the accuracy of an AI agent's internal trajectory. 39 | 40 | 41 | An accurate trajectory: 42 | - Makes logical sense between steps 43 | - Shows clear progression 44 | - Is relatively efficient, though it does not need to be perfectly efficient 45 | 46 | 47 | First, try to understand the goal of the trajectory by looking at the input 48 | (if the input is not present try to infer it from the content of the first message), 49 | as well as the output of the final message. Once you understand the goal, grade the trajectory 50 | as it relates to achieving that goal. 51 | 52 | Grade the following trajectory: 53 | 54 | 55 | {outputs} 56 | `; 57 | 58 | function _formatInputs(params: { 59 | outputs: 60 | | ChatCompletionMessage[] 61 | | FlexibleChatCompletionMessage[] 62 | | BaseMessage[] 63 | | { 64 | messages: ( 65 | | BaseMessage 66 | | ChatCompletionMessage 67 | | FlexibleChatCompletionMessage 68 | )[]; 69 | }; 70 | referenceOutputs?: 71 | | ChatCompletionMessage[] 72 | | FlexibleChatCompletionMessage[] 73 | | BaseMessage[] 74 | | { 75 | messages: ( 76 | | BaseMessage 77 | | ChatCompletionMessage 78 | | FlexibleChatCompletionMessage 79 | )[]; 80 | }; 81 | }): [string, string] { 82 | const { outputs, referenceOutputs } = params; 83 | const normalizedOutputs = _normalizeToOpenAIMessagesList(outputs); 84 | const normalizedReferenceOutputs = _normalizeToOpenAIMessagesList( 85 | referenceOutputs ?? [] 86 | ); 87 | 88 | const formattedReferenceOutputs = normalizedReferenceOutputs 89 | ? _chatCompletionMessagesToString(normalizedReferenceOutputs) 90 | : ""; 91 | 92 | const formattedOutputs = _chatCompletionMessagesToString(normalizedOutputs); 93 | 94 | return [formattedOutputs, formattedReferenceOutputs]; 95 | } 96 | 97 | /** 98 | * Creates an evaluator that uses an LLM to judge agent trajectories. 99 | * 100 | * @param options - Configuration options 101 | * @param options.prompt - The evaluation prompt. Can be a string template, LangChain prompt template, 102 | * or callable that returns a list of chat messages. 103 | * @param options.feedbackKey - Key used to store the evaluation result. Defaults to "trajectory_accuracy". 104 | * @param options.model - Model identifier to use. If judge is an OpenAI client, 105 | * this should be a model name directly. If judge is omitted, must be a valid 106 | * LangChain model identifier. 107 | * @param options.system - Optional system message to prepend to the prompt. 108 | * @param options.judge - The LLM used for evaluation. Can be an OpenAI client or a LangChainLikeModel. 109 | * If an OpenAI client, must specify "model" as well. If omitted, "model" will be 110 | * used to instantiate a LangChain model instance by model string. 111 | * @param options.continuous - If true, score will be a float between 0 and 1. If false, score will be boolean. 112 | * Defaults to false. 113 | * @param options.choices - Optional list of specific float values the score must be chosen from. 114 | * @param options.useReasoning - If true, includes explanation for the score in the output. Defaults to true. 115 | * @param options.fewShotExamples - Optional list of example evaluations to append to the prompt. 116 | * @returns A function that evaluates agent trajectories using the configured LLM judge. 117 | */ 118 | export const createTrajectoryLLMAsJudge = ({ 119 | prompt = TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE, 120 | feedbackKey = "trajectory_accuracy", 121 | model, 122 | system, 123 | judge, 124 | continuous = false, 125 | choices, 126 | useReasoning = true, 127 | fewShotExamples, 128 | }: TrajectoryLLMAsJudgeParams) => { 129 | const scorer = _createLLMAsJudgeScorer({ 130 | prompt, 131 | judge, 132 | model, 133 | system, 134 | continuous, 135 | choices, 136 | useReasoning, 137 | fewShotExamples, 138 | }); 139 | 140 | const wrappedEvaluator = async ({ 141 | inputs, 142 | outputs, 143 | referenceOutputs, 144 | ...extra 145 | }: { 146 | outputs: 147 | | ChatCompletionMessage[] 148 | | FlexibleChatCompletionMessage[] 149 | | BaseMessage[] 150 | | { 151 | messages: ( 152 | | BaseMessage 153 | | ChatCompletionMessage 154 | | FlexibleChatCompletionMessage 155 | )[]; 156 | }; 157 | referenceOutputs?: 158 | | ChatCompletionMessage[] 159 | | FlexibleChatCompletionMessage[] 160 | | BaseMessage[] 161 | | { 162 | messages: ( 163 | | BaseMessage 164 | | ChatCompletionMessage 165 | | FlexibleChatCompletionMessage 166 | )[]; 167 | }; 168 | [key: string]: unknown; 169 | }): Promise => { 170 | const [formattedOutputs, formattedReferenceOutputs] = _formatInputs({ 171 | outputs, 172 | referenceOutputs, 173 | }); 174 | 175 | return _runEvaluator(`llm_as_${feedbackKey}_judge`, scorer, feedbackKey, { 176 | inputs, 177 | outputs: formattedOutputs, 178 | referenceOutputs: formattedReferenceOutputs, 179 | ...extra, 180 | }); 181 | }; 182 | return wrappedEvaluator; 183 | }; 184 | -------------------------------------------------------------------------------- /python/tests/test_trajectory_llm.py: -------------------------------------------------------------------------------- 1 | from agentevals.trajectory.llm import ( 2 | create_trajectory_llm_as_judge, 3 | TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE, 4 | TRAJECTORY_ACCURACY_PROMPT, 5 | ) 6 | 7 | from agentevals.types import ChatCompletionMessage 8 | 9 | import pytest 10 | import json 11 | 12 | 13 | @pytest.mark.langsmith 14 | def test_trajectory_match(): 15 | evaluator = create_trajectory_llm_as_judge( 16 | prompt=TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE, model="openai:o3-mini" 17 | ) 18 | outputs = [ 19 | {"role": "user", "content": "What is the weather in SF?"}, 20 | { 21 | "role": "assistant", 22 | "tool_calls": [ 23 | { 24 | "function": { 25 | "name": "get_weather", 26 | "arguments": json.dumps({"city": "SF"}), 27 | } 28 | } 29 | ], 30 | }, 31 | {"role": "tool", "content": "It's 80 degrees and sunny in SF."}, 32 | {"role": "assistant", "content": "The weather in SF is 80 degrees and sunny."}, 33 | ] 34 | reference_outputs = [ 35 | {"role": "user", "content": "What is the weather in SF?"}, 36 | { 37 | "role": "assistant", 38 | "tool_calls": [ 39 | { 40 | "function": { 41 | "name": "get_weather", 42 | "arguments": json.dumps({"city": "San Francisco"}), 43 | } 44 | } 45 | ], 46 | }, 47 | {"role": "tool", "content": "It's 80 degrees and sunny in San Francisco."}, 48 | {"role": "assistant", "content": "The weather in SF is 80˚ and sunny."}, 49 | ] 50 | eval_result = evaluator( 51 | outputs=outputs, 52 | reference_outputs=reference_outputs, 53 | ) 54 | assert eval_result["key"] == "trajectory_accuracy" 55 | assert eval_result["score"] 56 | 57 | 58 | @pytest.mark.langsmith 59 | def test_trajectory_no_ref(): 60 | evaluator = create_trajectory_llm_as_judge( 61 | prompt=TRAJECTORY_ACCURACY_PROMPT, model="openai:o3-mini" 62 | ) 63 | outputs = [ 64 | {"role": "user", "content": "What is the weather in SF?"}, 65 | { 66 | "role": "assistant", 67 | "tool_calls": [ 68 | { 69 | "function": { 70 | "name": "get_weather", 71 | "arguments": json.dumps({"city": "SF"}), 72 | } 73 | } 74 | ], 75 | }, 76 | {"role": "tool", "content": "It's 80 degrees and sunny in SF."}, 77 | {"role": "assistant", "content": "The weather in SF is 80 degrees and sunny."}, 78 | ] 79 | eval_result = evaluator( 80 | outputs=outputs, 81 | ) 82 | assert eval_result["key"] == "trajectory_accuracy" 83 | assert eval_result["score"] 84 | 85 | 86 | @pytest.mark.langsmith 87 | def test_trajectory_no_ref_bad_trajectory(): 88 | evaluator = create_trajectory_llm_as_judge( 89 | prompt=TRAJECTORY_ACCURACY_PROMPT, model="openai:o3-mini" 90 | ) 91 | outputs = [ 92 | {"role": "user", "content": "What are some good restaurants in SF?"}, 93 | { 94 | "role": "assistant", 95 | "tool_calls": [ 96 | { 97 | "function": { 98 | "name": "get_weather", 99 | "arguments": json.dumps({"city": "SF"}), 100 | } 101 | } 102 | ], 103 | }, 104 | {"role": "tool", "content": "It's 80 degrees and sunny in SF."}, 105 | {"role": "assistant", "content": "The weather in SF is 80 degrees and sunny."}, 106 | ] 107 | eval_result = evaluator( 108 | outputs=outputs, 109 | ) 110 | assert eval_result["key"] == "trajectory_accuracy" 111 | assert not eval_result["score"] 112 | 113 | 114 | @pytest.mark.langsmith 115 | def test_trajectory_match_with_inverse_rubric(): 116 | REVERSE_PROMPT = """You are an expert data labeler. 117 | Your task is to grade the inaccuracy of an AI agent's internal trajectory. 118 | 119 | 120 | An inaccurate trajectory: 121 | - Makes no logical sense between steps 122 | - Shows no clear progression 123 | - Is not relatively efficient, though it does not need to be perfectly inefficient 124 | - Is not semantically equivalent to the provided reference trajectory, if present 125 | 126 | We are looking for bad trajectories, so score should be 0 if the trajectory contains reasonable steps for the agent to answer the input, and 1 if not. 127 | 128 | 129 | Grade the following trajectory: 130 | 131 | 132 | {outputs} 133 | 134 | 135 | 136 | {inputs} 137 | 138 | 139 | According to this reference trajectory: 140 | 141 | 142 | {reference_outputs} 143 | 144 | """ 145 | evaluator = create_trajectory_llm_as_judge( 146 | prompt=REVERSE_PROMPT, model="openai:o3-mini" 147 | ) 148 | outputs = [ 149 | ChatCompletionMessage(role="user", content="What is the weather in SF?"), 150 | ChatCompletionMessage( 151 | role="assistant", 152 | tool_calls=[ 153 | { 154 | "function": { 155 | "name": "get_weather", 156 | "arguments": json.dumps({"city": "SF"}), 157 | } 158 | } 159 | ], 160 | ), 161 | ChatCompletionMessage(role="tool", content="It's 80 degrees and sunny in SF."), 162 | ChatCompletionMessage( 163 | role="assistant", content="The weather in SF is 80 degrees and sunny." 164 | ), 165 | ] 166 | reference_outputs = [ 167 | ChatCompletionMessage(role="user", content="What is the weather in SF?"), 168 | ChatCompletionMessage( 169 | role="assistant", 170 | tool_calls=[ 171 | { 172 | "function": { 173 | "name": "get_weather", 174 | "arguments": json.dumps({"city": "San Francisco"}), 175 | } 176 | } 177 | ], 178 | ), 179 | ChatCompletionMessage( 180 | role="tool", content="It's 80 degrees and sunny in San Francisco." 181 | ), 182 | ChatCompletionMessage( 183 | role="assistant", content="The weather in SF is 80˚ and sunny." 184 | ), 185 | ] 186 | eval_result = evaluator( 187 | inputs="What is the weather in SF?", 188 | outputs=outputs, 189 | reference_outputs=reference_outputs, 190 | ) 191 | assert eval_result["key"] == "trajectory_accuracy" 192 | assert not eval_result["score"] 193 | -------------------------------------------------------------------------------- /python/tests/test_trajectory_llm_async.py: -------------------------------------------------------------------------------- 1 | from agentevals.trajectory.llm import ( 2 | create_async_trajectory_llm_as_judge, 3 | TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE, 4 | TRAJECTORY_ACCURACY_PROMPT, 5 | ) 6 | 7 | from agentevals.types import ChatCompletionMessage 8 | 9 | import pytest 10 | import json 11 | 12 | 13 | @pytest.mark.langsmith 14 | @pytest.mark.asyncio 15 | async def test_trajectory_match(): 16 | evaluator = create_async_trajectory_llm_as_judge( 17 | prompt=TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE, model="openai:o3-mini" 18 | ) 19 | inputs = {} 20 | outputs = [ 21 | {"role": "user", "content": "What is the weather in SF?"}, 22 | { 23 | "role": "assistant", 24 | "tool_calls": [ 25 | { 26 | "function": { 27 | "name": "get_weather", 28 | "arguments": json.dumps({"city": "SF"}), 29 | } 30 | } 31 | ], 32 | }, 33 | {"role": "tool", "content": "It's 80 degrees and sunny in SF."}, 34 | {"role": "assistant", "content": "The weather in SF is 80 degrees and sunny."}, 35 | ] 36 | reference_outputs = [ 37 | {"role": "user", "content": "What is the weather in SF?"}, 38 | { 39 | "role": "assistant", 40 | "tool_calls": [ 41 | { 42 | "function": { 43 | "name": "get_weather", 44 | "arguments": json.dumps({"city": "San Francisco"}), 45 | } 46 | } 47 | ], 48 | }, 49 | {"role": "tool", "content": "It's 80 degrees and sunny in San Francisco."}, 50 | {"role": "assistant", "content": "The weather in SF is 80˚ and sunny."}, 51 | ] 52 | eval_result = await evaluator( 53 | inputs=inputs, 54 | outputs=outputs, 55 | reference_outputs=reference_outputs, 56 | ) 57 | assert eval_result["key"] == "trajectory_accuracy" 58 | assert eval_result["score"] 59 | 60 | 61 | @pytest.mark.langsmith 62 | @pytest.mark.asyncio 63 | async def test_trajectory_no_ref(): 64 | evaluator = create_async_trajectory_llm_as_judge( 65 | prompt=TRAJECTORY_ACCURACY_PROMPT, model="openai:o3-mini" 66 | ) 67 | outputs = [ 68 | {"role": "user", "content": "What is the weather in SF?"}, 69 | { 70 | "role": "assistant", 71 | "tool_calls": [ 72 | { 73 | "function": { 74 | "name": "get_weather", 75 | "arguments": json.dumps({"city": "SF"}), 76 | } 77 | } 78 | ], 79 | }, 80 | {"role": "tool", "content": "It's 80 degrees and sunny in SF."}, 81 | {"role": "assistant", "content": "The weather in SF is 80 degrees and sunny."}, 82 | ] 83 | eval_result = await evaluator( 84 | outputs=outputs, 85 | ) 86 | assert eval_result["key"] == "trajectory_accuracy" 87 | assert eval_result["score"] 88 | 89 | 90 | @pytest.mark.langsmith 91 | @pytest.mark.asyncio 92 | async def test_trajectory_no_ref_bad_trajectory(): 93 | evaluator = create_async_trajectory_llm_as_judge( 94 | prompt=TRAJECTORY_ACCURACY_PROMPT, model="openai:o3-mini" 95 | ) 96 | outputs = [ 97 | {"role": "user", "content": "What are some good restaurants in SF?"}, 98 | { 99 | "role": "assistant", 100 | "tool_calls": [ 101 | { 102 | "function": { 103 | "name": "get_weather", 104 | "arguments": json.dumps({"city": "SF"}), 105 | } 106 | } 107 | ], 108 | }, 109 | {"role": "tool", "content": "It's 80 degrees and sunny in SF."}, 110 | {"role": "assistant", "content": "The weather in SF is 80 degrees and sunny."}, 111 | ] 112 | eval_result = await evaluator( 113 | outputs=outputs, 114 | ) 115 | assert eval_result["key"] == "trajectory_accuracy" 116 | assert not eval_result["score"] 117 | 118 | 119 | @pytest.mark.langsmith 120 | @pytest.mark.asyncio 121 | async def test_trajectory_match_with_inverse_rubric(): 122 | REVERSE_PROMPT = """You are an expert data labeler. 123 | Your task is to grade the inaccuracy of an AI agent's internal trajectory. 124 | 125 | 126 | An inaccurate trajectory: 127 | - Makes no logical sense between steps 128 | - Shows no clear progression 129 | - Is not relatively efficient, though it does not need to be perfectly inefficient 130 | - Is not semantically equivalent to the provided reference trajectory, if present 131 | 132 | We are looking for bad trajectories, so score should be 0 if the trajectory contains reasonable steps for the agent to answer the input, and 1 if not. 133 | 134 | 135 | Grade the following trajectory: 136 | 137 | 138 | {outputs} 139 | 140 | {inputs} 141 | {reference_outputs} 142 | """ 143 | evaluator = create_async_trajectory_llm_as_judge( 144 | prompt=REVERSE_PROMPT, model="openai:o3-mini" 145 | ) 146 | inputs = "What is the weather in SF?" 147 | outputs = [ 148 | ChatCompletionMessage(role="user", content="What is the weather in SF?"), 149 | ChatCompletionMessage( 150 | role="assistant", 151 | tool_calls=[ 152 | { 153 | "function": { 154 | "name": "get_weather", 155 | "arguments": json.dumps({"city": "SF"}), 156 | } 157 | } 158 | ], 159 | ), 160 | ChatCompletionMessage(role="tool", content="It's 80 degrees and sunny in SF."), 161 | ChatCompletionMessage( 162 | role="assistant", content="The weather in SF is 80 degrees and sunny." 163 | ), 164 | ] 165 | reference_outputs = [ 166 | ChatCompletionMessage(role="user", content="What is the weather in SF?"), 167 | ChatCompletionMessage( 168 | role="assistant", 169 | tool_calls=[ 170 | { 171 | "function": { 172 | "name": "get_weather", 173 | "arguments": json.dumps({"city": "San Francisco"}), 174 | } 175 | } 176 | ], 177 | ), 178 | ChatCompletionMessage( 179 | role="tool", content="It's 80 degrees and sunny in San Francisco." 180 | ), 181 | ChatCompletionMessage( 182 | role="assistant", content="The weather in SF is 80˚ and sunny." 183 | ), 184 | ] 185 | eval_result = await evaluator( 186 | inputs=inputs, 187 | outputs=outputs, 188 | reference_outputs=reference_outputs, 189 | ) 190 | assert eval_result["key"] == "trajectory_accuracy" 191 | assert not eval_result["score"] 192 | -------------------------------------------------------------------------------- /js/src/trajectory/tests/trajectory_llm.test.ts: -------------------------------------------------------------------------------- 1 | import * as ls from "langsmith/vitest"; 2 | import { expect } from "vitest"; 3 | 4 | import { 5 | createTrajectoryLLMAsJudge, 6 | TRAJECTORY_ACCURACY_PROMPT, 7 | } from "../llm.js"; 8 | import { FlexibleChatCompletionMessage } from "../../types.js"; 9 | 10 | ls.describe("Trajectory LLM", () => { 11 | ls.test( 12 | "should match trajectories", 13 | { 14 | inputs: {}, 15 | }, 16 | async () => { 17 | const evaluator = createTrajectoryLLMAsJudge({ 18 | model: "openai:o3-mini", 19 | }); 20 | const inputs = {}; 21 | const outputs = [ 22 | { role: "user", content: "What is the weather in SF?" }, 23 | { 24 | role: "assistant", 25 | content: "", 26 | tool_calls: [ 27 | { 28 | function: { 29 | name: "get_weather", 30 | arguments: JSON.stringify({ city: "SF" }), 31 | }, 32 | }, 33 | ], 34 | }, 35 | { role: "tool", content: "It's 80 degrees and sunny in SF." }, 36 | { 37 | role: "assistant", 38 | content: "The weather in SF is 80 degrees and sunny.", 39 | }, 40 | ] satisfies FlexibleChatCompletionMessage[]; 41 | 42 | const referenceOutputs = [ 43 | { role: "user", content: "What is the weather in SF?" }, 44 | { 45 | role: "assistant", 46 | content: "", 47 | tool_calls: [ 48 | { 49 | function: { 50 | name: "get_weather", 51 | arguments: JSON.stringify({ city: "San Francisco" }), 52 | }, 53 | }, 54 | ], 55 | }, 56 | { 57 | role: "tool", 58 | content: "It's 80 degrees and sunny in San Francisco.", 59 | }, 60 | { role: "assistant", content: "The weather in SF is 80˚ and sunny." }, 61 | ] satisfies FlexibleChatCompletionMessage[]; 62 | 63 | const evalResult = await evaluator({ 64 | inputs, 65 | outputs, 66 | referenceOutputs, 67 | }); 68 | 69 | expect(evalResult.key).toBe("trajectory_accuracy"); 70 | expect(evalResult.score).toBe(true); 71 | } 72 | ); 73 | 74 | ls.test("trajectory no ref", { inputs: {} }, async () => { 75 | const evaluator = createTrajectoryLLMAsJudge({ 76 | prompt: TRAJECTORY_ACCURACY_PROMPT, 77 | model: "openai:o3-mini", 78 | }); 79 | const evalResult = await evaluator({ 80 | outputs: [ 81 | { role: "user", content: "What is the weather in SF?" }, 82 | { 83 | role: "assistant", 84 | content: "", 85 | tool_calls: [ 86 | { 87 | function: { 88 | name: "get_weather", 89 | arguments: JSON.stringify({ city: "SF" }), 90 | }, 91 | }, 92 | ], 93 | }, 94 | { role: "tool", content: "It's 80 degrees and sunny in SF." }, 95 | { 96 | role: "assistant", 97 | content: "The weather in SF is 80 degrees and sunny.", 98 | }, 99 | ], 100 | }); 101 | 102 | expect(evalResult.key).toBe("trajectory_accuracy"); 103 | expect(evalResult.score).toBe(true); 104 | }); 105 | 106 | ls.test("trajectory no ref bad trajectory", { inputs: {} }, async () => { 107 | const evaluator = createTrajectoryLLMAsJudge({ 108 | prompt: TRAJECTORY_ACCURACY_PROMPT, 109 | model: "openai:o3-mini", 110 | }); 111 | const outputs = [ 112 | { role: "user", content: "What are some good restaurants in SF?" }, 113 | { 114 | content: "", 115 | role: "assistant", 116 | tool_calls: [ 117 | { 118 | function: { 119 | name: "get_weather", 120 | arguments: JSON.stringify({ city: "SF" }), 121 | }, 122 | }, 123 | ], 124 | }, 125 | { role: "tool", content: "It's 80 degrees and sunny in SF." }, 126 | { 127 | role: "assistant", 128 | content: "The weather in SF is 80 degrees and sunny.", 129 | }, 130 | ] satisfies FlexibleChatCompletionMessage[]; 131 | const evalResult = await evaluator({ 132 | outputs, 133 | }); 134 | 135 | expect(evalResult.key).toBe("trajectory_accuracy"); 136 | expect(evalResult.score).toBe(false); 137 | }); 138 | 139 | ls.test( 140 | "should match trajectories with inverse rubric", 141 | { inputs: {} }, 142 | async () => { 143 | const REVERSE_PROMPT = `You are an expert data labeler. 144 | Your task is to grade the inaccuracy of an AI agent's internal trajectory. 145 | 146 | 147 | An inaccurate trajectory: 148 | - Makes no logical sense between steps 149 | - Shows no clear progression 150 | - Is not relatively efficient, though it does not need to be perfectly inefficient 151 | - Is not semantically equivalent to the provided reference trajectory, if present 152 | 153 | We are looking for bad trajectories, so score should be 0 if the trajectory contains reasonable steps for the agent to answer the input, and 1 if not. 154 | 155 | 156 | Grade the following trajectory: 157 | 158 | 159 | {outputs} 160 | 161 | 162 | 163 | {inputs} 164 | 165 | 166 | According to this reference trajectory: 167 | 168 | 169 | {reference_outputs} 170 | 171 | `; 172 | 173 | const evaluator = createTrajectoryLLMAsJudge({ 174 | model: "openai:o3-mini", 175 | prompt: REVERSE_PROMPT, 176 | }); 177 | const inputs = {}; 178 | const outputs = [ 179 | { role: "user", content: "What is the weather in SF?" }, 180 | { 181 | role: "assistant", 182 | content: "", 183 | tool_calls: [ 184 | { 185 | function: { 186 | name: "get_weather", 187 | arguments: JSON.stringify({ city: "SF" }), 188 | }, 189 | }, 190 | ], 191 | }, 192 | { role: "tool", content: "It's 80 degrees and sunny in SF." }, 193 | { 194 | role: "assistant", 195 | content: "The weather in SF is 80 degrees and sunny.", 196 | }, 197 | ] satisfies FlexibleChatCompletionMessage[]; 198 | 199 | const referenceOutputs = [ 200 | { role: "user", content: "What is the weather in SF?" }, 201 | { 202 | role: "assistant", 203 | content: "", 204 | tool_calls: [ 205 | { 206 | function: { 207 | name: "get_weather", 208 | arguments: JSON.stringify({ city: "San Francisco" }), 209 | }, 210 | }, 211 | ], 212 | }, 213 | { 214 | role: "tool", 215 | content: "It's 80 degrees and sunny in San Francisco.", 216 | }, 217 | { role: "assistant", content: "The weather in SF is 80˚ and sunny." }, 218 | ] satisfies FlexibleChatCompletionMessage[]; 219 | 220 | const evalResult = await evaluator({ 221 | inputs, 222 | outputs, 223 | referenceOutputs, 224 | }); 225 | 226 | expect(evalResult.key).toBe("trajectory_accuracy"); 227 | expect(evalResult.score).toBe(false); 228 | } 229 | ); 230 | }); 231 | -------------------------------------------------------------------------------- /python/agentevals/trajectory/utils.py: -------------------------------------------------------------------------------- 1 | __all__ = [ 2 | "_is_trajectory_superset", 3 | "_extract_tool_calls", 4 | "_get_matcher_for_tool_name", 5 | "_normalize_to_openai_messages_list", 6 | "_convert_to_openai_message", 7 | ] 8 | 9 | import json 10 | 11 | from agentevals.types import ( 12 | ChatCompletionMessage, 13 | ToolArgsMatchMode, 14 | ToolArgsMatchOverrides, 15 | ) 16 | from langchain_core.messages import BaseMessage 17 | from langchain_core.messages.utils import convert_to_openai_messages 18 | from typing import Callable, Optional, Union 19 | 20 | 21 | # More flexible version of converting to OpenAI messages for trajectories 22 | def _convert_to_openai_message( 23 | message: Union[ChatCompletionMessage, BaseMessage, dict], 24 | ) -> ChatCompletionMessage: 25 | if not isinstance(message, BaseMessage): 26 | if not isinstance(message, dict): 27 | message = dict(message) 28 | if message.get("role") in ["ai", "assistant"] and message.get("tool_calls"): 29 | message["tool_calls"] = [ 30 | {**tool_call, "id": tool_call.get("id", "")} 31 | for tool_call in message["tool_calls"] 32 | ] 33 | if message.get("role") == "tool" and message.get("tool_call_id") is None: 34 | message["tool_call_id"] = "" 35 | if message.get("content") is None: 36 | message["content"] = "" 37 | converted = convert_to_openai_messages([message])[0] # type: ignore 38 | if isinstance(message, BaseMessage): 39 | if message.id is not None and converted.get("id") is None: 40 | converted["id"] = message.id 41 | else: 42 | if message.get("id") is not None and converted.get("id") is None: 43 | converted["id"] = message.get("id") 44 | return converted # type: ignore 45 | 46 | 47 | def _normalize_to_openai_messages_list( 48 | messages: Optional[ 49 | Union[ 50 | list[ChatCompletionMessage], list[BaseMessage], ChatCompletionMessage, dict 51 | ] 52 | ], 53 | ) -> list[ChatCompletionMessage]: 54 | if messages is None: 55 | return [] 56 | if isinstance(messages, dict): 57 | if "role" in messages: 58 | messages = [messages] # type: ignore 59 | elif "messages" in messages: 60 | messages = messages["messages"] # type: ignore 61 | else: 62 | raise ValueError("if messages is a dict, it must contain a 'messages' key") 63 | if not isinstance(messages, list): 64 | messages = [messages] # type: ignore 65 | return [_convert_to_openai_message(message) for message in messages] # type: ignore 66 | 67 | 68 | def _normalize_tool_call(tool_call: dict) -> dict: 69 | if "function" in tool_call: 70 | return { 71 | "name": tool_call["function"]["name"], 72 | "args": json.loads(tool_call["function"]["arguments"]), 73 | } 74 | else: 75 | return tool_call 76 | 77 | 78 | def _extract_tool_calls(messages: list[ChatCompletionMessage]) -> list[dict]: 79 | tool_calls: list[dict] = [] 80 | for message in messages: 81 | if "tool_calls" in message: 82 | normalized_tool_calls = [ 83 | _normalize_tool_call(tool_call) 84 | for tool_call in message["tool_calls"] or [] 85 | ] 86 | tool_calls.extend(normalized_tool_calls) 87 | return tool_calls 88 | 89 | 90 | def _is_trajectory_superset( 91 | outputs: list[ChatCompletionMessage], 92 | reference_outputs: list[ChatCompletionMessage], 93 | tool_args_match_mode: ToolArgsMatchMode, 94 | tool_args_match_overrides: Optional[ToolArgsMatchOverrides] = None, 95 | ): 96 | output_tool_calls = _extract_tool_calls(outputs) 97 | reference_tool_calls = _extract_tool_calls(reference_outputs) 98 | 99 | # Keep track of which reference tool calls have been matched 100 | matched_reference_calls = set() 101 | 102 | # For each reference tool call, find a matching output tool call 103 | for ref_call in reference_tool_calls: 104 | ref_name = ref_call["name"] 105 | ref_args = ref_call["args"] 106 | 107 | found_match = False 108 | for out_idx, out_call in enumerate(output_tool_calls): 109 | out_name = out_call["name"] 110 | 111 | # Names must match 112 | if ref_name != out_name: 113 | continue 114 | 115 | # If we're already using this output call for a different match, skip 116 | if out_idx in matched_reference_calls: 117 | continue 118 | 119 | # Check tool args according to match mode 120 | matcher = _get_matcher_for_tool_name( 121 | ref_name, tool_args_match_mode, tool_args_match_overrides 122 | ) 123 | 124 | out_args = out_call["args"] 125 | if matcher(out_args, ref_args): 126 | matched_reference_calls.add(out_idx) 127 | found_match = True 128 | break 129 | 130 | # If we didn't find a match for this reference call, we're not a superset 131 | if not found_match: 132 | return False 133 | 134 | return True 135 | 136 | 137 | def _exact_match(tool_call: dict, reference_tool_call: dict) -> bool: 138 | return tool_call == reference_tool_call 139 | 140 | 141 | def _subset_match(tool_call: dict, reference_tool_call: dict) -> bool: 142 | # Every key-value pair in tool_call must exist in reference_tool_call 143 | return all( 144 | key in reference_tool_call and reference_tool_call[key] == value 145 | for key, value in tool_call.items() 146 | ) 147 | 148 | 149 | def _superset_match(tool_call: dict, reference_tool_call: dict) -> bool: 150 | # Every key-value pair in reference_tool_call must exist in tool_call 151 | return all( 152 | key in tool_call and tool_call[key] == value 153 | for key, value in reference_tool_call.items() 154 | ) 155 | 156 | 157 | def _ignore_match(tool_call: dict, reference_tool_call: dict) -> bool: 158 | return True 159 | 160 | 161 | def _get_matcher_for_comparison_mode( 162 | mode: ToolArgsMatchMode, 163 | ) -> Callable[[dict, dict], bool]: 164 | if mode == "exact": 165 | return _exact_match 166 | elif mode == "subset": 167 | return _subset_match 168 | elif mode == "superset": 169 | return _superset_match 170 | else: 171 | return _ignore_match 172 | 173 | 174 | def _get_partial_matcher_on_keys(keys: list[str]) -> Callable[[dict, dict], bool]: 175 | def get_nested_value(d: dict, key_path: str): 176 | current = d 177 | for part in key_path.split("."): 178 | if not isinstance(current, dict): 179 | return None 180 | current = current.get(part) # type: ignore 181 | if current is None: 182 | return None 183 | return current 184 | 185 | def matcher(output_call: dict, reference_call: dict) -> bool: 186 | return all( 187 | get_nested_value(output_call, key) == get_nested_value(reference_call, key) 188 | for key in keys 189 | ) 190 | 191 | return matcher 192 | 193 | 194 | def _get_matcher_for_tool_name( 195 | tool_call_name: str, 196 | tool_args_match_mode: ToolArgsMatchMode, 197 | tool_args_match_overrides: Optional[ToolArgsMatchOverrides], 198 | ) -> Callable[[dict, dict], bool]: 199 | matcher = _get_matcher_for_comparison_mode(tool_args_match_mode) 200 | if tool_args_match_overrides is not None and tool_args_match_overrides.get( 201 | tool_call_name, False 202 | ): 203 | override = tool_args_match_overrides.get(tool_call_name) 204 | if isinstance(override, str): 205 | matcher = _get_matcher_for_comparison_mode(override) 206 | elif callable(override): 207 | matcher = override 208 | elif isinstance(override, list): 209 | matcher = _get_partial_matcher_on_keys(override) 210 | else: 211 | raise ValueError(f"Invalid tool args match override: {override}") 212 | return matcher 213 | -------------------------------------------------------------------------------- /js/src/trajectory/utils.ts: -------------------------------------------------------------------------------- 1 | import { 2 | ChatCompletionMessage, 3 | ToolArgsMatchMode, 4 | ToolArgsMatchOverrides, 5 | ToolArgsMatcher, 6 | } from "../types.js"; 7 | 8 | // eslint-disable-next-line @typescript-eslint/no-explicit-any 9 | function _normalizeToolCall(toolCall: Record): { 10 | name: string; 11 | args: Record; 12 | } { 13 | if ( 14 | "function" in toolCall && 15 | toolCall.function != null && 16 | typeof toolCall.function === "object" && 17 | typeof toolCall.function.arguments === "string" 18 | ) { 19 | return { 20 | name: toolCall.function.name, 21 | args: JSON.parse(toolCall.function.arguments), 22 | }; 23 | } 24 | return toolCall as { name: string; args: Record }; 25 | } 26 | 27 | function _extractToolCalls( 28 | messages: ChatCompletionMessage[] 29 | ): { name: string; args: Record }[] { 30 | const toolCalls: { name: string; args: Record }[] = []; 31 | for (const message of messages) { 32 | if (message.tool_calls) { 33 | toolCalls.push(...message.tool_calls.map(_normalizeToolCall)); 34 | } 35 | } 36 | return toolCalls; 37 | } 38 | 39 | export async function _isTrajectorySuperset( 40 | outputs: ChatCompletionMessage[], 41 | referenceOutputs: ChatCompletionMessage[], 42 | toolArgsMatchMode: ToolArgsMatchMode, 43 | toolArgsMatchOverrides?: ToolArgsMatchOverrides 44 | ): Promise { 45 | const outputToolCalls = _extractToolCalls(outputs); 46 | const referenceToolCalls = _extractToolCalls(referenceOutputs); 47 | 48 | // Keep track of which reference tool calls have been matched 49 | const matchedReferenceCalls = new Set(); 50 | 51 | // For each reference tool call, find a matching output tool call 52 | for (const refCall of referenceToolCalls) { 53 | const refName = refCall.name; 54 | const refArgs = refCall.args; 55 | 56 | let foundMatch = false; 57 | for (let outIdx = 0; outIdx < outputToolCalls.length; outIdx++) { 58 | const outCall = outputToolCalls[outIdx]; 59 | const outName = outCall.name; 60 | 61 | // Names must match 62 | if (refName !== outName) { 63 | continue; 64 | } 65 | 66 | // If we're already using this output call for a different match, skip 67 | if (matchedReferenceCalls.has(outIdx)) { 68 | continue; 69 | } 70 | 71 | // Check tool args according to match mode 72 | const matcher = _getMatcherForToolName( 73 | refName, 74 | toolArgsMatchMode, 75 | toolArgsMatchOverrides 76 | ); 77 | 78 | const outArgs = outCall.args; 79 | if (await matcher(outArgs, refArgs)) { 80 | matchedReferenceCalls.add(outIdx); 81 | foundMatch = true; 82 | break; 83 | } 84 | } 85 | 86 | // If we didn't find a match for this reference call, we're not a superset 87 | if (!foundMatch) { 88 | return false; 89 | } 90 | } 91 | 92 | return true; 93 | } 94 | 95 | // Deep equality check function 96 | function _deepEqual(a: unknown, b: unknown): boolean { 97 | if (a == null && b == null) return true; 98 | if (a === b) return true; 99 | if (typeof a !== "object" || typeof b !== "object" || !a || !b) return false; 100 | 101 | if (Array.isArray(a) && Array.isArray(b)) { 102 | if (a.length !== b.length) return false; 103 | return a.every((val, index) => _deepEqual(val, b[index])); 104 | } 105 | 106 | const keysA = Object.keys(a); 107 | const keysB = Object.keys(b); 108 | 109 | if (keysA.length !== keysB.length) return false; 110 | 111 | return ( 112 | keysA.every((key) => keysB.includes(key)) && 113 | keysB.every((key) => keysA.includes(key)) && 114 | keysA.every((key) => 115 | _deepEqual( 116 | (a as Record)[key], 117 | (b as Record)[key] 118 | ) 119 | ) 120 | ); 121 | } 122 | 123 | function _exactMatch( 124 | toolCall: Record, 125 | referenceToolCall: Record 126 | ): boolean { 127 | return _deepEqual(toolCall, referenceToolCall); 128 | } 129 | 130 | function _ignoreMatch( 131 | _toolCall: Record, 132 | _referenceToolCall: Record 133 | ): boolean { 134 | return true; 135 | } 136 | 137 | function _subsetMatch( 138 | toolCall: Record, 139 | referenceToolCall: Record 140 | ): boolean { 141 | // Every key-value pair in toolCall must exist in referenceToolCall with the same value 142 | return Object.entries(toolCall).every( 143 | ([key, value]) => 144 | key in referenceToolCall && _deepEqual(referenceToolCall[key], value) 145 | ); 146 | } 147 | 148 | function _supersetMatch( 149 | toolCall: Record, 150 | referenceToolCall: Record 151 | ): boolean { 152 | // Every key-value pair in referenceToolCall must exist in toolCall with the same value 153 | return Object.entries(referenceToolCall).every( 154 | ([key, value]) => key in toolCall && _deepEqual(toolCall[key], value) 155 | ); 156 | } 157 | 158 | function _getMatcherForComparisonMode( 159 | mode: ToolArgsMatchMode 160 | ): ToolArgsMatcher { 161 | if (mode === "exact") { 162 | return _exactMatch; 163 | } else if (mode === "subset") { 164 | return _subsetMatch; 165 | } else if (mode === "superset") { 166 | return _supersetMatch; 167 | } else { 168 | return _ignoreMatch; 169 | } 170 | } 171 | 172 | function _getPartialMatcherOnKeys(keys: string[]): ToolArgsMatcher { 173 | const getNestedValue = ( 174 | d: Record, 175 | keyPath: string 176 | ): unknown => { 177 | let current: unknown = d; 178 | for (const part of keyPath.split(".")) { 179 | if (current && typeof current === "object" && part in current) { 180 | current = current[part as keyof typeof current]; 181 | } else { 182 | return undefined; 183 | } 184 | } 185 | return current; 186 | }; 187 | 188 | return ( 189 | outputCall: Record, 190 | referenceCall: Record 191 | ): boolean => { 192 | return keys.every((key) => { 193 | const nestedOutputValue = getNestedValue(outputCall, key); 194 | const nestedReferenceValue = getNestedValue(referenceCall, key); 195 | return _deepEqual(nestedOutputValue, nestedReferenceValue); 196 | }); 197 | }; 198 | } 199 | 200 | export function _getMatcherForToolName( 201 | toolCallName: string, 202 | toolArgsMatchMode: ToolArgsMatchMode, 203 | toolArgsMatchOverrides?: ToolArgsMatchOverrides 204 | ): ToolArgsMatcher { 205 | let matcher = _getMatcherForComparisonMode(toolArgsMatchMode); 206 | 207 | if (toolArgsMatchOverrides && toolCallName in toolArgsMatchOverrides) { 208 | const override = toolArgsMatchOverrides[toolCallName]; 209 | 210 | if (typeof override === "string") { 211 | matcher = _getMatcherForComparisonMode(override); 212 | } else if (typeof override === "function") { 213 | matcher = override; 214 | } else if (Array.isArray(override)) { 215 | matcher = _getPartialMatcherOnKeys(override); 216 | } 217 | } 218 | 219 | return matcher; 220 | } 221 | 222 | export function _chatCompletionMessagesToString( 223 | messages: ChatCompletionMessage[] 224 | ): string { 225 | function formatMessage(message: ChatCompletionMessage): string { 226 | let content = message.content ?? ""; 227 | 228 | // Handle tool/function calls 229 | if (message.tool_calls) { 230 | const toolCallsStr = message.tool_calls 231 | .map((call: { function: { name: string; arguments: string } }) => { 232 | const func = call.function ?? {}; 233 | return `\n${func.name ?? ""}\n${func.arguments ?? ""}\n`; 234 | }) 235 | .join("\n"); 236 | 237 | content = content ? `${content}\n${toolCallsStr}` : toolCallsStr; 238 | } 239 | 240 | // Handle tool call results 241 | if (message.tool_call_id) { 242 | content = `\n${message.tool_call_id}\n${content}\n`; 243 | } 244 | 245 | return `<${message.role ?? ""}>\n${content}\n`; 246 | } 247 | 248 | return messages.map(formatMessage).join("\n\n"); 249 | } 250 | -------------------------------------------------------------------------------- /python/agentevals/trajectory/strict.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | from warnings import warn 3 | import json 4 | 5 | from agentevals.trajectory.utils import _normalize_to_openai_messages_list 6 | from agentevals.types import ( 7 | ChatCompletionMessage, 8 | ToolArgsMatchMode, 9 | ToolArgsMatchOverrides, 10 | ) 11 | from agentevals.utils import _run_evaluator, _arun_evaluator 12 | from agentevals.trajectory.utils import _get_matcher_for_tool_name 13 | 14 | from typing import Any, Union, Optional, TYPE_CHECKING 15 | 16 | if TYPE_CHECKING: 17 | from langchain_core.messages import BaseMessage 18 | 19 | 20 | def _scorer( 21 | *, 22 | outputs: list[ChatCompletionMessage], 23 | reference_outputs: list[ChatCompletionMessage], 24 | tool_args_match_mode: ToolArgsMatchMode, 25 | tool_args_match_overrides: Optional[ToolArgsMatchOverrides] = None, 26 | **kwargs: Any, 27 | ): 28 | outputs = _normalize_to_openai_messages_list(outputs) 29 | reference_outputs = _normalize_to_openai_messages_list(reference_outputs) 30 | if outputs is None or reference_outputs is None: 31 | raise ValueError( 32 | "Strict trajectory match requires both outputs and reference_outputs" 33 | ) 34 | if len(outputs) != len(reference_outputs): 35 | return False 36 | for output, reference_output in zip(outputs, reference_outputs): 37 | if output["role"] != reference_output["role"]: 38 | return False 39 | elif ("tool_calls" in output and output["tool_calls"] is not None) != ( 40 | "tool_calls" in reference_output 41 | and reference_output["tool_calls"] is not None 42 | ): 43 | # One has tool calls while the other doesn't 44 | return False 45 | elif "tool_calls" in output and output["tool_calls"] is not None: 46 | # Both have tool calls, compare them 47 | if not isinstance(output["tool_calls"], list) or not isinstance( 48 | reference_output["tool_calls"], list 49 | ): 50 | return False 51 | if len(output["tool_calls"]) != len(reference_output["tool_calls"]): 52 | return False 53 | # Create a copy of reference tool calls to track matches 54 | seen = [False] * len(reference_output["tool_calls"]) 55 | for output_call in output["tool_calls"]: 56 | found_match = False 57 | for i, reference_call in enumerate(reference_output["tool_calls"]): 58 | if not seen[i] and ( 59 | output_call["function"]["name"] 60 | == reference_call["function"]["name"] 61 | ): 62 | matcher = _get_matcher_for_tool_name( 63 | output_call["function"]["name"], 64 | tool_args_match_mode, 65 | tool_args_match_overrides, 66 | ) 67 | if matcher( 68 | json.loads(output_call["function"]["arguments"]), 69 | json.loads(reference_call["function"]["arguments"]), 70 | ): 71 | found_match = True 72 | seen[i] = True 73 | break 74 | if not found_match: 75 | return False 76 | return True 77 | 78 | 79 | def trajectory_strict_match( 80 | *, 81 | outputs: Union[list[ChatCompletionMessage], list[BaseMessage], dict], 82 | reference_outputs: Union[list[ChatCompletionMessage], list[BaseMessage], dict], 83 | tool_call_args_exact_match: bool = True, 84 | **kwargs: Any, 85 | ): 86 | """ 87 | DEPRECATED: Use create_trajectory_match_evaluator() instead: 88 | ```python 89 | from agentevals.trajectory.match import create_trajectory_match_evaluator 90 | evaluator = create_trajectory_match_evaluator(trajectory_match_mode="strict") 91 | evaluator(outputs=outputs, reference_outputs=reference_outputs) 92 | ``` 93 | 94 | Evaluate whether an input agent trajectory and called tools strictly matches a reference trajectory. 95 | This means that at each step, the agent called the same tools in the same order as specified in the reference trajectory. 96 | 97 | Args: 98 | outputs (Union[list[ChatCompletionMessage], list[BaseMessage], dict]): Actual trajectory the agent followed. 99 | May be a list of OpenAI messages, a list of LangChain messages, or a dictionary containing 100 | a "messages" key with one of the above. 101 | reference_outputs (Union[list[ChatCompletionMessage], list[BaseMessage], dict]): Ideal reference trajectory the agent should have followed. 102 | May be a list of OpenAI messages, a list of LangChain messages, or a dictionary containing 103 | a "messages" key with one of the above. 104 | tool_call_args_exact_match (bool): Whether to require exact matches for tool call arguments 105 | 106 | Returns: 107 | EvaluatorResult: Contains a score of True if trajectory (including called tools) matches, False otherwise 108 | """ 109 | warn( 110 | "trajectory_strict_match() is deprecated. Use create_trajectory_match_evaluator() instead.", 111 | DeprecationWarning, 112 | stacklevel=2, 113 | ) 114 | 115 | def wrapper(**kwargs: Any): 116 | return _scorer( 117 | tool_args_match_mode="exact" if tool_call_args_exact_match else "ignore", 118 | **kwargs, 119 | ) 120 | 121 | return _run_evaluator( 122 | run_name="trajectory_strict_match", 123 | scorer=wrapper, 124 | feedback_key="trajectory_strict_match", 125 | outputs=outputs, 126 | reference_outputs=reference_outputs, 127 | **kwargs, 128 | ) 129 | 130 | 131 | async def trajectory_strict_match_async( 132 | *, 133 | outputs: Union[list[ChatCompletionMessage], list[BaseMessage], dict], 134 | reference_outputs: Union[list[ChatCompletionMessage], list[BaseMessage], dict], 135 | tool_call_args_exact_match: bool = True, 136 | **kwargs: Any, 137 | ): 138 | """ 139 | DEPRECATED: Use create_async_trajectory_match_evaluator() instead: 140 | ```python 141 | from agentevals.trajectory.match import create_trajectory_match_evaluator 142 | evaluator = create_async_trajectory_match_evaluator(trajectory_match_mode="subset") 143 | await evaluator(outputs=outputs, reference_outputs=reference_outputs) 144 | ``` 145 | 146 | Evaluate whether an input agent trajectory and called tools strictly matches a reference trajectory. 147 | This means that at each step, the agent called the same tools in the same order as specified in the reference trajectory. 148 | 149 | Args: 150 | outputs (Union[list[ChatCompletionMessage], list[BaseMessage], dict]): Actual trajectory the agent followed. 151 | May be a list of OpenAI messages, a list of LangChain messages, or a dictionary containing 152 | a "messages" key with one of the above. 153 | reference_outputs (Union[list[ChatCompletionMessage], list[BaseMessage], dict]): Ideal reference trajectory the agent should have followed. 154 | May be a list of OpenAI messages, a list of LangChain messages, or a dictionary containing 155 | a "messages" key with one of the above. 156 | tool_call_args_exact_match (bool): Whether to require exact matches for tool call arguments 157 | 158 | Returns: 159 | EvaluatorResult: Contains a score of True if trajectory (including called tools) matches, False otherwise 160 | """ 161 | warn( 162 | "trajectory_strict_match_async() is deprecated. Use create_async_trajectory_match_evaluator() instead.", 163 | DeprecationWarning, 164 | stacklevel=2, 165 | ) 166 | 167 | def wrapper(**kwargs: Any): 168 | return _scorer( 169 | tool_args_match_mode="exact" if tool_call_args_exact_match else "ignore", 170 | **kwargs, 171 | ) 172 | 173 | return await _arun_evaluator( 174 | run_name="trajectory_strict_match", 175 | scorer=wrapper, 176 | feedback_key="trajectory_strict_match", 177 | outputs=outputs, 178 | reference_outputs=reference_outputs, 179 | tool_call_args_exact_match=tool_call_args_exact_match, 180 | **kwargs, 181 | ) 182 | -------------------------------------------------------------------------------- /python/agentevals/trajectory/match.py: -------------------------------------------------------------------------------- 1 | from typing import Literal, Optional, Union 2 | 3 | from agentevals.trajectory.strict import _scorer as trajectory_strict_scorer 4 | from agentevals.trajectory.unordered import _scorer as trajectory_unordered_scorer 5 | from agentevals.trajectory.subset import _scorer as trajectory_subset_scorer 6 | from agentevals.trajectory.superset import _scorer as trajectory_superset_scorer 7 | from agentevals.types import ( 8 | ChatCompletionMessage, 9 | SimpleEvaluator, 10 | SimpleAsyncEvaluator, 11 | ToolArgsMatchMode, 12 | ToolArgsMatchOverrides, 13 | ) 14 | from agentevals.utils import _run_evaluator, _arun_evaluator 15 | 16 | from agentevals.trajectory.utils import _normalize_to_openai_messages_list 17 | 18 | from langchain_core.messages import BaseMessage 19 | 20 | 21 | TrajectoryMatchMode = Literal["strict", "unordered", "subset", "superset"] 22 | 23 | 24 | def create_trajectory_match_evaluator( 25 | *, 26 | trajectory_match_mode: TrajectoryMatchMode = "strict", 27 | tool_args_match_mode: ToolArgsMatchMode = "exact", 28 | tool_args_match_overrides: Optional[ToolArgsMatchOverrides] = None, 29 | ) -> SimpleEvaluator: 30 | """Creates an evaluator that compares trajectories between model outputs and reference outputs. 31 | 32 | Args: 33 | trajectory_match_mode (TrajectoryMatchMode): The mode for matching trajectories: 34 | - "strict": Requires exact match in order and content 35 | - "unordered": Allows matching in any order 36 | - "subset": Accepts if output trajectory is a subset of reference 37 | - "superset": Accepts if output trajectory is a superset of reference 38 | tool_args_match_mode (ToolArgsMatchMode): Mode for matching tool arguments ("exact" by default, can be "ignore") 39 | tool_args_match_overrides (Optional[ToolArgsMatchOverrides]): Dict containing custom overrides for 40 | tool argument matching. Each key should be a tool name, and each value should be either a 41 | match mode or a matcher. Matchers should be a Callable that takes two sets of tool call args 42 | and returns whether they are equal. 43 | 44 | Returns: 45 | SimpleEvaluator: A function that evaluates trajectory matches between outputs and references 46 | 47 | The returned evaluator accepts: 48 | - outputs: List of messages or dict representing the model output trajectory 49 | - reference_outputs: List of messages or dict representing the reference trajectory 50 | - **kwargs: Additional arguments passed to the underlying evaluator 51 | 52 | Example: 53 | ```python 54 | def matcher(output_tool_call_args: dict, reference_tool_call_args: dict) -> bool: 55 | output_args = output_tool_call_args.get("query", "").lower() 56 | reference_args = reference_tool_call_args.get("query", "").lower() 57 | return output_args == reference_args 58 | 59 | evaluator = create_trajectory_match_evaluator( 60 | trajectory_match_mode="strict", 61 | tool_args_match_mode="exact", 62 | tool_args_match_overrides={ 63 | "my_tool_name": matcher, 64 | }, 65 | ) 66 | result = evaluator( 67 | outputs=..., 68 | reference_outputs=..., 69 | ) 70 | ``` 71 | """ 72 | if trajectory_match_mode == "strict": 73 | scorer = trajectory_strict_scorer 74 | elif trajectory_match_mode == "unordered": 75 | scorer = trajectory_unordered_scorer 76 | elif trajectory_match_mode == "subset": 77 | scorer = trajectory_subset_scorer 78 | elif trajectory_match_mode == "superset": 79 | scorer = trajectory_superset_scorer 80 | else: 81 | raise ValueError( 82 | f"Invalid trajectory match type: `{trajectory_match_mode}`. Must be one of `strict`, `unordered`, `subset`, or `superset`." 83 | ) 84 | 85 | if tool_args_match_mode not in ["exact", "ignore", "subset", "superset"]: 86 | raise ValueError( 87 | f"Invalid tool args match mode: `{tool_args_match_mode}`. Must be either `exact`, `ignore`, `subset`, or `superset`." 88 | ) 89 | 90 | def _wrapped_evaluator( 91 | *, 92 | outputs: Union[list[ChatCompletionMessage], list[BaseMessage], dict], 93 | reference_outputs: Union[list[ChatCompletionMessage], list[BaseMessage], dict], 94 | **kwargs, 95 | ): 96 | outputs = _normalize_to_openai_messages_list(outputs) 97 | reference_outputs = _normalize_to_openai_messages_list(reference_outputs) 98 | return _run_evaluator( 99 | run_name=f"trajectory_{trajectory_match_mode}_match", 100 | scorer=scorer, 101 | feedback_key=f"trajectory_{trajectory_match_mode}_match", 102 | outputs=outputs, 103 | reference_outputs=reference_outputs, 104 | tool_args_match_mode=tool_args_match_mode, 105 | tool_args_match_overrides=tool_args_match_overrides, 106 | **kwargs, 107 | ) 108 | 109 | return _wrapped_evaluator 110 | 111 | 112 | def create_async_trajectory_match_evaluator( 113 | *, 114 | trajectory_match_mode: TrajectoryMatchMode = "strict", 115 | tool_args_match_mode: ToolArgsMatchMode = "exact", 116 | tool_args_match_overrides: Optional[ToolArgsMatchOverrides] = None, 117 | ) -> SimpleAsyncEvaluator: 118 | """Creates an async evaluator that compares trajectories between model outputs and reference outputs. 119 | 120 | Args: 121 | trajectory_match_mode (TrajectoryMatchMode): The mode for matching trajectories: 122 | - "strict": Requires exact match in order and content 123 | - "unordered": Allows matching in any order 124 | - "subset": Accepts if output trajectory is a subset of reference 125 | - "superset": Accepts if output trajectory is a superset of reference 126 | tool_args_match_mode (ToolArgsMatchMode): Mode for matching tool arguments ("exact" by default, can be "ignore") 127 | tool_args_match_overrides (Optional[ToolArgsMatchOverrides]): Dict containing custom overrides for 128 | tool argument matching. Each key should be a tool name, and each value should be either a 129 | match mode or a matcher. Matchers should be a Callable that takes two sets of tool call args 130 | and returns whether they are equal. 131 | 132 | Returns: 133 | SimpleAsyncEvaluator: An async function that evaluates trajectory matches between outputs and references 134 | 135 | The returned evaluator accepts: 136 | - outputs: List of messages or dict representing the model output trajectory 137 | - reference_outputs: List of messages or dict representing the reference trajectory 138 | - **kwargs: Additional arguments passed to the underlying evaluator 139 | 140 | Example: 141 | ```python 142 | def matcher(output_tool_call_args: dict, reference_tool_call_args: dict) -> bool: 143 | output_args = output_tool_call_args.get("query", "").lower() 144 | reference_args = reference_tool_call_args.get("query", "").lower() 145 | return output_args == reference_args 146 | 147 | evaluator = create_async_trajectory_match_evaluator( 148 | trajectory_match_mode="strict", 149 | tool_args_match_mode="exact", 150 | tool_args_match_overrides={ 151 | "my_tool_name": matcher, 152 | }, 153 | ) 154 | result = await evaluator( 155 | outputs=..., 156 | reference_outputs=..., 157 | ) 158 | ``` 159 | """ 160 | if trajectory_match_mode == "strict": 161 | scorer = trajectory_strict_scorer 162 | elif trajectory_match_mode == "unordered": 163 | scorer = trajectory_unordered_scorer 164 | elif trajectory_match_mode == "subset": 165 | scorer = trajectory_subset_scorer 166 | elif trajectory_match_mode == "superset": 167 | scorer = trajectory_superset_scorer 168 | else: 169 | raise ValueError( 170 | f"Invalid trajectory match type: `{trajectory_match_mode}`. Must be one of `strict`, `unordered`, `subset`, or `superset`." 171 | ) 172 | 173 | if tool_args_match_mode not in ["exact", "ignore", "subset", "superset"]: 174 | raise ValueError( 175 | f"Invalid tool args match mode: `{tool_args_match_mode}`. Must be either `exact`, `ignore`, `subset`, or `superset`." 176 | ) 177 | 178 | async def _wrapped_evaluator( 179 | *, 180 | outputs: Union[list[ChatCompletionMessage], list[BaseMessage], dict], 181 | reference_outputs: Union[list[ChatCompletionMessage], list[BaseMessage], dict], 182 | **kwargs, 183 | ): 184 | outputs = _normalize_to_openai_messages_list(outputs) 185 | reference_outputs = _normalize_to_openai_messages_list(reference_outputs) 186 | return await _arun_evaluator( 187 | run_name=f"trajectory_{trajectory_match_mode}_match", 188 | scorer=scorer, 189 | feedback_key=f"trajectory_{trajectory_match_mode}_match", 190 | outputs=outputs, 191 | reference_outputs=reference_outputs, 192 | tool_args_match_mode=tool_args_match_mode, 193 | tool_args_match_overrides=tool_args_match_overrides, 194 | **kwargs, 195 | ) 196 | 197 | return _wrapped_evaluator 198 | -------------------------------------------------------------------------------- /python/agentevals/trajectory/llm.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from openevals.llm import ( 4 | _create_llm_as_judge_scorer, 5 | _create_async_llm_as_judge_scorer, 6 | ChatCompletionMessage, 7 | ModelClient, 8 | SimpleEvaluator, 9 | SimpleAsyncEvaluator, 10 | Callable, 11 | Optional, 12 | Union, 13 | ) 14 | from openevals.utils import ( 15 | _chat_completion_messages_to_string, 16 | ) 17 | from agentevals.types import FewShotExample 18 | from agentevals.utils import _run_evaluator, _arun_evaluator 19 | from agentevals.trajectory.utils import _normalize_to_openai_messages_list 20 | 21 | from langchain_core.language_models.chat_models import BaseChatModel 22 | from langchain_core.runnables import Runnable 23 | 24 | from typing import TYPE_CHECKING 25 | 26 | 27 | TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE = """You are an expert data labeler. 28 | Your task is to grade the accuracy of an AI agent's internal trajectory. 29 | 30 | 31 | An accurate trajectory: 32 | - Makes logical sense between steps 33 | - Shows clear progression 34 | - Is relatively efficient, though it does not need to be perfectly efficient 35 | - Is semantically equivalent to the provided reference trajectory 36 | 37 | 38 | Based on the following reference trajectory: 39 | 40 | 41 | {reference_outputs} 42 | 43 | 44 | Grade this actual trajectory: 45 | 46 | 47 | {outputs} 48 | 49 | """ 50 | 51 | TRAJECTORY_ACCURACY_PROMPT = """You are an expert data labeler. 52 | Your task is to grade the accuracy of an AI agent's internal trajectory. 53 | 54 | 55 | An accurate trajectory: 56 | - Makes logical sense between steps 57 | - Shows clear progression 58 | - Is relatively efficient, though it does not need to be perfectly efficient 59 | 60 | 61 | First, try to understand the goal of the trajectory by looking at the input 62 | (if the input is not present try to infer it from the content of the first message), 63 | as well as the output of the final message. Once you understand the goal, grade the trajectory 64 | as it relates to achieving that goal. 65 | 66 | Grade the following trajectory: 67 | 68 | 69 | {outputs} 70 | 71 | """ 72 | 73 | if TYPE_CHECKING: 74 | from langchain_core.messages import BaseMessage 75 | 76 | 77 | def _format_inputs( 78 | outputs: Union[list[ChatCompletionMessage], list[BaseMessage], dict], 79 | reference_outputs: Optional[ 80 | Union[list[ChatCompletionMessage], list[BaseMessage], dict] 81 | ], 82 | ) -> tuple[str, str]: 83 | outputs = _normalize_to_openai_messages_list(outputs) 84 | reference_outputs = _normalize_to_openai_messages_list(reference_outputs) 85 | if reference_outputs: 86 | formatted_reference_outputs = _chat_completion_messages_to_string( 87 | reference_outputs 88 | ) 89 | else: 90 | formatted_reference_outputs = "" 91 | formatted_outputs = _chat_completion_messages_to_string(outputs) 92 | return ( 93 | formatted_outputs, 94 | formatted_reference_outputs, 95 | ) 96 | 97 | 98 | def create_trajectory_llm_as_judge( 99 | *, 100 | prompt: str 101 | | Runnable 102 | | Callable[ 103 | ..., list[ChatCompletionMessage] 104 | ] = TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE, 105 | model: Optional[str] = None, 106 | feedback_key: str = "trajectory_accuracy", 107 | judge: Optional[ 108 | Union[ 109 | ModelClient, 110 | BaseChatModel, 111 | ] 112 | ] = None, 113 | continuous: bool = False, 114 | choices: Optional[list[float]] = None, 115 | use_reasoning: bool = True, 116 | few_shot_examples: Optional[list[FewShotExample]] = None, 117 | ) -> SimpleEvaluator: 118 | """Creates an evaluator that uses an LLM to judge agent trajectories. 119 | 120 | Args: 121 | prompt: The evaluation prompt, can be a string template, LangChain prompt template, or callable 122 | that returns a list of chat messages. Note that the default prompt allows a rubric 123 | in addition to the typical "inputs", "outputs", and "reference_outputs" parameters. 124 | feedback_key: Key used to store the evaluation result, defaults to "trajectory_accuracy". 125 | judge: The LLM used for evaluation. Can be an OpenAI client 126 | or a LangChain chat model. If an OpenAI client, must specify "model" as well. 127 | If omitted, "model" will be used to instantiate a LangChain model instance 128 | by model string. 129 | model: Model identifier to use. If "judge" is an OpenAI client, 130 | this argument should be a model name directly. If "judge" is omitted, must be a valid 131 | LangChain model identifier. See `init_chat_model` docs for more details: 132 | https://python.langchain.com/docs/how_to/chat_models_universal_init/. 133 | system: Optional system message to prepend to the prompt. 134 | continuous: If True, score will be a float between 0 and 1. If False, score will be boolean. Defaults to False. 135 | choices: Optional list of specific float values the score must be chosen from. 136 | use_reasoning: If True, includes explanation for the score in the output. Defaults to True. 137 | few_shot_examples: Optional list of example evaluations to append to the prompt. 138 | 139 | Returns: 140 | SimpleEvaluator: A function that evaluates agent trajectories using the configured LLM judge. 141 | """ 142 | scorer = _create_llm_as_judge_scorer( 143 | prompt=prompt, 144 | judge=judge, 145 | model=model, 146 | continuous=continuous, 147 | choices=choices, 148 | use_reasoning=use_reasoning, 149 | few_shot_examples=few_shot_examples, 150 | ) 151 | 152 | def _wrapped_evaluator( 153 | *, 154 | outputs: Union[list[ChatCompletionMessage], list[BaseMessage], dict], 155 | reference_outputs: Optional[ 156 | Union[list[ChatCompletionMessage], list[BaseMessage], dict] 157 | ] = None, 158 | **kwargs, 159 | ): 160 | ( 161 | formatted_outputs, 162 | formatted_reference_outputs, 163 | ) = _format_inputs(outputs, reference_outputs) 164 | return _run_evaluator( 165 | run_name=f"llm_as_{feedback_key}_judge", 166 | scorer=scorer, 167 | feedback_key=feedback_key, 168 | outputs=formatted_outputs, 169 | reference_outputs=formatted_reference_outputs, 170 | **kwargs, 171 | ) 172 | 173 | return _wrapped_evaluator 174 | 175 | 176 | def create_async_trajectory_llm_as_judge( 177 | *, 178 | prompt: str 179 | | Runnable 180 | | Callable[ 181 | ..., list[ChatCompletionMessage] 182 | ] = TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE, 183 | model: Optional[str] = None, 184 | feedback_key: str = "trajectory_accuracy", 185 | judge: Optional[ 186 | Union[ 187 | ModelClient, 188 | BaseChatModel, 189 | ] 190 | ] = None, 191 | continuous: bool = False, 192 | choices: Optional[list[float]] = None, 193 | use_reasoning: bool = True, 194 | few_shot_examples: Optional[list[FewShotExample]] = None, 195 | ) -> SimpleAsyncEvaluator: 196 | """Creates an evaluator that uses an LLM to judge agent trajectories. 197 | 198 | Args: 199 | prompt: The evaluation prompt, can be a string template, LangChain prompt template, or callable 200 | that returns a list of chat messages. Note that the default prompt allows a rubric 201 | in addition to the typical "inputs", "outputs", and "reference_outputs" parameters. 202 | feedback_key: Key used to store the evaluation result, defaults to "trajectory_accuracy". 203 | judge: The LLM used for evaluation. Can be an OpenAI client 204 | or a LangChain chat model. If an OpenAI client, must specify "model" as well. 205 | If omitted, "model" will be used to instantiate a LangChain model instance 206 | by model string. 207 | model: Model identifier to use. If "judge" is an OpenAI client, 208 | this argument should be a model name directly. If "judge" is omitted, must be a valid 209 | LangChain model identifier. See `init_chat_model` docs for more details: 210 | https://python.langchain.com/docs/how_to/chat_models_universal_init/. 211 | system: Optional system message to prepend to the prompt. 212 | continuous: If True, score will be a float between 0 and 1. If False, score will be boolean. Defaults to False. 213 | choices: Optional list of specific float values the score must be chosen from. 214 | use_reasoning: If True, includes explanation for the score in the output. Defaults to True. 215 | few_shot_examples: Optional list of example evaluations to append to the prompt. 216 | 217 | Returns: 218 | SimpleAsyncEvaluator: A function that evaluates agent trajectories using the configured LLM judge. 219 | """ 220 | scorer = _create_async_llm_as_judge_scorer( 221 | prompt=prompt, 222 | judge=judge, 223 | model=model, 224 | continuous=continuous, 225 | choices=choices, 226 | use_reasoning=use_reasoning, 227 | few_shot_examples=few_shot_examples, 228 | ) 229 | 230 | async def _wrapped_evaluator( 231 | *, 232 | outputs: Union[list[ChatCompletionMessage], list[BaseMessage], dict], 233 | reference_outputs: Optional[ 234 | Union[list[ChatCompletionMessage], list[BaseMessage], dict] 235 | ] = None, 236 | **kwargs, 237 | ): 238 | ( 239 | formatted_outputs, 240 | formatted_reference_outputs, 241 | ) = _format_inputs(outputs, reference_outputs) 242 | return await _arun_evaluator( 243 | run_name=f"llm_as_{feedback_key}_judge", 244 | scorer=scorer, 245 | feedback_key=feedback_key, 246 | outputs=formatted_outputs, 247 | reference_outputs=formatted_reference_outputs, 248 | **kwargs, 249 | ) 250 | 251 | return _wrapped_evaluator 252 | -------------------------------------------------------------------------------- /python/agentevals/graph_trajectory/llm.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | from openevals.llm import ( 3 | _create_llm_as_judge_scorer, 4 | _create_async_llm_as_judge_scorer, 5 | ChatCompletionMessage, 6 | ModelClient, 7 | SimpleEvaluator, 8 | SimpleAsyncEvaluator, 9 | Callable, 10 | Optional, 11 | Union, 12 | ) 13 | from langchain_core.runnables import Runnable 14 | 15 | from agentevals.types import EvaluatorResult, FewShotExample, GraphTrajectory 16 | from agentevals.utils import _run_evaluator, _arun_evaluator 17 | 18 | from langchain_core.language_models.chat_models import BaseChatModel 19 | 20 | DEFAULT_REF_COMPARE_PROMPT = """You are an expert data labeler. 21 | Your task is to grade the accuracy of an AI agent's internal steps in resolving a user queries. 22 | 23 | 24 | An accurate trajectory: 25 | - Makes logical sense between steps 26 | - Shows clear progression 27 | - Is relatively efficient, though it does not need to be perfectly efficient 28 | - Is semantically equivalent to the provided reference trajectory, if present 29 | 30 | 31 | 32 | Grade the following thread, evaluating whether the agent's overall steps are logical and relatively efficient. 33 | For the trajectory, "__start__" denotes an initial entrypoint to the agent, and "__interrupt__" corresponds to the agent 34 | interrupting to await additional data from another source ("human-in-the-loop"): 35 | 36 | 37 | 38 | {thread} 39 | 40 | 41 | {reference_outputs} 42 | """ 43 | 44 | 45 | def _format_thread( 46 | inputs: list, 47 | outputs: GraphTrajectory, 48 | ) -> str: 49 | formatted_thread = "" 50 | for input, result, step in zip(inputs, outputs["results"], outputs["steps"]): 51 | formatted_thread += f"\n\n{input}\n\n" if input else "" 52 | formatted_thread += f"\n\n{step}\n\n" 53 | formatted_thread += f"\n\n{result}\n\n" 54 | return formatted_thread 55 | 56 | 57 | def _format_inputs( 58 | inputs: Optional[Union[list, dict]], 59 | outputs: GraphTrajectory, 60 | reference_outputs: Optional[GraphTrajectory], 61 | ) -> tuple[str, str]: 62 | if isinstance(inputs, dict): 63 | if "inputs" not in inputs: 64 | raise ValueError("inputs must be a list or a dict with an 'inputs' key") 65 | inputs = inputs["inputs"] 66 | if len(inputs) != len(outputs["results"]): 67 | raise ValueError( 68 | "Provided `inputs` and `results` within provided `outputs` must have the same length" 69 | ) 70 | if inputs is not None and len(inputs) != len(outputs["steps"]): 71 | raise ValueError( 72 | "Provided `inputs` and `steps` within provided `outputs` must have the same length" 73 | ) 74 | formatted_thread = _format_thread(inputs, outputs) # type: ignore 75 | if reference_outputs: 76 | formatted_reference_outputs = f"\nUse the following trajectory as an example reference when grading:\n\n{_format_thread(reference_outputs['inputs'], reference_outputs)}\n\n" 77 | else: 78 | formatted_reference_outputs = "" 79 | return ( 80 | formatted_thread, 81 | formatted_reference_outputs, 82 | ) 83 | 84 | 85 | def create_graph_trajectory_llm_as_judge( 86 | *, 87 | prompt: str 88 | | Runnable 89 | | Callable[..., list[ChatCompletionMessage]] = DEFAULT_REF_COMPARE_PROMPT, 90 | model: Optional[str] = None, 91 | feedback_key: str = "graph_trajectory_accuracy", 92 | judge: Optional[ 93 | Union[ 94 | ModelClient, 95 | BaseChatModel, 96 | ] 97 | ] = None, 98 | continuous: bool = False, 99 | choices: Optional[list[float]] = None, 100 | use_reasoning: bool = True, 101 | few_shot_examples: Optional[list[FewShotExample]] = None, 102 | ) -> SimpleEvaluator: 103 | """Creates an evaluator that uses an LLM to judge agent trajectories. 104 | 105 | Args: 106 | prompt: The evaluation prompt, can be a string template, LangChain prompt template, or callable 107 | that returns a list of chat messages. Note that the default prompt allows a rubric 108 | in addition to the typical "inputs", "outputs", and "reference_outputs" parameters. 109 | feedback_key: Key used to store the evaluation result, defaults to "graph_trajectory_accuracy". 110 | judge: The LLM used for evaluation. Can be an OpenAI client), 111 | or a LangChain chat model. If an OpenAI client, must specify "model" as well. 112 | If omitted, "model" will be used to instantiate a LangChain model instance 113 | by model string. 114 | model: Model identifier to use. If "judge" is an OpenAI client, 115 | this argument should be a model name directly. If "judge" is omitted, must be a valid 116 | LangChain model identifier. See `init_chat_model` docs for more details: 117 | https://python.langchain.com/docs/how_to/chat_models_universal_init/. 118 | system: Optional system message to prepend to the prompt. 119 | continuous: If True, score will be a float between 0 and 1. If False, score will be boolean. Defaults to False. 120 | choices: Optional list of specific float values the score must be chosen from. 121 | use_reasoning: If True, includes explanation for the score in the output. Defaults to True. 122 | few_shot_examples: Optional list of example evaluations to append to the prompt. 123 | 124 | Returns: 125 | SimpleEvaluator: A function that evaluates agent trajectories using the configured LLM judge. 126 | """ 127 | scorer = _create_llm_as_judge_scorer( 128 | prompt=prompt, 129 | judge=judge, 130 | model=model, 131 | continuous=continuous, 132 | choices=choices, 133 | use_reasoning=use_reasoning, 134 | few_shot_examples=few_shot_examples, 135 | ) 136 | 137 | def _wrapped_evaluator( 138 | *, 139 | inputs: Optional[Union[dict, list]] = None, 140 | outputs: GraphTrajectory, 141 | reference_outputs: Optional[GraphTrajectory] = None, 142 | **kwargs, 143 | ) -> EvaluatorResult: 144 | ( 145 | formatted_thread, 146 | formatted_reference_outputs, 147 | ) = _format_inputs(inputs, outputs, reference_outputs) 148 | return _run_evaluator( 149 | run_name=f"llm_as_{feedback_key}_judge", 150 | scorer=scorer, 151 | feedback_key=feedback_key, 152 | inputs=inputs, 153 | outputs=outputs, 154 | thread=formatted_thread, 155 | reference_outputs=formatted_reference_outputs, 156 | **kwargs, 157 | ) 158 | 159 | return _wrapped_evaluator 160 | 161 | 162 | def create_async_graph_trajectory_llm_as_judge( 163 | *, 164 | prompt: str 165 | | Runnable 166 | | Callable[..., list[ChatCompletionMessage]] = DEFAULT_REF_COMPARE_PROMPT, 167 | model: Optional[str] = None, 168 | feedback_key: str = "graph_trajectory_accuracy", 169 | judge: Optional[ 170 | Union[ 171 | ModelClient, 172 | BaseChatModel, 173 | ] 174 | ] = None, 175 | continuous: bool = False, 176 | choices: Optional[list[float]] = None, 177 | use_reasoning: bool = True, 178 | few_shot_examples: Optional[list[FewShotExample]] = None, 179 | ) -> SimpleAsyncEvaluator: 180 | """Creates an evaluator that uses an LLM to judge agent trajectories. 181 | 182 | Args: 183 | prompt: The evaluation prompt, can be a string template, LangChain prompt template, or callable 184 | that returns a list of chat messages. Note that the default prompt allows a rubric 185 | in addition to the typical "inputs", "outputs", and "reference_outputs" parameters. 186 | feedback_key: Key used to store the evaluation result, defaults to "graph_trajectory_accuracy". 187 | judge: The LLM used for evaluation. Can be an OpenAI client), 188 | or a LangChain chat model. If an OpenAI client, must specify "model" as well. 189 | If omitted, "model" will be used to instantiate a LangChain model instance 190 | by model string. 191 | model: Model identifier to use. If "judge" is an OpenAI client, 192 | this argument should be a model name directly. If "judge" is omitted, must be a valid 193 | LangChain model identifier. See `init_chat_model` docs for more details: 194 | https://python.langchain.com/docs/how_to/chat_models_universal_init/. 195 | system: Optional system message to prepend to the prompt. 196 | continuous: If True, score will be a float between 0 and 1. If False, score will be boolean. Defaults to False. 197 | choices: Optional list of specific float values the score must be chosen from. 198 | use_reasoning: If True, includes explanation for the score in the output. Defaults to True. 199 | few_shot_examples: Optional list of example evaluations to append to the prompt. 200 | 201 | Returns: 202 | SimpleAsyncEvaluator: A function that evaluates agent trajectories using the configured LLM judge. 203 | """ 204 | scorer = _create_async_llm_as_judge_scorer( 205 | prompt=prompt, 206 | judge=judge, 207 | model=model, 208 | continuous=continuous, 209 | choices=choices, 210 | use_reasoning=use_reasoning, 211 | few_shot_examples=few_shot_examples, 212 | ) 213 | 214 | async def _wrapped_evaluator( 215 | *, 216 | inputs: Optional[Union[dict, list]] = None, 217 | outputs: GraphTrajectory, 218 | reference_outputs: Optional[GraphTrajectory] = None, 219 | **kwargs, 220 | ) -> EvaluatorResult: 221 | ( 222 | formatted_thread, 223 | formatted_reference_outputs, 224 | ) = _format_inputs(inputs, outputs, reference_outputs) 225 | return await _arun_evaluator( 226 | run_name=f"llm_as_{feedback_key}_judge", 227 | scorer=scorer, 228 | feedback_key=feedback_key, 229 | inputs=inputs, 230 | outputs=outputs, 231 | thread=formatted_thread, 232 | reference_outputs=formatted_reference_outputs, 233 | **kwargs, 234 | ) 235 | 236 | return _wrapped_evaluator 237 | --------------------------------------------------------------------------------