├── python
    ├── tests
    │   ├── __init__.py
    │   ├── graph_trajectory
    │   │   ├── test_graph_trajectory_strict.py
    │   │   ├── test_graph_trajectory_strict_async.py
    │   │   ├── test_graph_trajectory_llm.py
    │   │   ├── test_graph_trajectory_llm_async.py
    │   │   └── test_graph_trajectory_utils.py
    │   ├── test_trajectory_llm.py
    │   └── test_trajectory_llm_async.py
    ├── .python-version
    ├── agentevals
    │   ├── __init__.py
    │   ├── graph_trajectory
    │   │   ├── __init__.py
    │   │   ├── strict.py
    │   │   ├── utils.py
    │   │   └── llm.py
    │   ├── trajectory
    │   │   ├── __init__.py
    │   │   ├── unordered.py
    │   │   ├── subset.py
    │   │   ├── superset.py
    │   │   ├── utils.py
    │   │   ├── strict.py
    │   │   ├── match.py
    │   │   └── llm.py
    │   ├── utils.py
    │   └── types.py
    ├── pyproject.toml
    └── LICENSE
├── js
    ├── .yarnrc.yml
    ├── .gitignore
    ├── .gitattributes
    ├── vitest.config.ts
    ├── .editorconfig
    ├── tsconfig.cjs.json
    ├── .prettierrc
    ├── langchain.config.js
    ├── tsconfig.json
    ├── src
    │   ├── index.ts
    │   ├── graph_trajectory
    │   │   ├── strict.ts
    │   │   ├── tests
    │   │   │   ├── graph_trajectory_strict.test.ts
    │   │   │   ├── graph_trajectory_utils.test.ts
    │   │   │   └── graph_trajectory_llm.test.ts
    │   │   ├── utils.ts
    │   │   └── llm.ts
    │   ├── types.ts
    │   ├── trajectory
    │   │   ├── subset.ts
    │   │   ├── superset.ts
    │   │   ├── unordered.ts
    │   │   ├── match.ts
    │   │   ├── strict.ts
    │   │   ├── llm.ts
    │   │   ├── tests
    │   │   │   └── trajectory_llm.test.ts
    │   │   └── utils.ts
    │   └── utils.ts
    ├── LICENSE
    ├── package.json
    └── .eslintrc.cjs
├── static
    └── img
    │   ├── pytest_output.png
    │   └── langsmith_results.png
├── uv.lock
├── .gitignore
├── LICENSE
├── .github
    └── workflows
    │   ├── build.yml
    │   └── integration_tests.yml
└── scripts
    └── generate_language_readmes.py


/python/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/python/.python-version:
--------------------------------------------------------------------------------
1 | 3.11
2 | 


--------------------------------------------------------------------------------
/python/agentevals/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/python/agentevals/graph_trajectory/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/js/.yarnrc.yml:
--------------------------------------------------------------------------------
1 | yarnPath: .yarn/releases/yarn-3.5.1.cjs
2 | nodeLinker: node-modules


--------------------------------------------------------------------------------
/js/.gitignore:
--------------------------------------------------------------------------------
1 | index.cjs
2 | index.js
3 | index.d.ts
4 | index.d.cts
5 | node_modules
6 | dist
7 | .yarn
8 | 


--------------------------------------------------------------------------------
/static/img/pytest_output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/langchain-ai/agentevals/HEAD/static/img/pytest_output.png


--------------------------------------------------------------------------------
/static/img/langsmith_results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/langchain-ai/agentevals/HEAD/static/img/langsmith_results.png


--------------------------------------------------------------------------------
/uv.lock:
--------------------------------------------------------------------------------
1 | version = 1
2 | requires-python = ">=3.9, <4.0"
3 | 
4 | [[package]]
5 | name = "openevals-monorepo"
6 | version = "0.0.1"
7 | source = { virtual = "." }
8 | 


--------------------------------------------------------------------------------
/js/.gitattributes:
--------------------------------------------------------------------------------
1 | /.yarn/**            linguist-vendored
2 | /.yarn/releases/*    binary
3 | /.yarn/plugins/**/*  binary
4 | /.pnp.*              binary linguist-generated
5 | 


--------------------------------------------------------------------------------
/js/vitest.config.ts:
--------------------------------------------------------------------------------
1 | import { defineConfig } from "vitest/config";
2 | 
3 | export default defineConfig({
4 |   test: {
5 |     setupFiles: ["dotenv/config"],
6 |   },
7 | });
8 | 


--------------------------------------------------------------------------------
/js/.editorconfig:
--------------------------------------------------------------------------------
 1 | root = true
 2 | 
 3 | [*]
 4 | end_of_line = lf
 5 | insert_final_newline = true
 6 | 
 7 | [*.{js,json,yml}]
 8 | charset = utf-8
 9 | indent_style = space
10 | indent_size = 2
11 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Python-generated files
 2 | __pycache__/
 3 | *.py[oc]
 4 | build/
 5 | dist/
 6 | wheels/
 7 | *.egg-info
 8 | 
 9 | # Virtual environments
10 | .venv
11 | 
12 | # JS
13 | node_modules/
14 | .env
15 | .eslintcache


--------------------------------------------------------------------------------
/js/tsconfig.cjs.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "extends": "./tsconfig.json",
 3 |   "compilerOptions": {
 4 |     "module": "commonjs",
 5 |     "declaration": false
 6 |   },
 7 |   "exclude": [
 8 |     "node_modules",
 9 |     "dist",
10 |     "docs",
11 |     "**/tests"
12 |   ]
13 | }


--------------------------------------------------------------------------------
/python/agentevals/trajectory/__init__.py:
--------------------------------------------------------------------------------
 1 | from .match import (
 2 |     create_trajectory_match_evaluator,
 3 |     create_async_trajectory_match_evaluator,
 4 | )
 5 | from .llm import create_trajectory_llm_as_judge, create_async_trajectory_llm_as_judge
 6 | 
 7 | __all__ = [
 8 |     "create_trajectory_match_evaluator",
 9 |     "create_async_trajectory_match_evaluator",
10 |     "create_trajectory_llm_as_judge",
11 |     "create_async_trajectory_llm_as_judge",
12 | ]
13 | 


--------------------------------------------------------------------------------
/js/.prettierrc:
--------------------------------------------------------------------------------
 1 | {
 2 |   "$schema": "https://json.schemastore.org/prettierrc",
 3 |   "printWidth": 80,
 4 |   "tabWidth": 2,
 5 |   "useTabs": false,
 6 |   "semi": true,
 7 |   "singleQuote": false,
 8 |   "quoteProps": "as-needed",
 9 |   "jsxSingleQuote": false,
10 |   "trailingComma": "es5",
11 |   "bracketSpacing": true,
12 |   "arrowParens": "always",
13 |   "requirePragma": false,
14 |   "insertPragma": false,
15 |   "proseWrap": "preserve",
16 |   "htmlWhitespaceSensitivity": "css",
17 |   "vueIndentScriptAndStyle": false,
18 |   "endOfLine": "lf"
19 | }
20 | 


--------------------------------------------------------------------------------
/js/langchain.config.js:
--------------------------------------------------------------------------------
 1 | import { resolve, dirname } from "node:path";
 2 | import { fileURLToPath } from "node:url";
 3 | 
 4 | /**
 5 |  * @param {string} relativePath
 6 |  * @returns {string}
 7 |  */
 8 | function abs(relativePath) {
 9 |   return resolve(dirname(fileURLToPath(import.meta.url)), relativePath);
10 | }
11 | 
12 | export const config = {
13 |   internals: [
14 |     /node\:/,
15 |     /js-tiktoken/,
16 |     /langsmith/,
17 |     /openevals\/llm/,
18 |     /openevals\/types/,
19 |     /@langchain\/core\/messages/,
20 |   ],
21 |   entrypoints: {
22 |     index: "index",
23 |   },
24 |   tsConfigPath: resolve("./tsconfig.json"),
25 |   packageSuffix: "core",
26 |   cjsSource: "./dist-cjs",
27 |   cjsDestination: "./dist",
28 |   abs,
29 | }


--------------------------------------------------------------------------------
/python/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "agentevals"
 3 | version = "0.0.9"
 4 | license = {text = "MIT"}
 5 | description = "Open-source evaluators for LLM agents"
 6 | readme = "README.md"
 7 | requires-python = ">=3.9"
 8 | dependencies = [
 9 |     "openevals>=0.0.20"
10 | ]
11 | 
12 | [dependency-groups]
13 | dev = [
14 |     "langgraph>=0.5.4",
15 |     "mypy>=1.15.0",
16 |     "openai>=1.61.1",
17 |     "openevals>=0.1.0",
18 |     "pytest>=8.3.4",
19 |     "pytest-asyncio>=0.25.3",
20 |     "pytest-dotenv>=0.5.2",
21 |     "ruff>=0.9.5",
22 | ]
23 | 
24 | [tool.setuptools.packages.find]
25 | include = ["agentevals*"]
26 | 
27 | [tool.pytest.ini_options]
28 | addopts = [
29 |     "--import-mode=importlib",
30 | ]
31 | pythonpath = [
32 |   "."
33 | ]
34 | 


--------------------------------------------------------------------------------
/js/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "extends": "@tsconfig/recommended",
 3 |   "compilerOptions": {
 4 |     "target": "ES2021",
 5 |     "lib": [
 6 |       "ES2021",
 7 |       "ES2022.Object",
 8 |       "DOM"
 9 |     ],
10 |     "module": "ES2020",
11 |     "moduleResolution": "nodenext",
12 |     "esModuleInterop": true,
13 |     "declaration": true,
14 |     "noImplicitReturns": true,
15 |     "noFallthroughCasesInSwitch": true,
16 |     "noUnusedLocals": true,
17 |     "noUnusedParameters": true,
18 |     "useDefineForClassFields": true,
19 |     "strictPropertyInitialization": false,
20 |     "allowJs": true,
21 |     "strict": true
22 |   },
23 |   "include": [
24 |     "src/**/*",
25 |   ],
26 |   "exclude": [
27 |     "node_modules",
28 |     "**/dist/",
29 |     "docs",
30 |     "dist/",
31 |   ]
32 | }


--------------------------------------------------------------------------------
/js/src/index.ts:
--------------------------------------------------------------------------------
 1 | export { trajectoryStrictMatch } from "./trajectory/strict.js";
 2 | export { trajectorySubset } from "./trajectory/subset.js";
 3 | export { trajectorySuperset } from "./trajectory/superset.js";
 4 | export { trajectoryUnorderedMatch } from "./trajectory/unordered.js";
 5 | export {
 6 |   createTrajectoryMatchEvaluator,
 7 |   type TrajectoryMatchMode,
 8 | } from "./trajectory/match.js";
 9 | export {
10 |   createTrajectoryLLMAsJudge,
11 |   TRAJECTORY_ACCURACY_PROMPT,
12 |   TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE,
13 | } from "./trajectory/llm.js";
14 | export {
15 |   createGraphTrajectoryLLMAsJudge,
16 |   GRAPH_TRAJECTORY_ACCURACY_PROMPT,
17 | } from "./graph_trajectory/llm.js";
18 | 
19 | export * from "./types.js";
20 | export * from "./utils.js";
21 | export * from "./graph_trajectory/utils.js";
22 | 


--------------------------------------------------------------------------------
/python/agentevals/utils.py:
--------------------------------------------------------------------------------
 1 | __all__ = ["_run_evaluator", "_arun_evaluator"]
 2 | 
 3 | from openevals.types import EvaluatorResult
 4 | from openevals.utils import (
 5 |     _run_evaluator as _base_run_evaluator,
 6 |     _arun_evaluator as _base_arun_evaluator,
 7 | )
 8 | 
 9 | from typing import Any, Callable
10 | 
11 | 
12 | def _run_evaluator(
13 |     *, run_name: str, scorer: Callable, feedback_key: str, **kwargs: Any
14 | ) -> EvaluatorResult | list[EvaluatorResult]:
15 |     return _base_run_evaluator(
16 |         run_name=run_name,
17 |         scorer=scorer,
18 |         feedback_key=feedback_key,
19 |         ls_framework="agentevals",
20 |         **kwargs,
21 |     )
22 | 
23 | 
24 | async def _arun_evaluator(
25 |     *, run_name: str, scorer: Callable, feedback_key: str, **kwargs: Any
26 | ) -> EvaluatorResult | list[EvaluatorResult]:
27 |     return await _base_arun_evaluator(
28 |         run_name=run_name,
29 |         scorer=scorer,
30 |         feedback_key=feedback_key,
31 |         ls_framework="agentevals",
32 |         **kwargs,
33 |     )
34 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2025 LangChain, Inc.
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 | SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/js/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2025 LangChain, Inc.
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 | SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/python/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2025 LangChain, Inc.
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 | SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/python/agentevals/types.py:
--------------------------------------------------------------------------------
 1 | from typing import Callable, Literal, Optional, Union
 2 | from typing_extensions import TypedDict
 3 | 
 4 | from openevals.types import (
 5 |     ChatCompletionMessage,
 6 |     EvaluatorResult,
 7 |     FewShotExample,
 8 |     SimpleEvaluator,
 9 |     SimpleAsyncEvaluator,
10 | )
11 | 
12 | 
13 | # Trajectory extracted from agent
14 | class GraphTrajectory(TypedDict):
15 |     inputs: Optional[list[dict]]
16 |     results: list[dict]
17 |     steps: list[list[str]]
18 | 
19 | 
20 | # Trajectory extracted from a LangGraph thread
21 | class ExtractedLangGraphThreadTrajectory(TypedDict):
22 |     inputs: list
23 |     outputs: GraphTrajectory
24 | 
25 | 
26 | ToolArgsMatchMode = Literal["exact", "ignore", "subset", "superset"]
27 | 
28 | ToolArgsMatchOverrides = dict[
29 |     str, Union[ToolArgsMatchMode, list[str], Callable[[dict, dict], bool]]
30 | ]
31 | 
32 | __all__ = [
33 |     "GraphTrajectory",
34 |     "ChatCompletionMessage",
35 |     "EvaluatorResult",
36 |     "SimpleEvaluator",
37 |     "SimpleAsyncEvaluator",
38 |     "FewShotExample",
39 |     "ToolArgsMatchMode",
40 |     "ToolArgsMatchOverrides",
41 | ]
42 | 


--------------------------------------------------------------------------------
/.github/workflows/build.yml:
--------------------------------------------------------------------------------
 1 | name: Build Success
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |   pull_request:
 8 |     branches:
 9 |       - main
10 |   workflow_dispatch:
11 | 
12 | permissions:
13 |   contents: read
14 | 
15 | jobs:
16 |   changed_files:
17 |     runs-on: ubuntu-latest
18 |     outputs:
19 |       js_changed: ${{ steps.check-changes.outputs.js_changed }}
20 |     steps:
21 |       - uses: actions/checkout@v4
22 |         with:
23 |           fetch-depth: 0
24 | 
25 |       - name: Check for JS file changes
26 |         id: check-changes
27 |         run: |
28 |           if git diff --name-only origin/main HEAD | grep -E "^js/.*\.(js|ts|jsx|tsx|json)$"; then
29 |             echo "js_changed=true" >> $GITHUB_OUTPUT
30 |           else
31 |             echo "js_changed=false" >> $GITHUB_OUTPUT
32 |           fi
33 | 
34 |   js_build_test:
35 |     name: JS Build Test
36 |     needs: changed_files
37 |     if: >
38 |       (github.event_name == 'push') ||
39 |       (github.event_name == 'pull_request' && needs.changed_files.outputs.js_changed == 'true') ||
40 |       (github.event_name == 'workflow_dispatch')
41 |     runs-on: ubuntu-latest
42 |     defaults:
43 |       run:
44 |         working-directory: js
45 |     steps:
46 |       - uses: actions/checkout@v3
47 | 
48 |       - name: Setup Node
49 |         uses: actions/setup-node@v3
50 |         with:
51 |           node-version: 22.x
52 |           cache: "yarn"
53 |           cache-dependency-path: "js/yarn.lock"
54 | 
55 |       - name: Install dependencies
56 |         run: yarn install --immutable
57 | 
58 |       - name: Build
59 |         run: yarn build
60 | 


--------------------------------------------------------------------------------
/python/tests/graph_trajectory/test_graph_trajectory_strict.py:
--------------------------------------------------------------------------------
 1 | from agentevals.graph_trajectory.utils import (
 2 |     extract_langgraph_trajectory_from_thread,
 3 | )
 4 | from agentevals.graph_trajectory.strict import graph_trajectory_strict_match
 5 | 
 6 | from langgraph.prebuilt import create_react_agent
 7 | from langgraph.checkpoint.memory import MemorySaver
 8 | from langgraph.types import Command, interrupt
 9 | from langchain_core.tools import tool
10 | 
11 | import pytest
12 | 
13 | 
14 | @tool
15 | def search(query: str):
16 |     """Call to surf the web."""
17 |     user_answer = interrupt("Tell me the answer to the question.")
18 |     return user_answer
19 | 
20 | 
21 | tools = [search]
22 | 
23 | 
24 | @pytest.mark.langsmith
25 | def test_trajectory_match():
26 |     checkpointer = MemorySaver()
27 |     graph = create_react_agent(
28 |         model="gpt-4o-mini",
29 |         checkpointer=checkpointer,
30 |         tools=[search],
31 |     )
32 |     graph.invoke(
33 |         {"messages": [{"role": "user", "content": "what's the weather in sf?"}]},
34 |         config={"configurable": {"thread_id": "1"}},
35 |     )
36 |     graph.invoke(
37 |         Command(resume="It is rainy and 70 degrees!"),
38 |         config={"configurable": {"thread_id": "1"}},
39 |     )
40 |     extracted_trajectory = extract_langgraph_trajectory_from_thread(
41 |         graph, {"configurable": {"thread_id": "1"}}
42 |     )
43 |     reference_trajectory = {
44 |         "results": [],
45 |         "steps": [["__start__", "agent", "tools", "__interrupt__"], ["agent"]],
46 |     }
47 |     res = graph_trajectory_strict_match(
48 |         outputs=extracted_trajectory["outputs"],
49 |         reference_outputs=reference_trajectory,
50 |     )
51 |     assert res["score"]
52 | 


--------------------------------------------------------------------------------
/python/tests/graph_trajectory/test_graph_trajectory_strict_async.py:
--------------------------------------------------------------------------------
 1 | from agentevals.graph_trajectory.utils import (
 2 |     aextract_langgraph_trajectory_from_thread,
 3 | )
 4 | from agentevals.graph_trajectory.strict import graph_trajectory_strict_match_async
 5 | 
 6 | from langgraph.prebuilt import create_react_agent
 7 | from langgraph.checkpoint.memory import MemorySaver
 8 | from langgraph.types import Command, interrupt
 9 | from langchain_core.tools import tool
10 | 
11 | import pytest
12 | 
13 | 
14 | @tool
15 | def search(query: str):
16 |     """Call to surf the web."""
17 |     user_answer = interrupt("Tell me the answer to the question.")
18 |     return user_answer
19 | 
20 | 
21 | tools = [search]
22 | 
23 | 
24 | @pytest.mark.langsmith
25 | @pytest.mark.asyncio
26 | async def test_trajectory_match():
27 |     checkpointer = MemorySaver()
28 |     graph = create_react_agent(
29 |         model="gpt-4o-mini",
30 |         checkpointer=checkpointer,
31 |         tools=[search],
32 |     )
33 |     await graph.ainvoke(
34 |         {"messages": [{"role": "user", "content": "what's the weather in sf?"}]},
35 |         config={"configurable": {"thread_id": "1"}},
36 |     )
37 |     await graph.ainvoke(
38 |         Command(resume="It is rainy and 70 degrees!"),
39 |         config={"configurable": {"thread_id": "1"}},
40 |     )
41 |     extracted_trajectory = await aextract_langgraph_trajectory_from_thread(
42 |         graph, {"configurable": {"thread_id": "1"}}
43 |     )
44 |     reference_trajectory = {
45 |         "results": [],
46 |         "steps": [["__start__", "agent", "tools", "__interrupt__"], ["agent"]],
47 |     }
48 |     res = await graph_trajectory_strict_match_async(
49 |         outputs=extracted_trajectory["outputs"],
50 |         reference_outputs=reference_trajectory,
51 |     )
52 |     assert res["score"]
53 | 


--------------------------------------------------------------------------------
/scripts/generate_language_readmes.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | def strip_language_details(content: str, target_language: str) -> str:
 4 |     """
 5 |     Strip out <details> tags for non-target language sections and remove the tags
 6 |     for the target language while preserving its content.
 7 |     
 8 |     Args:
 9 |         content: The README content
10 |         target_language: Either 'Python' or 'TypeScript'
11 |     """
12 |     # Define the opposite language to remove
13 |     opposite_language = "TypeScript" if target_language == "Python" else "Python"
14 |     
15 |     # First remove the opposite language blocks completely
16 |     pattern = rf'<details[^>]*>\s*<summary>{opposite_language}</summary>.*?</details>'
17 |     content = re.sub(pattern, '', content, flags=re.DOTALL)
18 |     
19 |     # Then remove just the detail/summary tags for target language, keeping content
20 |     pattern = rf'<details[^>]*>\s*<summary>{target_language}</summary>(.*?)</details>'
21 |     
22 |     def replace_match(match):
23 |         return match.group(1).strip()
24 |     
25 |     content = re.sub(pattern, replace_match, content, flags=re.DOTALL)
26 |     
27 |     # Clean up any double newlines created during the process
28 |     content = re.sub(r'\n{3,}', '\n\n', content)
29 |     
30 |     return content
31 | 
32 | def main():
33 |     # Read the README
34 |     with open('README.md', 'r') as f:
35 |         content = f.read()
36 |     
37 |     # Generate Python version
38 |     python_content = strip_language_details(content, "Python")
39 |     with open('./python/README.md', 'w') as f:
40 |         f.write(python_content)
41 |     
42 |     # Generate TypeScript version
43 |     ts_content = strip_language_details(content, "TypeScript")
44 |     with open('./js/README.md', 'w') as f:
45 |         f.write(ts_content)
46 | 
47 | if __name__ == "__main__":
48 |     main()
49 | 


--------------------------------------------------------------------------------
/js/src/graph_trajectory/strict.ts:
--------------------------------------------------------------------------------
 1 | import { GraphTrajectory } from "../types.js";
 2 | import { _runEvaluator } from "../utils.js";
 3 | 
 4 | const _scorer = (params: {
 5 |   outputs: GraphTrajectory;
 6 |   referenceOutputs: GraphTrajectory;
 7 | }) => {
 8 |   const { outputs, referenceOutputs } = params;
 9 |   if (!outputs || !referenceOutputs) {
10 |     throw new Error(
11 |       "Strict trajectory match requires both outputs and referenceOutputs"
12 |     );
13 |   }
14 |   if (outputs.steps.length !== referenceOutputs.steps.length) {
15 |     return false;
16 |   }
17 |   for (let i = 0; i < outputs.steps.length; i++) {
18 |     if (outputs.steps[i].length !== referenceOutputs.steps[i].length) {
19 |       return false;
20 |     }
21 |     for (let j = 0; j < outputs.steps[i].length; j++) {
22 |       if (outputs.steps[i][j] !== referenceOutputs.steps[i][j]) {
23 |         return false;
24 |       }
25 |     }
26 |   }
27 |   return true;
28 | };
29 | 
30 | /**
31 |  * Evaluate whether an input graph trajectory strictly matches a reference graph trajectory.
32 |  * This means that at each step, the agent took the same steps in the same order as specified in the reference trajectory.
33 |  *
34 |  * @param params - The parameters object
35 |  * @param params.outputs - Actual trajectory the agent followed
36 |  * @param params.referenceOutputs - Ideal reference trajectory the agent should have followed
37 |  * @returns Contains a score of true if trajectory (including called tools) matches, false otherwise
38 |  */
39 | export const graphTrajectoryStrictMatch = ({
40 |   outputs,
41 |   referenceOutputs,
42 | }: {
43 |   outputs: GraphTrajectory;
44 |   referenceOutputs: GraphTrajectory;
45 | }) => {
46 |   return _runEvaluator(
47 |     "graph_trajectory_strict_match",
48 |     _scorer,
49 |     "graph_trajectory_strict_match",
50 |     {
51 |       outputs,
52 |       referenceOutputs,
53 |     }
54 |   );
55 | };
56 | 


--------------------------------------------------------------------------------
/js/src/types.ts:
--------------------------------------------------------------------------------
 1 | import { createLLMAsJudge } from "openevals/llm";
 2 | 
 3 | export * from "openevals/types";
 4 | 
 5 | // More tolerant version of ChatCompletionMessage that allows missing tool_call_id
 6 | // eslint-disable-next-line @typescript-eslint/no-explicit-any
 7 | export type FlexibleChatCompletionMessage = Record<string, any> &
 8 |   (
 9 |     | {
10 |         // eslint-disable-next-line @typescript-eslint/no-explicit-any
11 |         content: any;
12 |         role: "user" | "system" | "developer";
13 |         id?: string;
14 |       }
15 |     | {
16 |         role: "assistant";
17 |         // eslint-disable-next-line @typescript-eslint/no-explicit-any
18 |         content: any;
19 |         // eslint-disable-next-line @typescript-eslint/no-explicit-any
20 |         tool_calls?: any[];
21 |         id?: string;
22 |       }
23 |     | {
24 |         role: "tool";
25 |         // eslint-disable-next-line @typescript-eslint/no-explicit-any
26 |         content: any;
27 |         tool_call_id?: string; // Made optional for backward compatibility
28 |         id?: string;
29 |       }
30 |   );
31 | 
32 | // Trajectory extracted from agent
33 | export type GraphTrajectory = {
34 |   inputs?: (Record<string, unknown> | null)[];
35 |   results: Record<string, unknown>[];
36 |   steps: string[][];
37 | };
38 | 
39 | // Trajectory extracted from a LangGraph thread
40 | export type ExtractedLangGraphThreadTrajectory = {
41 |   inputs: (Record<string, unknown> | null)[][];
42 |   outputs: GraphTrajectory;
43 | };
44 | 
45 | export type TrajectoryLLMAsJudgeParams = Partial<
46 |   Omit<Parameters<typeof createLLMAsJudge>[0], "prompt">
47 | > & {
48 |   prompt?: Parameters<typeof createLLMAsJudge>[0]["prompt"];
49 | };
50 | 
51 | export type ToolArgsMatchMode = "exact" | "ignore" | "subset" | "superset";
52 | 
53 | export type ToolArgsMatcher = (
54 |   toolCall: Record<string, unknown>,
55 |   referenceToolCall: Record<string, unknown>
56 | ) => boolean | Promise<boolean>;
57 | 
58 | export type ToolArgsMatchOverrides = Record<
59 |   string,
60 |   ToolArgsMatchMode | string[] | ToolArgsMatcher
61 | >;
62 | 


--------------------------------------------------------------------------------
/js/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "agentevals",
 3 |   "version": "0.0.6",
 4 |   "packageManager": "yarn@3.5.1",
 5 |   "type": "module",
 6 |   "scripts": {
 7 |     "build": "yarn lc_build --create-entrypoints --pre --tree-shaking",
 8 |     "lint:eslint": "NODE_OPTIONS=--max-old-space-size=4096 eslint --cache --ext .ts,.js src/",
 9 |     "lint:dpdm": "dpdm --exit-code circular:1 --no-warning --no-tree src/*.ts src/**/*.ts",
10 |     "lint": "yarn lint:eslint && yarn lint:dpdm",
11 |     "lint:fix": "yarn lint:eslint --fix && yarn lint:dpdm",
12 |     "format": "prettier --config .prettierrc --write \"src\"",
13 |     "format:check": "prettier --config .prettierrc --check \"src\"",
14 |     "test": "vitest run"
15 |   },
16 |   "dependencies": {
17 |     "@langchain/openai": ">=0.4.4",
18 |     "langchain": ">=0.3.18",
19 |     "langsmith": ">=0.3.11",
20 |     "openevals": "^0.1.0"
21 |   },
22 |   "peerDependencies": {
23 |     "@langchain/core": ">=0.3.73",
24 |     "@langchain/langgraph": ">=0.2.46"
25 |   },
26 |   "devDependencies": {
27 |     "@langchain/core": "^0.3.73",
28 |     "@langchain/langgraph": "^0.4.9",
29 |     "@langchain/scripts": "0.1.3",
30 |     "@tsconfig/recommended": "^1.0.8",
31 |     "@typescript-eslint/eslint-plugin": "^8.24.1",
32 |     "@typescript-eslint/parser": "^8.24.1",
33 |     "dotenv": "^16.4.7",
34 |     "dpdm": "^3.14.0",
35 |     "eslint": "^8.33.0",
36 |     "eslint-config-airbnb-base": "^15.0.0",
37 |     "eslint-config-prettier": "^8.6.0",
38 |     "eslint-plugin-import": "^2.27.5",
39 |     "eslint-plugin-jest": "^27.6.0",
40 |     "eslint-plugin-no-instanceof": "^1.0.1",
41 |     "eslint-plugin-prettier": "^4.2.1",
42 |     "openai": "^4.85.1",
43 |     "prettier": "^3.5.1",
44 |     "typescript": "~5.1.6",
45 |     "vitest": "^3.0.5",
46 |     "zod": "^4.1.5"
47 |   },
48 |   "files": [
49 |     "dist/",
50 |     "index.cjs",
51 |     "index.js",
52 |     "index.d.ts",
53 |     "index.d.cts"
54 |   ],
55 |   "exports": {
56 |     ".": {
57 |       "types": {
58 |         "import": "./index.d.ts",
59 |         "require": "./index.d.cts",
60 |         "default": "./index.d.ts"
61 |       },
62 |       "import": "./index.js",
63 |       "require": "./index.cjs"
64 |     },
65 |     "./package.json": "./package.json"
66 |   }
67 | }
68 | 


--------------------------------------------------------------------------------
/js/src/graph_trajectory/tests/graph_trajectory_strict.test.ts:
--------------------------------------------------------------------------------
 1 | import * as ls from "langsmith/vitest";
 2 | import { expect } from "vitest";
 3 | import {
 4 |   MemorySaver,
 5 |   Command,
 6 |   interrupt,
 7 |   Annotation,
 8 |   StateGraph,
 9 | } from "@langchain/langgraph";
10 | 
11 | import { graphTrajectoryStrictMatch } from "../strict.js";
12 | import { extractLangGraphTrajectoryFromThread } from "../utils.js";
13 | 
14 | ls.describe(
15 |   "graph_trajectory_strict_match",
16 |   () => {
17 |     ls.test(
18 |       "should match the reference trajectory",
19 |       {
20 |         inputs: {},
21 |         referenceOutputs: {
22 |           results: [
23 |             {},
24 |             {
25 |               myKey: "It is rainy and 70 degrees!",
26 |             },
27 |           ],
28 |           steps: [["__start__", "agent", "interrupt", "__interrupt__"], []],
29 |         },
30 |       },
31 |       async ({ referenceOutputs }) => {
32 |         const graph = new StateGraph(
33 |           Annotation.Root({
34 |             myKey: Annotation<string>,
35 |           })
36 |         )
37 |           .addNode("agent", async () => {
38 |             return {
39 |               myKey: "hello",
40 |             };
41 |           })
42 |           .addNode("interrupt", async () => {
43 |             const res = interrupt("Tell me the answer to the question.");
44 |             return { myKey: res };
45 |           })
46 |           .addEdge("__start__", "agent")
47 |           .addEdge("agent", "interrupt")
48 |           .compile({ checkpointer: new MemorySaver() });
49 |         const config = {
50 |           configurable: {
51 |             thread_id: "1",
52 |           },
53 |         };
54 |         await graph.invoke(
55 |           {
56 |             myKey: "foo",
57 |           },
58 |           config
59 |         );
60 |         await graph.invoke(
61 |           new Command({ resume: "It is rainy and 70 degrees!" }),
62 |           config
63 |         );
64 |         const trajectory = await extractLangGraphTrajectoryFromThread(
65 |           graph,
66 |           config
67 |         );
68 |         const result = await graphTrajectoryStrictMatch({
69 |           outputs: trajectory.outputs,
70 |           referenceOutputs: referenceOutputs!,
71 |         });
72 |         expect(result.score).toBe(true);
73 |       }
74 |     );
75 |   },
76 |   {
77 |     enableTestTracking: false,
78 |   }
79 | );
80 | 


--------------------------------------------------------------------------------
/js/.eslintrc.cjs:
--------------------------------------------------------------------------------
 1 | module.exports = {
 2 |   extends: [
 3 |     "airbnb-base",
 4 |     "eslint:recommended",
 5 |     "prettier",
 6 |     "plugin:@typescript-eslint/recommended",
 7 |   ],
 8 |   parserOptions: {
 9 |     ecmaVersion: 12,
10 |     parser: "@typescript-eslint/parser",
11 |     project: "./tsconfig.json",
12 |     sourceType: "module",
13 |   },
14 |   plugins: ["@typescript-eslint", "no-instanceof", "eslint-plugin-jest"],
15 |   ignorePatterns: [
16 |     "src/utils/@cfworker",
17 |     "src/utils/fast-json-patch",
18 |     "src/utils/js-sha1",
19 |     "src/utils/sax-js",
20 |     ".eslintrc.cjs",
21 |     "scripts",
22 |     "node_modules",
23 |     "dist",
24 |     "dist-cjs",
25 |     "*.js",
26 |     "*.cjs",
27 |     "*.d.ts",
28 |   ],
29 |   rules: {
30 |     "no-process-env": 2,
31 |     "no-instanceof/no-instanceof": 2,
32 |     "@typescript-eslint/explicit-module-boundary-types": 0,
33 |     "@typescript-eslint/no-empty-function": 0,
34 |     "@typescript-eslint/no-shadow": 0,
35 |     "@typescript-eslint/no-empty-interface": 0,
36 |     "@typescript-eslint/no-use-before-define": ["error", "nofunc"],
37 |     "@typescript-eslint/no-unused-vars": ["warn", { args: "none" }],
38 |     "@typescript-eslint/no-floating-promises": "error",
39 |     "@typescript-eslint/no-misused-promises": "error",
40 |     "@typescript-eslint/no-this-alias": 0,
41 |     camelcase: 0,
42 |     "class-methods-use-this": 0,
43 |     "import/extensions": [2, "ignorePackages"],
44 |     "import/no-extraneous-dependencies": [
45 |       "error",
46 |       { devDependencies: ["**/*.test.ts"] },
47 |     ],
48 |     "import/no-unresolved": 0,
49 |     "import/prefer-default-export": 0,
50 |     "keyword-spacing": "error",
51 |     "max-classes-per-file": 0,
52 |     "max-len": 0,
53 |     "no-await-in-loop": 0,
54 |     "no-bitwise": 0,
55 |     "no-console": 0,
56 |     "no-empty-function": 0,
57 |     "no-restricted-syntax": 0,
58 |     "no-shadow": 0,
59 |     "no-continue": 0,
60 |     "no-void": 0,
61 |     "no-underscore-dangle": 0,
62 |     "no-use-before-define": 0,
63 |     "no-useless-constructor": 0,
64 |     "no-return-await": 0,
65 |     "no-plusplus": 0,
66 |     "consistent-return": 0,
67 |     "no-else-return": 0,
68 |     "func-names": 0,
69 |     "no-lonely-if": 0,
70 |     "prefer-rest-params": 0,
71 |     "new-cap": ["error", { properties: false, capIsNew: false }],
72 |     'jest/no-focused-tests': 'error',
73 |     "arrow-body-style": 0,
74 |     "prefer-destructuring": 0,
75 |   },
76 |   overrides: [
77 |     {
78 |       files: ['**/*.test.ts'],
79 |       rules: {
80 |         '@typescript-eslint/no-unused-vars': 'off'
81 |       }
82 |     }
83 |   ]
84 | };


--------------------------------------------------------------------------------
/js/src/trajectory/subset.ts:
--------------------------------------------------------------------------------
 1 | import { BaseMessage } from "@langchain/core/messages";
 2 | import {
 3 |   ChatCompletionMessage,
 4 |   FlexibleChatCompletionMessage,
 5 |   EvaluatorResult,
 6 |   ToolArgsMatchMode,
 7 |   ToolArgsMatchOverrides,
 8 | } from "../types.js";
 9 | import { _normalizeToOpenAIMessagesList, _runEvaluator } from "../utils.js";
10 | import { _isTrajectorySuperset } from "./utils.js";
11 | 
12 | export const _scorer = async (params: {
13 |   outputs: ChatCompletionMessage[];
14 |   referenceOutputs: ChatCompletionMessage[];
15 |   toolArgsMatchMode: ToolArgsMatchMode;
16 |   toolArgsMatchOverrides?: ToolArgsMatchOverrides;
17 | }): Promise<boolean> => {
18 |   const isSubset = await _isTrajectorySuperset(
19 |     params.referenceOutputs,
20 |     params.outputs,
21 |     params.toolArgsMatchMode,
22 |     params.toolArgsMatchOverrides
23 |   );
24 |   return isSubset;
25 | };
26 | 
27 | /**
28 |  * @deprecated Use `createTrajectoryMatchEvaluator` with `trajectoryMatchMode: "subset"` instead.
29 |  * Evaluate whether an agent trajectory and called tools is a subset of a reference trajectory and called tools.
30 |  * This means the agent called a subset of the tools specified in the reference trajectory.
31 |  *
32 |  * @param params - The parameters for trajectory subset evaluation
33 |  * @param params.outputs - Actual trajectory the agent followed.
34 |  *    May be a list of OpenAI messages, a list of LangChain messages, or a dictionary containing
35 |  *    a "messages" key with one of the above.
36 |  * @param params.reference_outputs - Ideal reference trajectory the agent should have followed.
37 |  *    May be a list of OpenAI messages, a list of LangChain messages, or a dictionary containing
38 |  *    a "messages" key with one of the above.
39 |  * @returns EvaluatorResult containing a score of true if trajectory (including called tools) matches, false otherwise
40 |  */
41 | export async function trajectorySubset(params: {
42 |   outputs:
43 |     | FlexibleChatCompletionMessage[]
44 |     | BaseMessage[]
45 |     | {
46 |         messages: (
47 |           | BaseMessage
48 |           | ChatCompletionMessage
49 |           | FlexibleChatCompletionMessage
50 |         )[];
51 |       };
52 |   referenceOutputs:
53 |     | FlexibleChatCompletionMessage[]
54 |     | BaseMessage[]
55 |     | {
56 |         messages: (
57 |           | BaseMessage
58 |           | ChatCompletionMessage
59 |           | FlexibleChatCompletionMessage
60 |         )[];
61 |       };
62 | }): Promise<EvaluatorResult> {
63 |   const { outputs, referenceOutputs } = params;
64 |   const outputsList = _normalizeToOpenAIMessagesList(outputs);
65 |   const referenceOutputsList = _normalizeToOpenAIMessagesList(referenceOutputs);
66 | 
67 |   return _runEvaluator("trajectory_subset", _scorer, "trajectory_subset", {
68 |     ...params,
69 |     outputs: outputsList,
70 |     referenceOutputs: referenceOutputsList,
71 |     toolArgsMatchMode: "ignore",
72 |   });
73 | }
74 | 


--------------------------------------------------------------------------------
/js/src/graph_trajectory/tests/graph_trajectory_utils.test.ts:
--------------------------------------------------------------------------------
 1 | /* eslint-disable no-promise-executor-return */
 2 | import { expect, test } from "vitest";
 3 | import { Annotation, StateGraph, MemorySaver } from "@langchain/langgraph";
 4 | 
 5 | import { extractLangGraphTrajectoryFromThread } from "../utils.js";
 6 | 
 7 | test("trajectory match", async () => {
 8 |   const checkpointer = new MemorySaver();
 9 | 
10 |   const inner = new StateGraph(
11 |     Annotation.Root({
12 |       myKey: Annotation<string>({
13 |         reducer: (a, b) => a + b,
14 |         default: () => "",
15 |       }),
16 |       myOtherKey: Annotation<string>,
17 |     })
18 |   )
19 |     .addNode("inner1", async (state) => {
20 |       await new Promise((resolve) => setTimeout(resolve, 100));
21 |       return { myKey: "got here", myOtherKey: state.myKey };
22 |     })
23 |     .addNode("inner2", (state) => ({
24 |       myKey: " and there",
25 |       myOtherKey: state.myKey,
26 |     }))
27 |     .addEdge("inner1", "inner2")
28 |     .addEdge("__start__", "inner1")
29 |     .compile({ interruptBefore: ["inner2"] });
30 | 
31 |   const app = new StateGraph(
32 |     Annotation.Root({
33 |       myKey: Annotation<string>({
34 |         reducer: (a, b) => a + b,
35 |         default: () => "",
36 |       }),
37 |     })
38 |   )
39 |     .addNode("inner", (state, config) => inner.invoke(state, config), {
40 |       subgraphs: [inner],
41 |     })
42 |     .addNode("outer1", () => ({ myKey: " and parallel" }))
43 |     .addNode("outer2", () => ({ myKey: " and back again" }))
44 |     .addEdge("__start__", "inner")
45 |     .addEdge("__start__", "outer1")
46 |     .addEdge(["inner", "outer1"], "outer2")
47 |     .compile({ checkpointer });
48 | 
49 |   // test invoke w/ nested interrupt
50 |   const config = { configurable: { thread_id: "1" } };
51 |   expect(await app.invoke({ myKey: "" }, config)).toEqual({
52 |     __interrupt__: [],
53 |     myKey: " and parallel",
54 |   });
55 | 
56 |   expect(await app.invoke(null, config)).toEqual({
57 |     myKey: "got here and there and parallel and back again",
58 |   });
59 | 
60 |   const trajectory = await extractLangGraphTrajectoryFromThread(app, config);
61 |   expect(trajectory).toEqual({
62 |     inputs: [
63 |       {
64 |         __start__: {
65 |           myKey: "",
66 |         },
67 |       },
68 |       {
69 |         __start__: {
70 |           myKey: "",
71 |         },
72 |       },
73 |     ],
74 |     outputs: {
75 |       results: [
76 |         {
77 |           myKey: "got here and there",
78 |           myOtherKey: "got here",
79 |         },
80 |         {
81 |           myKey: "got here and there and parallel and back again",
82 |         },
83 |       ],
84 |       steps: [
85 |         [
86 |           "__start__",
87 |           "outer1",
88 |           "inner",
89 |           "inner:__start__",
90 |           "inner:inner1",
91 |           "inner:inner2",
92 |         ],
93 |         ["outer2"],
94 |       ],
95 |     },
96 |   });
97 | });
98 | 


--------------------------------------------------------------------------------
/js/src/trajectory/superset.ts:
--------------------------------------------------------------------------------
 1 | import { BaseMessage } from "@langchain/core/messages";
 2 | import {
 3 |   ChatCompletionMessage,
 4 |   FlexibleChatCompletionMessage,
 5 |   EvaluatorResult,
 6 |   ToolArgsMatchMode,
 7 |   ToolArgsMatchOverrides,
 8 | } from "../types.js";
 9 | import { _normalizeToOpenAIMessagesList, _runEvaluator } from "../utils.js";
10 | import { _isTrajectorySuperset } from "./utils.js";
11 | 
12 | export const _scorer = async (params: {
13 |   outputs: ChatCompletionMessage[];
14 |   referenceOutputs: ChatCompletionMessage[];
15 |   toolArgsMatchMode: ToolArgsMatchMode;
16 |   toolArgsMatchOverrides?: ToolArgsMatchOverrides;
17 | }): Promise<boolean> => {
18 |   const isSuperset = await _isTrajectorySuperset(
19 |     params.outputs,
20 |     params.referenceOutputs,
21 |     params.toolArgsMatchMode,
22 |     params.toolArgsMatchOverrides
23 |   );
24 |   return isSuperset;
25 | };
26 | 
27 | /**
28 |  * @deprecated Use `createTrajectoryMatchEvaluator` with `trajectoryMatchMode: "superset"` instead.
29 |  * Evaluate whether an agent trajectory and called tools is a superset of a reference trajectory and called tools.
30 |  * This means the agent called a superset of the tools specified in the reference trajectory.
31 |  *
32 |  * @param params - The parameters for trajectory superset evaluation
33 |  * @param params.outputs - Actual trajectory the agent followed.
34 |  *    May be a list of OpenAI messages, a list of LangChain messages, or a dictionary containing
35 |  *    a "messages" key with one of the above.
36 |  * @param params.reference_outputs - Ideal reference trajectory the agent should have followed.
37 |  *    May be a list of OpenAI messages, a list of LangChain messages, or a dictionary containing
38 |  *    a "messages" key with one of the above.
39 |  * @returns EvaluatorResult containing a score of true if trajectory (including called tools) matches, false otherwise
40 |  */
41 | export async function trajectorySuperset(params: {
42 |   outputs:
43 |     | FlexibleChatCompletionMessage[]
44 |     | BaseMessage[]
45 |     | {
46 |         messages: (
47 |           | BaseMessage
48 |           | ChatCompletionMessage
49 |           | FlexibleChatCompletionMessage
50 |         )[];
51 |       };
52 |   referenceOutputs:
53 |     | FlexibleChatCompletionMessage[]
54 |     | BaseMessage[]
55 |     | {
56 |         messages: (
57 |           | BaseMessage
58 |           | ChatCompletionMessage
59 |           | FlexibleChatCompletionMessage
60 |         )[];
61 |       };
62 | }): Promise<EvaluatorResult> {
63 |   const { outputs, referenceOutputs } = params;
64 |   const outputsList = _normalizeToOpenAIMessagesList(outputs);
65 |   const referenceOutputsList = _normalizeToOpenAIMessagesList(referenceOutputs);
66 | 
67 |   return _runEvaluator("trajectory_superset", _scorer, "trajectory_superset", {
68 |     ...params,
69 |     outputs: outputsList,
70 |     referenceOutputs: referenceOutputsList,
71 |     toolArgsMatchMode: "ignore",
72 |   });
73 | }
74 | 


--------------------------------------------------------------------------------
/python/agentevals/graph_trajectory/strict.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from agentevals.types import EvaluatorResult, GraphTrajectory
 4 | from agentevals.utils import _run_evaluator, _arun_evaluator
 5 | 
 6 | from typing import Any
 7 | 
 8 | 
 9 | def _scorer(
10 |     *,
11 |     outputs: GraphTrajectory,
12 |     reference_outputs: GraphTrajectory,
13 | ) -> float:
14 |     if outputs is None or reference_outputs is None:
15 |         raise ValueError(
16 |             "Strict trajectory match requires both outputs and reference_outputs"
17 |         )
18 |     if len(outputs["steps"]) != len(reference_outputs["steps"]):
19 |         return False
20 |     exact_match = True
21 |     for output, reference_output in zip(outputs["steps"], reference_outputs["steps"]):
22 |         if output != reference_output:
23 |             exact_match = False
24 |             break
25 |     return exact_match
26 | 
27 | 
28 | def graph_trajectory_strict_match(
29 |     *,
30 |     outputs: GraphTrajectory,
31 |     reference_outputs: GraphTrajectory,
32 |     **kwargs: Any,
33 | ) -> EvaluatorResult:
34 |     """
35 |     Evaluate whether an input graph trajectory strictly matches a reference graph trajectory.
36 |     This means that at each step, the agent took the same steps in the same order as specified in the reference trajectory.
37 | 
38 |     Args:
39 |         outputs (GraphTrajectory): Actual trajectory the agent followed.
40 |         reference_outputs (GraphTrajectory): Ideal reference trajectory the agent should have followed.
41 | 
42 |     Returns:
43 |         EvaluatorResult: Contains a score of True if trajectory (including called tools) matches, False otherwise
44 |     """
45 |     return _run_evaluator(
46 |         run_name="graph_trajectory_strict_match",
47 |         scorer=_scorer,
48 |         feedback_key="graph_trajectory_strict_match",
49 |         outputs=outputs,
50 |         reference_outputs=reference_outputs,
51 |     )
52 | 
53 | 
54 | async def graph_trajectory_strict_match_async(
55 |     *,
56 |     outputs: GraphTrajectory,
57 |     reference_outputs: GraphTrajectory,
58 |     **kwargs: Any,
59 | ) -> EvaluatorResult:
60 |     """
61 |     Evaluate whether an input graph trajectory strictly matches a reference graph trajectory.
62 |     This means that at each step, the agent took the same steps in the same order as specified in the reference trajectory.
63 | 
64 |     Args:
65 |         outputs (GraphTrajectory): Actual trajectory the agent followed.
66 |         reference_outputs (GraphTrajectory): Ideal reference trajectory the agent should have followed.
67 | 
68 |     Returns:
69 |         EvaluatorResult: Contains a score of True if trajectory (including called tools) matches, False otherwise
70 |     """
71 | 
72 |     async def async_wrapper(**kwargs: Any):
73 |         return _scorer(**kwargs)
74 | 
75 |     return await _arun_evaluator(
76 |         run_name="graph_trajectory_strict_match",
77 |         scorer=async_wrapper,
78 |         feedback_key="graph_trajectory_strict_match",
79 |         outputs=outputs,
80 |         reference_outputs=reference_outputs,
81 |     )
82 | 


--------------------------------------------------------------------------------
/js/src/trajectory/unordered.ts:
--------------------------------------------------------------------------------
 1 | import { BaseMessage } from "@langchain/core/messages";
 2 | import {
 3 |   ChatCompletionMessage,
 4 |   FlexibleChatCompletionMessage,
 5 |   EvaluatorResult,
 6 |   ToolArgsMatchMode,
 7 |   ToolArgsMatchOverrides,
 8 | } from "../types.js";
 9 | import { _normalizeToOpenAIMessagesList, _runEvaluator } from "../utils.js";
10 | import { _isTrajectorySuperset } from "./utils.js";
11 | 
12 | export const _scorer = async (params: {
13 |   outputs: ChatCompletionMessage[];
14 |   referenceOutputs: ChatCompletionMessage[];
15 |   toolArgsMatchMode: ToolArgsMatchMode;
16 |   toolArgsMatchOverrides?: ToolArgsMatchOverrides;
17 | }): Promise<boolean> => {
18 |   const isUnorderedMatch =
19 |     (await _isTrajectorySuperset(
20 |       params.outputs,
21 |       params.referenceOutputs,
22 |       params.toolArgsMatchMode,
23 |       params.toolArgsMatchOverrides
24 |     )) &&
25 |     (await _isTrajectorySuperset(
26 |       params.referenceOutputs,
27 |       params.outputs,
28 |       params.toolArgsMatchMode,
29 |       params.toolArgsMatchOverrides
30 |     ));
31 |   return isUnorderedMatch;
32 | };
33 | 
34 | /**
35 |  * @deprecated Use `createTrajectoryMatchEvaluator` with `trajectoryMatchMode: "unordered"` instead.
36 |  * Evaluate whether an input agent trajectory and called tools contains all the tools used in a reference trajectory.
37 |  * This accounts for some differences in an LLM's reasoning process in a case-by-case basis.
38 |  *
39 |  * @param params - The parameters for trajectory unordered match evaluation
40 |  * @param params.outputs - Actual trajectory the agent followed.
41 |  *    May be a list of OpenAI messages, a list of LangChain messages, or a dictionary containing
42 |  *    a "messages" key with one of the above.
43 |  * @param params.reference_outputs - Ideal reference trajectory the agent should have followed.
44 |  *    May be a list of OpenAI messages, a list of LangChain messages, or a dictionary containing
45 |  *    a "messages" key with one of the above.
46 |  * @returns EvaluatorResult containing a score of true if trajectory (including called tools) matches, false otherwise
47 |  */
48 | export async function trajectoryUnorderedMatch(params: {
49 |   outputs:
50 |     | FlexibleChatCompletionMessage[]
51 |     | BaseMessage[]
52 |     | {
53 |         messages: (
54 |           | BaseMessage
55 |           | ChatCompletionMessage
56 |           | FlexibleChatCompletionMessage
57 |         )[];
58 |       };
59 |   referenceOutputs:
60 |     | FlexibleChatCompletionMessage[]
61 |     | BaseMessage[]
62 |     | {
63 |         messages: (
64 |           | BaseMessage
65 |           | ChatCompletionMessage
66 |           | FlexibleChatCompletionMessage
67 |         )[];
68 |       };
69 | }): Promise<EvaluatorResult> {
70 |   const { outputs, referenceOutputs } = params;
71 |   const outputsList = _normalizeToOpenAIMessagesList(outputs);
72 |   const referenceOutputsList = _normalizeToOpenAIMessagesList(referenceOutputs);
73 | 
74 |   return _runEvaluator(
75 |     "trajectory_unordered_match",
76 |     _scorer,
77 |     "trajectory_unordered_match",
78 |     {
79 |       ...params,
80 |       outputs: outputsList,
81 |       referenceOutputs: referenceOutputsList,
82 |       toolArgsMatchMode: "ignore",
83 |     }
84 |   );
85 | }
86 | 


--------------------------------------------------------------------------------
/python/tests/graph_trajectory/test_graph_trajectory_llm.py:
--------------------------------------------------------------------------------
  1 | from agentevals.graph_trajectory.utils import (
  2 |     extract_langgraph_trajectory_from_thread,
  3 | )
  4 | from agentevals.graph_trajectory.llm import create_graph_trajectory_llm_as_judge
  5 | 
  6 | from langgraph.prebuilt import create_react_agent
  7 | from langgraph.checkpoint.memory import MemorySaver
  8 | from langgraph.types import Command, interrupt
  9 | from langchain_core.tools import tool
 10 | 
 11 | import pytest
 12 | 
 13 | 
 14 | @tool
 15 | def search(query: str):
 16 |     """Call to surf the web."""
 17 |     user_answer = interrupt("Tell me the answer to the question.")
 18 |     return user_answer
 19 | 
 20 | 
 21 | tools = [search]
 22 | 
 23 | 
 24 | @pytest.mark.langsmith
 25 | def test_sensible_trajectory():
 26 |     checkpointer = MemorySaver()
 27 |     graph = create_react_agent(
 28 |         model="gpt-4o-mini",
 29 |         checkpointer=checkpointer,
 30 |         tools=[search],
 31 |     )
 32 |     graph.invoke(
 33 |         {"messages": [{"role": "user", "content": "what's the weather in sf?"}]},
 34 |         config={"configurable": {"thread_id": "1"}},
 35 |     )
 36 |     graph.invoke(
 37 |         Command(resume="It is rainy and 70 degrees!"),
 38 |         config={"configurable": {"thread_id": "1"}},
 39 |     )
 40 |     extracted_trajectory = extract_langgraph_trajectory_from_thread(
 41 |         graph, {"configurable": {"thread_id": "1"}}
 42 |     )
 43 |     evaluator = create_graph_trajectory_llm_as_judge(
 44 |         model="openai:o3-mini",
 45 |     )
 46 |     res = evaluator(
 47 |         inputs=extracted_trajectory["inputs"],
 48 |         outputs=extracted_trajectory["outputs"],
 49 |     )
 50 |     assert res["key"] == "graph_trajectory_accuracy"
 51 |     assert res["score"]
 52 | 
 53 | 
 54 | @pytest.mark.langsmith
 55 | def test_unsensible_trajectory():
 56 |     checkpointer = MemorySaver()
 57 | 
 58 |     @tool
 59 |     def askjeeves(query: str):
 60 |         """Call to surf the web."""
 61 |         return "foo"
 62 | 
 63 |     graph = create_react_agent(
 64 |         model="gpt-4o-mini",
 65 |         checkpointer=checkpointer,
 66 |         tools=[askjeeves],
 67 |         prompt="You are an evil assistant who is inefficient and calls more tools than necessary.",
 68 |     )
 69 |     graph.invoke(
 70 |         {"messages": [{"role": "user", "content": "what's the weather in sf?"}]},
 71 |         config={"configurable": {"thread_id": "1"}},
 72 |     )
 73 |     extracted_trajectory = extract_langgraph_trajectory_from_thread(
 74 |         graph, {"configurable": {"thread_id": "1"}}
 75 |     )
 76 |     evaluator = create_graph_trajectory_llm_as_judge(
 77 |         prompt="""You are an expert data labeler.
 78 | Your task is to grade the accuracy of an AI agent's internal steps in resolving a user queries.
 79 | 
 80 | <Rubric>
 81 |   An accurate trajectory:
 82 |   - Makes logical sense between steps
 83 |   - Shows clear progression
 84 |   - Is perfectly efficient, with no more than one tool call
 85 |   - Is semantically equivalent to the provided reference trajectory, if present
 86 | </Rubric>
 87 | 
 88 | <Instructions>
 89 |   Grade the following thread, evaluating whether the agent's overall steps are logical and relatively efficient.
 90 |   For the trajectory, "__start__" denotes an initial entrypoint to the agent, and "__interrupt__" corresponds to the agent
 91 |   interrupting to await additional data from another source ("human-in-the-loop"):
 92 | </Instructions>
 93 | 
 94 | <thread>
 95 | {thread}
 96 | </thread>
 97 | 
 98 | {reference_outputs}
 99 | """,
100 |         model="openai:o3-mini",
101 |     )
102 |     res = evaluator(
103 |         inputs=extracted_trajectory["inputs"],
104 |         outputs=extracted_trajectory["outputs"],
105 |     )
106 |     assert res["key"] == "graph_trajectory_accuracy"
107 |     assert not res["score"]
108 | 


--------------------------------------------------------------------------------
/js/src/utils.ts:
--------------------------------------------------------------------------------
  1 | import { BaseMessage, isBaseMessage } from "@langchain/core/messages";
  2 | import { _convertMessagesToOpenAIParams } from "@langchain/openai";
  3 | import {
  4 |   _runEvaluator as baseRunEvaluator,
  5 |   EvaluationResultType,
  6 | } from "openevals/utils";
  7 | import {
  8 |   ChatCompletionMessage,
  9 |   FlexibleChatCompletionMessage,
 10 |   MultiResultScorerReturnType,
 11 |   SingleResultScorerReturnType,
 12 | } from "./types.js";
 13 | 
 14 | export const _convertToOpenAIMessage = (
 15 |   message: BaseMessage | ChatCompletionMessage
 16 | ): ChatCompletionMessage => {
 17 |   if (isBaseMessage(message)) {
 18 |     // eslint-disable-next-line @typescript-eslint/no-explicit-any
 19 |     return _convertMessagesToOpenAIParams([message])[0] as any;
 20 |   } else {
 21 |     return message;
 22 |   }
 23 | };
 24 | 
 25 | export const _convertToChatCompletionMessage = (
 26 |   message: BaseMessage | ChatCompletionMessage | FlexibleChatCompletionMessage
 27 | ): ChatCompletionMessage => {
 28 |   let converted: FlexibleChatCompletionMessage;
 29 | 
 30 |   if (isBaseMessage(message)) {
 31 |     // eslint-disable-next-line @typescript-eslint/no-explicit-any
 32 |     converted = _convertMessagesToOpenAIParams([message])[0] as any;
 33 |   } else {
 34 |     converted = message as FlexibleChatCompletionMessage;
 35 |   }
 36 | 
 37 |   // For tool messages without tool_call_id, generate one for compatibility
 38 |   if (converted.role === "tool" && !converted.tool_call_id) {
 39 |     converted = {
 40 |       ...converted,
 41 |       tool_call_id: `generated-${Math.random().toString(36).substring(2)}`,
 42 |     };
 43 |   }
 44 | 
 45 |   return converted as ChatCompletionMessage;
 46 | };
 47 | 
 48 | export const _normalizeToOpenAIMessagesList = (
 49 |   messages?:
 50 |     | (BaseMessage | ChatCompletionMessage | FlexibleChatCompletionMessage)[]
 51 |     | {
 52 |         messages: (
 53 |           | BaseMessage
 54 |           | ChatCompletionMessage
 55 |           | FlexibleChatCompletionMessage
 56 |         )[];
 57 |       }
 58 | ): ChatCompletionMessage[] => {
 59 |   if (!messages) {
 60 |     return [];
 61 |   }
 62 |   let messagesList: (
 63 |     | BaseMessage
 64 |     | ChatCompletionMessage
 65 |     | FlexibleChatCompletionMessage
 66 |   )[];
 67 |   if (!Array.isArray(messages)) {
 68 |     if ("messages" in messages && Array.isArray(messages.messages)) {
 69 |       messagesList = messages.messages;
 70 |     } else {
 71 |       throw new Error(
 72 |         `If passing messages as an object, it must contain a "messages" key`
 73 |       );
 74 |     }
 75 |   } else {
 76 |     messagesList = messages;
 77 |   }
 78 |   return messagesList.map(_convertToChatCompletionMessage);
 79 | };
 80 | 
 81 | export const processScore = (
 82 |   _: string,
 83 |   value: boolean | number | { score: boolean | number; reasoning?: string }
 84 | ) => {
 85 |   if (typeof value === "object") {
 86 |     if (value != null && "score" in value) {
 87 |       return [
 88 |         value.score,
 89 |         "reasoning" in value && typeof value.reasoning === "string"
 90 |           ? value.reasoning
 91 |           : undefined,
 92 |       ] as const;
 93 |     } else {
 94 |       throw new Error(
 95 |         `Expected a dictionary with a "score" key, but got "${JSON.stringify(
 96 |           value,
 97 |           null,
 98 |           2
 99 |         )}"`
100 |       );
101 |     }
102 |   }
103 |   return [value] as const;
104 | };
105 | 
106 | export const _runEvaluator = async <
107 |   T extends Record<string, unknown>,
108 |   O extends
109 |     | SingleResultScorerReturnType
110 |     | MultiResultScorerReturnType
111 |     | Promise<SingleResultScorerReturnType | MultiResultScorerReturnType>,
112 | >(
113 |   runName: string,
114 |   scorer: (params: T) => O,
115 |   feedbackKey: string,
116 |   extra?: T
117 | ): Promise<EvaluationResultType<O>> => {
118 |   return baseRunEvaluator(runName, scorer, feedbackKey, extra, "agentevals");
119 | };
120 | 


--------------------------------------------------------------------------------
/python/tests/graph_trajectory/test_graph_trajectory_llm_async.py:
--------------------------------------------------------------------------------
  1 | from agentevals.graph_trajectory.utils import (
  2 |     aextract_langgraph_trajectory_from_thread,
  3 | )
  4 | from agentevals.graph_trajectory.llm import create_async_graph_trajectory_llm_as_judge
  5 | 
  6 | from langgraph.prebuilt import create_react_agent
  7 | from langgraph.checkpoint.memory import MemorySaver
  8 | from langgraph.types import Command, interrupt
  9 | from langchain_core.tools import tool
 10 | 
 11 | import pytest
 12 | 
 13 | 
 14 | @tool
 15 | def search(query: str):
 16 |     """Call to surf the web."""
 17 |     user_answer = interrupt("Tell me the answer to the question.")
 18 |     return user_answer
 19 | 
 20 | 
 21 | tools = [search]
 22 | 
 23 | 
 24 | @pytest.mark.langsmith
 25 | @pytest.mark.asyncio
 26 | async def test_sensible_trajectory():
 27 |     checkpointer = MemorySaver()
 28 |     graph = create_react_agent(
 29 |         model="gpt-4o-mini",
 30 |         checkpointer=checkpointer,
 31 |         tools=[search],
 32 |     )
 33 |     await graph.ainvoke(
 34 |         {"messages": [{"role": "user", "content": "what's the weather in sf?"}]},
 35 |         config={"configurable": {"thread_id": "1"}},
 36 |     )
 37 |     await graph.ainvoke(
 38 |         Command(resume="It is rainy and 70 degrees!"),
 39 |         config={"configurable": {"thread_id": "1"}},
 40 |     )
 41 |     extracted_trajectory = await aextract_langgraph_trajectory_from_thread(
 42 |         graph, {"configurable": {"thread_id": "1"}}
 43 |     )
 44 |     evaluator = create_async_graph_trajectory_llm_as_judge(
 45 |         model="openai:o3-mini",
 46 |     )
 47 |     res = await evaluator(
 48 |         inputs=extracted_trajectory["inputs"],
 49 |         outputs=extracted_trajectory["outputs"],
 50 |     )
 51 |     assert res["key"] == "graph_trajectory_accuracy"
 52 |     assert res["score"]
 53 | 
 54 | 
 55 | @pytest.mark.langsmith
 56 | @pytest.mark.asyncio
 57 | async def test_unsensible_trajectory():
 58 |     checkpointer = MemorySaver()
 59 | 
 60 |     @tool
 61 |     def askjeeves(query: str):
 62 |         """Call to surf the web."""
 63 |         return "foo"
 64 | 
 65 |     graph = create_react_agent(
 66 |         model="gpt-4o-mini",
 67 |         checkpointer=checkpointer,
 68 |         tools=[askjeeves],
 69 |         prompt="You are an evil assistant who is inefficient and calls more tools than necessary.",
 70 |     )
 71 |     await graph.ainvoke(
 72 |         {"messages": [{"role": "user", "content": "what's the weather in sf?"}]},
 73 |         config={"configurable": {"thread_id": "1"}},
 74 |     )
 75 |     extracted_trajectory = await aextract_langgraph_trajectory_from_thread(
 76 |         graph, {"configurable": {"thread_id": "1"}}
 77 |     )
 78 |     evaluator = create_async_graph_trajectory_llm_as_judge(
 79 |         prompt="""You are an expert data labeler.
 80 | Your task is to grade the accuracy of an AI agent's internal steps in resolving a user queries.
 81 | 
 82 | <Rubric>
 83 |   An accurate trajectory:
 84 |   - Makes logical sense between steps
 85 |   - Shows clear progression
 86 |   - Is perfectly efficient, with no more than one tool call
 87 |   - Is semantically equivalent to the provided reference trajectory, if present
 88 | </Rubric>
 89 | 
 90 | <Instructions>
 91 |   Grade the following thread, evaluating whether the agent's overall steps are logical and relatively efficient.
 92 |   For the trajectory, "__start__" denotes an initial entrypoint to the agent, and "__interrupt__" corresponds to the agent
 93 |   interrupting to await additional data from another source ("human-in-the-loop"):
 94 | </Instructions>
 95 | 
 96 | <thread>
 97 | {thread}
 98 | </thread>
 99 | 
100 | {reference_outputs}
101 | """,
102 |         model="openai:o3-mini",
103 |     )
104 |     res = await evaluator(
105 |         inputs=extracted_trajectory["inputs"],
106 |         outputs=extracted_trajectory["outputs"],
107 |     )
108 |     assert res["key"] == "graph_trajectory_accuracy"
109 |     assert not res["score"]
110 | 


--------------------------------------------------------------------------------
/.github/workflows/integration_tests.yml:
--------------------------------------------------------------------------------
  1 | name: Integration Tests CI
  2 | 
  3 | on:
  4 |   push:
  5 |     branches:
  6 |       - main
  7 |   pull_request:
  8 |     branches:
  9 |       - main
 10 |   workflow_dispatch:
 11 | 
 12 | permissions:
 13 |   contents: read
 14 |   pull-requests: read
 15 | 
 16 | jobs:
 17 |   changed_files:
 18 |     runs-on: ubuntu-latest
 19 |     outputs:
 20 |       python_changed: ${{ steps.check-changes.outputs.python_changed }}
 21 |       js_changed: ${{ steps.check-changes.outputs.js_changed }}
 22 |     steps:
 23 |       - uses: actions/checkout@v4
 24 |         with:
 25 |           fetch-depth: 0  # Required for diff with main branch
 26 | 
 27 |       - name: Check for file changes
 28 |         id: check-changes
 29 |         run: |
 30 |           if git diff --name-only origin/main HEAD | grep -E "^python/.*\.py$"; then
 31 |             echo "python_changed=true" >> $GITHUB_OUTPUT
 32 |           else
 33 |             echo "python_changed=false" >> $GITHUB_OUTPUT
 34 |           fi
 35 |           
 36 |           if git diff --name-only origin/main HEAD | grep -E "^js/.*\.(js|ts|jsx|tsx)$"; then
 37 |             echo "js_changed=true" >> $GITHUB_OUTPUT
 38 |           else
 39 |             echo "js_changed=false" >> $GITHUB_OUTPUT
 40 |           fi
 41 | 
 42 |   python_integration_test:
 43 |     name: Python Integration Test (${{ matrix.python-version }})
 44 |     needs: changed_files
 45 |     if: >
 46 |       (github.event_name == 'push') ||
 47 |       (github.event_name == 'pull_request' && (
 48 |         contains(github.event.pull_request.labels.*.name, 'release') ||
 49 |         needs.changed_files.outputs.python_changed == 'true'
 50 |       )) ||
 51 |       (github.event_name == 'workflow_dispatch' && github.event.inputs.run-python-tests == 'true')
 52 |     runs-on: ubuntu-latest
 53 |     strategy:
 54 |       matrix:
 55 |         python-version: ["3.9", "3.11"]
 56 |     defaults:
 57 |       run:
 58 |         working-directory: python
 59 |     steps:
 60 |       - uses: actions/checkout@v3
 61 | 
 62 |       - name: Install uv
 63 |         uses: astral-sh/setup-uv@v5
 64 |         with:
 65 |           version: "0.6.2"
 66 | 
 67 |       - name: Set up Python ${{ matrix.python-version }}
 68 |         uses: actions/setup-python@v5
 69 |         with:
 70 |           python-version: ${{ matrix.python-version }}
 71 | 
 72 |       - name: Install dependencies
 73 |         run: |
 74 |           uv venv
 75 |           source .venv/bin/activate
 76 |           uv sync
 77 |           uv sync --group dev
 78 |         shell: bash
 79 |         working-directory: python
 80 |           
 81 |       - name: Run integration tests
 82 |         env:
 83 |           LANGSMITH_TRACING: "true"
 84 |           LANGSMITH_ENDPOINT: ${{ secrets.LANGSMITH_ENDPOINT }}
 85 |           LANGSMITH_API_KEY: ${{ secrets.LANGSMITH_API_KEY }}
 86 |           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
 87 |         run: uv run pytest tests
 88 |         shell: bash
 89 |         working-directory: python
 90 | 
 91 |   js_integration_test:
 92 |     name: JS Integration Test
 93 |     needs: changed_files
 94 |     if: >
 95 |       (github.event_name == 'push') ||
 96 |       (github.event_name == 'pull_request' && (
 97 |         contains(github.event.pull_request.labels.*.name, 'release') ||
 98 |         needs.changed_files.outputs.js_changed == 'true'
 99 |       )) ||
100 |       (github.event_name == 'workflow_dispatch' && github.event.inputs.run-js-tests == 'true')
101 |     runs-on: ubuntu-latest
102 |     defaults:
103 |       run:
104 |         working-directory: js
105 |     steps:
106 |       - uses: actions/checkout@v3
107 | 
108 |       - name: Setup Node
109 |         uses: actions/setup-node@v3
110 |         with:
111 |           node-version: 22.x
112 |           cache: "yarn"
113 |           cache-dependency-path: "js/yarn.lock"
114 | 
115 |       - name: Install Yarn dependencies
116 |         run: yarn install
117 |         shell: bash
118 |         working-directory: js
119 | 
120 | 
121 |       - name: Run JS integration tests
122 |         env:
123 |           LANGSMITH_TRACING: "true"
124 |           LANGSMITH_ENDPOINT: ${{ secrets.LANGSMITH_ENDPOINT }}
125 |           LANGSMITH_API_KEY: ${{ secrets.LANGSMITH_API_KEY }}
126 |           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
127 |         run: yarn test src/trajectory/tests src/graph_trajectory/tests
128 |         shell: bash
129 |         working-directory: js


--------------------------------------------------------------------------------
/js/src/graph_trajectory/tests/graph_trajectory_llm.test.ts:
--------------------------------------------------------------------------------
  1 | import * as ls from "langsmith/vitest";
  2 | import { expect } from "vitest";
  3 | 
  4 | import { createReactAgent } from "@langchain/langgraph/prebuilt";
  5 | import { MemorySaver } from "@langchain/langgraph";
  6 | import { tool } from "@langchain/core/tools";
  7 | import { z } from "zod";
  8 | import { ChatOpenAI } from "@langchain/openai";
  9 | 
 10 | import { createGraphTrajectoryLLMAsJudge } from "../llm.js";
 11 | import { extractLangGraphTrajectoryFromThread } from "../utils.js";
 12 | 
 13 | const search = tool(
 14 |   async () => {
 15 |     return "It's 80 degrees and sunny in San Francisco.";
 16 |   },
 17 |   {
 18 |     name: "search",
 19 |     description: "Call to surf the web.",
 20 |     schema: z.object({
 21 |       query: z.string(),
 22 |     }),
 23 |   }
 24 | );
 25 | 
 26 | const tools = [search];
 27 | 
 28 | ls.describe("graph_trajectory_llm", () => {
 29 |   ls.test(
 30 |     "sensible_trajectory",
 31 |     {
 32 |       inputs: {},
 33 |       referenceOutputs: {},
 34 |     },
 35 |     async () => {
 36 |       const checkpointer = new MemorySaver();
 37 |       const graph = createReactAgent({
 38 |         llm: new ChatOpenAI({ model: "gpt-4o-mini" }),
 39 |         checkpointer,
 40 |         tools,
 41 |       });
 42 |       const config = { configurable: { thread_id: "1" } };
 43 |       await graph.invoke(
 44 |         { messages: [{ role: "user", content: "what's the weather in sf?" }] },
 45 |         config
 46 |       );
 47 |       const trajectory = await extractLangGraphTrajectoryFromThread(
 48 |         graph,
 49 |         config
 50 |       );
 51 |       const evaluator = createGraphTrajectoryLLMAsJudge({
 52 |         model: "openai:o3-mini",
 53 |       });
 54 |       const res = await evaluator({
 55 |         inputs: trajectory.inputs,
 56 |         outputs: trajectory.outputs,
 57 |       });
 58 |       expect(res.key).toBe("graph_trajectory_accuracy");
 59 |       expect(res.score).toBe(true);
 60 |     }
 61 |   );
 62 | 
 63 |   ls.test(
 64 |     "unsensible_trajectory",
 65 |     {
 66 |       inputs: {},
 67 |       referenceOutputs: {},
 68 |     },
 69 |     async () => {
 70 |       const checkpointer = new MemorySaver();
 71 |       const askjeeves = tool(
 72 |         async () => {
 73 |           return "foo";
 74 |         },
 75 |         {
 76 |           name: "askjeeves",
 77 |           description: "Call to surf the web.",
 78 |           schema: z.object({ query: z.string() }),
 79 |         }
 80 |       );
 81 |       const graph = createReactAgent({
 82 |         llm: new ChatOpenAI({ model: "gpt-4o-mini" }),
 83 |         checkpointer,
 84 |         prompt:
 85 |           "You are an evil assistant who is inefficient and calls more tools than necessary.",
 86 |         tools: [askjeeves],
 87 |       });
 88 |       const config = { configurable: { thread_id: "1" } };
 89 |       await graph.invoke(
 90 |         { messages: [{ role: "user", content: "what's the weather in sf?" }] },
 91 |         config
 92 |       );
 93 |       const trajectory = await extractLangGraphTrajectoryFromThread(
 94 |         graph,
 95 |         config
 96 |       );
 97 |       const evaluator = createGraphTrajectoryLLMAsJudge({
 98 |         model: "openai:o3-mini",
 99 |         prompt: `You are an expert data labeler.
100 | Your task is to grade the accuracy of an AI agent's internal steps in resolving a user queries.
101 | 
102 | <Rubric>
103 |   An accurate trajectory:
104 |   - Makes logical sense between steps
105 |   - Shows clear progression
106 |   - Is perfectly efficient, with no more than one tool call
107 |   - Is semantically equivalent to the provided reference trajectory, if present
108 | </Rubric>
109 | 
110 | <Instructions>
111 |   Grade the following thread, evaluating whether the agent's overall steps are logical and relatively efficient.
112 |   For the trajectory, "__start__" denotes an initial entrypoint to the agent, and "__interrupt__" corresponds to the agent
113 |   interrupting to await additional data from another source ("human-in-the-loop"):
114 | </Instructions>
115 | 
116 | <thread>
117 | {thread}
118 | </thread>
119 | 
120 | {reference_outputs}`,
121 |       });
122 |       const res = await evaluator({
123 |         inputs: trajectory.inputs,
124 |         outputs: trajectory.outputs,
125 |       });
126 |       expect(res.key).toBe("graph_trajectory_accuracy");
127 |       expect(res.score).toBe(false);
128 |     }
129 |   );
130 | });
131 | 


--------------------------------------------------------------------------------
/js/src/graph_trajectory/utils.ts:
--------------------------------------------------------------------------------
  1 | import type { StateSnapshot, Pregel } from "@langchain/langgraph/web";
  2 | import { isBaseMessage } from "@langchain/core/messages";
  3 | import type { RunnableConfig } from "@langchain/core/runnables";
  4 | import { _convertMessagesToOpenAIParams } from "@langchain/openai";
  5 | 
  6 | import type { GraphTrajectory } from "../types.js";
  7 | 
  8 | export const extractLangGraphTrajectoryFromSnapshots = (
  9 |   snapshots: StateSnapshot[]
 10 | ) => {
 11 |   const inputs = [];
 12 |   const trajectory: GraphTrajectory = {
 13 |     results: [],
 14 |     steps: [],
 15 |   };
 16 |   let isAccumulatingSteps = false;
 17 |   for (let i = 0; i < snapshots.length; i += 1) {
 18 |     const snapshot = snapshots[i];
 19 |     const hasInterrupts = snapshot.tasks?.find((task) => {
 20 |       return task.interrupts?.length;
 21 |     });
 22 |     if (!snapshot.next?.length || hasInterrupts) {
 23 |       isAccumulatingSteps = true;
 24 |       if (hasInterrupts) {
 25 |         trajectory.results.push({});
 26 |       } else if (
 27 |         snapshot.values != null &&
 28 |         typeof snapshot.values === "object" &&
 29 |         !Array.isArray(snapshot.values) &&
 30 |         "messages" in snapshot.values &&
 31 |         Array.isArray(snapshot.values.messages)
 32 |       ) {
 33 |         const lastMessage = snapshot.values.messages.at(-1);
 34 |         if (isBaseMessage(lastMessage)) {
 35 |           // Just append the last message in the output to the results to reduce context size
 36 |           trajectory.results.push({
 37 |             messages: _convertMessagesToOpenAIParams([lastMessage]),
 38 |           });
 39 |         } else {
 40 |           trajectory.results.push({ messages: [lastMessage] });
 41 |         }
 42 |       } else {
 43 |         trajectory.results.push(snapshot.values);
 44 |       }
 45 |       trajectory.steps.push([]);
 46 |     }
 47 |     if (isAccumulatingSteps && snapshot.tasks?.length) {
 48 |       const checkpointNs = snapshot.config?.configurable?.checkpoint_ns ?? "";
 49 |       let subgraphPath = "";
 50 |       if (checkpointNs.split(":").length > 1) {
 51 |         subgraphPath = `${checkpointNs.split(":")[0]}:`;
 52 |       }
 53 |       for (const task of snapshot.tasks) {
 54 |         if (task.interrupts?.length) {
 55 |           trajectory.steps.at(-1)?.push("__interrupt__");
 56 |         }
 57 |         trajectory.steps.at(-1)?.push(`${subgraphPath}${task.name}`);
 58 |       }
 59 |     }
 60 |     if (isAccumulatingSteps) {
 61 |       if (snapshot.metadata != null && snapshot.metadata.source === "input") {
 62 |         if (
 63 |           "writes" in snapshot.metadata &&
 64 |           snapshot.metadata.writes != null &&
 65 |           typeof snapshot.metadata.writes === "object"
 66 |         ) {
 67 |           inputs.push(snapshot.metadata.writes as Record<string, unknown>);
 68 |         } else {
 69 |           inputs.push(
 70 |             ...snapshot.tasks.map((task) => ({ [task.name]: task.result }))
 71 |           );
 72 |         }
 73 |       } else if (
 74 |         i + 1 < snapshots.length &&
 75 |         snapshots[i + 1].tasks?.find((task) => task.interrupts?.length > 0)
 76 |       ) {
 77 |         inputs.push("__resuming__");
 78 |       }
 79 |     }
 80 |   }
 81 |   inputs.reverse();
 82 |   trajectory.results.reverse();
 83 |   trajectory.steps.reverse();
 84 |   for (const stepList of trajectory.steps) {
 85 |     stepList.reverse();
 86 |   }
 87 |   if (inputs.length !== trajectory.results.length) {
 88 |     console.warn(
 89 |       "Trajectory parsing may be incomplete: inputs and results have different lengths"
 90 |     );
 91 |   } else if (inputs.length !== trajectory.steps.length) {
 92 |     console.warn(
 93 |       "Trajectory parsing may be incomplete: inputs and steps have different lengths"
 94 |     );
 95 |   }
 96 |   return { inputs, outputs: trajectory };
 97 | };
 98 | 
 99 | export const _getLangGraphStateHistoryRecursive = async (
100 |   // eslint-disable-next-line @typescript-eslint/no-explicit-any
101 |   graph: Pregel<any, any>,
102 |   config: RunnableConfig
103 | ): Promise<StateSnapshot[]> => {
104 |   const stateHistory = [];
105 |   for await (const history of graph.getStateHistory(config)) {
106 |     if (history.tasks?.length) {
107 |       for (const task of history.tasks) {
108 |         if ((task.state as RunnableConfig)?.configurable?.checkpoint_ns) {
109 |           stateHistory.push(
110 |             ...(await _getLangGraphStateHistoryRecursive(
111 |               graph,
112 |               task.state as RunnableConfig
113 |             ))
114 |           );
115 |         }
116 |       }
117 |     }
118 |     stateHistory.push(history);
119 |   }
120 |   return stateHistory;
121 | };
122 | 
123 | export const extractLangGraphTrajectoryFromThread = async (
124 |   // eslint-disable-next-line @typescript-eslint/no-explicit-any
125 |   graph: Pregel<any, any>,
126 |   config: RunnableConfig
127 | ) => {
128 |   const history = await _getLangGraphStateHistoryRecursive(graph, config);
129 |   return extractLangGraphTrajectoryFromSnapshots(history);
130 | };
131 | 


--------------------------------------------------------------------------------
/js/src/trajectory/match.ts:
--------------------------------------------------------------------------------
  1 | import { BaseMessage } from "@langchain/core/messages";
  2 | import {
  3 |   ChatCompletionMessage,
  4 |   FlexibleChatCompletionMessage,
  5 |   ToolArgsMatchMode,
  6 |   ToolArgsMatchOverrides,
  7 | } from "../types.js";
  8 | import { _normalizeToOpenAIMessagesList, _runEvaluator } from "../utils.js";
  9 | import { _scorer as trajectoryStrictScorer } from "./strict.js";
 10 | import { _scorer as trajectoryUnorderedScorer } from "./unordered.js";
 11 | import { _scorer as trajectorySubsetScorer } from "./subset.js";
 12 | import { _scorer as trajectorySuperstScorer } from "./superset.js";
 13 | 
 14 | export type TrajectoryMatchMode =
 15 |   | "strict"
 16 |   | "unordered"
 17 |   | "subset"
 18 |   | "superset";
 19 | 
 20 | /**
 21 |  * Creates an evaluator that compares trajectories between model outputs and reference outputs.
 22 |  *
 23 |  * @param options - The configuration options
 24 |  * @param options.trajectoryMatchMode - The mode for matching trajectories:
 25 |  *   - `"strict"`: Requires exact match in order and content
 26 |  *   - `"unordered"`: Allows matching in any order
 27 |  *   - `"subset"`: Accepts if output trajectory is a subset of reference
 28 |  *   - `"superset"`: Accepts if output trajectory is a superset of reference
 29 |  * @param options.toolArgsMatchMode - Mode for matching tool arguments ("exact" by default, can be "ignore")
 30 |  * @param options.toolArgsMatchOverrides - Object containing custom overrides for tool argument matching.
 31 |  *   Each key should be a tool name, and each value should be either a match mode or a matcher function.
 32 |  *   Matchers should be a function that takes two sets of tool call args and returns whether they are equal.
 33 |  *
 34 |  * @returns An async function that evaluates trajectory matches between outputs and references.
 35 |  *   The returned evaluator accepts:
 36 |  *   - outputs: List of messages or dict representing the model output trajectory
 37 |  *   - referenceOutputs: List of messages or dict representing the reference trajectory
 38 |  *   - Additional arguments passed to the underlying evaluator
 39 |  *
 40 |  * @example
 41 |  * ```typescript
 42 |  * const matcher = (
 43 |  *   outputToolCallArgs: Record<string, any>,
 44 |  *   referenceToolCallArgs: Record<string, any>
 45 |  * ): boolean => {
 46 |  *   const outputArgs = (outputToolCallArgs.query ?? "").toLowerCase();
 47 |  *   const referenceArgs = (referenceToolCallArgs.query ?? "").toLowerCase();
 48 |  *   return outputArgs === referenceArgs;
 49 |  * };
 50 |  *
 51 |  * const evaluator = createAsyncTrajectoryMatchEvaluator({
 52 |  *   trajectoryMatchMode: "strict",
 53 |  *   toolArgsMatchMode: "exact",
 54 |  *   toolArgsMatchOverrides: {
 55 |  *     myToolName: matcher,
 56 |  *   },
 57 |  * });
 58 |  *
 59 |  * const result = await evaluator({
 60 |  *   outputs: [...],
 61 |  *   referenceOutputs: [...],
 62 |  * });
 63 |  * ```
 64 |  */
 65 | export function createTrajectoryMatchEvaluator({
 66 |   trajectoryMatchMode = "strict",
 67 |   toolArgsMatchMode = "exact",
 68 |   toolArgsMatchOverrides,
 69 | }: {
 70 |   trajectoryMatchMode?: TrajectoryMatchMode;
 71 |   toolArgsMatchMode?: ToolArgsMatchMode;
 72 |   toolArgsMatchOverrides?: ToolArgsMatchOverrides;
 73 | }) {
 74 |   let scorer: (params: {
 75 |     outputs: ChatCompletionMessage[];
 76 |     referenceOutputs: ChatCompletionMessage[];
 77 |     toolArgsMatchMode: ToolArgsMatchMode;
 78 |     toolArgsMatchOverrides?: ToolArgsMatchOverrides;
 79 |   }) => boolean | Promise<boolean>;
 80 |   switch (trajectoryMatchMode) {
 81 |     case "strict":
 82 |       scorer = trajectoryStrictScorer;
 83 |       break;
 84 |     case "unordered":
 85 |       scorer = trajectoryUnorderedScorer;
 86 |       break;
 87 |     case "subset":
 88 |       scorer = trajectorySubsetScorer;
 89 |       break;
 90 |     case "superset":
 91 |       scorer = trajectorySuperstScorer;
 92 |       break;
 93 |     default:
 94 |       throw new Error(`Invalid trajectory match type: ${trajectoryMatchMode}`);
 95 |   }
 96 | 
 97 |   return async function _wrappedEvaluator({
 98 |     outputs,
 99 |     referenceOutputs,
100 |     ...extra
101 |   }: {
102 |     outputs:
103 |       | ChatCompletionMessage[]
104 |       | FlexibleChatCompletionMessage[]
105 |       | BaseMessage[]
106 |       | {
107 |           messages: (
108 |             | BaseMessage
109 |             | ChatCompletionMessage
110 |             | FlexibleChatCompletionMessage
111 |           )[];
112 |         };
113 |     referenceOutputs:
114 |       | ChatCompletionMessage[]
115 |       | FlexibleChatCompletionMessage[]
116 |       | BaseMessage[]
117 |       | {
118 |           messages: (
119 |             | BaseMessage
120 |             | ChatCompletionMessage
121 |             | FlexibleChatCompletionMessage
122 |           )[];
123 |         };
124 |     [key: string]: unknown;
125 |   }) {
126 |     const normalizedOutputs = _normalizeToOpenAIMessagesList(outputs);
127 |     const normalizedReferenceOutputs =
128 |       _normalizeToOpenAIMessagesList(referenceOutputs);
129 | 
130 |     return _runEvaluator(
131 |       `trajectory_${trajectoryMatchMode}_match`,
132 |       scorer,
133 |       `trajectory_${trajectoryMatchMode}_match`,
134 |       {
135 |         outputs: normalizedOutputs,
136 |         referenceOutputs: normalizedReferenceOutputs,
137 |         toolArgsMatchMode,
138 |         toolArgsMatchOverrides,
139 |         ...extra,
140 |       }
141 |     );
142 |   };
143 | }
144 | 


--------------------------------------------------------------------------------
/js/src/trajectory/strict.ts:
--------------------------------------------------------------------------------
  1 | import { BaseMessage } from "@langchain/core/messages";
  2 | import {
  3 |   ChatCompletionMessage,
  4 |   FlexibleChatCompletionMessage,
  5 |   EvaluatorResult,
  6 |   ToolArgsMatchMode,
  7 |   ToolArgsMatchOverrides,
  8 | } from "../types.js";
  9 | import { _normalizeToOpenAIMessagesList, _runEvaluator } from "../utils.js";
 10 | import { _getMatcherForToolName } from "./utils.js";
 11 | 
 12 | export async function _scorer(params: {
 13 |   outputs: ChatCompletionMessage[];
 14 |   referenceOutputs: ChatCompletionMessage[];
 15 |   toolArgsMatchMode: ToolArgsMatchMode;
 16 |   toolArgsMatchOverrides?: ToolArgsMatchOverrides;
 17 | }): Promise<boolean> {
 18 |   const {
 19 |     outputs,
 20 |     referenceOutputs,
 21 |     toolArgsMatchMode,
 22 |     toolArgsMatchOverrides,
 23 |   } = params;
 24 |   const normalizedOutputs = outputs;
 25 |   const normalizedReferenceOutputs = referenceOutputs;
 26 | 
 27 |   if (!normalizedOutputs || !normalizedReferenceOutputs) {
 28 |     throw new Error(
 29 |       "Strict trajectory match requires both outputs and reference_outputs"
 30 |     );
 31 |   }
 32 | 
 33 |   if (normalizedOutputs.length !== normalizedReferenceOutputs.length) {
 34 |     return false;
 35 |   }
 36 | 
 37 |   for (let i = 0; i < normalizedOutputs.length; i++) {
 38 |     const output = normalizedOutputs[i];
 39 |     const referenceOutput = normalizedReferenceOutputs[i];
 40 | 
 41 |     if (output.role !== referenceOutput.role) {
 42 |       return false;
 43 |     }
 44 | 
 45 |     const outputHasToolCalls = output.tool_calls != null;
 46 |     const referenceHasToolCalls = referenceOutput.tool_calls != null;
 47 | 
 48 |     if (outputHasToolCalls !== referenceHasToolCalls) {
 49 |       return false;
 50 |     }
 51 | 
 52 |     if (outputHasToolCalls) {
 53 |       if (output.tool_calls!.length !== referenceOutput.tool_calls!.length) {
 54 |         return false;
 55 |       }
 56 |       const referenceCalls = referenceOutput.tool_calls ?? [];
 57 |       const seen = new Array(referenceCalls.length).fill(false);
 58 | 
 59 |       for (const outputCall of output.tool_calls ?? []) {
 60 |         let foundMatch = false;
 61 |         for (let i = 0; i < referenceCalls.length; i++) {
 62 |           const referenceCall = referenceCalls[i];
 63 |           if (
 64 |             !seen[i] &&
 65 |             outputCall.function?.name === referenceCall.function?.name
 66 |           ) {
 67 |             const matcher = _getMatcherForToolName(
 68 |               outputCall.function?.name ?? "",
 69 |               toolArgsMatchMode,
 70 |               toolArgsMatchOverrides
 71 |             );
 72 |             if (
 73 |               await matcher(
 74 |                 JSON.parse(outputCall.function?.arguments ?? "{}"),
 75 |                 JSON.parse(referenceCall.function?.arguments ?? "{}")
 76 |               )
 77 |             ) {
 78 |               foundMatch = true;
 79 |               seen[i] = true;
 80 |               break;
 81 |             }
 82 |           }
 83 |         }
 84 |         if (!foundMatch) {
 85 |           return false;
 86 |         }
 87 |       }
 88 |     }
 89 |   }
 90 | 
 91 |   return true;
 92 | }
 93 | 
 94 | /**
 95 |  * @deprecated Use `createTrajectoryMatchEvaluator` with `trajectoryMatchMode: "strict"` instead.
 96 |  * Evaluate whether an input agent trajectory and called tools strictly matches a reference trajectory.
 97 |  * This means that at each step, the agent called the same tools in the same order as specified in the reference trajectory.
 98 |  *
 99 |  * @param outputs - Actual trajectory the agent followed. May be a list of OpenAI messages,
100 |  *                 a list of LangChain messages, or a dictionary containing a "messages" key with one of the above.
101 |  * @param referenceOutputs - Ideal reference trajectory the agent should have followed. May be a list of OpenAI messages,
102 |  *                          a list of LangChain messages, or a dictionary containing a "messages" key with one of the above.
103 |  * @param toolCallArgsExactMatch - Whether to require exact matches for tool call arguments
104 |  * @returns EvaluatorResult containing a score of true if trajectory (including called tools) matches, false otherwise
105 |  */
106 | export async function trajectoryStrictMatch(params: {
107 |   outputs:
108 |     | ChatCompletionMessage[]
109 |     | FlexibleChatCompletionMessage[]
110 |     | BaseMessage[]
111 |     | {
112 |         messages: (
113 |           | BaseMessage
114 |           | ChatCompletionMessage
115 |           | FlexibleChatCompletionMessage
116 |         )[];
117 |       };
118 |   referenceOutputs:
119 |     | ChatCompletionMessage[]
120 |     | FlexibleChatCompletionMessage[]
121 |     | BaseMessage[]
122 |     | {
123 |         messages: (
124 |           | BaseMessage
125 |           | ChatCompletionMessage
126 |           | FlexibleChatCompletionMessage
127 |         )[];
128 |       };
129 |   toolCallArgsExactMatch: boolean;
130 | }): Promise<EvaluatorResult> {
131 |   const normalizedOutputs = _normalizeToOpenAIMessagesList(params.outputs);
132 |   const normalizedReferenceOutputs = _normalizeToOpenAIMessagesList(
133 |     params.referenceOutputs
134 |   );
135 | 
136 |   return _runEvaluator(
137 |     "trajectory_strict_match",
138 |     _scorer,
139 |     "trajectory_strict_match",
140 |     {
141 |       outputs: normalizedOutputs,
142 |       referenceOutputs: normalizedReferenceOutputs,
143 |       toolArgsMatchMode: params.toolCallArgsExactMatch ? "exact" : "ignore",
144 |     }
145 |   );
146 | }
147 | 


--------------------------------------------------------------------------------
/python/agentevals/graph_trajectory/utils.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | from typing import Iterable, TYPE_CHECKING
  3 | import warnings
  4 | 
  5 | from langchain_core.messages import BaseMessage
  6 | from langchain_core.messages.utils import convert_to_openai_messages
  7 | 
  8 | from agentevals.types import GraphTrajectory, ExtractedLangGraphThreadTrajectory
  9 | 
 10 | from langchain_core.runnables import RunnableConfig
 11 | 
 12 | if TYPE_CHECKING:
 13 |     from langgraph.pregel import Pregel
 14 |     from langgraph.pregel.types import StateSnapshot
 15 | 
 16 | 
 17 | def extract_langgraph_trajectory_from_snapshots(
 18 |     snapshots: Iterable[StateSnapshot],
 19 | ) -> ExtractedLangGraphThreadTrajectory:
 20 |     inputs = []
 21 |     trajectory = GraphTrajectory(
 22 |         inputs=[],
 23 |         results=[],
 24 |         steps=[],
 25 |     )
 26 |     is_acc_steps = False
 27 |     snapshot_list = list(snapshots)
 28 |     for i, snapshot in enumerate(snapshot_list):
 29 |         has_interrupts = any(t.interrupts for t in snapshot.tasks)
 30 |         if not snapshot.next or has_interrupts:
 31 |             is_acc_steps = True
 32 |             if has_interrupts:
 33 |                 trajectory["results"].append({})
 34 |             elif (
 35 |                 isinstance(snapshot.values, dict)
 36 |                 and "messages" in snapshot.values
 37 |                 and isinstance(snapshot.values["messages"], list)
 38 |             ):
 39 |                 # Just append the last message in the output to the results to reduce context size
 40 |                 last_message = snapshot.values["messages"][-1]
 41 |                 if isinstance(last_message, BaseMessage):
 42 |                     trajectory["results"].append(
 43 |                         {"messages": convert_to_openai_messages([last_message])}
 44 |                     )
 45 |                 else:
 46 |                     trajectory["results"].append({"messages": [last_message]})
 47 |             else:
 48 |                 trajectory["results"].append(snapshot.values)
 49 |             trajectory["steps"].append([])
 50 |         if is_acc_steps and snapshot.tasks:
 51 |             checkpoint_ns = snapshot.config.get("configurable", {}).get(
 52 |                 "checkpoint_ns", ""
 53 |             )
 54 |             subgraph_path = ""
 55 |             if checkpoint_ns and len(checkpoint_ns.split(":")) > 1:
 56 |                 subgraph_path = f"{checkpoint_ns.split(':')[0]}:"
 57 |             for task in snapshot.tasks:
 58 |                 if task.interrupts:
 59 |                     trajectory["steps"][-1].append("__interrupt__")
 60 |                 trajectory["steps"][-1].append(f"{subgraph_path}{task.name}")
 61 |         if is_acc_steps:
 62 |             if snapshot.metadata is not None and snapshot.metadata["source"] == "input":
 63 |                 inputs.extend({task.name: task.result} for task in snapshot.tasks)
 64 |             elif i + 1 < len(snapshot_list) and any(
 65 |                 t.interrupts for t in snapshot_list[i + 1].tasks
 66 |             ):
 67 |                 inputs.append("__resuming__")  # type: ignore
 68 |     inputs.reverse()
 69 |     trajectory["results"].reverse()
 70 |     trajectory["steps"].reverse()
 71 |     for ss in trajectory["steps"]:
 72 |         ss.reverse()
 73 |     if len(inputs) != len(trajectory["results"]):
 74 |         warnings.warn(
 75 |             "Trajectory parsing may be incomplete: inputs and results have different lengths"
 76 |         )
 77 |     elif len(inputs) != len(trajectory["steps"]):
 78 |         warnings.warn(
 79 |             "Trajectory parsing may be incomplete: inputs and steps have different lengths"
 80 |         )
 81 | 
 82 |     return {"inputs": inputs, "outputs": trajectory}
 83 | 
 84 | 
 85 | def _get_langgraph_state_history_recursive(graph: Pregel, config: RunnableConfig):
 86 |     state_history = []
 87 |     for history in graph.get_state_history(config=config):
 88 |         if history.tasks:
 89 |             for task in history.tasks:
 90 |                 if task.state and task.state.get("configurable", {}).get(
 91 |                     "checkpoint_ns", None
 92 |                 ):
 93 |                     state_history.extend(
 94 |                         _get_langgraph_state_history_recursive(graph, task.state)
 95 |                     )
 96 |         state_history.append(history)
 97 |     return state_history
 98 | 
 99 | 
100 | async def _aget_langgraph_state_history_recursive(
101 |     graph: Pregel, config: RunnableConfig
102 | ):
103 |     state_history = []
104 |     async for history in graph.aget_state_history(config=config):
105 |         if history.tasks:
106 |             for task in history.tasks:
107 |                 if task.state and task.state.get("configurable", {}).get(
108 |                     "checkpoint_ns", None
109 |                 ):
110 |                     state_history.extend(
111 |                         await _aget_langgraph_state_history_recursive(graph, task.state)
112 |                     )
113 |         state_history.append(history)
114 |     return state_history
115 | 
116 | 
117 | def extract_langgraph_trajectory_from_thread(
118 |     graph: Pregel, config: RunnableConfig
119 | ) -> ExtractedLangGraphThreadTrajectory:
120 |     return extract_langgraph_trajectory_from_snapshots(
121 |         _get_langgraph_state_history_recursive(graph, config)
122 |     )
123 | 
124 | 
125 | async def aextract_langgraph_trajectory_from_thread(
126 |     graph: Pregel, config: RunnableConfig
127 | ) -> ExtractedLangGraphThreadTrajectory:
128 |     return extract_langgraph_trajectory_from_snapshots(
129 |         await _aget_langgraph_state_history_recursive(graph, config)
130 |     )
131 | 


--------------------------------------------------------------------------------
/python/agentevals/trajectory/unordered.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | from warnings import warn
  3 | 
  4 | from agentevals.types import (
  5 |     ChatCompletionMessage,
  6 |     ToolArgsMatchMode,
  7 |     ToolArgsMatchOverrides,
  8 | )
  9 | from agentevals.trajectory.utils import (
 10 |     _is_trajectory_superset,
 11 |     _normalize_to_openai_messages_list,
 12 | )
 13 | from agentevals.utils import _run_evaluator, _arun_evaluator
 14 | 
 15 | from typing import Any, Optional, Union, TYPE_CHECKING
 16 | 
 17 | if TYPE_CHECKING:
 18 |     from langchain_core.messages import BaseMessage
 19 | 
 20 | 
 21 | def _scorer(
 22 |     *,
 23 |     outputs: list[ChatCompletionMessage],
 24 |     reference_outputs: list[ChatCompletionMessage],
 25 |     tool_args_match_mode: ToolArgsMatchMode,
 26 |     tool_args_match_overrides: Optional[ToolArgsMatchOverrides] = None,
 27 |     **kwargs: Any,
 28 | ):
 29 |     if outputs is None or reference_outputs is None:
 30 |         raise ValueError(
 31 |             "Trajectory unordered match requires both outputs and reference_outputs"
 32 |         )
 33 |     unordered_match = _is_trajectory_superset(
 34 |         outputs, reference_outputs, tool_args_match_mode, tool_args_match_overrides
 35 |     ) and _is_trajectory_superset(
 36 |         reference_outputs, outputs, tool_args_match_mode, tool_args_match_overrides
 37 |     )
 38 |     return unordered_match
 39 | 
 40 | 
 41 | def trajectory_unordered_match(
 42 |     *,
 43 |     outputs: Union[list[ChatCompletionMessage], list[BaseMessage], dict],
 44 |     reference_outputs: Union[list[ChatCompletionMessage], list[BaseMessage], dict],
 45 |     **kwargs: Any,
 46 | ):
 47 |     """
 48 |     DEPRECATED: Use create_trajectory_match_evaluator() instead:
 49 |     ```python
 50 |     from agentevals.trajectory.match import create_trajectory_match_evaluator
 51 |     evaluator = create_trajectory_match_evaluator(trajectory_match_mode="unordered")
 52 |     evaluator(outputs=outputs, reference_outputs=reference_outputs)
 53 |     ```
 54 | 
 55 |     Evaluate whether an input agent trajectory and called tools contains all the tools used in a reference trajectory.
 56 |     This accounts for some differences in an LLM's reasoning process in a case-by-case basis.
 57 | 
 58 |     Args:
 59 |         outputs (Union[list[ChatCompletionMessage], list[BaseMessage], dict]): Actual trajectory the agent followed.
 60 |             May be a list of OpenAI messages, a list of LangChain messages, or a dictionary containing
 61 |             a "messages" key with one of the above.
 62 |         reference_outputs (Union[list[ChatCompletionMessage], list[BaseMessage], dict]): Ideal reference trajectory the agent should have followed.
 63 |             May be a list of OpenAI messages, a list of LangChain messages, or a dictionary containing
 64 |             a "messages" key with one of the above.
 65 | 
 66 |     Returns:
 67 |         EvaluatorResult: Contains a score of True if trajectory matches, False otherwise
 68 |     """
 69 |     warn(
 70 |         "trajectory_unordered_match() is deprecated. Use create_trajectory_match_evaluator(trajectory_match_mode='unordered') instead.",
 71 |         DeprecationWarning,
 72 |         stacklevel=2,
 73 |     )
 74 | 
 75 |     outputs = _normalize_to_openai_messages_list(outputs)
 76 |     reference_outputs = _normalize_to_openai_messages_list(reference_outputs)
 77 | 
 78 |     return _run_evaluator(
 79 |         run_name="trajectory_unordered_match",
 80 |         scorer=_scorer,
 81 |         feedback_key="trajectory_unordered_match",
 82 |         outputs=outputs,
 83 |         reference_outputs=reference_outputs,
 84 |         tool_args_match_mode="ignore",
 85 |         **kwargs,
 86 |     )
 87 | 
 88 | 
 89 | async def trajectory_unordered_match_async(
 90 |     *,
 91 |     outputs: Union[list[ChatCompletionMessage], list[BaseMessage], dict],
 92 |     reference_outputs: Union[list[ChatCompletionMessage], list[BaseMessage], dict],
 93 |     **kwargs: Any,
 94 | ):
 95 |     """
 96 |     Evaluate whether an input agent trajectory and called tools contains all the tools used in a reference trajectory.
 97 |     This accounts for some differences in an LLM's reasoning process in a case-by-case basis.
 98 | 
 99 |     Args:
100 |         outputs (Union[list[ChatCompletionMessage], list[BaseMessage], dict]): Actual trajectory the agent followed.
101 |             May be a list of OpenAI messages, a list of LangChain messages, or a dictionary containing
102 |             a "messages" key with one of the above.
103 |         reference_outputs (Union[list[ChatCompletionMessage], list[BaseMessage], dict]): Ideal reference trajectory the agent should have followed.
104 |             May be a list of OpenAI messages, a list of LangChain messages, or a dictionary containing
105 |             a "messages" key with one of the above.
106 | 
107 |     Returns:
108 |         EvaluatorResult: Contains a score of True if trajectory matches, False otherwise
109 |     """
110 |     warn(
111 |         "trajectory_unordered_match_async() is deprecated. Use create_async_trajectory_match_evaluator(trajectory_match_mode='unordered') instead.",
112 |         DeprecationWarning,
113 |         stacklevel=2,
114 |     )
115 | 
116 |     outputs = _normalize_to_openai_messages_list(outputs)
117 |     reference_outputs = _normalize_to_openai_messages_list(reference_outputs)
118 | 
119 |     return await _arun_evaluator(
120 |         run_name="trajectory_unordered_match",
121 |         scorer=_scorer,
122 |         feedback_key="trajectory_unordered_match",
123 |         outputs=outputs,
124 |         reference_outputs=reference_outputs,
125 |         tool_args_match_mode="ignore",
126 |         **kwargs,
127 |     )
128 | 


--------------------------------------------------------------------------------
/python/agentevals/trajectory/subset.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | from warnings import warn
  3 | 
  4 | from agentevals.trajectory.utils import _normalize_to_openai_messages_list
  5 | from agentevals.types import ChatCompletionMessage
  6 | from agentevals.trajectory.utils import _is_trajectory_superset
  7 | from agentevals.utils import _run_evaluator, _arun_evaluator
  8 | from agentevals.types import ToolArgsMatchMode, ToolArgsMatchOverrides
  9 | 
 10 | from typing import Any, Union, Optional, TYPE_CHECKING
 11 | 
 12 | if TYPE_CHECKING:
 13 |     from langchain_core.messages import BaseMessage
 14 | 
 15 | 
 16 | def _scorer(
 17 |     *,
 18 |     outputs: list[ChatCompletionMessage],
 19 |     reference_outputs: list[ChatCompletionMessage],
 20 |     tool_args_match_mode: ToolArgsMatchMode,
 21 |     tool_args_match_overrides: Optional[ToolArgsMatchOverrides] = None,
 22 |     **kwargs: Any,
 23 | ):
 24 |     if outputs is None or reference_outputs is None:
 25 |         raise ValueError(
 26 |             "Trajectory subset match requires both outputs and reference_outputs"
 27 |         )
 28 |     is_superset = _is_trajectory_superset(
 29 |         reference_outputs, outputs, tool_args_match_mode, tool_args_match_overrides
 30 |     )
 31 |     return is_superset
 32 | 
 33 | 
 34 | def trajectory_subset(
 35 |     *,
 36 |     outputs: Union[list[ChatCompletionMessage], list[BaseMessage], dict],
 37 |     reference_outputs: Union[list[ChatCompletionMessage], list[BaseMessage], dict],
 38 |     **kwargs: Any,
 39 | ):
 40 |     """
 41 |     DEPRECATED: Use create_trajectory_match_evaluator() instead:
 42 |     ```python
 43 |     from agentevals.trajectory.match import create_trajectory_match_evaluator
 44 |     evaluator = create_trajectory_match_evaluator(trajectory_match_mode="subset")
 45 |     evaluator(outputs=outputs, reference_outputs=reference_outputs)
 46 |     ```
 47 | 
 48 |     Evaluate whether an agent trajectory and called tools is a subset of a reference trajectory and called tools.
 49 |     This means the agent called a subset of the tools specified in the reference trajectory.
 50 | 
 51 |     Args:
 52 |         outputs (Union[list[ChatCompletionMessage], list[BaseMessage], dict]): Actual trajectory the agent followed.
 53 |             May be a list of OpenAI messages, a list of LangChain messages, or a dictionary containing
 54 |             a "messages" key with one of the above.
 55 |         reference_outputs (Union[list[ChatCompletionMessage], list[BaseMessage], dict]): Ideal reference trajectory the agent should have followed.
 56 |             May be a list of OpenAI messages, a list of LangChain messages, or a dictionary containing
 57 |             a "messages" key with one of the above.
 58 | 
 59 |     Returns:
 60 |         EvaluatorResult: Contains a score of True if trajectory (including called tools) matches, False otherwise
 61 |     """
 62 |     warn(
 63 |         "trajectory_subset() is deprecated. Use create_trajectory_match_evaluator(trajectory_match_mode='subset') instead.",
 64 |         DeprecationWarning,
 65 |         stacklevel=2,
 66 |     )
 67 | 
 68 |     outputs = _normalize_to_openai_messages_list(outputs)
 69 |     reference_outputs = _normalize_to_openai_messages_list(reference_outputs)
 70 | 
 71 |     return _run_evaluator(
 72 |         run_name="trajectory_subset",
 73 |         scorer=_scorer,
 74 |         feedback_key="trajectory_subset",
 75 |         outputs=outputs,
 76 |         reference_outputs=reference_outputs,
 77 |         tool_args_match_mode="ignore",
 78 |         **kwargs,
 79 |     )
 80 | 
 81 | 
 82 | async def trajectory_subset_async(
 83 |     *,
 84 |     outputs: Union[list[ChatCompletionMessage], list[BaseMessage], dict],
 85 |     reference_outputs: Union[list[ChatCompletionMessage], list[BaseMessage], dict],
 86 |     **kwargs: Any,
 87 | ):
 88 |     """
 89 |     DEPRECATED: Use create_async_trajectory_match_evaluator() instead:
 90 |     ```python
 91 |     from agentevals.trajectory.match import create_trajectory_match_evaluator
 92 |     evaluator = create_async_trajectory_match_evaluator(trajectory_match_mode="subset")
 93 |     await evaluator(outputs=outputs, reference_outputs=reference_outputs)
 94 |     ```
 95 | 
 96 |     Evaluate whether an agent trajectory and called tools is a subset of a reference trajectory and called tools.
 97 |     This means the agent called a subset of the tools specified in the reference trajectory.
 98 | 
 99 |     Args:
100 |         outputs (Union[list[ChatCompletionMessage], list[BaseMessage], dict]): Actual trajectory the agent followed.
101 |             May be a list of OpenAI messages, a list of LangChain messages, or a dictionary containing
102 |             a "messages" key with one of the above.
103 |         reference_outputs (Union[list[ChatCompletionMessage], list[BaseMessage], dict]): Ideal reference trajectory the agent should have followed.
104 |             May be a list of OpenAI messages, a list of LangChain messages, or a dictionary containing
105 |             a "messages" key with one of the above.
106 | 
107 |     Returns:
108 |         EvaluatorResult: Contains a score of True if trajectory (including called tools) matches, False otherwise
109 |     """
110 |     warn(
111 |         "trajectory_subset_async() is deprecated. Use create_async_trajectory_match_evaluator(trajectory_match_mode='subset') instead.",
112 |         DeprecationWarning,
113 |         stacklevel=2,
114 |     )
115 | 
116 |     outputs = _normalize_to_openai_messages_list(outputs)
117 |     reference_outputs = _normalize_to_openai_messages_list(reference_outputs)
118 | 
119 |     return await _arun_evaluator(
120 |         run_name="trajectory_subset",
121 |         scorer=_scorer,
122 |         feedback_key="trajectory_subset",
123 |         outputs=outputs,
124 |         reference_outputs=reference_outputs,
125 |         tool_args_match_mode="ignore",
126 |         **kwargs,
127 |     )
128 | 


--------------------------------------------------------------------------------
/python/agentevals/trajectory/superset.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | from warnings import warn
  3 | 
  4 | from agentevals.types import (
  5 |     ChatCompletionMessage,
  6 |     ToolArgsMatchMode,
  7 |     ToolArgsMatchOverrides,
  8 | )
  9 | from agentevals.trajectory.utils import (
 10 |     _is_trajectory_superset,
 11 |     _normalize_to_openai_messages_list,
 12 | )
 13 | from agentevals.utils import _run_evaluator, _arun_evaluator
 14 | 
 15 | from typing import Any, Optional, Union, TYPE_CHECKING
 16 | 
 17 | if TYPE_CHECKING:
 18 |     from langchain_core.messages import BaseMessage
 19 | 
 20 | 
 21 | def _scorer(
 22 |     *,
 23 |     outputs: list[ChatCompletionMessage],
 24 |     reference_outputs: list[ChatCompletionMessage],
 25 |     tool_args_match_mode: ToolArgsMatchMode,
 26 |     tool_args_match_overrides: Optional[ToolArgsMatchOverrides] = None,
 27 |     **kwargs: Any,
 28 | ):
 29 |     if outputs is None or reference_outputs is None:
 30 |         raise ValueError(
 31 |             "Trajectory superset match requires both outputs and reference_outputs"
 32 |         )
 33 |     is_superset = _is_trajectory_superset(
 34 |         outputs, reference_outputs, tool_args_match_mode, tool_args_match_overrides
 35 |     )
 36 |     return is_superset
 37 | 
 38 | 
 39 | def trajectory_superset(
 40 |     *,
 41 |     outputs: Union[list[ChatCompletionMessage], list[BaseMessage], dict],
 42 |     reference_outputs: Union[list[ChatCompletionMessage], list[BaseMessage], dict],
 43 |     **kwargs: Any,
 44 | ):
 45 |     """
 46 |     DEPRECATED: Use create_trajectory_match_evaluator() instead:
 47 |     ```python
 48 |     from agentevals.trajectory.match import create_trajectory_match_evaluator
 49 |     evaluator = create_trajectory_match_evaluator(trajectory_match_mode="superset")
 50 |     evaluator(outputs=outputs, reference_outputs=reference_outputs)
 51 |     ```
 52 | 
 53 |     Evaluate whether an agent trajectory and called tools is a superset of a reference trajectory and called tools.
 54 |     This means the agent called a superset of the tools specified in the reference trajectory.
 55 | 
 56 |     Args:
 57 |         outputs (Union[list[ChatCompletionMessage], list[BaseMessage], dict]): Actual trajectory the agent followed.
 58 |             May be a list of OpenAI messages, a list of LangChain messages, or a dictionary containing
 59 |             a "messages" key with one of the above.
 60 |         reference_outputs (Union[list[ChatCompletionMessage], list[BaseMessage], dict]): Ideal reference trajectory the agent should have followed.
 61 |             May be a list of OpenAI messages, a list of LangChain messages, or a dictionary containing
 62 |             a "messages" key with one of the above.
 63 | 
 64 |     Returns:
 65 |         EvaluatorResult: Contains a score of True if trajectory (including called tools) matches, False otherwise
 66 |     """
 67 |     warn(
 68 |         "trajectory_superset() is deprecated. Use create_trajectory_match_evaluator(trajectory_match_mode='superset') instead.",
 69 |         DeprecationWarning,
 70 |         stacklevel=2,
 71 |     )
 72 | 
 73 |     outputs = _normalize_to_openai_messages_list(outputs)
 74 |     reference_outputs = _normalize_to_openai_messages_list(reference_outputs)
 75 | 
 76 |     return _run_evaluator(
 77 |         run_name="trajectory_superset",
 78 |         scorer=_scorer,
 79 |         feedback_key="trajectory_superset",
 80 |         outputs=outputs,
 81 |         reference_outputs=reference_outputs,
 82 |         tool_args_match_mode="ignore",
 83 |         **kwargs,
 84 |     )
 85 | 
 86 | 
 87 | async def trajectory_superset_async(
 88 |     *,
 89 |     outputs: Union[list[ChatCompletionMessage], list[BaseMessage], dict],
 90 |     reference_outputs: Union[list[ChatCompletionMessage], list[BaseMessage], dict],
 91 |     **kwargs: Any,
 92 | ):
 93 |     """
 94 |     DEPRECATED: Use create_async_trajectory_match_evaluator() instead:
 95 |     ```python
 96 |     from agentevals.trajectory.match import create_trajectory_match_evaluator
 97 |     evaluator = create_async_trajectory_match_evaluator(trajectory_match_mode="superset")
 98 |     await evaluator(outputs=outputs, reference_outputs=reference_outputs)
 99 |     ```
100 | 
101 |     Evaluate whether an agent trajectory and called tools is a superset of a reference trajectory and called tools.
102 |     This means the agent called a superset of the tools specified in the reference trajectory.
103 | 
104 |     Args:
105 |         outputs (Union[list[ChatCompletionMessage], list[BaseMessage], dict]): Actual trajectory the agent followed.
106 |             May be a list of OpenAI messages, a list of LangChain messages, or a dictionary containing
107 |             a "messages" key with one of the above.
108 |         reference_outputs (Union[list[ChatCompletionMessage], list[BaseMessage], dict]): Ideal reference trajectory the agent should have followed.
109 |             May be a list of OpenAI messages, a list of LangChain messages, or a dictionary containing
110 |             a "messages" key with one of the above.
111 | 
112 |     Returns:
113 |         EvaluatorResult: Contains a score of True if trajectory (including called tools) matches, False otherwise
114 |     """
115 |     warn(
116 |         "trajectory_superset_async() is deprecated. Use create_async_trajectory_match_evaluator(trajectory_match_mode='superset') instead.",
117 |         DeprecationWarning,
118 |         stacklevel=2,
119 |     )
120 | 
121 |     outputs = _normalize_to_openai_messages_list(outputs)
122 |     reference_outputs = _normalize_to_openai_messages_list(reference_outputs)
123 | 
124 |     return await _arun_evaluator(
125 |         run_name="trajectory_superset",
126 |         scorer=_scorer,
127 |         feedback_key="trajectory_superset",
128 |         outputs=outputs,
129 |         reference_outputs=reference_outputs,
130 |         tool_args_match_mode="ignore",
131 |         **kwargs,
132 |     )
133 | 


--------------------------------------------------------------------------------
/python/tests/graph_trajectory/test_graph_trajectory_utils.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | 
  3 | from agentevals.graph_trajectory.utils import (
  4 |     aextract_langgraph_trajectory_from_thread,
  5 |     extract_langgraph_trajectory_from_thread,
  6 | )
  7 | from openevals.exact import exact_match
  8 | 
  9 | from typing import Annotated
 10 | from typing_extensions import TypedDict
 11 | import operator
 12 | import time
 13 | 
 14 | from langgraph.graph import StateGraph
 15 | from langgraph.checkpoint.memory import MemorySaver
 16 | 
 17 | 
 18 | @pytest.mark.langsmith
 19 | def test_trajectory_match():
 20 |     checkpointer = MemorySaver()
 21 | 
 22 |     class InnerState(TypedDict):
 23 |         my_key: Annotated[str, operator.add]
 24 |         my_other_key: str
 25 | 
 26 |     def inner_1(state: InnerState):
 27 |         time.sleep(0.1)
 28 |         return {"my_key": "got here", "my_other_key": state["my_key"]}
 29 | 
 30 |     def inner_2(state: InnerState):
 31 |         return {
 32 |             "my_key": " and there",
 33 |             "my_other_key": state["my_key"],
 34 |         }
 35 | 
 36 |     inner = StateGraph(InnerState)
 37 |     inner.add_node("inner_1", inner_1)
 38 |     inner.add_node("inner_2", inner_2)
 39 |     inner.add_edge("inner_1", "inner_2")
 40 |     inner.set_entry_point("inner_1")
 41 |     inner.set_finish_point("inner_2")
 42 | 
 43 |     class State(TypedDict):
 44 |         my_key: Annotated[str, operator.add]
 45 | 
 46 |     def outer_1(state: State):
 47 |         return {"my_key": " and parallel"}
 48 | 
 49 |     def outer_2(state: State):
 50 |         return {"my_key": " and back again"}
 51 | 
 52 |     graph = StateGraph(State)
 53 |     graph.add_node("inner", inner.compile(interrupt_before=["inner_2"]))
 54 |     graph.add_node("outer_1", outer_1)
 55 |     graph.add_node("outer_2", outer_2)
 56 | 
 57 |     graph.add_edge("__start__", "inner")
 58 |     graph.add_edge("__start__", "outer_1")
 59 |     graph.add_edge(["inner", "outer_1"], "outer_2")
 60 |     graph.set_finish_point("outer_2")
 61 | 
 62 |     app = graph.compile(checkpointer=checkpointer)
 63 | 
 64 |     # test invoke w/ nested interrupt
 65 |     config = {"configurable": {"thread_id": "1"}}
 66 |     app.invoke({"my_key": ""}, config)
 67 | 
 68 |     app.invoke(None, config) == {
 69 |         "my_key": "got here and there and parallel and back again",
 70 |     }
 71 |     assert exact_match(
 72 |         outputs=extract_langgraph_trajectory_from_thread(
 73 |             app, {"configurable": {"thread_id": "1"}}
 74 |         ),
 75 |         reference_outputs={
 76 |             "inputs": [
 77 |                 {"__start__": {"my_key": ""}},
 78 |                 {"__start__": {"my_key": ""}},
 79 |             ],
 80 |             "outputs": {
 81 |                 "inputs": [],
 82 |                 "results": [
 83 |                     {"my_key": "got here and there", "my_other_key": "got here"},
 84 |                     {"my_key": "got here and there and parallel and back again"},
 85 |                 ],
 86 |                 "steps": [
 87 |                     [
 88 |                         "__start__",
 89 |                         "outer_1",
 90 |                         "inner",
 91 |                         "inner:__start__",
 92 |                         "inner:inner_1",
 93 |                         "inner:inner_2",
 94 |                     ],
 95 |                     ["outer_2"],
 96 |                 ],
 97 |             },
 98 |         },
 99 |     )["score"]
100 | 
101 | 
102 | @pytest.mark.asyncio
103 | @pytest.mark.langsmith
104 | async def test_trajectory_match_async():
105 |     checkpointer = MemorySaver()
106 | 
107 |     class InnerState(TypedDict):
108 |         my_key: Annotated[str, operator.add]
109 |         my_other_key: str
110 | 
111 |     def inner_1(state: InnerState):
112 |         time.sleep(0.1)
113 |         return {"my_key": "got here", "my_other_key": state["my_key"]}
114 | 
115 |     def inner_2(state: InnerState):
116 |         return {
117 |             "my_key": " and there",
118 |             "my_other_key": state["my_key"],
119 |         }
120 | 
121 |     inner = StateGraph(InnerState)
122 |     inner.add_node("inner_1", inner_1)
123 |     inner.add_node("inner_2", inner_2)
124 |     inner.add_edge("inner_1", "inner_2")
125 |     inner.set_entry_point("inner_1")
126 |     inner.set_finish_point("inner_2")
127 | 
128 |     class State(TypedDict):
129 |         my_key: Annotated[str, operator.add]
130 | 
131 |     def outer_1(state: State):
132 |         return {"my_key": " and parallel"}
133 | 
134 |     def outer_2(state: State):
135 |         return {"my_key": " and back again"}
136 | 
137 |     graph = StateGraph(State)
138 |     graph.add_node("inner", inner.compile(interrupt_before=["inner_2"]))
139 |     graph.add_node("outer_1", outer_1)
140 |     graph.add_node("outer_2", outer_2)
141 | 
142 |     graph.add_edge("__start__", "inner")
143 |     graph.add_edge("__start__", "outer_1")
144 |     graph.add_edge(["inner", "outer_1"], "outer_2")
145 |     graph.set_finish_point("outer_2")
146 | 
147 |     app = graph.compile(checkpointer=checkpointer)
148 | 
149 |     # test invoke w/ nested interrupt
150 |     config = {"configurable": {"thread_id": "1"}}
151 |     await app.ainvoke({"my_key": ""}, config)
152 | 
153 |     await app.ainvoke(None, config) == {
154 |         "my_key": "got here and there and parallel and back again",
155 |     }
156 |     assert exact_match(
157 |         outputs=await aextract_langgraph_trajectory_from_thread(
158 |             app, {"configurable": {"thread_id": "1"}}
159 |         ),
160 |         reference_outputs={
161 |             "inputs": [
162 |                 {"__start__": {"my_key": ""}},
163 |                 {"__start__": {"my_key": ""}},
164 |             ],
165 |             "outputs": {
166 |                 "inputs": [],
167 |                 "results": [
168 |                     {"my_key": "got here and there", "my_other_key": "got here"},
169 |                     {"my_key": "got here and there and parallel and back again"},
170 |                 ],
171 |                 "steps": [
172 |                     [
173 |                         "__start__",
174 |                         "outer_1",
175 |                         "inner",
176 |                         "inner:__start__",
177 |                         "inner:inner_1",
178 |                         "inner:inner_2",
179 |                     ],
180 |                     ["outer_2"],
181 |                 ],
182 |             },
183 |         },
184 |     )["score"]
185 | 


--------------------------------------------------------------------------------
/js/src/graph_trajectory/llm.ts:
--------------------------------------------------------------------------------
  1 | import { _createLLMAsJudgeScorer } from "openevals/llm";
  2 | 
  3 | import { _runEvaluator } from "../utils.js";
  4 | import type { GraphTrajectory, TrajectoryLLMAsJudgeParams } from "../types.js";
  5 | 
  6 | export const GRAPH_TRAJECTORY_ACCURACY_PROMPT = `You are an expert data labeler.
  7 | Your task is to grade the accuracy of an AI agent's internal steps in resolving a user queries.
  8 | 
  9 | <Rubric>
 10 |   An accurate trajectory:
 11 |   - Makes logical sense between steps
 12 |   - Shows clear progression
 13 |   - Is relatively efficient, though it does not need to be perfectly efficient
 14 |   - Is semantically equivalent to the provided reference trajectory, if present
 15 | </Rubric>
 16 | 
 17 | <Instructions>
 18 |   Grade the following thread, evaluating whether the agent's overall steps are logical and relatively efficient.
 19 |   For the trajectory, "__start__" denotes an initial entrypoint to the agent, and "__interrupt__" corresponds to the agent
 20 |   interrupting to await additional data from another source ("human-in-the-loop").
 21 |   
 22 |   Steps containing a colon represent steps within subagents (e.g. "graph:step_name").
 23 | </Instructions>
 24 | 
 25 | <thread>
 26 | {thread}
 27 | </thread>
 28 | 
 29 | {reference_outputs}
 30 | `;
 31 | 
 32 | function _formatThread(
 33 |   inputs: (string | Record<string, unknown> | null)[],
 34 |   outputs: GraphTrajectory
 35 | ): string {
 36 |   let formattedThread = "";
 37 |   const zippedData = inputs.map((input, i) => ({
 38 |     input: JSON.stringify(input ?? ""),
 39 |     result: JSON.stringify(outputs.results[i]),
 40 |     step: JSON.stringify(outputs.steps[i]),
 41 |   }));
 42 | 
 43 |   for (const { input, result, step } of zippedData) {
 44 |     formattedThread += input ? `\n<input>\n${input}\n</input>\n` : "";
 45 |     formattedThread += `\n<trajectory>\n${step}\n</trajectory>\n`;
 46 |     formattedThread += `\n<result>\n${result}\n</result>\n`;
 47 |   }
 48 |   return formattedThread;
 49 | }
 50 | 
 51 | function _formatInputs(
 52 |   inputs:
 53 |     | (string | Record<string, unknown> | null)[]
 54 |     | { inputs: (string | Record<string, unknown> | null)[] },
 55 |   outputs: GraphTrajectory,
 56 |   referenceOutputs?: GraphTrajectory
 57 | ) {
 58 |   let processedInputs: (string | Record<string, unknown> | null)[];
 59 | 
 60 |   if (Array.isArray(inputs)) {
 61 |     processedInputs = inputs;
 62 |   } else {
 63 |     if (!("inputs" in inputs)) {
 64 |       throw new Error(
 65 |         "inputs must be an array or an object with an 'inputs' key"
 66 |       );
 67 |     }
 68 |     processedInputs = inputs.inputs;
 69 |   }
 70 | 
 71 |   if (processedInputs.length !== outputs.results.length) {
 72 |     throw new Error(
 73 |       "Provided `inputs` and `results` within provided `outputs` must have the same length"
 74 |     );
 75 |   }
 76 |   if (processedInputs.length !== outputs.steps.length) {
 77 |     throw new Error(
 78 |       "Provided `inputs` and `steps` within provided `outputs` must have the same length"
 79 |     );
 80 |   }
 81 | 
 82 |   const formattedThread = _formatThread(processedInputs, outputs);
 83 |   const formattedReferenceOutputs = referenceOutputs
 84 |     ? `\nUse the following trajectory as an example reference when grading:\n<reference_thread>\n${_formatThread(referenceOutputs.inputs ?? [], referenceOutputs)}\n</reference_thread>\n`
 85 |     : "";
 86 | 
 87 |   return {
 88 |     formattedThread,
 89 |     formattedReferenceOutputs,
 90 |   };
 91 | }
 92 | 
 93 | /**
 94 |  * Creates an evaluator that uses an LLM to judge agent trajectories.
 95 |  * @param options Configuration options
 96 |  * @param [options.prompt] - The evaluation prompt. Can be a string template,
 97 |  *        LangChain prompt template, or callable that returns a list of chat messages. Note that the default prompt allows a rubric
 98 |  *        in addition to the typical "inputs", "outputs", and "reference_outputs" parameters.
 99 |  * @param [options.feedbackKey="graph_trajectory_accuracy"] - Key used to store the evaluation result
100 |  * @param [options.judge] - The LLM used for evaluation. Can be an OpenAI client
101 |  *        or a LangChain chat model. If an OpenAI client, must specify "model" as well.
102 |  *        If omitted, "model" will be used to instantiate a LangChain model instance by model string.
103 |  * @param [options.model] - Model identifier to use. If "judge" is an OpenAI client,
104 |  *        this argument should be a model name directly. If "judge" is omitted, must be a valid
105 |  *        LangChain model identifier. See `init_chat_model` docs for more details:
106 |  *        https://python.langchain.com/docs/how_to/chat_models_universal_init/
107 |  * @param [options.continuous=false] - If true, score will be a float between 0 and 1. If false, score will be boolean.
108 |  * @param [options.choices] - Optional list of specific float values the score must be chosen from
109 |  * @param [options.useReasoning=true] - If true, includes explanation for the score in the output
110 |  * @param [options.fewShotExamples] - Optional list of example evaluations to append to the prompt
111 |  * @returns A function that evaluates agent trajectories using the configured LLM judge
112 |  */
113 | export const createGraphTrajectoryLLMAsJudge = ({
114 |   prompt = GRAPH_TRAJECTORY_ACCURACY_PROMPT,
115 |   model,
116 |   feedbackKey = "graph_trajectory_accuracy",
117 |   judge,
118 |   continuous = false,
119 |   choices,
120 |   useReasoning = true,
121 |   fewShotExamples,
122 | }: TrajectoryLLMAsJudgeParams) => {
123 |   const scorer = _createLLMAsJudgeScorer({
124 |     prompt,
125 |     judge,
126 |     model,
127 |     continuous,
128 |     choices,
129 |     useReasoning,
130 |     fewShotExamples,
131 |   });
132 | 
133 |   const _wrappedEvaluator = async ({
134 |     inputs,
135 |     outputs,
136 |     referenceOutputs,
137 |     ...extra
138 |   }: {
139 |     inputs:
140 |       | (string | Record<string, unknown> | null)[]
141 |       | { inputs: (string | Record<string, unknown> | null)[] };
142 |     outputs: GraphTrajectory;
143 |     referenceOutputs?: GraphTrajectory;
144 |     [key: string]: unknown;
145 |   }) => {
146 |     const { formattedThread, formattedReferenceOutputs } = _formatInputs(
147 |       inputs,
148 |       outputs,
149 |       referenceOutputs
150 |     );
151 |     return _runEvaluator(`llm_as_${feedbackKey}_judge`, scorer, feedbackKey, {
152 |       outputs,
153 |       inputs,
154 |       thread: formattedThread,
155 |       referenceOutputs: formattedReferenceOutputs,
156 |       ...extra,
157 |     });
158 |   };
159 |   return _wrappedEvaluator;
160 | };
161 | 


--------------------------------------------------------------------------------
/js/src/trajectory/llm.ts:
--------------------------------------------------------------------------------
  1 | import { BaseMessage } from "@langchain/core/messages";
  2 | import { _createLLMAsJudgeScorer } from "openevals/llm";
  3 | 
  4 | import { _runEvaluator, _normalizeToOpenAIMessagesList } from "../utils.js";
  5 | import { _chatCompletionMessagesToString } from "./utils.js";
  6 | import {
  7 |   ChatCompletionMessage,
  8 |   FlexibleChatCompletionMessage,
  9 |   EvaluatorResult,
 10 |   TrajectoryLLMAsJudgeParams,
 11 | } from "../types.js";
 12 | 
 13 | export const TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE = `You are an expert data labeler.
 14 | Your task is to grade the accuracy of an AI agent's internal trajectory.
 15 | 
 16 | <Rubric>
 17 |   An accurate trajectory:
 18 |   - Makes logical sense between steps
 19 |   - Shows clear progression
 20 |   - Is relatively efficient, though it does not need to be perfectly efficient
 21 |   - Is semantically equivalent to the provided reference trajectory
 22 | </Rubric>
 23 | 
 24 | Based on the following reference trajectory:
 25 | 
 26 | <reference_trajectory>
 27 | {reference_outputs}
 28 | </reference_trajectory>
 29 | 
 30 | Grade this actual trajectory:
 31 | 
 32 | <trajectory>
 33 | {outputs}
 34 | </trajectory>
 35 | `;
 36 | 
 37 | export const TRAJECTORY_ACCURACY_PROMPT = `You are an expert data labeler.
 38 | Your task is to grade the accuracy of an AI agent's internal trajectory.
 39 | 
 40 | <Rubric>
 41 |   An accurate trajectory:
 42 |   - Makes logical sense between steps
 43 |   - Shows clear progression
 44 |   - Is relatively efficient, though it does not need to be perfectly efficient
 45 | </Rubric>
 46 | 
 47 | First, try to understand the goal of the trajectory by looking at the input
 48 | (if the input is not present try to infer it from the content of the first message),
 49 | as well as the output of the final message. Once you understand the goal, grade the trajectory
 50 | as it relates to achieving that goal.
 51 | 
 52 | Grade the following trajectory:
 53 | 
 54 | <trajectory>
 55 | {outputs}
 56 | </trajectory>`;
 57 | 
 58 | function _formatInputs(params: {
 59 |   outputs:
 60 |     | ChatCompletionMessage[]
 61 |     | FlexibleChatCompletionMessage[]
 62 |     | BaseMessage[]
 63 |     | {
 64 |         messages: (
 65 |           | BaseMessage
 66 |           | ChatCompletionMessage
 67 |           | FlexibleChatCompletionMessage
 68 |         )[];
 69 |       };
 70 |   referenceOutputs?:
 71 |     | ChatCompletionMessage[]
 72 |     | FlexibleChatCompletionMessage[]
 73 |     | BaseMessage[]
 74 |     | {
 75 |         messages: (
 76 |           | BaseMessage
 77 |           | ChatCompletionMessage
 78 |           | FlexibleChatCompletionMessage
 79 |         )[];
 80 |       };
 81 | }): [string, string] {
 82 |   const { outputs, referenceOutputs } = params;
 83 |   const normalizedOutputs = _normalizeToOpenAIMessagesList(outputs);
 84 |   const normalizedReferenceOutputs = _normalizeToOpenAIMessagesList(
 85 |     referenceOutputs ?? []
 86 |   );
 87 | 
 88 |   const formattedReferenceOutputs = normalizedReferenceOutputs
 89 |     ? _chatCompletionMessagesToString(normalizedReferenceOutputs)
 90 |     : "";
 91 | 
 92 |   const formattedOutputs = _chatCompletionMessagesToString(normalizedOutputs);
 93 | 
 94 |   return [formattedOutputs, formattedReferenceOutputs];
 95 | }
 96 | 
 97 | /**
 98 |  * Creates an evaluator that uses an LLM to judge agent trajectories.
 99 |  *
100 |  * @param options - Configuration options
101 |  * @param options.prompt - The evaluation prompt. Can be a string template, LangChain prompt template,
102 |  *                        or callable that returns a list of chat messages.
103 |  * @param options.feedbackKey - Key used to store the evaluation result. Defaults to "trajectory_accuracy".
104 |  * @param options.model - Model identifier to use. If judge is an OpenAI client,
105 |  *                       this should be a model name directly. If judge is omitted, must be a valid
106 |  *                       LangChain model identifier.
107 |  * @param options.system - Optional system message to prepend to the prompt.
108 |  * @param options.judge - The LLM used for evaluation. Can be an OpenAI client or a LangChainLikeModel.
109 |  *                       If an OpenAI client, must specify "model" as well. If omitted, "model" will be
110 |  *                       used to instantiate a LangChain model instance by model string.
111 |  * @param options.continuous - If true, score will be a float between 0 and 1. If false, score will be boolean.
112 |  *                           Defaults to false.
113 |  * @param options.choices - Optional list of specific float values the score must be chosen from.
114 |  * @param options.useReasoning - If true, includes explanation for the score in the output. Defaults to true.
115 |  * @param options.fewShotExamples - Optional list of example evaluations to append to the prompt.
116 |  * @returns A function that evaluates agent trajectories using the configured LLM judge.
117 |  */
118 | export const createTrajectoryLLMAsJudge = ({
119 |   prompt = TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE,
120 |   feedbackKey = "trajectory_accuracy",
121 |   model,
122 |   system,
123 |   judge,
124 |   continuous = false,
125 |   choices,
126 |   useReasoning = true,
127 |   fewShotExamples,
128 | }: TrajectoryLLMAsJudgeParams) => {
129 |   const scorer = _createLLMAsJudgeScorer({
130 |     prompt,
131 |     judge,
132 |     model,
133 |     system,
134 |     continuous,
135 |     choices,
136 |     useReasoning,
137 |     fewShotExamples,
138 |   });
139 | 
140 |   const wrappedEvaluator = async ({
141 |     inputs,
142 |     outputs,
143 |     referenceOutputs,
144 |     ...extra
145 |   }: {
146 |     outputs:
147 |       | ChatCompletionMessage[]
148 |       | FlexibleChatCompletionMessage[]
149 |       | BaseMessage[]
150 |       | {
151 |           messages: (
152 |             | BaseMessage
153 |             | ChatCompletionMessage
154 |             | FlexibleChatCompletionMessage
155 |           )[];
156 |         };
157 |     referenceOutputs?:
158 |       | ChatCompletionMessage[]
159 |       | FlexibleChatCompletionMessage[]
160 |       | BaseMessage[]
161 |       | {
162 |           messages: (
163 |             | BaseMessage
164 |             | ChatCompletionMessage
165 |             | FlexibleChatCompletionMessage
166 |           )[];
167 |         };
168 |     [key: string]: unknown;
169 |   }): Promise<EvaluatorResult> => {
170 |     const [formattedOutputs, formattedReferenceOutputs] = _formatInputs({
171 |       outputs,
172 |       referenceOutputs,
173 |     });
174 | 
175 |     return _runEvaluator(`llm_as_${feedbackKey}_judge`, scorer, feedbackKey, {
176 |       inputs,
177 |       outputs: formattedOutputs,
178 |       referenceOutputs: formattedReferenceOutputs,
179 |       ...extra,
180 |     });
181 |   };
182 |   return wrappedEvaluator;
183 | };
184 | 


--------------------------------------------------------------------------------
/python/tests/test_trajectory_llm.py:
--------------------------------------------------------------------------------
  1 | from agentevals.trajectory.llm import (
  2 |     create_trajectory_llm_as_judge,
  3 |     TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE,
  4 |     TRAJECTORY_ACCURACY_PROMPT,
  5 | )
  6 | 
  7 | from agentevals.types import ChatCompletionMessage
  8 | 
  9 | import pytest
 10 | import json
 11 | 
 12 | 
 13 | @pytest.mark.langsmith
 14 | def test_trajectory_match():
 15 |     evaluator = create_trajectory_llm_as_judge(
 16 |         prompt=TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE, model="openai:o3-mini"
 17 |     )
 18 |     outputs = [
 19 |         {"role": "user", "content": "What is the weather in SF?"},
 20 |         {
 21 |             "role": "assistant",
 22 |             "tool_calls": [
 23 |                 {
 24 |                     "function": {
 25 |                         "name": "get_weather",
 26 |                         "arguments": json.dumps({"city": "SF"}),
 27 |                     }
 28 |                 }
 29 |             ],
 30 |         },
 31 |         {"role": "tool", "content": "It's 80 degrees and sunny in SF."},
 32 |         {"role": "assistant", "content": "The weather in SF is 80 degrees and sunny."},
 33 |     ]
 34 |     reference_outputs = [
 35 |         {"role": "user", "content": "What is the weather in SF?"},
 36 |         {
 37 |             "role": "assistant",
 38 |             "tool_calls": [
 39 |                 {
 40 |                     "function": {
 41 |                         "name": "get_weather",
 42 |                         "arguments": json.dumps({"city": "San Francisco"}),
 43 |                     }
 44 |                 }
 45 |             ],
 46 |         },
 47 |         {"role": "tool", "content": "It's 80 degrees and sunny in San Francisco."},
 48 |         {"role": "assistant", "content": "The weather in SF is 80˚ and sunny."},
 49 |     ]
 50 |     eval_result = evaluator(
 51 |         outputs=outputs,
 52 |         reference_outputs=reference_outputs,
 53 |     )
 54 |     assert eval_result["key"] == "trajectory_accuracy"
 55 |     assert eval_result["score"]
 56 | 
 57 | 
 58 | @pytest.mark.langsmith
 59 | def test_trajectory_no_ref():
 60 |     evaluator = create_trajectory_llm_as_judge(
 61 |         prompt=TRAJECTORY_ACCURACY_PROMPT, model="openai:o3-mini"
 62 |     )
 63 |     outputs = [
 64 |         {"role": "user", "content": "What is the weather in SF?"},
 65 |         {
 66 |             "role": "assistant",
 67 |             "tool_calls": [
 68 |                 {
 69 |                     "function": {
 70 |                         "name": "get_weather",
 71 |                         "arguments": json.dumps({"city": "SF"}),
 72 |                     }
 73 |                 }
 74 |             ],
 75 |         },
 76 |         {"role": "tool", "content": "It's 80 degrees and sunny in SF."},
 77 |         {"role": "assistant", "content": "The weather in SF is 80 degrees and sunny."},
 78 |     ]
 79 |     eval_result = evaluator(
 80 |         outputs=outputs,
 81 |     )
 82 |     assert eval_result["key"] == "trajectory_accuracy"
 83 |     assert eval_result["score"]
 84 | 
 85 | 
 86 | @pytest.mark.langsmith
 87 | def test_trajectory_no_ref_bad_trajectory():
 88 |     evaluator = create_trajectory_llm_as_judge(
 89 |         prompt=TRAJECTORY_ACCURACY_PROMPT, model="openai:o3-mini"
 90 |     )
 91 |     outputs = [
 92 |         {"role": "user", "content": "What are some good restaurants in SF?"},
 93 |         {
 94 |             "role": "assistant",
 95 |             "tool_calls": [
 96 |                 {
 97 |                     "function": {
 98 |                         "name": "get_weather",
 99 |                         "arguments": json.dumps({"city": "SF"}),
100 |                     }
101 |                 }
102 |             ],
103 |         },
104 |         {"role": "tool", "content": "It's 80 degrees and sunny in SF."},
105 |         {"role": "assistant", "content": "The weather in SF is 80 degrees and sunny."},
106 |     ]
107 |     eval_result = evaluator(
108 |         outputs=outputs,
109 |     )
110 |     assert eval_result["key"] == "trajectory_accuracy"
111 |     assert not eval_result["score"]
112 | 
113 | 
114 | @pytest.mark.langsmith
115 | def test_trajectory_match_with_inverse_rubric():
116 |     REVERSE_PROMPT = """You are an expert data labeler.
117 | Your task is to grade the inaccuracy of an AI agent's internal trajectory.
118 | 
119 | <Rubric>
120 |   An inaccurate trajectory:
121 |   - Makes no logical sense between steps
122 |   - Shows no clear progression
123 |   - Is not relatively efficient, though it does not need to be perfectly inefficient
124 |   - Is not semantically equivalent to the provided reference trajectory, if present
125 | 
126 |   We are looking for bad trajectories, so score should be 0 if the trajectory contains reasonable steps for the agent to answer the input, and 1 if not.
127 | </Rubric>
128 | 
129 | Grade the following trajectory:
130 | 
131 | <trajectory>
132 | {outputs}
133 | </trajectory>
134 | 
135 | <input>
136 | {inputs}
137 | </input>
138 | 
139 | According to this reference trajectory:
140 | 
141 | <reference_trajectory>
142 | {reference_outputs}
143 | </reference_trajectory>
144 | """
145 |     evaluator = create_trajectory_llm_as_judge(
146 |         prompt=REVERSE_PROMPT, model="openai:o3-mini"
147 |     )
148 |     outputs = [
149 |         ChatCompletionMessage(role="user", content="What is the weather in SF?"),
150 |         ChatCompletionMessage(
151 |             role="assistant",
152 |             tool_calls=[
153 |                 {
154 |                     "function": {
155 |                         "name": "get_weather",
156 |                         "arguments": json.dumps({"city": "SF"}),
157 |                     }
158 |                 }
159 |             ],
160 |         ),
161 |         ChatCompletionMessage(role="tool", content="It's 80 degrees and sunny in SF."),
162 |         ChatCompletionMessage(
163 |             role="assistant", content="The weather in SF is 80 degrees and sunny."
164 |         ),
165 |     ]
166 |     reference_outputs = [
167 |         ChatCompletionMessage(role="user", content="What is the weather in SF?"),
168 |         ChatCompletionMessage(
169 |             role="assistant",
170 |             tool_calls=[
171 |                 {
172 |                     "function": {
173 |                         "name": "get_weather",
174 |                         "arguments": json.dumps({"city": "San Francisco"}),
175 |                     }
176 |                 }
177 |             ],
178 |         ),
179 |         ChatCompletionMessage(
180 |             role="tool", content="It's 80 degrees and sunny in San Francisco."
181 |         ),
182 |         ChatCompletionMessage(
183 |             role="assistant", content="The weather in SF is 80˚ and sunny."
184 |         ),
185 |     ]
186 |     eval_result = evaluator(
187 |         inputs="What is the weather in SF?",
188 |         outputs=outputs,
189 |         reference_outputs=reference_outputs,
190 |     )
191 |     assert eval_result["key"] == "trajectory_accuracy"
192 |     assert not eval_result["score"]
193 | 


--------------------------------------------------------------------------------
/python/tests/test_trajectory_llm_async.py:
--------------------------------------------------------------------------------
  1 | from agentevals.trajectory.llm import (
  2 |     create_async_trajectory_llm_as_judge,
  3 |     TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE,
  4 |     TRAJECTORY_ACCURACY_PROMPT,
  5 | )
  6 | 
  7 | from agentevals.types import ChatCompletionMessage
  8 | 
  9 | import pytest
 10 | import json
 11 | 
 12 | 
 13 | @pytest.mark.langsmith
 14 | @pytest.mark.asyncio
 15 | async def test_trajectory_match():
 16 |     evaluator = create_async_trajectory_llm_as_judge(
 17 |         prompt=TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE, model="openai:o3-mini"
 18 |     )
 19 |     inputs = {}
 20 |     outputs = [
 21 |         {"role": "user", "content": "What is the weather in SF?"},
 22 |         {
 23 |             "role": "assistant",
 24 |             "tool_calls": [
 25 |                 {
 26 |                     "function": {
 27 |                         "name": "get_weather",
 28 |                         "arguments": json.dumps({"city": "SF"}),
 29 |                     }
 30 |                 }
 31 |             ],
 32 |         },
 33 |         {"role": "tool", "content": "It's 80 degrees and sunny in SF."},
 34 |         {"role": "assistant", "content": "The weather in SF is 80 degrees and sunny."},
 35 |     ]
 36 |     reference_outputs = [
 37 |         {"role": "user", "content": "What is the weather in SF?"},
 38 |         {
 39 |             "role": "assistant",
 40 |             "tool_calls": [
 41 |                 {
 42 |                     "function": {
 43 |                         "name": "get_weather",
 44 |                         "arguments": json.dumps({"city": "San Francisco"}),
 45 |                     }
 46 |                 }
 47 |             ],
 48 |         },
 49 |         {"role": "tool", "content": "It's 80 degrees and sunny in San Francisco."},
 50 |         {"role": "assistant", "content": "The weather in SF is 80˚ and sunny."},
 51 |     ]
 52 |     eval_result = await evaluator(
 53 |         inputs=inputs,
 54 |         outputs=outputs,
 55 |         reference_outputs=reference_outputs,
 56 |     )
 57 |     assert eval_result["key"] == "trajectory_accuracy"
 58 |     assert eval_result["score"]
 59 | 
 60 | 
 61 | @pytest.mark.langsmith
 62 | @pytest.mark.asyncio
 63 | async def test_trajectory_no_ref():
 64 |     evaluator = create_async_trajectory_llm_as_judge(
 65 |         prompt=TRAJECTORY_ACCURACY_PROMPT, model="openai:o3-mini"
 66 |     )
 67 |     outputs = [
 68 |         {"role": "user", "content": "What is the weather in SF?"},
 69 |         {
 70 |             "role": "assistant",
 71 |             "tool_calls": [
 72 |                 {
 73 |                     "function": {
 74 |                         "name": "get_weather",
 75 |                         "arguments": json.dumps({"city": "SF"}),
 76 |                     }
 77 |                 }
 78 |             ],
 79 |         },
 80 |         {"role": "tool", "content": "It's 80 degrees and sunny in SF."},
 81 |         {"role": "assistant", "content": "The weather in SF is 80 degrees and sunny."},
 82 |     ]
 83 |     eval_result = await evaluator(
 84 |         outputs=outputs,
 85 |     )
 86 |     assert eval_result["key"] == "trajectory_accuracy"
 87 |     assert eval_result["score"]
 88 | 
 89 | 
 90 | @pytest.mark.langsmith
 91 | @pytest.mark.asyncio
 92 | async def test_trajectory_no_ref_bad_trajectory():
 93 |     evaluator = create_async_trajectory_llm_as_judge(
 94 |         prompt=TRAJECTORY_ACCURACY_PROMPT, model="openai:o3-mini"
 95 |     )
 96 |     outputs = [
 97 |         {"role": "user", "content": "What are some good restaurants in SF?"},
 98 |         {
 99 |             "role": "assistant",
100 |             "tool_calls": [
101 |                 {
102 |                     "function": {
103 |                         "name": "get_weather",
104 |                         "arguments": json.dumps({"city": "SF"}),
105 |                     }
106 |                 }
107 |             ],
108 |         },
109 |         {"role": "tool", "content": "It's 80 degrees and sunny in SF."},
110 |         {"role": "assistant", "content": "The weather in SF is 80 degrees and sunny."},
111 |     ]
112 |     eval_result = await evaluator(
113 |         outputs=outputs,
114 |     )
115 |     assert eval_result["key"] == "trajectory_accuracy"
116 |     assert not eval_result["score"]
117 | 
118 | 
119 | @pytest.mark.langsmith
120 | @pytest.mark.asyncio
121 | async def test_trajectory_match_with_inverse_rubric():
122 |     REVERSE_PROMPT = """You are an expert data labeler.
123 | Your task is to grade the inaccuracy of an AI agent's internal trajectory.
124 | 
125 | <Rubric>
126 |   An inaccurate trajectory:
127 |   - Makes no logical sense between steps
128 |   - Shows no clear progression
129 |   - Is not relatively efficient, though it does not need to be perfectly inefficient
130 |   - Is not semantically equivalent to the provided reference trajectory, if present
131 | 
132 |   We are looking for bad trajectories, so score should be 0 if the trajectory contains reasonable steps for the agent to answer the input, and 1 if not.
133 | </Rubric>
134 | 
135 | Grade the following trajectory:
136 | 
137 | <trajectory>
138 | {outputs}
139 | </trajectory>
140 | {inputs}
141 | {reference_outputs}
142 | """
143 |     evaluator = create_async_trajectory_llm_as_judge(
144 |         prompt=REVERSE_PROMPT, model="openai:o3-mini"
145 |     )
146 |     inputs = "What is the weather in SF?"
147 |     outputs = [
148 |         ChatCompletionMessage(role="user", content="What is the weather in SF?"),
149 |         ChatCompletionMessage(
150 |             role="assistant",
151 |             tool_calls=[
152 |                 {
153 |                     "function": {
154 |                         "name": "get_weather",
155 |                         "arguments": json.dumps({"city": "SF"}),
156 |                     }
157 |                 }
158 |             ],
159 |         ),
160 |         ChatCompletionMessage(role="tool", content="It's 80 degrees and sunny in SF."),
161 |         ChatCompletionMessage(
162 |             role="assistant", content="The weather in SF is 80 degrees and sunny."
163 |         ),
164 |     ]
165 |     reference_outputs = [
166 |         ChatCompletionMessage(role="user", content="What is the weather in SF?"),
167 |         ChatCompletionMessage(
168 |             role="assistant",
169 |             tool_calls=[
170 |                 {
171 |                     "function": {
172 |                         "name": "get_weather",
173 |                         "arguments": json.dumps({"city": "San Francisco"}),
174 |                     }
175 |                 }
176 |             ],
177 |         ),
178 |         ChatCompletionMessage(
179 |             role="tool", content="It's 80 degrees and sunny in San Francisco."
180 |         ),
181 |         ChatCompletionMessage(
182 |             role="assistant", content="The weather in SF is 80˚ and sunny."
183 |         ),
184 |     ]
185 |     eval_result = await evaluator(
186 |         inputs=inputs,
187 |         outputs=outputs,
188 |         reference_outputs=reference_outputs,
189 |     )
190 |     assert eval_result["key"] == "trajectory_accuracy"
191 |     assert not eval_result["score"]
192 | 


--------------------------------------------------------------------------------
/js/src/trajectory/tests/trajectory_llm.test.ts:
--------------------------------------------------------------------------------
  1 | import * as ls from "langsmith/vitest";
  2 | import { expect } from "vitest";
  3 | 
  4 | import {
  5 |   createTrajectoryLLMAsJudge,
  6 |   TRAJECTORY_ACCURACY_PROMPT,
  7 | } from "../llm.js";
  8 | import { FlexibleChatCompletionMessage } from "../../types.js";
  9 | 
 10 | ls.describe("Trajectory LLM", () => {
 11 |   ls.test(
 12 |     "should match trajectories",
 13 |     {
 14 |       inputs: {},
 15 |     },
 16 |     async () => {
 17 |       const evaluator = createTrajectoryLLMAsJudge({
 18 |         model: "openai:o3-mini",
 19 |       });
 20 |       const inputs = {};
 21 |       const outputs = [
 22 |         { role: "user", content: "What is the weather in SF?" },
 23 |         {
 24 |           role: "assistant",
 25 |           content: "",
 26 |           tool_calls: [
 27 |             {
 28 |               function: {
 29 |                 name: "get_weather",
 30 |                 arguments: JSON.stringify({ city: "SF" }),
 31 |               },
 32 |             },
 33 |           ],
 34 |         },
 35 |         { role: "tool", content: "It's 80 degrees and sunny in SF." },
 36 |         {
 37 |           role: "assistant",
 38 |           content: "The weather in SF is 80 degrees and sunny.",
 39 |         },
 40 |       ] satisfies FlexibleChatCompletionMessage[];
 41 | 
 42 |       const referenceOutputs = [
 43 |         { role: "user", content: "What is the weather in SF?" },
 44 |         {
 45 |           role: "assistant",
 46 |           content: "",
 47 |           tool_calls: [
 48 |             {
 49 |               function: {
 50 |                 name: "get_weather",
 51 |                 arguments: JSON.stringify({ city: "San Francisco" }),
 52 |               },
 53 |             },
 54 |           ],
 55 |         },
 56 |         {
 57 |           role: "tool",
 58 |           content: "It's 80 degrees and sunny in San Francisco.",
 59 |         },
 60 |         { role: "assistant", content: "The weather in SF is 80˚ and sunny." },
 61 |       ] satisfies FlexibleChatCompletionMessage[];
 62 | 
 63 |       const evalResult = await evaluator({
 64 |         inputs,
 65 |         outputs,
 66 |         referenceOutputs,
 67 |       });
 68 | 
 69 |       expect(evalResult.key).toBe("trajectory_accuracy");
 70 |       expect(evalResult.score).toBe(true);
 71 |     }
 72 |   );
 73 | 
 74 |   ls.test("trajectory no ref", { inputs: {} }, async () => {
 75 |     const evaluator = createTrajectoryLLMAsJudge({
 76 |       prompt: TRAJECTORY_ACCURACY_PROMPT,
 77 |       model: "openai:o3-mini",
 78 |     });
 79 |     const evalResult = await evaluator({
 80 |       outputs: [
 81 |         { role: "user", content: "What is the weather in SF?" },
 82 |         {
 83 |           role: "assistant",
 84 |           content: "",
 85 |           tool_calls: [
 86 |             {
 87 |               function: {
 88 |                 name: "get_weather",
 89 |                 arguments: JSON.stringify({ city: "SF" }),
 90 |               },
 91 |             },
 92 |           ],
 93 |         },
 94 |         { role: "tool", content: "It's 80 degrees and sunny in SF." },
 95 |         {
 96 |           role: "assistant",
 97 |           content: "The weather in SF is 80 degrees and sunny.",
 98 |         },
 99 |       ],
100 |     });
101 | 
102 |     expect(evalResult.key).toBe("trajectory_accuracy");
103 |     expect(evalResult.score).toBe(true);
104 |   });
105 | 
106 |   ls.test("trajectory no ref bad trajectory", { inputs: {} }, async () => {
107 |     const evaluator = createTrajectoryLLMAsJudge({
108 |       prompt: TRAJECTORY_ACCURACY_PROMPT,
109 |       model: "openai:o3-mini",
110 |     });
111 |     const outputs = [
112 |       { role: "user", content: "What are some good restaurants in SF?" },
113 |       {
114 |         content: "",
115 |         role: "assistant",
116 |         tool_calls: [
117 |           {
118 |             function: {
119 |               name: "get_weather",
120 |               arguments: JSON.stringify({ city: "SF" }),
121 |             },
122 |           },
123 |         ],
124 |       },
125 |       { role: "tool", content: "It's 80 degrees and sunny in SF." },
126 |       {
127 |         role: "assistant",
128 |         content: "The weather in SF is 80 degrees and sunny.",
129 |       },
130 |     ] satisfies FlexibleChatCompletionMessage[];
131 |     const evalResult = await evaluator({
132 |       outputs,
133 |     });
134 | 
135 |     expect(evalResult.key).toBe("trajectory_accuracy");
136 |     expect(evalResult.score).toBe(false);
137 |   });
138 | 
139 |   ls.test(
140 |     "should match trajectories with inverse rubric",
141 |     { inputs: {} },
142 |     async () => {
143 |       const REVERSE_PROMPT = `You are an expert data labeler.
144 | Your task is to grade the inaccuracy of an AI agent's internal trajectory.
145 | 
146 | <Rubric>
147 |   An inaccurate trajectory:
148 |   - Makes no logical sense between steps
149 |   - Shows no clear progression
150 |   - Is not relatively efficient, though it does not need to be perfectly inefficient
151 |   - Is not semantically equivalent to the provided reference trajectory, if present
152 | 
153 |   We are looking for bad trajectories, so score should be 0 if the trajectory contains reasonable steps for the agent to answer the input, and 1 if not.
154 | </Rubric>
155 | 
156 | Grade the following trajectory:
157 | 
158 | <trajectory>
159 | {outputs}
160 | </trajectory>
161 | 
162 | <input>
163 | {inputs}
164 | </input>
165 | 
166 | According to this reference trajectory:
167 | 
168 | <reference_trajectory>
169 | {reference_outputs}
170 | </reference_trajectory>
171 | `;
172 | 
173 |       const evaluator = createTrajectoryLLMAsJudge({
174 |         model: "openai:o3-mini",
175 |         prompt: REVERSE_PROMPT,
176 |       });
177 |       const inputs = {};
178 |       const outputs = [
179 |         { role: "user", content: "What is the weather in SF?" },
180 |         {
181 |           role: "assistant",
182 |           content: "",
183 |           tool_calls: [
184 |             {
185 |               function: {
186 |                 name: "get_weather",
187 |                 arguments: JSON.stringify({ city: "SF" }),
188 |               },
189 |             },
190 |           ],
191 |         },
192 |         { role: "tool", content: "It's 80 degrees and sunny in SF." },
193 |         {
194 |           role: "assistant",
195 |           content: "The weather in SF is 80 degrees and sunny.",
196 |         },
197 |       ] satisfies FlexibleChatCompletionMessage[];
198 | 
199 |       const referenceOutputs = [
200 |         { role: "user", content: "What is the weather in SF?" },
201 |         {
202 |           role: "assistant",
203 |           content: "",
204 |           tool_calls: [
205 |             {
206 |               function: {
207 |                 name: "get_weather",
208 |                 arguments: JSON.stringify({ city: "San Francisco" }),
209 |               },
210 |             },
211 |           ],
212 |         },
213 |         {
214 |           role: "tool",
215 |           content: "It's 80 degrees and sunny in San Francisco.",
216 |         },
217 |         { role: "assistant", content: "The weather in SF is 80˚ and sunny." },
218 |       ] satisfies FlexibleChatCompletionMessage[];
219 | 
220 |       const evalResult = await evaluator({
221 |         inputs,
222 |         outputs,
223 |         referenceOutputs,
224 |       });
225 | 
226 |       expect(evalResult.key).toBe("trajectory_accuracy");
227 |       expect(evalResult.score).toBe(false);
228 |     }
229 |   );
230 | });
231 | 


--------------------------------------------------------------------------------
/python/agentevals/trajectory/utils.py:
--------------------------------------------------------------------------------
  1 | __all__ = [
  2 |     "_is_trajectory_superset",
  3 |     "_extract_tool_calls",
  4 |     "_get_matcher_for_tool_name",
  5 |     "_normalize_to_openai_messages_list",
  6 |     "_convert_to_openai_message",
  7 | ]
  8 | 
  9 | import json
 10 | 
 11 | from agentevals.types import (
 12 |     ChatCompletionMessage,
 13 |     ToolArgsMatchMode,
 14 |     ToolArgsMatchOverrides,
 15 | )
 16 | from langchain_core.messages import BaseMessage
 17 | from langchain_core.messages.utils import convert_to_openai_messages
 18 | from typing import Callable, Optional, Union
 19 | 
 20 | 
 21 | # More flexible version of converting to OpenAI messages for trajectories
 22 | def _convert_to_openai_message(
 23 |     message: Union[ChatCompletionMessage, BaseMessage, dict],
 24 | ) -> ChatCompletionMessage:
 25 |     if not isinstance(message, BaseMessage):
 26 |         if not isinstance(message, dict):
 27 |             message = dict(message)
 28 |         if message.get("role") in ["ai", "assistant"] and message.get("tool_calls"):
 29 |             message["tool_calls"] = [
 30 |                 {**tool_call, "id": tool_call.get("id", "")}
 31 |                 for tool_call in message["tool_calls"]
 32 |             ]
 33 |         if message.get("role") == "tool" and message.get("tool_call_id") is None:
 34 |             message["tool_call_id"] = ""
 35 |         if message.get("content") is None:
 36 |             message["content"] = ""
 37 |     converted = convert_to_openai_messages([message])[0]  # type: ignore
 38 |     if isinstance(message, BaseMessage):
 39 |         if message.id is not None and converted.get("id") is None:
 40 |             converted["id"] = message.id
 41 |     else:
 42 |         if message.get("id") is not None and converted.get("id") is None:
 43 |             converted["id"] = message.get("id")
 44 |     return converted  # type: ignore
 45 | 
 46 | 
 47 | def _normalize_to_openai_messages_list(
 48 |     messages: Optional[
 49 |         Union[
 50 |             list[ChatCompletionMessage], list[BaseMessage], ChatCompletionMessage, dict
 51 |         ]
 52 |     ],
 53 | ) -> list[ChatCompletionMessage]:
 54 |     if messages is None:
 55 |         return []
 56 |     if isinstance(messages, dict):
 57 |         if "role" in messages:
 58 |             messages = [messages]  # type: ignore
 59 |         elif "messages" in messages:
 60 |             messages = messages["messages"]  # type: ignore
 61 |         else:
 62 |             raise ValueError("if messages is a dict, it must contain a 'messages' key")
 63 |     if not isinstance(messages, list):
 64 |         messages = [messages]  # type: ignore
 65 |     return [_convert_to_openai_message(message) for message in messages]  # type: ignore
 66 | 
 67 | 
 68 | def _normalize_tool_call(tool_call: dict) -> dict:
 69 |     if "function" in tool_call:
 70 |         return {
 71 |             "name": tool_call["function"]["name"],
 72 |             "args": json.loads(tool_call["function"]["arguments"]),
 73 |         }
 74 |     else:
 75 |         return tool_call
 76 | 
 77 | 
 78 | def _extract_tool_calls(messages: list[ChatCompletionMessage]) -> list[dict]:
 79 |     tool_calls: list[dict] = []
 80 |     for message in messages:
 81 |         if "tool_calls" in message:
 82 |             normalized_tool_calls = [
 83 |                 _normalize_tool_call(tool_call)
 84 |                 for tool_call in message["tool_calls"] or []
 85 |             ]
 86 |             tool_calls.extend(normalized_tool_calls)
 87 |     return tool_calls
 88 | 
 89 | 
 90 | def _is_trajectory_superset(
 91 |     outputs: list[ChatCompletionMessage],
 92 |     reference_outputs: list[ChatCompletionMessage],
 93 |     tool_args_match_mode: ToolArgsMatchMode,
 94 |     tool_args_match_overrides: Optional[ToolArgsMatchOverrides] = None,
 95 | ):
 96 |     output_tool_calls = _extract_tool_calls(outputs)
 97 |     reference_tool_calls = _extract_tool_calls(reference_outputs)
 98 | 
 99 |     # Keep track of which reference tool calls have been matched
100 |     matched_reference_calls = set()
101 | 
102 |     # For each reference tool call, find a matching output tool call
103 |     for ref_call in reference_tool_calls:
104 |         ref_name = ref_call["name"]
105 |         ref_args = ref_call["args"]
106 | 
107 |         found_match = False
108 |         for out_idx, out_call in enumerate(output_tool_calls):
109 |             out_name = out_call["name"]
110 | 
111 |             # Names must match
112 |             if ref_name != out_name:
113 |                 continue
114 | 
115 |             # If we're already using this output call for a different match, skip
116 |             if out_idx in matched_reference_calls:
117 |                 continue
118 | 
119 |             # Check tool args according to match mode
120 |             matcher = _get_matcher_for_tool_name(
121 |                 ref_name, tool_args_match_mode, tool_args_match_overrides
122 |             )
123 | 
124 |             out_args = out_call["args"]
125 |             if matcher(out_args, ref_args):
126 |                 matched_reference_calls.add(out_idx)
127 |                 found_match = True
128 |                 break
129 | 
130 |         # If we didn't find a match for this reference call, we're not a superset
131 |         if not found_match:
132 |             return False
133 | 
134 |     return True
135 | 
136 | 
137 | def _exact_match(tool_call: dict, reference_tool_call: dict) -> bool:
138 |     return tool_call == reference_tool_call
139 | 
140 | 
141 | def _subset_match(tool_call: dict, reference_tool_call: dict) -> bool:
142 |     # Every key-value pair in tool_call must exist in reference_tool_call
143 |     return all(
144 |         key in reference_tool_call and reference_tool_call[key] == value
145 |         for key, value in tool_call.items()
146 |     )
147 | 
148 | 
149 | def _superset_match(tool_call: dict, reference_tool_call: dict) -> bool:
150 |     # Every key-value pair in reference_tool_call must exist in tool_call
151 |     return all(
152 |         key in tool_call and tool_call[key] == value
153 |         for key, value in reference_tool_call.items()
154 |     )
155 | 
156 | 
157 | def _ignore_match(tool_call: dict, reference_tool_call: dict) -> bool:
158 |     return True
159 | 
160 | 
161 | def _get_matcher_for_comparison_mode(
162 |     mode: ToolArgsMatchMode,
163 | ) -> Callable[[dict, dict], bool]:
164 |     if mode == "exact":
165 |         return _exact_match
166 |     elif mode == "subset":
167 |         return _subset_match
168 |     elif mode == "superset":
169 |         return _superset_match
170 |     else:
171 |         return _ignore_match
172 | 
173 | 
174 | def _get_partial_matcher_on_keys(keys: list[str]) -> Callable[[dict, dict], bool]:
175 |     def get_nested_value(d: dict, key_path: str):
176 |         current = d
177 |         for part in key_path.split("."):
178 |             if not isinstance(current, dict):
179 |                 return None
180 |             current = current.get(part)  # type: ignore
181 |             if current is None:
182 |                 return None
183 |         return current
184 | 
185 |     def matcher(output_call: dict, reference_call: dict) -> bool:
186 |         return all(
187 |             get_nested_value(output_call, key) == get_nested_value(reference_call, key)
188 |             for key in keys
189 |         )
190 | 
191 |     return matcher
192 | 
193 | 
194 | def _get_matcher_for_tool_name(
195 |     tool_call_name: str,
196 |     tool_args_match_mode: ToolArgsMatchMode,
197 |     tool_args_match_overrides: Optional[ToolArgsMatchOverrides],
198 | ) -> Callable[[dict, dict], bool]:
199 |     matcher = _get_matcher_for_comparison_mode(tool_args_match_mode)
200 |     if tool_args_match_overrides is not None and tool_args_match_overrides.get(
201 |         tool_call_name, False
202 |     ):
203 |         override = tool_args_match_overrides.get(tool_call_name)
204 |         if isinstance(override, str):
205 |             matcher = _get_matcher_for_comparison_mode(override)
206 |         elif callable(override):
207 |             matcher = override
208 |         elif isinstance(override, list):
209 |             matcher = _get_partial_matcher_on_keys(override)
210 |         else:
211 |             raise ValueError(f"Invalid tool args match override: {override}")
212 |     return matcher
213 | 


--------------------------------------------------------------------------------
/js/src/trajectory/utils.ts:
--------------------------------------------------------------------------------
  1 | import {
  2 |   ChatCompletionMessage,
  3 |   ToolArgsMatchMode,
  4 |   ToolArgsMatchOverrides,
  5 |   ToolArgsMatcher,
  6 | } from "../types.js";
  7 | 
  8 | // eslint-disable-next-line @typescript-eslint/no-explicit-any
  9 | function _normalizeToolCall(toolCall: Record<string, any>): {
 10 |   name: string;
 11 |   args: Record<string, unknown>;
 12 | } {
 13 |   if (
 14 |     "function" in toolCall &&
 15 |     toolCall.function != null &&
 16 |     typeof toolCall.function === "object" &&
 17 |     typeof toolCall.function.arguments === "string"
 18 |   ) {
 19 |     return {
 20 |       name: toolCall.function.name,
 21 |       args: JSON.parse(toolCall.function.arguments),
 22 |     };
 23 |   }
 24 |   return toolCall as { name: string; args: Record<string, unknown> };
 25 | }
 26 | 
 27 | function _extractToolCalls(
 28 |   messages: ChatCompletionMessage[]
 29 | ): { name: string; args: Record<string, unknown> }[] {
 30 |   const toolCalls: { name: string; args: Record<string, unknown> }[] = [];
 31 |   for (const message of messages) {
 32 |     if (message.tool_calls) {
 33 |       toolCalls.push(...message.tool_calls.map(_normalizeToolCall));
 34 |     }
 35 |   }
 36 |   return toolCalls;
 37 | }
 38 | 
 39 | export async function _isTrajectorySuperset(
 40 |   outputs: ChatCompletionMessage[],
 41 |   referenceOutputs: ChatCompletionMessage[],
 42 |   toolArgsMatchMode: ToolArgsMatchMode,
 43 |   toolArgsMatchOverrides?: ToolArgsMatchOverrides
 44 | ): Promise<boolean> {
 45 |   const outputToolCalls = _extractToolCalls(outputs);
 46 |   const referenceToolCalls = _extractToolCalls(referenceOutputs);
 47 | 
 48 |   // Keep track of which reference tool calls have been matched
 49 |   const matchedReferenceCalls = new Set<number>();
 50 | 
 51 |   // For each reference tool call, find a matching output tool call
 52 |   for (const refCall of referenceToolCalls) {
 53 |     const refName = refCall.name;
 54 |     const refArgs = refCall.args;
 55 | 
 56 |     let foundMatch = false;
 57 |     for (let outIdx = 0; outIdx < outputToolCalls.length; outIdx++) {
 58 |       const outCall = outputToolCalls[outIdx];
 59 |       const outName = outCall.name;
 60 | 
 61 |       // Names must match
 62 |       if (refName !== outName) {
 63 |         continue;
 64 |       }
 65 | 
 66 |       // If we're already using this output call for a different match, skip
 67 |       if (matchedReferenceCalls.has(outIdx)) {
 68 |         continue;
 69 |       }
 70 | 
 71 |       // Check tool args according to match mode
 72 |       const matcher = _getMatcherForToolName(
 73 |         refName,
 74 |         toolArgsMatchMode,
 75 |         toolArgsMatchOverrides
 76 |       );
 77 | 
 78 |       const outArgs = outCall.args;
 79 |       if (await matcher(outArgs, refArgs)) {
 80 |         matchedReferenceCalls.add(outIdx);
 81 |         foundMatch = true;
 82 |         break;
 83 |       }
 84 |     }
 85 | 
 86 |     // If we didn't find a match for this reference call, we're not a superset
 87 |     if (!foundMatch) {
 88 |       return false;
 89 |     }
 90 |   }
 91 | 
 92 |   return true;
 93 | }
 94 | 
 95 | // Deep equality check function
 96 | function _deepEqual(a: unknown, b: unknown): boolean {
 97 |   if (a == null && b == null) return true;
 98 |   if (a === b) return true;
 99 |   if (typeof a !== "object" || typeof b !== "object" || !a || !b) return false;
100 | 
101 |   if (Array.isArray(a) && Array.isArray(b)) {
102 |     if (a.length !== b.length) return false;
103 |     return a.every((val, index) => _deepEqual(val, b[index]));
104 |   }
105 | 
106 |   const keysA = Object.keys(a);
107 |   const keysB = Object.keys(b);
108 | 
109 |   if (keysA.length !== keysB.length) return false;
110 | 
111 |   return (
112 |     keysA.every((key) => keysB.includes(key)) &&
113 |     keysB.every((key) => keysA.includes(key)) &&
114 |     keysA.every((key) =>
115 |       _deepEqual(
116 |         (a as Record<string, unknown>)[key],
117 |         (b as Record<string, unknown>)[key]
118 |       )
119 |     )
120 |   );
121 | }
122 | 
123 | function _exactMatch(
124 |   toolCall: Record<string, unknown>,
125 |   referenceToolCall: Record<string, unknown>
126 | ): boolean {
127 |   return _deepEqual(toolCall, referenceToolCall);
128 | }
129 | 
130 | function _ignoreMatch(
131 |   _toolCall: Record<string, unknown>,
132 |   _referenceToolCall: Record<string, unknown>
133 | ): boolean {
134 |   return true;
135 | }
136 | 
137 | function _subsetMatch(
138 |   toolCall: Record<string, unknown>,
139 |   referenceToolCall: Record<string, unknown>
140 | ): boolean {
141 |   // Every key-value pair in toolCall must exist in referenceToolCall with the same value
142 |   return Object.entries(toolCall).every(
143 |     ([key, value]) =>
144 |       key in referenceToolCall && _deepEqual(referenceToolCall[key], value)
145 |   );
146 | }
147 | 
148 | function _supersetMatch(
149 |   toolCall: Record<string, unknown>,
150 |   referenceToolCall: Record<string, unknown>
151 | ): boolean {
152 |   // Every key-value pair in referenceToolCall must exist in toolCall with the same value
153 |   return Object.entries(referenceToolCall).every(
154 |     ([key, value]) => key in toolCall && _deepEqual(toolCall[key], value)
155 |   );
156 | }
157 | 
158 | function _getMatcherForComparisonMode(
159 |   mode: ToolArgsMatchMode
160 | ): ToolArgsMatcher {
161 |   if (mode === "exact") {
162 |     return _exactMatch;
163 |   } else if (mode === "subset") {
164 |     return _subsetMatch;
165 |   } else if (mode === "superset") {
166 |     return _supersetMatch;
167 |   } else {
168 |     return _ignoreMatch;
169 |   }
170 | }
171 | 
172 | function _getPartialMatcherOnKeys(keys: string[]): ToolArgsMatcher {
173 |   const getNestedValue = (
174 |     d: Record<string, unknown>,
175 |     keyPath: string
176 |   ): unknown => {
177 |     let current: unknown = d;
178 |     for (const part of keyPath.split(".")) {
179 |       if (current && typeof current === "object" && part in current) {
180 |         current = current[part as keyof typeof current];
181 |       } else {
182 |         return undefined;
183 |       }
184 |     }
185 |     return current;
186 |   };
187 | 
188 |   return (
189 |     outputCall: Record<string, unknown>,
190 |     referenceCall: Record<string, unknown>
191 |   ): boolean => {
192 |     return keys.every((key) => {
193 |       const nestedOutputValue = getNestedValue(outputCall, key);
194 |       const nestedReferenceValue = getNestedValue(referenceCall, key);
195 |       return _deepEqual(nestedOutputValue, nestedReferenceValue);
196 |     });
197 |   };
198 | }
199 | 
200 | export function _getMatcherForToolName(
201 |   toolCallName: string,
202 |   toolArgsMatchMode: ToolArgsMatchMode,
203 |   toolArgsMatchOverrides?: ToolArgsMatchOverrides
204 | ): ToolArgsMatcher {
205 |   let matcher = _getMatcherForComparisonMode(toolArgsMatchMode);
206 | 
207 |   if (toolArgsMatchOverrides && toolCallName in toolArgsMatchOverrides) {
208 |     const override = toolArgsMatchOverrides[toolCallName];
209 | 
210 |     if (typeof override === "string") {
211 |       matcher = _getMatcherForComparisonMode(override);
212 |     } else if (typeof override === "function") {
213 |       matcher = override;
214 |     } else if (Array.isArray(override)) {
215 |       matcher = _getPartialMatcherOnKeys(override);
216 |     }
217 |   }
218 | 
219 |   return matcher;
220 | }
221 | 
222 | export function _chatCompletionMessagesToString(
223 |   messages: ChatCompletionMessage[]
224 | ): string {
225 |   function formatMessage(message: ChatCompletionMessage): string {
226 |     let content = message.content ?? "";
227 | 
228 |     // Handle tool/function calls
229 |     if (message.tool_calls) {
230 |       const toolCallsStr = message.tool_calls
231 |         .map((call: { function: { name: string; arguments: string } }) => {
232 |           const func = call.function ?? {};
233 |           return `<tool_call>\n<name>${func.name ?? ""}</name>\n<arguments>${func.arguments ?? ""}</arguments>\n</tool_call>`;
234 |         })
235 |         .join("\n");
236 | 
237 |       content = content ? `${content}\n${toolCallsStr}` : toolCallsStr;
238 |     }
239 | 
240 |     // Handle tool call results
241 |     if (message.tool_call_id) {
242 |       content = `<tool_result>\n<id>${message.tool_call_id}</id>\n<content>${content}</content>\n</tool_result>`;
243 |     }
244 | 
245 |     return `<${message.role ?? ""}>\n${content}\n</${message.role ?? ""}>`;
246 |   }
247 | 
248 |   return messages.map(formatMessage).join("\n\n");
249 | }
250 | 


--------------------------------------------------------------------------------
/python/agentevals/trajectory/strict.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | from warnings import warn
  3 | import json
  4 | 
  5 | from agentevals.trajectory.utils import _normalize_to_openai_messages_list
  6 | from agentevals.types import (
  7 |     ChatCompletionMessage,
  8 |     ToolArgsMatchMode,
  9 |     ToolArgsMatchOverrides,
 10 | )
 11 | from agentevals.utils import _run_evaluator, _arun_evaluator
 12 | from agentevals.trajectory.utils import _get_matcher_for_tool_name
 13 | 
 14 | from typing import Any, Union, Optional, TYPE_CHECKING
 15 | 
 16 | if TYPE_CHECKING:
 17 |     from langchain_core.messages import BaseMessage
 18 | 
 19 | 
 20 | def _scorer(
 21 |     *,
 22 |     outputs: list[ChatCompletionMessage],
 23 |     reference_outputs: list[ChatCompletionMessage],
 24 |     tool_args_match_mode: ToolArgsMatchMode,
 25 |     tool_args_match_overrides: Optional[ToolArgsMatchOverrides] = None,
 26 |     **kwargs: Any,
 27 | ):
 28 |     outputs = _normalize_to_openai_messages_list(outputs)
 29 |     reference_outputs = _normalize_to_openai_messages_list(reference_outputs)
 30 |     if outputs is None or reference_outputs is None:
 31 |         raise ValueError(
 32 |             "Strict trajectory match requires both outputs and reference_outputs"
 33 |         )
 34 |     if len(outputs) != len(reference_outputs):
 35 |         return False
 36 |     for output, reference_output in zip(outputs, reference_outputs):
 37 |         if output["role"] != reference_output["role"]:
 38 |             return False
 39 |         elif ("tool_calls" in output and output["tool_calls"] is not None) != (
 40 |             "tool_calls" in reference_output
 41 |             and reference_output["tool_calls"] is not None
 42 |         ):
 43 |             # One has tool calls while the other doesn't
 44 |             return False
 45 |         elif "tool_calls" in output and output["tool_calls"] is not None:
 46 |             # Both have tool calls, compare them
 47 |             if not isinstance(output["tool_calls"], list) or not isinstance(
 48 |                 reference_output["tool_calls"], list
 49 |             ):
 50 |                 return False
 51 |             if len(output["tool_calls"]) != len(reference_output["tool_calls"]):
 52 |                 return False
 53 |             # Create a copy of reference tool calls to track matches
 54 |             seen = [False] * len(reference_output["tool_calls"])
 55 |             for output_call in output["tool_calls"]:
 56 |                 found_match = False
 57 |                 for i, reference_call in enumerate(reference_output["tool_calls"]):
 58 |                     if not seen[i] and (
 59 |                         output_call["function"]["name"]
 60 |                         == reference_call["function"]["name"]
 61 |                     ):
 62 |                         matcher = _get_matcher_for_tool_name(
 63 |                             output_call["function"]["name"],
 64 |                             tool_args_match_mode,
 65 |                             tool_args_match_overrides,
 66 |                         )
 67 |                         if matcher(
 68 |                             json.loads(output_call["function"]["arguments"]),
 69 |                             json.loads(reference_call["function"]["arguments"]),
 70 |                         ):
 71 |                             found_match = True
 72 |                             seen[i] = True
 73 |                             break
 74 |                 if not found_match:
 75 |                     return False
 76 |     return True
 77 | 
 78 | 
 79 | def trajectory_strict_match(
 80 |     *,
 81 |     outputs: Union[list[ChatCompletionMessage], list[BaseMessage], dict],
 82 |     reference_outputs: Union[list[ChatCompletionMessage], list[BaseMessage], dict],
 83 |     tool_call_args_exact_match: bool = True,
 84 |     **kwargs: Any,
 85 | ):
 86 |     """
 87 |     DEPRECATED: Use create_trajectory_match_evaluator() instead:
 88 |     ```python
 89 |     from agentevals.trajectory.match import create_trajectory_match_evaluator
 90 |     evaluator = create_trajectory_match_evaluator(trajectory_match_mode="strict")
 91 |     evaluator(outputs=outputs, reference_outputs=reference_outputs)
 92 |     ```
 93 | 
 94 |     Evaluate whether an input agent trajectory and called tools strictly matches a reference trajectory.
 95 |     This means that at each step, the agent called the same tools in the same order as specified in the reference trajectory.
 96 | 
 97 |     Args:
 98 |         outputs (Union[list[ChatCompletionMessage], list[BaseMessage], dict]): Actual trajectory the agent followed.
 99 |             May be a list of OpenAI messages, a list of LangChain messages, or a dictionary containing
100 |             a "messages" key with one of the above.
101 |         reference_outputs (Union[list[ChatCompletionMessage], list[BaseMessage], dict]): Ideal reference trajectory the agent should have followed.
102 |             May be a list of OpenAI messages, a list of LangChain messages, or a dictionary containing
103 |             a "messages" key with one of the above.
104 |         tool_call_args_exact_match (bool): Whether to require exact matches for tool call arguments
105 | 
106 |     Returns:
107 |         EvaluatorResult: Contains a score of True if trajectory (including called tools) matches, False otherwise
108 |     """
109 |     warn(
110 |         "trajectory_strict_match() is deprecated. Use create_trajectory_match_evaluator() instead.",
111 |         DeprecationWarning,
112 |         stacklevel=2,
113 |     )
114 | 
115 |     def wrapper(**kwargs: Any):
116 |         return _scorer(
117 |             tool_args_match_mode="exact" if tool_call_args_exact_match else "ignore",
118 |             **kwargs,
119 |         )
120 | 
121 |     return _run_evaluator(
122 |         run_name="trajectory_strict_match",
123 |         scorer=wrapper,
124 |         feedback_key="trajectory_strict_match",
125 |         outputs=outputs,
126 |         reference_outputs=reference_outputs,
127 |         **kwargs,
128 |     )
129 | 
130 | 
131 | async def trajectory_strict_match_async(
132 |     *,
133 |     outputs: Union[list[ChatCompletionMessage], list[BaseMessage], dict],
134 |     reference_outputs: Union[list[ChatCompletionMessage], list[BaseMessage], dict],
135 |     tool_call_args_exact_match: bool = True,
136 |     **kwargs: Any,
137 | ):
138 |     """
139 |     DEPRECATED: Use create_async_trajectory_match_evaluator() instead:
140 |     ```python
141 |     from agentevals.trajectory.match import create_trajectory_match_evaluator
142 |     evaluator = create_async_trajectory_match_evaluator(trajectory_match_mode="subset")
143 |     await evaluator(outputs=outputs, reference_outputs=reference_outputs)
144 |     ```
145 | 
146 |     Evaluate whether an input agent trajectory and called tools strictly matches a reference trajectory.
147 |     This means that at each step, the agent called the same tools in the same order as specified in the reference trajectory.
148 | 
149 |     Args:
150 |         outputs (Union[list[ChatCompletionMessage], list[BaseMessage], dict]): Actual trajectory the agent followed.
151 |             May be a list of OpenAI messages, a list of LangChain messages, or a dictionary containing
152 |             a "messages" key with one of the above.
153 |         reference_outputs (Union[list[ChatCompletionMessage], list[BaseMessage], dict]): Ideal reference trajectory the agent should have followed.
154 |             May be a list of OpenAI messages, a list of LangChain messages, or a dictionary containing
155 |             a "messages" key with one of the above.
156 |         tool_call_args_exact_match (bool): Whether to require exact matches for tool call arguments
157 | 
158 |     Returns:
159 |         EvaluatorResult: Contains a score of True if trajectory (including called tools) matches, False otherwise
160 |     """
161 |     warn(
162 |         "trajectory_strict_match_async() is deprecated. Use create_async_trajectory_match_evaluator() instead.",
163 |         DeprecationWarning,
164 |         stacklevel=2,
165 |     )
166 | 
167 |     def wrapper(**kwargs: Any):
168 |         return _scorer(
169 |             tool_args_match_mode="exact" if tool_call_args_exact_match else "ignore",
170 |             **kwargs,
171 |         )
172 | 
173 |     return await _arun_evaluator(
174 |         run_name="trajectory_strict_match",
175 |         scorer=wrapper,
176 |         feedback_key="trajectory_strict_match",
177 |         outputs=outputs,
178 |         reference_outputs=reference_outputs,
179 |         tool_call_args_exact_match=tool_call_args_exact_match,
180 |         **kwargs,
181 |     )
182 | 


--------------------------------------------------------------------------------
/python/agentevals/trajectory/match.py:
--------------------------------------------------------------------------------
  1 | from typing import Literal, Optional, Union
  2 | 
  3 | from agentevals.trajectory.strict import _scorer as trajectory_strict_scorer
  4 | from agentevals.trajectory.unordered import _scorer as trajectory_unordered_scorer
  5 | from agentevals.trajectory.subset import _scorer as trajectory_subset_scorer
  6 | from agentevals.trajectory.superset import _scorer as trajectory_superset_scorer
  7 | from agentevals.types import (
  8 |     ChatCompletionMessage,
  9 |     SimpleEvaluator,
 10 |     SimpleAsyncEvaluator,
 11 |     ToolArgsMatchMode,
 12 |     ToolArgsMatchOverrides,
 13 | )
 14 | from agentevals.utils import _run_evaluator, _arun_evaluator
 15 | 
 16 | from agentevals.trajectory.utils import _normalize_to_openai_messages_list
 17 | 
 18 | from langchain_core.messages import BaseMessage
 19 | 
 20 | 
 21 | TrajectoryMatchMode = Literal["strict", "unordered", "subset", "superset"]
 22 | 
 23 | 
 24 | def create_trajectory_match_evaluator(
 25 |     *,
 26 |     trajectory_match_mode: TrajectoryMatchMode = "strict",
 27 |     tool_args_match_mode: ToolArgsMatchMode = "exact",
 28 |     tool_args_match_overrides: Optional[ToolArgsMatchOverrides] = None,
 29 | ) -> SimpleEvaluator:
 30 |     """Creates an evaluator that compares trajectories between model outputs and reference outputs.
 31 | 
 32 |     Args:
 33 |         trajectory_match_mode (TrajectoryMatchMode): The mode for matching trajectories:
 34 |             - "strict": Requires exact match in order and content
 35 |             - "unordered": Allows matching in any order
 36 |             - "subset": Accepts if output trajectory is a subset of reference
 37 |             - "superset": Accepts if output trajectory is a superset of reference
 38 |         tool_args_match_mode (ToolArgsMatchMode): Mode for matching tool arguments ("exact" by default, can be "ignore")
 39 |         tool_args_match_overrides (Optional[ToolArgsMatchOverrides]): Dict containing custom overrides for
 40 |             tool argument matching. Each key should be a tool name, and each value should be either a
 41 |             match mode or a matcher. Matchers should be a Callable that takes two sets of tool call args
 42 |             and returns whether they are equal.
 43 | 
 44 |     Returns:
 45 |         SimpleEvaluator: A function that evaluates trajectory matches between outputs and references
 46 | 
 47 |     The returned evaluator accepts:
 48 |         - outputs: List of messages or dict representing the model output trajectory
 49 |         - reference_outputs: List of messages or dict representing the reference trajectory
 50 |         - **kwargs: Additional arguments passed to the underlying evaluator
 51 | 
 52 |     Example:
 53 |     ```python
 54 |     def matcher(output_tool_call_args: dict, reference_tool_call_args: dict) -> bool:
 55 |         output_args = output_tool_call_args.get("query", "").lower()
 56 |         reference_args = reference_tool_call_args.get("query", "").lower()
 57 |         return output_args == reference_args
 58 | 
 59 |     evaluator = create_trajectory_match_evaluator(
 60 |         trajectory_match_mode="strict",
 61 |         tool_args_match_mode="exact",
 62 |         tool_args_match_overrides={
 63 |             "my_tool_name": matcher,
 64 |         },
 65 |     )
 66 |     result = evaluator(
 67 |         outputs=...,
 68 |         reference_outputs=...,
 69 |     )
 70 |     ```
 71 |     """
 72 |     if trajectory_match_mode == "strict":
 73 |         scorer = trajectory_strict_scorer
 74 |     elif trajectory_match_mode == "unordered":
 75 |         scorer = trajectory_unordered_scorer
 76 |     elif trajectory_match_mode == "subset":
 77 |         scorer = trajectory_subset_scorer
 78 |     elif trajectory_match_mode == "superset":
 79 |         scorer = trajectory_superset_scorer
 80 |     else:
 81 |         raise ValueError(
 82 |             f"Invalid trajectory match type: `{trajectory_match_mode}`. Must be one of `strict`, `unordered`, `subset`, or `superset`."
 83 |         )
 84 | 
 85 |     if tool_args_match_mode not in ["exact", "ignore", "subset", "superset"]:
 86 |         raise ValueError(
 87 |             f"Invalid tool args match mode: `{tool_args_match_mode}`. Must be either `exact`, `ignore`, `subset`, or `superset`."
 88 |         )
 89 | 
 90 |     def _wrapped_evaluator(
 91 |         *,
 92 |         outputs: Union[list[ChatCompletionMessage], list[BaseMessage], dict],
 93 |         reference_outputs: Union[list[ChatCompletionMessage], list[BaseMessage], dict],
 94 |         **kwargs,
 95 |     ):
 96 |         outputs = _normalize_to_openai_messages_list(outputs)
 97 |         reference_outputs = _normalize_to_openai_messages_list(reference_outputs)
 98 |         return _run_evaluator(
 99 |             run_name=f"trajectory_{trajectory_match_mode}_match",
100 |             scorer=scorer,
101 |             feedback_key=f"trajectory_{trajectory_match_mode}_match",
102 |             outputs=outputs,
103 |             reference_outputs=reference_outputs,
104 |             tool_args_match_mode=tool_args_match_mode,
105 |             tool_args_match_overrides=tool_args_match_overrides,
106 |             **kwargs,
107 |         )
108 | 
109 |     return _wrapped_evaluator
110 | 
111 | 
112 | def create_async_trajectory_match_evaluator(
113 |     *,
114 |     trajectory_match_mode: TrajectoryMatchMode = "strict",
115 |     tool_args_match_mode: ToolArgsMatchMode = "exact",
116 |     tool_args_match_overrides: Optional[ToolArgsMatchOverrides] = None,
117 | ) -> SimpleAsyncEvaluator:
118 |     """Creates an async evaluator that compares trajectories between model outputs and reference outputs.
119 | 
120 |     Args:
121 |         trajectory_match_mode (TrajectoryMatchMode): The mode for matching trajectories:
122 |             - "strict": Requires exact match in order and content
123 |             - "unordered": Allows matching in any order
124 |             - "subset": Accepts if output trajectory is a subset of reference
125 |             - "superset": Accepts if output trajectory is a superset of reference
126 |         tool_args_match_mode (ToolArgsMatchMode): Mode for matching tool arguments ("exact" by default, can be "ignore")
127 |         tool_args_match_overrides (Optional[ToolArgsMatchOverrides]): Dict containing custom overrides for
128 |             tool argument matching. Each key should be a tool name, and each value should be either a
129 |             match mode or a matcher. Matchers should be a Callable that takes two sets of tool call args
130 |             and returns whether they are equal.
131 | 
132 |     Returns:
133 |         SimpleAsyncEvaluator: An async function that evaluates trajectory matches between outputs and references
134 | 
135 |     The returned evaluator accepts:
136 |         - outputs: List of messages or dict representing the model output trajectory
137 |         - reference_outputs: List of messages or dict representing the reference trajectory
138 |         - **kwargs: Additional arguments passed to the underlying evaluator
139 | 
140 |     Example:
141 |     ```python
142 |     def matcher(output_tool_call_args: dict, reference_tool_call_args: dict) -> bool:
143 |         output_args = output_tool_call_args.get("query", "").lower()
144 |         reference_args = reference_tool_call_args.get("query", "").lower()
145 |         return output_args == reference_args
146 | 
147 |     evaluator = create_async_trajectory_match_evaluator(
148 |         trajectory_match_mode="strict",
149 |         tool_args_match_mode="exact",
150 |         tool_args_match_overrides={
151 |             "my_tool_name": matcher,
152 |         },
153 |     )
154 |     result = await evaluator(
155 |         outputs=...,
156 |         reference_outputs=...,
157 |     )
158 |     ```
159 |     """
160 |     if trajectory_match_mode == "strict":
161 |         scorer = trajectory_strict_scorer
162 |     elif trajectory_match_mode == "unordered":
163 |         scorer = trajectory_unordered_scorer
164 |     elif trajectory_match_mode == "subset":
165 |         scorer = trajectory_subset_scorer
166 |     elif trajectory_match_mode == "superset":
167 |         scorer = trajectory_superset_scorer
168 |     else:
169 |         raise ValueError(
170 |             f"Invalid trajectory match type: `{trajectory_match_mode}`. Must be one of `strict`, `unordered`, `subset`, or `superset`."
171 |         )
172 | 
173 |     if tool_args_match_mode not in ["exact", "ignore", "subset", "superset"]:
174 |         raise ValueError(
175 |             f"Invalid tool args match mode: `{tool_args_match_mode}`. Must be either `exact`, `ignore`, `subset`, or `superset`."
176 |         )
177 | 
178 |     async def _wrapped_evaluator(
179 |         *,
180 |         outputs: Union[list[ChatCompletionMessage], list[BaseMessage], dict],
181 |         reference_outputs: Union[list[ChatCompletionMessage], list[BaseMessage], dict],
182 |         **kwargs,
183 |     ):
184 |         outputs = _normalize_to_openai_messages_list(outputs)
185 |         reference_outputs = _normalize_to_openai_messages_list(reference_outputs)
186 |         return await _arun_evaluator(
187 |             run_name=f"trajectory_{trajectory_match_mode}_match",
188 |             scorer=scorer,
189 |             feedback_key=f"trajectory_{trajectory_match_mode}_match",
190 |             outputs=outputs,
191 |             reference_outputs=reference_outputs,
192 |             tool_args_match_mode=tool_args_match_mode,
193 |             tool_args_match_overrides=tool_args_match_overrides,
194 |             **kwargs,
195 |         )
196 | 
197 |     return _wrapped_evaluator
198 | 


--------------------------------------------------------------------------------
/python/agentevals/trajectory/llm.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | from openevals.llm import (
  4 |     _create_llm_as_judge_scorer,
  5 |     _create_async_llm_as_judge_scorer,
  6 |     ChatCompletionMessage,
  7 |     ModelClient,
  8 |     SimpleEvaluator,
  9 |     SimpleAsyncEvaluator,
 10 |     Callable,
 11 |     Optional,
 12 |     Union,
 13 | )
 14 | from openevals.utils import (
 15 |     _chat_completion_messages_to_string,
 16 | )
 17 | from agentevals.types import FewShotExample
 18 | from agentevals.utils import _run_evaluator, _arun_evaluator
 19 | from agentevals.trajectory.utils import _normalize_to_openai_messages_list
 20 | 
 21 | from langchain_core.language_models.chat_models import BaseChatModel
 22 | from langchain_core.runnables import Runnable
 23 | 
 24 | from typing import TYPE_CHECKING
 25 | 
 26 | 
 27 | TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE = """You are an expert data labeler.
 28 | Your task is to grade the accuracy of an AI agent's internal trajectory.
 29 | 
 30 | <Rubric>
 31 |   An accurate trajectory:
 32 |   - Makes logical sense between steps
 33 |   - Shows clear progression
 34 |   - Is relatively efficient, though it does not need to be perfectly efficient
 35 |   - Is semantically equivalent to the provided reference trajectory
 36 | </Rubric>
 37 | 
 38 | Based on the following reference trajectory:
 39 | 
 40 | <reference_trajectory>
 41 | {reference_outputs}
 42 | </reference_trajectory>
 43 | 
 44 | Grade this actual trajectory:
 45 | 
 46 | <trajectory>
 47 | {outputs}
 48 | </trajectory>
 49 | """
 50 | 
 51 | TRAJECTORY_ACCURACY_PROMPT = """You are an expert data labeler.
 52 | Your task is to grade the accuracy of an AI agent's internal trajectory.
 53 | 
 54 | <Rubric>
 55 |   An accurate trajectory:
 56 |   - Makes logical sense between steps
 57 |   - Shows clear progression
 58 |   - Is relatively efficient, though it does not need to be perfectly efficient
 59 | </Rubric>
 60 | 
 61 | First, try to understand the goal of the trajectory by looking at the input
 62 | (if the input is not present try to infer it from the content of the first message),
 63 | as well as the output of the final message. Once you understand the goal, grade the trajectory
 64 | as it relates to achieving that goal.
 65 | 
 66 | Grade the following trajectory:
 67 | 
 68 | <trajectory>
 69 | {outputs}
 70 | </trajectory>
 71 | """
 72 | 
 73 | if TYPE_CHECKING:
 74 |     from langchain_core.messages import BaseMessage
 75 | 
 76 | 
 77 | def _format_inputs(
 78 |     outputs: Union[list[ChatCompletionMessage], list[BaseMessage], dict],
 79 |     reference_outputs: Optional[
 80 |         Union[list[ChatCompletionMessage], list[BaseMessage], dict]
 81 |     ],
 82 | ) -> tuple[str, str]:
 83 |     outputs = _normalize_to_openai_messages_list(outputs)
 84 |     reference_outputs = _normalize_to_openai_messages_list(reference_outputs)
 85 |     if reference_outputs:
 86 |         formatted_reference_outputs = _chat_completion_messages_to_string(
 87 |             reference_outputs
 88 |         )
 89 |     else:
 90 |         formatted_reference_outputs = ""
 91 |     formatted_outputs = _chat_completion_messages_to_string(outputs)
 92 |     return (
 93 |         formatted_outputs,
 94 |         formatted_reference_outputs,
 95 |     )
 96 | 
 97 | 
 98 | def create_trajectory_llm_as_judge(
 99 |     *,
100 |     prompt: str
101 |     | Runnable
102 |     | Callable[
103 |         ..., list[ChatCompletionMessage]
104 |     ] = TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE,
105 |     model: Optional[str] = None,
106 |     feedback_key: str = "trajectory_accuracy",
107 |     judge: Optional[
108 |         Union[
109 |             ModelClient,
110 |             BaseChatModel,
111 |         ]
112 |     ] = None,
113 |     continuous: bool = False,
114 |     choices: Optional[list[float]] = None,
115 |     use_reasoning: bool = True,
116 |     few_shot_examples: Optional[list[FewShotExample]] = None,
117 | ) -> SimpleEvaluator:
118 |     """Creates an evaluator that uses an LLM to judge agent trajectories.
119 | 
120 |     Args:
121 |         prompt: The evaluation prompt, can be a string template, LangChain prompt template, or callable
122 |             that returns a list of chat messages. Note that the default prompt allows a rubric
123 |             in addition to the typical "inputs", "outputs", and "reference_outputs" parameters.
124 |         feedback_key: Key used to store the evaluation result, defaults to "trajectory_accuracy".
125 |         judge: The LLM used for evaluation. Can be an OpenAI client
126 |             or a LangChain chat model. If an OpenAI client, must specify "model" as well.
127 |             If omitted, "model" will be used to instantiate a LangChain model instance
128 |             by model string.
129 |         model: Model identifier to use. If "judge" is an OpenAI client,
130 |             this argument should be a model name directly. If "judge" is omitted, must be a valid
131 |             LangChain model identifier. See `init_chat_model` docs for more details:
132 |             https://python.langchain.com/docs/how_to/chat_models_universal_init/.
133 |         system: Optional system message to prepend to the prompt.
134 |         continuous: If True, score will be a float between 0 and 1. If False, score will be boolean. Defaults to False.
135 |         choices: Optional list of specific float values the score must be chosen from.
136 |         use_reasoning: If True, includes explanation for the score in the output. Defaults to True.
137 |         few_shot_examples: Optional list of example evaluations to append to the prompt.
138 | 
139 |     Returns:
140 |         SimpleEvaluator: A function that evaluates agent trajectories using the configured LLM judge.
141 |     """
142 |     scorer = _create_llm_as_judge_scorer(
143 |         prompt=prompt,
144 |         judge=judge,
145 |         model=model,
146 |         continuous=continuous,
147 |         choices=choices,
148 |         use_reasoning=use_reasoning,
149 |         few_shot_examples=few_shot_examples,
150 |     )
151 | 
152 |     def _wrapped_evaluator(
153 |         *,
154 |         outputs: Union[list[ChatCompletionMessage], list[BaseMessage], dict],
155 |         reference_outputs: Optional[
156 |             Union[list[ChatCompletionMessage], list[BaseMessage], dict]
157 |         ] = None,
158 |         **kwargs,
159 |     ):
160 |         (
161 |             formatted_outputs,
162 |             formatted_reference_outputs,
163 |         ) = _format_inputs(outputs, reference_outputs)
164 |         return _run_evaluator(
165 |             run_name=f"llm_as_{feedback_key}_judge",
166 |             scorer=scorer,
167 |             feedback_key=feedback_key,
168 |             outputs=formatted_outputs,
169 |             reference_outputs=formatted_reference_outputs,
170 |             **kwargs,
171 |         )
172 | 
173 |     return _wrapped_evaluator
174 | 
175 | 
176 | def create_async_trajectory_llm_as_judge(
177 |     *,
178 |     prompt: str
179 |     | Runnable
180 |     | Callable[
181 |         ..., list[ChatCompletionMessage]
182 |     ] = TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE,
183 |     model: Optional[str] = None,
184 |     feedback_key: str = "trajectory_accuracy",
185 |     judge: Optional[
186 |         Union[
187 |             ModelClient,
188 |             BaseChatModel,
189 |         ]
190 |     ] = None,
191 |     continuous: bool = False,
192 |     choices: Optional[list[float]] = None,
193 |     use_reasoning: bool = True,
194 |     few_shot_examples: Optional[list[FewShotExample]] = None,
195 | ) -> SimpleAsyncEvaluator:
196 |     """Creates an evaluator that uses an LLM to judge agent trajectories.
197 | 
198 |     Args:
199 |         prompt: The evaluation prompt, can be a string template, LangChain prompt template, or callable
200 |             that returns a list of chat messages. Note that the default prompt allows a rubric
201 |             in addition to the typical "inputs", "outputs", and "reference_outputs" parameters.
202 |         feedback_key: Key used to store the evaluation result, defaults to "trajectory_accuracy".
203 |         judge: The LLM used for evaluation. Can be an OpenAI client
204 |             or a LangChain chat model. If an OpenAI client, must specify "model" as well.
205 |             If omitted, "model" will be used to instantiate a LangChain model instance
206 |             by model string.
207 |         model: Model identifier to use. If "judge" is an OpenAI client,
208 |             this argument should be a model name directly. If "judge" is omitted, must be a valid
209 |             LangChain model identifier. See `init_chat_model` docs for more details:
210 |             https://python.langchain.com/docs/how_to/chat_models_universal_init/.
211 |         system: Optional system message to prepend to the prompt.
212 |         continuous: If True, score will be a float between 0 and 1. If False, score will be boolean. Defaults to False.
213 |         choices: Optional list of specific float values the score must be chosen from.
214 |         use_reasoning: If True, includes explanation for the score in the output. Defaults to True.
215 |         few_shot_examples: Optional list of example evaluations to append to the prompt.
216 | 
217 |     Returns:
218 |         SimpleAsyncEvaluator: A function that evaluates agent trajectories using the configured LLM judge.
219 |     """
220 |     scorer = _create_async_llm_as_judge_scorer(
221 |         prompt=prompt,
222 |         judge=judge,
223 |         model=model,
224 |         continuous=continuous,
225 |         choices=choices,
226 |         use_reasoning=use_reasoning,
227 |         few_shot_examples=few_shot_examples,
228 |     )
229 | 
230 |     async def _wrapped_evaluator(
231 |         *,
232 |         outputs: Union[list[ChatCompletionMessage], list[BaseMessage], dict],
233 |         reference_outputs: Optional[
234 |             Union[list[ChatCompletionMessage], list[BaseMessage], dict]
235 |         ] = None,
236 |         **kwargs,
237 |     ):
238 |         (
239 |             formatted_outputs,
240 |             formatted_reference_outputs,
241 |         ) = _format_inputs(outputs, reference_outputs)
242 |         return await _arun_evaluator(
243 |             run_name=f"llm_as_{feedback_key}_judge",
244 |             scorer=scorer,
245 |             feedback_key=feedback_key,
246 |             outputs=formatted_outputs,
247 |             reference_outputs=formatted_reference_outputs,
248 |             **kwargs,
249 |         )
250 | 
251 |     return _wrapped_evaluator
252 | 


--------------------------------------------------------------------------------
/python/agentevals/graph_trajectory/llm.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | from openevals.llm import (
  3 |     _create_llm_as_judge_scorer,
  4 |     _create_async_llm_as_judge_scorer,
  5 |     ChatCompletionMessage,
  6 |     ModelClient,
  7 |     SimpleEvaluator,
  8 |     SimpleAsyncEvaluator,
  9 |     Callable,
 10 |     Optional,
 11 |     Union,
 12 | )
 13 | from langchain_core.runnables import Runnable
 14 | 
 15 | from agentevals.types import EvaluatorResult, FewShotExample, GraphTrajectory
 16 | from agentevals.utils import _run_evaluator, _arun_evaluator
 17 | 
 18 | from langchain_core.language_models.chat_models import BaseChatModel
 19 | 
 20 | DEFAULT_REF_COMPARE_PROMPT = """You are an expert data labeler.
 21 | Your task is to grade the accuracy of an AI agent's internal steps in resolving a user queries.
 22 | 
 23 | <Rubric>
 24 |   An accurate trajectory:
 25 |   - Makes logical sense between steps
 26 |   - Shows clear progression
 27 |   - Is relatively efficient, though it does not need to be perfectly efficient
 28 |   - Is semantically equivalent to the provided reference trajectory, if present
 29 | </Rubric>
 30 | 
 31 | <Instructions>
 32 |   Grade the following thread, evaluating whether the agent's overall steps are logical and relatively efficient.
 33 |   For the trajectory, "__start__" denotes an initial entrypoint to the agent, and "__interrupt__" corresponds to the agent
 34 |   interrupting to await additional data from another source ("human-in-the-loop"):
 35 | </Instructions>
 36 | 
 37 | <thread>
 38 | {thread}
 39 | </thread>
 40 | 
 41 | {reference_outputs}
 42 | """
 43 | 
 44 | 
 45 | def _format_thread(
 46 |     inputs: list,
 47 |     outputs: GraphTrajectory,
 48 | ) -> str:
 49 |     formatted_thread = ""
 50 |     for input, result, step in zip(inputs, outputs["results"], outputs["steps"]):
 51 |         formatted_thread += f"\n<input>\n{input}\n</input>\n" if input else ""
 52 |         formatted_thread += f"\n<trajectory>\n{step}\n</trajectory>\n"
 53 |         formatted_thread += f"\n<result>\n{result}\n</result>\n"
 54 |     return formatted_thread
 55 | 
 56 | 
 57 | def _format_inputs(
 58 |     inputs: Optional[Union[list, dict]],
 59 |     outputs: GraphTrajectory,
 60 |     reference_outputs: Optional[GraphTrajectory],
 61 | ) -> tuple[str, str]:
 62 |     if isinstance(inputs, dict):
 63 |         if "inputs" not in inputs:
 64 |             raise ValueError("inputs must be a list or a dict with an 'inputs' key")
 65 |         inputs = inputs["inputs"]
 66 |     if len(inputs) != len(outputs["results"]):
 67 |         raise ValueError(
 68 |             "Provided `inputs` and `results` within provided `outputs` must have the same length"
 69 |         )
 70 |     if inputs is not None and len(inputs) != len(outputs["steps"]):
 71 |         raise ValueError(
 72 |             "Provided `inputs` and `steps` within provided `outputs` must have the same length"
 73 |         )
 74 |     formatted_thread = _format_thread(inputs, outputs)  # type: ignore
 75 |     if reference_outputs:
 76 |         formatted_reference_outputs = f"\nUse the following trajectory as an example reference when grading:\n<reference_thread>\n{_format_thread(reference_outputs['inputs'], reference_outputs)}\n</reference_thread>\n"
 77 |     else:
 78 |         formatted_reference_outputs = ""
 79 |     return (
 80 |         formatted_thread,
 81 |         formatted_reference_outputs,
 82 |     )
 83 | 
 84 | 
 85 | def create_graph_trajectory_llm_as_judge(
 86 |     *,
 87 |     prompt: str
 88 |     | Runnable
 89 |     | Callable[..., list[ChatCompletionMessage]] = DEFAULT_REF_COMPARE_PROMPT,
 90 |     model: Optional[str] = None,
 91 |     feedback_key: str = "graph_trajectory_accuracy",
 92 |     judge: Optional[
 93 |         Union[
 94 |             ModelClient,
 95 |             BaseChatModel,
 96 |         ]
 97 |     ] = None,
 98 |     continuous: bool = False,
 99 |     choices: Optional[list[float]] = None,
100 |     use_reasoning: bool = True,
101 |     few_shot_examples: Optional[list[FewShotExample]] = None,
102 | ) -> SimpleEvaluator:
103 |     """Creates an evaluator that uses an LLM to judge agent trajectories.
104 | 
105 |     Args:
106 |         prompt: The evaluation prompt, can be a string template, LangChain prompt template, or callable
107 |             that returns a list of chat messages. Note that the default prompt allows a rubric
108 |             in addition to the typical "inputs", "outputs", and "reference_outputs" parameters.
109 |         feedback_key: Key used to store the evaluation result, defaults to "graph_trajectory_accuracy".
110 |         judge: The LLM used for evaluation. Can be an OpenAI client),
111 |             or a LangChain chat model. If an OpenAI client, must specify "model" as well.
112 |             If omitted, "model" will be used to instantiate a LangChain model instance
113 |             by model string.
114 |         model: Model identifier to use. If "judge" is an OpenAI client,
115 |             this argument should be a model name directly. If "judge" is omitted, must be a valid
116 |             LangChain model identifier. See `init_chat_model` docs for more details:
117 |             https://python.langchain.com/docs/how_to/chat_models_universal_init/.
118 |         system: Optional system message to prepend to the prompt.
119 |         continuous: If True, score will be a float between 0 and 1. If False, score will be boolean. Defaults to False.
120 |         choices: Optional list of specific float values the score must be chosen from.
121 |         use_reasoning: If True, includes explanation for the score in the output. Defaults to True.
122 |         few_shot_examples: Optional list of example evaluations to append to the prompt.
123 | 
124 |     Returns:
125 |         SimpleEvaluator: A function that evaluates agent trajectories using the configured LLM judge.
126 |     """
127 |     scorer = _create_llm_as_judge_scorer(
128 |         prompt=prompt,
129 |         judge=judge,
130 |         model=model,
131 |         continuous=continuous,
132 |         choices=choices,
133 |         use_reasoning=use_reasoning,
134 |         few_shot_examples=few_shot_examples,
135 |     )
136 | 
137 |     def _wrapped_evaluator(
138 |         *,
139 |         inputs: Optional[Union[dict, list]] = None,
140 |         outputs: GraphTrajectory,
141 |         reference_outputs: Optional[GraphTrajectory] = None,
142 |         **kwargs,
143 |     ) -> EvaluatorResult:
144 |         (
145 |             formatted_thread,
146 |             formatted_reference_outputs,
147 |         ) = _format_inputs(inputs, outputs, reference_outputs)
148 |         return _run_evaluator(
149 |             run_name=f"llm_as_{feedback_key}_judge",
150 |             scorer=scorer,
151 |             feedback_key=feedback_key,
152 |             inputs=inputs,
153 |             outputs=outputs,
154 |             thread=formatted_thread,
155 |             reference_outputs=formatted_reference_outputs,
156 |             **kwargs,
157 |         )
158 | 
159 |     return _wrapped_evaluator
160 | 
161 | 
162 | def create_async_graph_trajectory_llm_as_judge(
163 |     *,
164 |     prompt: str
165 |     | Runnable
166 |     | Callable[..., list[ChatCompletionMessage]] = DEFAULT_REF_COMPARE_PROMPT,
167 |     model: Optional[str] = None,
168 |     feedback_key: str = "graph_trajectory_accuracy",
169 |     judge: Optional[
170 |         Union[
171 |             ModelClient,
172 |             BaseChatModel,
173 |         ]
174 |     ] = None,
175 |     continuous: bool = False,
176 |     choices: Optional[list[float]] = None,
177 |     use_reasoning: bool = True,
178 |     few_shot_examples: Optional[list[FewShotExample]] = None,
179 | ) -> SimpleAsyncEvaluator:
180 |     """Creates an evaluator that uses an LLM to judge agent trajectories.
181 | 
182 |     Args:
183 |         prompt: The evaluation prompt, can be a string template, LangChain prompt template, or callable
184 |             that returns a list of chat messages. Note that the default prompt allows a rubric
185 |             in addition to the typical "inputs", "outputs", and "reference_outputs" parameters.
186 |         feedback_key: Key used to store the evaluation result, defaults to "graph_trajectory_accuracy".
187 |         judge: The LLM used for evaluation. Can be an OpenAI client),
188 |             or a LangChain chat model. If an OpenAI client, must specify "model" as well.
189 |             If omitted, "model" will be used to instantiate a LangChain model instance
190 |             by model string.
191 |         model: Model identifier to use. If "judge" is an OpenAI client,
192 |             this argument should be a model name directly. If "judge" is omitted, must be a valid
193 |             LangChain model identifier. See `init_chat_model` docs for more details:
194 |             https://python.langchain.com/docs/how_to/chat_models_universal_init/.
195 |         system: Optional system message to prepend to the prompt.
196 |         continuous: If True, score will be a float between 0 and 1. If False, score will be boolean. Defaults to False.
197 |         choices: Optional list of specific float values the score must be chosen from.
198 |         use_reasoning: If True, includes explanation for the score in the output. Defaults to True.
199 |         few_shot_examples: Optional list of example evaluations to append to the prompt.
200 | 
201 |     Returns:
202 |         SimpleAsyncEvaluator: A function that evaluates agent trajectories using the configured LLM judge.
203 |     """
204 |     scorer = _create_async_llm_as_judge_scorer(
205 |         prompt=prompt,
206 |         judge=judge,
207 |         model=model,
208 |         continuous=continuous,
209 |         choices=choices,
210 |         use_reasoning=use_reasoning,
211 |         few_shot_examples=few_shot_examples,
212 |     )
213 | 
214 |     async def _wrapped_evaluator(
215 |         *,
216 |         inputs: Optional[Union[dict, list]] = None,
217 |         outputs: GraphTrajectory,
218 |         reference_outputs: Optional[GraphTrajectory] = None,
219 |         **kwargs,
220 |     ) -> EvaluatorResult:
221 |         (
222 |             formatted_thread,
223 |             formatted_reference_outputs,
224 |         ) = _format_inputs(inputs, outputs, reference_outputs)
225 |         return await _arun_evaluator(
226 |             run_name=f"llm_as_{feedback_key}_judge",
227 |             scorer=scorer,
228 |             feedback_key=feedback_key,
229 |             inputs=inputs,
230 |             outputs=outputs,
231 |             thread=formatted_thread,
232 |             reference_outputs=formatted_reference_outputs,
233 |             **kwargs,
234 |         )
235 | 
236 |     return _wrapped_evaluator
237 | 


--------------------------------------------------------------------------------