├── README.md ├── build-eval-agent ├── .env.example ├── .gitignore ├── agent-eval.ipynb ├── chinook.db └── images │ ├── architecture.png │ ├── evals-conceptual.png │ ├── final-response.png │ ├── lookup.png │ ├── refund.png │ ├── single-step.png │ ├── trajectory.png │ └── with-supervisor.png └── evaluate-document-extraction ├── .env.example ├── .gitignore ├── aapl.pdf ├── build-eval-extraction.ipynb ├── evaluate-document-extraction-dataset.csv └── extraction-eval.png /README.md: -------------------------------------------------------------------------------- 1 | # The Judge 2 | 3 | This repo is a collection of notebooks for the 'The Judge' video series 4 | 5 | ### 1. Build and evaluate an agent 6 | 7 | architecture 8 | 9 | ### 2. Evaluate document extraction 10 | 11 | architecture 12 | -------------------------------------------------------------------------------- /build-eval-agent/.env.example: -------------------------------------------------------------------------------- 1 | export OPENAI_API_KEY= 2 | export LANGCHAIN_API_KEY= 3 | export LANGCHAIN_TRACING_V2=true 4 | export ANTHROPIC_API_KEY= 5 | -------------------------------------------------------------------------------- /build-eval-agent/.gitignore: -------------------------------------------------------------------------------- 1 | .env -------------------------------------------------------------------------------- /build-eval-agent/agent-eval.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Build and evaluate a customer support agent 🦜🕸️" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "\"architecture\"" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "## Build the agent" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": { 27 | "vscode": { 28 | "languageId": "plaintext" 29 | } 30 | }, 31 | "source": [ 32 | "### Install dependencies" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 31, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "%%capture --no-stderr\n", 42 | "%pip install --upgrade --quiet langgraph langchain_openai openai langchain_community langchain_core SQLAlchemy python-dotenv typing-extensions pydantic scikit-learn asttokens tabulate langsmith" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "### Load env variables" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 32, 55 | "metadata": {}, 56 | "outputs": [ 57 | { 58 | "data": { 59 | "text/plain": [ 60 | "True" 61 | ] 62 | }, 63 | "execution_count": 32, 64 | "metadata": {}, 65 | "output_type": "execute_result" 66 | } 67 | ], 68 | "source": [ 69 | "from dotenv import load_dotenv\n", 70 | "\n", 71 | "load_dotenv()" 72 | ] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "metadata": {}, 77 | "source": [ 78 | "### Download database" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 33, 84 | "metadata": {}, 85 | "outputs": [ 86 | { 87 | "name": "stdout", 88 | "output_type": "stream", 89 | "text": [ 90 | "File downloaded and saved as Chinook.db\n" 91 | ] 92 | } 93 | ], 94 | "source": [ 95 | "import requests\n", 96 | "\n", 97 | "url = \"https://storage.googleapis.com/benchmarks-artifacts/chinook/Chinook.db\"\n", 98 | "\n", 99 | "response = requests.get(url)\n", 100 | "\n", 101 | "if response.status_code == 200:\n", 102 | " # Open a local file in binary write mode\n", 103 | " with open(\"chinook.db\", \"wb\") as file:\n", 104 | " # Write the content of the response (the file) to the local file\n", 105 | " file.write(response.content)\n", 106 | " print(\"File downloaded and saved as Chinook.db\")\n", 107 | "else:\n", 108 | " print(f\"Failed to download the file. Status code: {response.status_code}\")" 109 | ] 110 | }, 111 | { 112 | "cell_type": "markdown", 113 | "metadata": {}, 114 | "source": [ 115 | "### List tables" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": 34, 121 | "metadata": {}, 122 | "outputs": [ 123 | { 124 | "name": "stdout", 125 | "output_type": "stream", 126 | "text": [ 127 | "Tables in the database:\n", 128 | "Album\n", 129 | "Artist\n", 130 | "Customer\n", 131 | "Employee\n", 132 | "Genre\n", 133 | "Invoice\n", 134 | "InvoiceLine\n", 135 | "MediaType\n", 136 | "Playlist\n", 137 | "PlaylistTrack\n", 138 | "Track\n" 139 | ] 140 | } 141 | ], 142 | "source": [ 143 | "import sqlite3\n", 144 | "\n", 145 | "# Connect to the database\n", 146 | "conn = sqlite3.connect(\"chinook.db\")\n", 147 | "cursor = conn.cursor()\n", 148 | "\n", 149 | "# Fetch the names of all tables\n", 150 | "cursor.execute(\"SELECT name FROM sqlite_master WHERE type='table';\")\n", 151 | "tables = cursor.fetchall()\n", 152 | "\n", 153 | "# Print the names of all tables\n", 154 | "print(\"Tables in the database:\")\n", 155 | "for table in tables:\n", 156 | " print(table[0])\n", 157 | "\n", 158 | "# Close the connection\n", 159 | "conn.close()" 160 | ] 161 | }, 162 | { 163 | "cell_type": "markdown", 164 | "metadata": {}, 165 | "source": [ 166 | "### Define refund subgraph " 167 | ] 168 | }, 169 | { 170 | "cell_type": "markdown", 171 | "metadata": {}, 172 | "source": [ 173 | "\"refund\"" 174 | ] 175 | }, 176 | { 177 | "cell_type": "markdown", 178 | "metadata": {}, 179 | "source": [ 180 | "##### Helper functions to fetch invoice info and execute refunds" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": 35, 186 | "metadata": {}, 187 | "outputs": [], 188 | "source": [ 189 | "import sqlite3\n", 190 | "\n", 191 | "def _refund(invoice_id: int | None, invoice_line_ids: list[int] | None, mock: bool = False) -> float:\n", 192 | " \"\"\"Given an Invoice ID and/or Invoice Line IDs, delete the relevant Invoice/InvoiceLine records in the Chinook DB.\n", 193 | "\n", 194 | " Args:\n", 195 | " invoice_id: The Invoice to delete.\n", 196 | " invoice_line_ids: The Invoice Lines to delete.\n", 197 | " mock: If True, do not actually delete the specified Invoice/Invoice Lines. Used for testing purposes.\n", 198 | "\n", 199 | " Returns:\n", 200 | " float: The total dollar amount that was deleted (or mock deleted).\n", 201 | " \"\"\"\n", 202 | "\n", 203 | " if invoice_id is None and invoice_line_ids is None:\n", 204 | " return 0.0\n", 205 | "\n", 206 | " # Connect to the Chinook database\n", 207 | " conn = sqlite3.connect(\"chinook.db\")\n", 208 | " cursor = conn.cursor()\n", 209 | "\n", 210 | " total_refund = 0.0\n", 211 | "\n", 212 | " try:\n", 213 | " # If invoice_id is provided, delete entire invoice and its lines\n", 214 | " if invoice_id is not None:\n", 215 | " # First get the total amount for the invoice\n", 216 | " cursor.execute(\n", 217 | " \"\"\"\n", 218 | " SELECT Total\n", 219 | " FROM Invoice\n", 220 | " WHERE InvoiceId = ?\n", 221 | " \"\"\",\n", 222 | " (invoice_id,),\n", 223 | " )\n", 224 | "\n", 225 | " result = cursor.fetchone()\n", 226 | " if result:\n", 227 | " total_refund += result[0]\n", 228 | "\n", 229 | " # Delete invoice lines first (due to foreign key constraints)\n", 230 | " if not mock:\n", 231 | " cursor.execute(\n", 232 | " \"\"\"\n", 233 | " DELETE FROM InvoiceLine\n", 234 | " WHERE InvoiceId = ?\n", 235 | " \"\"\",\n", 236 | " (invoice_id,),\n", 237 | " )\n", 238 | "\n", 239 | " # Then delete the invoice\n", 240 | " cursor.execute(\n", 241 | " \"\"\"\n", 242 | " DELETE FROM Invoice\n", 243 | " WHERE InvoiceId = ?\n", 244 | " \"\"\",\n", 245 | " (invoice_id,),\n", 246 | " )\n", 247 | "\n", 248 | " # If specific invoice lines are provided\n", 249 | " if invoice_line_ids is not None:\n", 250 | " # Get the total amount for the specified invoice lines\n", 251 | " placeholders = \",\".join([\"?\" for _ in invoice_line_ids])\n", 252 | " cursor.execute(\n", 253 | " f\"\"\"\n", 254 | " SELECT SUM(UnitPrice * Quantity)\n", 255 | " FROM InvoiceLine\n", 256 | " WHERE InvoiceLineId IN ({placeholders})\n", 257 | " \"\"\",\n", 258 | " invoice_line_ids,\n", 259 | " )\n", 260 | "\n", 261 | " result = cursor.fetchone()\n", 262 | " if result and result[0]:\n", 263 | " total_refund += result[0]\n", 264 | "\n", 265 | " if not mock:\n", 266 | " # Delete the specified invoice lines\n", 267 | " cursor.execute(\n", 268 | " f\"\"\"\n", 269 | " DELETE FROM InvoiceLine\n", 270 | " WHERE InvoiceLineId IN ({placeholders})\n", 271 | " \"\"\",\n", 272 | " invoice_line_ids,\n", 273 | " )\n", 274 | "\n", 275 | " # Commit the changes\n", 276 | " conn.commit()\n", 277 | "\n", 278 | " except sqlite3.Error as e:\n", 279 | " # Roll back in case of error\n", 280 | " conn.rollback()\n", 281 | " raise e\n", 282 | "\n", 283 | " finally:\n", 284 | " # Close the connection\n", 285 | " conn.close()\n", 286 | "\n", 287 | " return float(total_refund)\n", 288 | "\n", 289 | "def _lookup(\n", 290 | " customer_first_name: str,\n", 291 | " customer_last_name: str,\n", 292 | " customer_phone: str,\n", 293 | " track_name: str | None,\n", 294 | " album_title: str | None,\n", 295 | " artist_name: str | None,\n", 296 | " purchase_date_iso_8601: str | None,\n", 297 | ") -> list[dict]:\n", 298 | " \"\"\"Find all of the Invoice Line IDs in the Chinook DB for the given filters.\n", 299 | "\n", 300 | " Returns:\n", 301 | " a list of dictionaries that contain keys: {\n", 302 | " 'invoice_line_id',\n", 303 | " 'track_name',\n", 304 | " 'artist_name',\n", 305 | " 'purchase_date',\n", 306 | " 'quantity_purchased',\n", 307 | " 'price_per_unit'\n", 308 | " }\n", 309 | " \"\"\"\n", 310 | "\n", 311 | " # Connect to the database\n", 312 | " conn = sqlite3.connect(\"chinook.db\")\n", 313 | " cursor = conn.cursor()\n", 314 | "\n", 315 | " # Base query joining all necessary tables\n", 316 | " query = \"\"\"\n", 317 | " SELECT\n", 318 | " il.InvoiceLineId,\n", 319 | " t.Name as track_name,\n", 320 | " art.Name as artist_name,\n", 321 | " i.InvoiceDate as purchase_date,\n", 322 | " il.Quantity as quantity_purchased,\n", 323 | " il.UnitPrice as price_per_unit\n", 324 | " FROM InvoiceLine il\n", 325 | " JOIN Invoice i ON il.InvoiceId = i.InvoiceId\n", 326 | " JOIN Customer c ON i.CustomerId = c.CustomerId\n", 327 | " JOIN Track t ON il.TrackId = t.TrackId\n", 328 | " JOIN Album alb ON t.AlbumId = alb.AlbumId\n", 329 | " JOIN Artist art ON alb.ArtistId = art.ArtistId\n", 330 | " WHERE c.FirstName = ?\n", 331 | " AND c.LastName = ?\n", 332 | " AND c.Phone = ?\n", 333 | " \"\"\"\n", 334 | "\n", 335 | " # Parameters for the query\n", 336 | " params = [customer_first_name, customer_last_name, customer_phone]\n", 337 | "\n", 338 | " # Add optional filters\n", 339 | " if track_name:\n", 340 | " query += \" AND t.Name = ?\"\n", 341 | " params.append(track_name)\n", 342 | "\n", 343 | " if album_title:\n", 344 | " query += \" AND alb.Title = ?\"\n", 345 | " params.append(album_title)\n", 346 | "\n", 347 | " if artist_name:\n", 348 | " query += \" AND art.Name = ?\"\n", 349 | " params.append(artist_name)\n", 350 | "\n", 351 | " if purchase_date_iso_8601:\n", 352 | " query += \" AND date(i.InvoiceDate) = date(?)\"\n", 353 | " params.append(purchase_date_iso_8601)\n", 354 | "\n", 355 | " # Execute query\n", 356 | " cursor.execute(query, params)\n", 357 | "\n", 358 | " # Fetch results\n", 359 | " results = cursor.fetchall()\n", 360 | "\n", 361 | " # Convert results to list of dictionaries\n", 362 | " output = []\n", 363 | " for row in results:\n", 364 | " output.append(\n", 365 | " {\n", 366 | " \"invoice_line_id\": row[0],\n", 367 | " \"track_name\": row[1],\n", 368 | " \"artist_name\": row[2],\n", 369 | " \"purchase_date\": row[3],\n", 370 | " \"quantity_purchased\": row[4],\n", 371 | " \"price_per_unit\": row[5],\n", 372 | " }\n", 373 | " )\n", 374 | "\n", 375 | " # Close connection\n", 376 | " conn.close()\n", 377 | "\n", 378 | " return output" 379 | ] 380 | }, 381 | { 382 | "cell_type": "markdown", 383 | "metadata": {}, 384 | "source": [ 385 | "##### Build the graph" 386 | ] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "execution_count": 36, 391 | "metadata": {}, 392 | "outputs": [], 393 | "source": [ 394 | "from typing import Literal\n", 395 | "import json\n", 396 | "\n", 397 | "from langchain.chat_models import init_chat_model\n", 398 | "from langchain_core.runnables import RunnableConfig\n", 399 | "from langgraph.graph import END, StateGraph\n", 400 | "from langgraph.graph.message import AnyMessage, add_messages\n", 401 | "from langgraph.types import Command\n", 402 | "from tabulate import tabulate\n", 403 | "from typing_extensions import Annotated, TypedDict\n", 404 | "\n", 405 | "# Graph state.\n", 406 | "class State(TypedDict):\n", 407 | " \"\"\"Agent state.\"\"\"\n", 408 | " messages: Annotated[list[AnyMessage], add_messages]\n", 409 | " followup: str | None\n", 410 | "\n", 411 | " invoice_id: int | None\n", 412 | " invoice_line_ids: list[int] | None\n", 413 | " customer_first_name: str | None\n", 414 | " customer_last_name: str | None\n", 415 | " customer_phone: str | None\n", 416 | " track_name: str | None\n", 417 | " album_title: str | None\n", 418 | " artist_name: str | None\n", 419 | " purchase_date_iso_8601: str | None\n", 420 | "\n", 421 | "# Instructions for extracting the user/purchase info from the conversation.\n", 422 | "gather_info_instructions = \"\"\"You are managing an online music store that sells song tracks. \\\n", 423 | "Customers can buy multiple tracks at a time and these purchases are recorded in a database as \\\n", 424 | "an Invoice per purchase and an associated set of Invoice Lines for each purchased track.\n", 425 | "\n", 426 | "Your task is to help customers who would like a refund for one or more of the tracks they've \\\n", 427 | "purchased. In order for you to be able refund them, the customer must specify the Invoice ID \\\n", 428 | "to get a refund on all the tracks they bought in a single transaction, or one or more Invoice \\\n", 429 | "Line IDs if they would like refunds on individual tracks.\n", 430 | "\n", 431 | "Often a user will not know the specific Invoice ID(s) or Invoice Line ID(s) for which they \\\n", 432 | "would like a refund. In this case you can help them look up their invoices by asking them to \\\n", 433 | "specify:\n", 434 | "- Required: Their first name, last name, and phone number.\n", 435 | "- Optionally: The track name, artist name, album name, or purchase date.\n", 436 | "\n", 437 | "If the customer has not specified the required information (either Invoice/Invoice Line IDs \\\n", 438 | "or first name, last name, phone) then please ask them to specify it.\"\"\"\n", 439 | "\n", 440 | "# Extraction schema, mirrors the graph state.\n", 441 | "class PurchaseInformation(TypedDict):\n", 442 | " \"\"\"All of the known information about the invoice / invoice lines the customer would like refunded. Do not make up values, leave fields as null if you don't know their value.\"\"\"\n", 443 | "\n", 444 | " invoice_id: int | None\n", 445 | " invoice_line_ids: list[int] | None\n", 446 | " customer_first_name: str | None\n", 447 | " customer_last_name: str | None\n", 448 | " customer_phone: str | None\n", 449 | " track_name: str | None\n", 450 | " album_title: str | None\n", 451 | " artist_name: str | None\n", 452 | " purchase_date_iso_8601: str | None\n", 453 | " followup: Annotated[\n", 454 | " str | None,\n", 455 | " ...,\n", 456 | " \"If the user hasn't enough identifying information, please tell them what the required information is and ask them to specify it.\",\n", 457 | " ]\n", 458 | "\n", 459 | "# Model for performing extraction.\n", 460 | "info_llm = init_chat_model(\"gpt-4o-mini\").with_structured_output(\n", 461 | " PurchaseInformation, method=\"json_schema\", include_raw=True\n", 462 | ")\n", 463 | "\n", 464 | "# Graph node for extracting user info and routing to lookup/refund/END.\n", 465 | "async def gather_info(state: State) -> Command[Literal[\"lookup\", \"refund\", \"__end__\"]]:\n", 466 | " info = await info_llm.ainvoke(\n", 467 | " [\n", 468 | " {\"role\": \"system\", \"content\": gather_info_instructions},\n", 469 | " *state[\"messages\"],\n", 470 | " ]\n", 471 | " )\n", 472 | " parsed = info[\"parsed\"]\n", 473 | " if any(parsed[k] for k in (\"invoice_id\", \"invoice_line_ids\")):\n", 474 | " goto = \"refund\"\n", 475 | " elif all(\n", 476 | " parsed[k]\n", 477 | " for k in (\"customer_first_name\", \"customer_last_name\", \"customer_phone\")\n", 478 | " ):\n", 479 | " goto = \"lookup\"\n", 480 | " else:\n", 481 | " goto = END\n", 482 | " update = {\"messages\": [info[\"raw\"]], **parsed}\n", 483 | " return Command(update=update, goto=goto)\n", 484 | "\n", 485 | "# Graph node for executing the refund.\n", 486 | "# Note that here we inspect the runtime config for an \"env\" variable.\n", 487 | "# If \"env\" is set to \"test\", then we don't actually delete any rows from our database.\n", 488 | "# This will become important when we're runnign our evaluations.\n", 489 | "def refund(state: State, config: RunnableConfig) -> dict:\n", 490 | " # Whether to mock the deletion. True if the configurable var 'env' is set to 'test'.\n", 491 | " mock = config.get(\"configurable\", {}).get(\"env\", \"prod\") == \"test\"\n", 492 | " refunded = _refund(\n", 493 | " invoice_id=state[\"invoice_id\"], invoice_line_ids=state[\"invoice_line_ids\"], mock=mock\n", 494 | " )\n", 495 | " response = f\"You have been refunded a total of: ${refunded:.2f}. Is there anything else I can help with?\"\n", 496 | " return {\n", 497 | " \"messages\": [{\"role\": \"assistant\", \"content\": response}],\n", 498 | " \"followup\": response,\n", 499 | " }\n", 500 | "\n", 501 | "# Graph node for looking up the users purchases\n", 502 | "def lookup(state: State) -> dict:\n", 503 | " args = (\n", 504 | " state[k]\n", 505 | " for k in (\n", 506 | " \"customer_first_name\",\n", 507 | " \"customer_last_name\",\n", 508 | " \"customer_phone\",\n", 509 | " \"track_name\",\n", 510 | " \"album_title\",\n", 511 | " \"artist_name\",\n", 512 | " \"purchase_date_iso_8601\",\n", 513 | " )\n", 514 | " )\n", 515 | " results = _lookup(*args)\n", 516 | " if not results:\n", 517 | " response = \"We did not find any purchases associated with the information you've provided. Are you sure you've entered all of your information correctly?\"\n", 518 | " followup = response\n", 519 | " else:\n", 520 | " response = f\"Which of the following purchases would you like to be refunded for?\\n\\n```json{json.dumps(results, indent=2)}\\n```\"\n", 521 | " followup = f\"Which of the following purchases would you like to be refunded for?\\n\\n{tabulate(results, headers='keys')}\"\n", 522 | " return {\n", 523 | " \"messages\": [{\"role\": \"assistant\", \"content\": response}],\n", 524 | " \"followup\": followup,\n", 525 | " \"invoice_line_ids\": [res[\"invoice_line_id\"] for res in results],\n", 526 | " }\n", 527 | "\n", 528 | "# Building our graph\n", 529 | "graph_builder = StateGraph(State)\n", 530 | "\n", 531 | "graph_builder.add_node(gather_info)\n", 532 | "graph_builder.add_node(refund)\n", 533 | "graph_builder.add_node(lookup)\n", 534 | "\n", 535 | "graph_builder.set_entry_point(\"gather_info\")\n", 536 | "graph_builder.add_edge(\"lookup\", END)\n", 537 | "graph_builder.add_edge(\"refund\", END)\n", 538 | "\n", 539 | "refund_graph = graph_builder.compile()" 540 | ] 541 | }, 542 | { 543 | "cell_type": "markdown", 544 | "metadata": {}, 545 | "source": [ 546 | "### Define question answering subgraph" 547 | ] 548 | }, 549 | { 550 | "cell_type": "markdown", 551 | "metadata": {}, 552 | "source": [ 553 | "\"lookup\"" 554 | ] 555 | }, 556 | { 557 | "cell_type": "code", 558 | "execution_count": 37, 559 | "metadata": {}, 560 | "outputs": [], 561 | "source": [ 562 | "from langchain.embeddings import init_embeddings\n", 563 | "from langchain_core.tools import tool\n", 564 | "from langchain_core.vectorstores import InMemoryVectorStore\n", 565 | "from langgraph.prebuilt import create_react_agent\n", 566 | "\n", 567 | "# We'll create vectorstore indexes for all of the artists, tracks and albums\n", 568 | "# ahead of time and use those to disambiguate the user input. E.g. if a user searches for\n", 569 | "# songs by \"prince\" and our DB records the artist as \"Prince\", ideally when we query our\n", 570 | "# artist vectorstore for \"prince\" we'll get back the value \"Prince\", which we can then\n", 571 | "# use in our SQL queries.\n", 572 | "def index_fields() -> tuple[InMemoryVectorStore, InMemoryVectorStore, InMemoryVectorStore]:\n", 573 | " \"\"\"Create an index for all artists, an index for all albums, and an index for all songs.\"\"\"\n", 574 | " try:\n", 575 | " # Connect to the chinook database\n", 576 | " conn = sqlite3.connect(\"chinook.db\")\n", 577 | " cursor = conn.cursor()\n", 578 | "\n", 579 | " # Fetch all results\n", 580 | " tracks = cursor.execute(\"SELECT Name FROM Track\").fetchall()\n", 581 | " artists = cursor.execute(\"SELECT Name FROM Artist\").fetchall()\n", 582 | " albums = cursor.execute(\"SELECT Title FROM Album\").fetchall()\n", 583 | " finally:\n", 584 | " # Close the connection\n", 585 | " if conn:\n", 586 | " conn.close()\n", 587 | "\n", 588 | " embeddings = init_embeddings(\"openai:text-embedding-3-small\")\n", 589 | "\n", 590 | " track_store = InMemoryVectorStore(embeddings)\n", 591 | " artist_store = InMemoryVectorStore(embeddings)\n", 592 | " album_store = InMemoryVectorStore(embeddings)\n", 593 | "\n", 594 | " track_store.add_texts([t[0] for t in tracks])\n", 595 | " artist_store.add_texts([a[0] for a in artists])\n", 596 | " album_store.add_texts([a[0] for a in albums])\n", 597 | " return track_store, artist_store, album_store\n", 598 | "\n", 599 | "track_store, artist_store, album_store = index_fields()\n", 600 | "\n", 601 | "# Agent tools\n", 602 | "@tool\n", 603 | "def lookup_track(\n", 604 | " track_name: str | None = None,\n", 605 | " album_title: str | None = None,\n", 606 | " artist_name: str | None = None,\n", 607 | ") -> list[dict]:\n", 608 | " \"\"\"Lookup a track in Chinook DB based on identifying information about.\n", 609 | "\n", 610 | " Returns:\n", 611 | " a list of dictionaries per matching track that contain keys {'track_name', 'artist_name', 'album_name'}\n", 612 | " \"\"\"\n", 613 | " conn = sqlite3.connect(\"chinook.db\")\n", 614 | " cursor = conn.cursor()\n", 615 | "\n", 616 | " query = \"\"\"\n", 617 | " SELECT DISTINCT t.Name as track_name, ar.Name as artist_name, al.Title as album_name\n", 618 | " FROM Track t\n", 619 | " JOIN Album al ON t.AlbumId = al.AlbumId\n", 620 | " JOIN Artist ar ON al.ArtistId = ar.ArtistId\n", 621 | " WHERE 1=1\n", 622 | " \"\"\"\n", 623 | " params = []\n", 624 | "\n", 625 | " if track_name:\n", 626 | " track_name = track_store.similarity_search(track_name, k=1)[0].page_content\n", 627 | " query += \" AND t.Name LIKE ?\"\n", 628 | " params.append(f\"%{track_name}%\")\n", 629 | " if album_title:\n", 630 | " album_title = album_store.similarity_search(album_title, k=1)[0].page_content\n", 631 | " query += \" AND al.Title LIKE ?\"\n", 632 | " params.append(f\"%{album_title}%\")\n", 633 | " if artist_name:\n", 634 | " artist_name = artist_store.similarity_search(artist_name, k=1)[0].page_content\n", 635 | " query += \" AND ar.Name LIKE ?\"\n", 636 | " params.append(f\"%{artist_name}%\")\n", 637 | "\n", 638 | " cursor.execute(query, params)\n", 639 | " results = cursor.fetchall()\n", 640 | "\n", 641 | " tracks = [\n", 642 | " {\"track_name\": row[0], \"artist_name\": row[1], \"album_name\": row[2]}\n", 643 | " for row in results\n", 644 | " ]\n", 645 | "\n", 646 | " conn.close()\n", 647 | " return tracks\n", 648 | "\n", 649 | "@tool\n", 650 | "def lookup_album(\n", 651 | " track_name: str | None = None,\n", 652 | " album_title: str | None = None,\n", 653 | " artist_name: str | None = None,\n", 654 | ") -> list[dict]:\n", 655 | " \"\"\"Lookup an album in Chinook DB based on identifying information about.\n", 656 | "\n", 657 | " Returns:\n", 658 | " a list of dictionaries per matching album that contain keys {'album_name', 'artist_name'}\n", 659 | " \"\"\"\n", 660 | " conn = sqlite3.connect(\"chinook.db\")\n", 661 | " cursor = conn.cursor()\n", 662 | "\n", 663 | " query = \"\"\"\n", 664 | " SELECT DISTINCT al.Title as album_name, ar.Name as artist_name\n", 665 | " FROM Album al\n", 666 | " JOIN Artist ar ON al.ArtistId = ar.ArtistId\n", 667 | " LEFT JOIN Track t ON t.AlbumId = al.AlbumId\n", 668 | " WHERE 1=1\n", 669 | " \"\"\"\n", 670 | " params = []\n", 671 | "\n", 672 | " if track_name:\n", 673 | " query += \" AND t.Name LIKE ?\"\n", 674 | " params.append(f\"%{track_name}%\")\n", 675 | " if album_title:\n", 676 | " query += \" AND al.Title LIKE ?\"\n", 677 | " params.append(f\"%{album_title}%\")\n", 678 | " if artist_name:\n", 679 | " query += \" AND ar.Name LIKE ?\"\n", 680 | " params.append(f\"%{artist_name}%\")\n", 681 | "\n", 682 | " cursor.execute(query, params)\n", 683 | " results = cursor.fetchall()\n", 684 | "\n", 685 | " albums = [{\"album_name\": row[0], \"artist_name\": row[1]} for row in results]\n", 686 | "\n", 687 | " conn.close()\n", 688 | " return albums\n", 689 | "\n", 690 | "@tool\n", 691 | "def lookup_artist(\n", 692 | " track_name: str | None = None,\n", 693 | " album_title: str | None = None,\n", 694 | " artist_name: str | None = None,\n", 695 | ") -> list[str]:\n", 696 | " \"\"\"Lookup an album in Chinook DB based on identifying information about.\n", 697 | "\n", 698 | " Returns:\n", 699 | " a list of matching artist names\n", 700 | " \"\"\"\n", 701 | " conn = sqlite3.connect(\"chinook.db\")\n", 702 | " cursor = conn.cursor()\n", 703 | "\n", 704 | " query = \"\"\"\n", 705 | " SELECT DISTINCT ar.Name as artist_name\n", 706 | " FROM Artist ar\n", 707 | " LEFT JOIN Album al ON al.ArtistId = ar.ArtistId\n", 708 | " LEFT JOIN Track t ON t.AlbumId = al.AlbumId\n", 709 | " WHERE 1=1\n", 710 | " \"\"\"\n", 711 | " params = []\n", 712 | "\n", 713 | " if track_name:\n", 714 | " query += \" AND t.Name LIKE ?\"\n", 715 | " params.append(f\"%{track_name}%\")\n", 716 | " if album_title:\n", 717 | " query += \" AND al.Title LIKE ?\"\n", 718 | " params.append(f\"%{album_title}%\")\n", 719 | " if artist_name:\n", 720 | " query += \" AND ar.Name LIKE ?\"\n", 721 | " params.append(f\"%{artist_name}%\")\n", 722 | "\n", 723 | " cursor.execute(query, params)\n", 724 | " results = cursor.fetchall()\n", 725 | "\n", 726 | " artists = [row[0] for row in results]\n", 727 | "\n", 728 | " conn.close()\n", 729 | " return artists\n", 730 | "\n", 731 | "# Agent model\n", 732 | "qa_llm = init_chat_model(\"claude-3-5-sonnet-latest\")\n", 733 | "# The prebuilt ReACT agent only expects State to have a 'messages' key, so the\n", 734 | "# state we defined for the refund agent can also be passed to our lookup agent.\n", 735 | "qa_graph = create_react_agent(qa_llm, [lookup_track, lookup_artist, lookup_album])" 736 | ] 737 | }, 738 | { 739 | "cell_type": "markdown", 740 | "metadata": {}, 741 | "source": [ 742 | "### Define supervisor node" 743 | ] 744 | }, 745 | { 746 | "cell_type": "markdown", 747 | "metadata": {}, 748 | "source": [ 749 | "\"supervisor\"" 750 | ] 751 | }, 752 | { 753 | "cell_type": "code", 754 | "execution_count": 38, 755 | "metadata": {}, 756 | "outputs": [ 757 | { 758 | "name": "stdout", 759 | "output_type": "stream", 760 | "text": [ 761 | "Which of the following purchases would you like to be refunded for?\n", 762 | "\n", 763 | " invoice_line_id track_name artist_name purchase_date quantity_purchased price_per_unit\n", 764 | "----------------- -------------------------------- ------------- ------------------- -------------------- ----------------\n", 765 | " 267 How Many More Times Led Zeppelin 2009-08-06 00:00:00 1 0.99\n", 766 | " 268 What Is And What Should Never Be Led Zeppelin 2009-08-06 00:00:00 1 0.99\n" 767 | ] 768 | } 769 | ], 770 | "source": [ 771 | "# Schema for routing user intent.\n", 772 | "# We'll use structured outputs to enforce that the model returns only\n", 773 | "# the desired output.\n", 774 | "class UserIntent(TypedDict):\n", 775 | " \"\"\"The user's current intent in the conversation\"\"\"\n", 776 | "\n", 777 | " intent: Literal[\"refund\", \"question_answering\"]\n", 778 | "\n", 779 | "# Routing model with structured output\n", 780 | "router_llm = init_chat_model(\"gpt-4o-mini\").with_structured_output(\n", 781 | " UserIntent, method=\"json_schema\", strict=True\n", 782 | ")\n", 783 | "\n", 784 | "# Instructions for routing.\n", 785 | "route_instructions = \"\"\"You are managing an online music store that sells song tracks. \\\n", 786 | "You can help customers in two types of ways: (1) answering general questions about \\\n", 787 | "tracks sold at your store, (2) helping them get a refund on a purhcase they made at your store.\n", 788 | "\n", 789 | "Based on the following conversation, determine if the user is currently seeking general \\\n", 790 | "information about song tracks or if they are trying to refund a specific purchase.\n", 791 | "\n", 792 | "Return 'refund' if they are trying to get a refund and 'question_answering' if they are \\\n", 793 | "asking a general music question. Do NOT return anything else. Do NOT try to respond to \\\n", 794 | "the user.\n", 795 | "\"\"\"\n", 796 | "\n", 797 | "# Node for routing.\n", 798 | "async def intent_classifier(\n", 799 | " state: State,\n", 800 | ") -> Command[Literal[\"refund_agent\", \"question_answering_agent\"]]:\n", 801 | " response = router_llm.invoke(\n", 802 | " [{\"role\": \"system\", \"content\": route_instructions}, *state[\"messages\"]]\n", 803 | " )\n", 804 | " return Command(goto=response[\"intent\"] + \"_agent\")\n", 805 | "\n", 806 | "# Node for making sure the 'followup' key is set before our agent run completes.\n", 807 | "def compile_followup(state: State) -> dict:\n", 808 | " \"\"\"Set the followup to be the last message if it hasn't explicitly been set.\"\"\"\n", 809 | " if not state.get(\"followup\"):\n", 810 | " return {\"followup\": state[\"messages\"][-1].content}\n", 811 | " return {}\n", 812 | "\n", 813 | "# Agent definition\n", 814 | "graph_builder = StateGraph(State)\n", 815 | "graph_builder.add_node(intent_classifier)\n", 816 | "# Since all of our subagents have compatible state,\n", 817 | "# we can add them as nodes directly.\n", 818 | "graph_builder.add_node(\"refund_agent\", refund_graph)\n", 819 | "graph_builder.add_node(\"question_answering_agent\", qa_graph)\n", 820 | "graph_builder.add_node(compile_followup)\n", 821 | "\n", 822 | "graph_builder.set_entry_point(\"intent_classifier\")\n", 823 | "graph_builder.add_edge(\"refund_agent\", \"compile_followup\")\n", 824 | "graph_builder.add_edge(\"question_answering_agent\", \"compile_followup\")\n", 825 | "graph_builder.add_edge(\"compile_followup\", END)\n", 826 | "\n", 827 | "graph = graph_builder.compile()\n", 828 | "state = await graph.ainvoke({\"messages\": [\n", 829 | " {\n", 830 | " \"role\": \"user\",\n", 831 | " \"content\": \"my name is Aaron Mitchell and my number is +1 (204) 452-6452. I bought some songs by Led Zeppelin that i'd like refunded\",\n", 832 | " }\n", 833 | "]})\n", 834 | "print(state[\"followup\"])" 835 | ] 836 | }, 837 | { 838 | "cell_type": "markdown", 839 | "metadata": {}, 840 | "source": [ 841 | "## Evaluate the agent" 842 | ] 843 | }, 844 | { 845 | "cell_type": "markdown", 846 | "metadata": {}, 847 | "source": [ 848 | "\"conceptual" 849 | ] 850 | }, 851 | { 852 | "cell_type": "markdown", 853 | "metadata": {}, 854 | "source": [ 855 | "### Final response" 856 | ] 857 | }, 858 | { 859 | "cell_type": "markdown", 860 | "metadata": {}, 861 | "source": [ 862 | "\"final" 863 | ] 864 | }, 865 | { 866 | "cell_type": "markdown", 867 | "metadata": {}, 868 | "source": [ 869 | "##### Create a dataset" 870 | ] 871 | }, 872 | { 873 | "cell_type": "code", 874 | "execution_count": 39, 875 | "metadata": {}, 876 | "outputs": [], 877 | "source": [ 878 | "from langsmith import Client\n", 879 | "\n", 880 | "client = Client()\n", 881 | "\n", 882 | "# Create a dataset\n", 883 | "examples = [\n", 884 | " {\n", 885 | " \"question\": \"How many songs do you have by James Brown\",\n", 886 | " \"response\": \"We have 20 songs by James Brown\",\n", 887 | " },\n", 888 | " {\n", 889 | " \"question\": \"My name is Aaron Mitchell and I'd like a refund.\",\n", 890 | " \"response\": \"I need some more information to help you with the refund. Please specify your phone number, the invoice ID, or the line item IDs for the purchase you'd like refunded.\",\n", 891 | " },\n", 892 | " {\n", 893 | " \"question\": \"My name is Aaron Mitchell and I'd like a refund on my Led Zeppelin purchases. My number is +1 (204) 452-6452\",\n", 894 | " \"response\": 'Which of the following purchases would you like to be refunded for?\\n\\n invoice_line_id track_name artist_name purchase_date quantity_purchased price_per_unit\\n----------------- -------------------------------- ------------- ------------------- -------------------- ----------------\\n 267 How Many More Times Led Zeppelin 2009-08-06 00:00:00 1 0.99\\n 268 What Is And What Should Never Be Led Zeppelin 2009-08-06 00:00:00 1 0.99',\n", 895 | " },\n", 896 | " {\n", 897 | " \"question\": \"Who recorded Wish You Were Here again?\",\n", 898 | " \"response\": \"Wish You Were Here is an album by Pink Floyd\",\n", 899 | " },\n", 900 | " { \n", 901 | " \"question\": \"I want a full refund for invoice 237\",\n", 902 | " \"response\": \"You have been refunded $0.99.\",\n", 903 | " },\n", 904 | "]\n", 905 | "\n", 906 | "dataset_name = \"Chinook Customer Service Bot: Final Response\"\n", 907 | "\n", 908 | "if not client.has_dataset(dataset_name=dataset_name):\n", 909 | " dataset = client.create_dataset(dataset_name=dataset_name)\n", 910 | " client.create_examples(\n", 911 | " inputs=[{\"question\": ex[\"question\"]} for ex in examples],\n", 912 | " outputs=[{\"response\": ex[\"response\"]} for ex in examples],\n", 913 | " dataset_id=dataset.id\n", 914 | " )" 915 | ] 916 | }, 917 | { 918 | "cell_type": "markdown", 919 | "metadata": {}, 920 | "source": [ 921 | "##### Define application logic to be evaluated" 922 | ] 923 | }, 924 | { 925 | "cell_type": "code", 926 | "execution_count": 40, 927 | "metadata": {}, 928 | "outputs": [], 929 | "source": [ 930 | "# Target function\n", 931 | "async def run_graph(inputs: dict) -> dict:\n", 932 | " \"\"\"Run graph and track the final response.\"\"\"\n", 933 | " result = await graph.ainvoke({\"messages\": [\n", 934 | " { \"role\": \"user\", \"content\": inputs['question']},\n", 935 | " ]}, config={\"env\": \"test\"})\n", 936 | " return {\"response\": result[\"followup\"]}" 937 | ] 938 | }, 939 | { 940 | "cell_type": "markdown", 941 | "metadata": {}, 942 | "source": [ 943 | "##### Define evaluator" 944 | ] 945 | }, 946 | { 947 | "cell_type": "code", 948 | "execution_count": 41, 949 | "metadata": {}, 950 | "outputs": [], 951 | "source": [ 952 | "# LLM-as-judge instructions\n", 953 | "grader_instructions = \"\"\"You are a teacher grading a quiz.\n", 954 | "\n", 955 | "You will be given a QUESTION, the GROUND TRUTH (correct) RESPONSE, and the STUDENT RESPONSE.\n", 956 | "\n", 957 | "Here is the grade criteria to follow:\n", 958 | "(1) Grade the student responses based ONLY on their factual accuracy relative to the ground truth answer.\n", 959 | "(2) Ensure that the student response does not contain any conflicting statements.\n", 960 | "(3) It is OK if the student response contains more information than the ground truth response, as long as it is factually accurate relative to the ground truth response.\n", 961 | "\n", 962 | "Correctness:\n", 963 | "True means that the student's response meets all of the criteria.\n", 964 | "False means that the student's response does not meet all of the criteria.\n", 965 | "\n", 966 | "Explain your reasoning in a step-by-step manner to ensure your reasoning and conclusion are correct.\"\"\"\n", 967 | "\n", 968 | "# LLM-as-judge output schema\n", 969 | "class Grade(TypedDict):\n", 970 | " \"\"\"Compare the expected and actual answers and grade the actual answer.\"\"\"\n", 971 | " reasoning: Annotated[str, ..., \"Explain your reasoning for whether the actual response is correct or not.\"]\n", 972 | " is_correct: Annotated[bool, ..., \"True if the student response is mostly or exactly correct, otherwise False.\"]\n", 973 | "\n", 974 | "# Judge LLM\n", 975 | "grader_llm = init_chat_model(\"gpt-4o-mini\", temperature=0).with_structured_output(Grade, method=\"json_schema\", strict=True)\n", 976 | "\n", 977 | "# Evaluator function\n", 978 | "async def final_answer_correct(inputs: dict, outputs: dict, reference_outputs: dict) -> bool:\n", 979 | " \"\"\"Evaluate if the final response is equivalent to reference response.\"\"\"\n", 980 | " # Note that we assume the outputs has a 'response' dictionary. We'll need to make sure\n", 981 | " # that the target function we define includes this key.\n", 982 | " user = f\"\"\"QUESTION: {inputs['question']}\n", 983 | " GROUND TRUTH RESPONSE: {reference_outputs['response']}\n", 984 | " STUDENT RESPONSE: {outputs['response']}\"\"\"\n", 985 | "\n", 986 | " grade = await grader_llm.ainvoke([{\"role\": \"system\", \"content\": grader_instructions}, {\"role\": \"user\", \"content\": user}])\n", 987 | " return grade[\"is_correct\"]" 988 | ] 989 | }, 990 | { 991 | "cell_type": "markdown", 992 | "metadata": {}, 993 | "source": [ 994 | "##### Run evaluation" 995 | ] 996 | }, 997 | { 998 | "cell_type": "code", 999 | "execution_count": 42, 1000 | "metadata": {}, 1001 | "outputs": [ 1002 | { 1003 | "name": "stdout", 1004 | "output_type": "stream", 1005 | "text": [ 1006 | "View the evaluation results for experiment: 'sql-agent-gpt4o-e2e-d984d052' at:\n", 1007 | "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/18ed4055-e160-40cb-ba16-e1af9002b310/compare?selectedSessions=57f00a9d-9450-4c34-8396-6f139cff8e0a\n", 1008 | "\n", 1009 | "\n" 1010 | ] 1011 | }, 1012 | { 1013 | "name": "stderr", 1014 | "output_type": "stream", 1015 | "text": [ 1016 | "5it [00:11, 2.36s/it]\n" 1017 | ] 1018 | }, 1019 | { 1020 | "data": { 1021 | "text/html": [ 1022 | "
\n", 1023 | "\n", 1036 | "\n", 1037 | " \n", 1038 | " \n", 1039 | " \n", 1040 | " \n", 1041 | " \n", 1042 | " \n", 1043 | " \n", 1044 | " \n", 1045 | " \n", 1046 | " \n", 1047 | " \n", 1048 | " \n", 1049 | " \n", 1050 | " \n", 1051 | " \n", 1052 | " \n", 1053 | " \n", 1054 | " \n", 1055 | " \n", 1056 | " \n", 1057 | " \n", 1058 | " \n", 1059 | " \n", 1060 | " \n", 1061 | " \n", 1062 | " \n", 1063 | " \n", 1064 | " \n", 1065 | " \n", 1066 | " \n", 1067 | " \n", 1068 | " \n", 1069 | " \n", 1070 | " \n", 1071 | " \n", 1072 | " \n", 1073 | " \n", 1074 | " \n", 1075 | " \n", 1076 | " \n", 1077 | " \n", 1078 | " \n", 1079 | " \n", 1080 | " \n", 1081 | " \n", 1082 | " \n", 1083 | " \n", 1084 | " \n", 1085 | " \n", 1086 | " \n", 1087 | " \n", 1088 | " \n", 1089 | " \n", 1090 | " \n", 1091 | " \n", 1092 | " \n", 1093 | " \n", 1094 | " \n", 1095 | " \n", 1096 | " \n", 1097 | " \n", 1098 | " \n", 1099 | " \n", 1100 | " \n", 1101 | " \n", 1102 | " \n", 1103 | " \n", 1104 | " \n", 1105 | " \n", 1106 | " \n", 1107 | "
inputs.questionoutputs.responseerrorreference.responsefeedback.final_answer_correctexecution_timeexample_idid
0I want a full refund for invoice 237You have been refunded a total of: $0.99. Is t...NoneYou have been refunded $0.99.True3.750356b094b32b-94f6-49a4-9af7-a7ccb289cba52ab8e132-e3b0-4ffe-ba73-3d42aea64fda
1My name is Aaron Mitchell and I'd like a refun...We did not find any purchases associated with ...NoneWhich of the following purchases would you lik...False3.7489792274dab9-5682-4e35-8f30-e3171886b2a9c9d3bb87-0ec2-4557-b384-cbf038e18149
2My name is Aaron Mitchell and I'd like a refund.Please provide your phone number and either th...NoneI need some more information to help you with ...True3.10867716e7f941-7c18-4152-82fe-639ce4feca0e35d9a6f6-dbb7-445f-a214-a12af1d495b0
3How many songs do you have by James BrownThere are 20 James Brown songs in the database...NoneWe have 20 songs by James BrownTrue6.4461635a516727-b97f-40ea-a363-605a8b92669a21d7d353-accd-43f0-9776-56cffb8a3954
4Who recorded Wish You Were Here again?I apologize, but I'm not finding \"Wish You Wer...NoneWish You Were Here is an album by Pink FloydFalse10.35657527e113d9-498f-49ab-a66b-8395a4a6b32bc476f0a7-b2f5-4628-8e59-c6eb2a4acf4d
\n", 1108 | "
" 1109 | ], 1110 | "text/plain": [ 1111 | " inputs.question \\\n", 1112 | "0 I want a full refund for invoice 237 \n", 1113 | "1 My name is Aaron Mitchell and I'd like a refun... \n", 1114 | "2 My name is Aaron Mitchell and I'd like a refund. \n", 1115 | "3 How many songs do you have by James Brown \n", 1116 | "4 Who recorded Wish You Were Here again? \n", 1117 | "\n", 1118 | " outputs.response error \\\n", 1119 | "0 You have been refunded a total of: $0.99. Is t... None \n", 1120 | "1 We did not find any purchases associated with ... None \n", 1121 | "2 Please provide your phone number and either th... None \n", 1122 | "3 There are 20 James Brown songs in the database... None \n", 1123 | "4 I apologize, but I'm not finding \"Wish You Wer... None \n", 1124 | "\n", 1125 | " reference.response \\\n", 1126 | "0 You have been refunded $0.99. \n", 1127 | "1 Which of the following purchases would you lik... \n", 1128 | "2 I need some more information to help you with ... \n", 1129 | "3 We have 20 songs by James Brown \n", 1130 | "4 Wish You Were Here is an album by Pink Floyd \n", 1131 | "\n", 1132 | " feedback.final_answer_correct execution_time \\\n", 1133 | "0 True 3.750356 \n", 1134 | "1 False 3.748979 \n", 1135 | "2 True 3.108677 \n", 1136 | "3 True 6.446163 \n", 1137 | "4 False 10.356575 \n", 1138 | "\n", 1139 | " example_id id \n", 1140 | "0 b094b32b-94f6-49a4-9af7-a7ccb289cba5 2ab8e132-e3b0-4ffe-ba73-3d42aea64fda \n", 1141 | "1 2274dab9-5682-4e35-8f30-e3171886b2a9 c9d3bb87-0ec2-4557-b384-cbf038e18149 \n", 1142 | "2 16e7f941-7c18-4152-82fe-639ce4feca0e 35d9a6f6-dbb7-445f-a214-a12af1d495b0 \n", 1143 | "3 5a516727-b97f-40ea-a363-605a8b92669a 21d7d353-accd-43f0-9776-56cffb8a3954 \n", 1144 | "4 27e113d9-498f-49ab-a66b-8395a4a6b32b c476f0a7-b2f5-4628-8e59-c6eb2a4acf4d " 1145 | ] 1146 | }, 1147 | "execution_count": 42, 1148 | "metadata": {}, 1149 | "output_type": "execute_result" 1150 | } 1151 | ], 1152 | "source": [ 1153 | "from langsmith import Client\n", 1154 | "\n", 1155 | "client = Client()\n", 1156 | "\n", 1157 | "# Evaluation job and results\n", 1158 | "experiment_results = await client.aevaluate(\n", 1159 | " run_graph,\n", 1160 | " data=dataset_name,\n", 1161 | " evaluators=[final_answer_correct],\n", 1162 | " experiment_prefix=\"sql-agent-gpt4o-e2e\",\n", 1163 | " num_repetitions=1,\n", 1164 | " max_concurrency=4,\n", 1165 | ")\n", 1166 | "experiment_results.to_pandas()" 1167 | ] 1168 | }, 1169 | { 1170 | "cell_type": "markdown", 1171 | "metadata": {}, 1172 | "source": [ 1173 | "### Single step evaluator" 1174 | ] 1175 | }, 1176 | { 1177 | "cell_type": "markdown", 1178 | "metadata": {}, 1179 | "source": [ 1180 | "\"single" 1181 | ] 1182 | }, 1183 | { 1184 | "cell_type": "markdown", 1185 | "metadata": {}, 1186 | "source": [ 1187 | "##### Create dataset" 1188 | ] 1189 | }, 1190 | { 1191 | "cell_type": "code", 1192 | "execution_count": 43, 1193 | "metadata": {}, 1194 | "outputs": [], 1195 | "source": [ 1196 | "# Create dataset\n", 1197 | "examples = [\n", 1198 | " {\"messages\": [{\"role\": \"user\", \"content\": \"i bought some tracks recently and i dont like them\"}], \"route\": \"refund_agent\"},\n", 1199 | " {\"messages\": [{\"role\": \"user\", \"content\": \"I was thinking of purchasing some Rolling Stones tunes, any recommendations?\"}], \"route\": \"question_answering_agent\"},\n", 1200 | " {\"messages\": [{\"role\": \"user\", \"content\": \"i want a refund on purchase 237\"}, {\"role\": \"assistant\", \"content\": \"I've refunded you a total of $1.98. How else can I help you today?\"}, {\"role\": \"user\", \"content\": \"did prince release any albums in 2000?\"}], \"route\": \"question_answering_agent\"},\n", 1201 | " {\"messages\": [{\"role\": \"user\", \"content\": \"i purchased a cover of Yesterday recently but can't remember who it was by, which versions of it do you have?\"}], \"route\": \"question_answering_agent\"},\n", 1202 | " {\"messages\": [{\"role\": \"user\", \"content\": \"Can I get my money back? I bought an album from the store last week, but it was the wrong one.\"}], \"route\": \"refund_agent\"}\n", 1203 | "]\n", 1204 | "\n", 1205 | "dataset_name = \"Chinook Customer Service Bot: Single Step\"\n", 1206 | "if not client.has_dataset(dataset_name=dataset_name):\n", 1207 | " dataset = client.create_dataset(dataset_name=dataset_name)\n", 1208 | " client.create_examples(\n", 1209 | " inputs = [{\"messages\": ex[\"messages\"]} for ex in examples],\n", 1210 | " outputs = [{\"route\": ex[\"route\"]} for ex in examples],\n", 1211 | " dataset_id=dataset.id\n", 1212 | " )" 1213 | ] 1214 | }, 1215 | { 1216 | "cell_type": "markdown", 1217 | "metadata": {}, 1218 | "source": [ 1219 | "##### Define application logic to be evaluated" 1220 | ] 1221 | }, 1222 | { 1223 | "cell_type": "code", 1224 | "execution_count": 44, 1225 | "metadata": {}, 1226 | "outputs": [], 1227 | "source": [ 1228 | "# Target function for running the relevant step\n", 1229 | "async def run_intent_classifier(inputs: dict) -> dict:\n", 1230 | " # Note that we can access and run the intent_classifier node of our graph directly.\n", 1231 | " command = await graph.nodes['intent_classifier'].ainvoke(inputs)\n", 1232 | " return {\"route\": command.goto}" 1233 | ] 1234 | }, 1235 | { 1236 | "cell_type": "markdown", 1237 | "metadata": {}, 1238 | "source": [ 1239 | "##### Define evaluator" 1240 | ] 1241 | }, 1242 | { 1243 | "cell_type": "code", 1244 | "execution_count": 45, 1245 | "metadata": {}, 1246 | "outputs": [], 1247 | "source": [ 1248 | "# Evaluator\n", 1249 | "def correct(outputs: dict, reference_outputs: dict) -> bool:\n", 1250 | " \"\"\"Check if the agent chose the correct route.\"\"\"\n", 1251 | " return outputs[\"route\"] == reference_outputs[\"route\"]" 1252 | ] 1253 | }, 1254 | { 1255 | "cell_type": "markdown", 1256 | "metadata": {}, 1257 | "source": [ 1258 | "##### Run evaluation" 1259 | ] 1260 | }, 1261 | { 1262 | "cell_type": "code", 1263 | "execution_count": 46, 1264 | "metadata": {}, 1265 | "outputs": [ 1266 | { 1267 | "name": "stdout", 1268 | "output_type": "stream", 1269 | "text": [ 1270 | "View the evaluation results for experiment: 'sql-agent-gpt4o-intent-classifier-3b90c1a3' at:\n", 1271 | "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/fd928d25-c809-4ca3-b12a-35c0cb306022/compare?selectedSessions=8cdb4a0a-fd7b-40cd-bd83-024b82ae2a83\n", 1272 | "\n", 1273 | "\n" 1274 | ] 1275 | }, 1276 | { 1277 | "name": "stderr", 1278 | "output_type": "stream", 1279 | "text": [ 1280 | "5it [00:02, 2.07it/s]\n" 1281 | ] 1282 | }, 1283 | { 1284 | "data": { 1285 | "text/html": [ 1286 | "
\n", 1287 | "\n", 1300 | "\n", 1301 | " \n", 1302 | " \n", 1303 | " \n", 1304 | " \n", 1305 | " \n", 1306 | " \n", 1307 | " \n", 1308 | " \n", 1309 | " \n", 1310 | " \n", 1311 | " \n", 1312 | " \n", 1313 | " \n", 1314 | " \n", 1315 | " \n", 1316 | " \n", 1317 | " \n", 1318 | " \n", 1319 | " \n", 1320 | " \n", 1321 | " \n", 1322 | " \n", 1323 | " \n", 1324 | " \n", 1325 | " \n", 1326 | " \n", 1327 | " \n", 1328 | " \n", 1329 | " \n", 1330 | " \n", 1331 | " \n", 1332 | " \n", 1333 | " \n", 1334 | " \n", 1335 | " \n", 1336 | " \n", 1337 | " \n", 1338 | " \n", 1339 | " \n", 1340 | " \n", 1341 | " \n", 1342 | " \n", 1343 | " \n", 1344 | " \n", 1345 | " \n", 1346 | " \n", 1347 | " \n", 1348 | " \n", 1349 | " \n", 1350 | " \n", 1351 | " \n", 1352 | " \n", 1353 | " \n", 1354 | " \n", 1355 | " \n", 1356 | " \n", 1357 | " \n", 1358 | " \n", 1359 | " \n", 1360 | " \n", 1361 | " \n", 1362 | " \n", 1363 | " \n", 1364 | " \n", 1365 | " \n", 1366 | " \n", 1367 | " \n", 1368 | " \n", 1369 | " \n", 1370 | " \n", 1371 | "
inputs.messagesoutputs.routeerrorreference.routefeedback.correctexecution_timeexample_idid
0[{'role': 'user', 'content': 'Can I get my mon...refund_agentNonerefund_agentTrue0.63442547bb4665-da6f-4b58-a314-fefc886040c3f6ccda12-147a-41ad-8e3c-93da544b6ca5
1[{'role': 'user', 'content': 'i purchased a co...question_answering_agentNonequestion_answering_agentTrue0.412655e88b4749-4b72-42d0-9efd-e69ff276b4e247fb6f5c-0d54-4757-b3df-ba2d271d1cf3
2[{'role': 'user', 'content': 'i want a refund ...question_answering_agentNonequestion_answering_agentTrue0.4091380dc9df85-806c-477c-ba2e-b1872b445db9aacd798d-9afb-4b84-ab67-e2ec1c3adb22
3[{'role': 'user', 'content': 'I was thinking o...question_answering_agentNonequestion_answering_agentTrue0.410535cd503c74-547a-41fb-8763-ed808dcf9ba97770076c-0894-460f-a7ec-ed53ae491cf4
4[{'role': 'user', 'content': 'i bought some tr...refund_agentNonerefund_agentTrue0.342001ac23df0a-f0d7-442f-b85d-26e62126adceb227ce5d-5378-4195-ab76-3d52f1db1b2a
\n", 1372 | "
" 1373 | ], 1374 | "text/plain": [ 1375 | " inputs.messages \\\n", 1376 | "0 [{'role': 'user', 'content': 'Can I get my mon... \n", 1377 | "1 [{'role': 'user', 'content': 'i purchased a co... \n", 1378 | "2 [{'role': 'user', 'content': 'i want a refund ... \n", 1379 | "3 [{'role': 'user', 'content': 'I was thinking o... \n", 1380 | "4 [{'role': 'user', 'content': 'i bought some tr... \n", 1381 | "\n", 1382 | " outputs.route error reference.route feedback.correct \\\n", 1383 | "0 refund_agent None refund_agent True \n", 1384 | "1 question_answering_agent None question_answering_agent True \n", 1385 | "2 question_answering_agent None question_answering_agent True \n", 1386 | "3 question_answering_agent None question_answering_agent True \n", 1387 | "4 refund_agent None refund_agent True \n", 1388 | "\n", 1389 | " execution_time example_id \\\n", 1390 | "0 0.634425 47bb4665-da6f-4b58-a314-fefc886040c3 \n", 1391 | "1 0.412655 e88b4749-4b72-42d0-9efd-e69ff276b4e2 \n", 1392 | "2 0.409138 0dc9df85-806c-477c-ba2e-b1872b445db9 \n", 1393 | "3 0.410535 cd503c74-547a-41fb-8763-ed808dcf9ba9 \n", 1394 | "4 0.342001 ac23df0a-f0d7-442f-b85d-26e62126adce \n", 1395 | "\n", 1396 | " id \n", 1397 | "0 f6ccda12-147a-41ad-8e3c-93da544b6ca5 \n", 1398 | "1 47fb6f5c-0d54-4757-b3df-ba2d271d1cf3 \n", 1399 | "2 aacd798d-9afb-4b84-ab67-e2ec1c3adb22 \n", 1400 | "3 7770076c-0894-460f-a7ec-ed53ae491cf4 \n", 1401 | "4 b227ce5d-5378-4195-ab76-3d52f1db1b2a " 1402 | ] 1403 | }, 1404 | "execution_count": 46, 1405 | "metadata": {}, 1406 | "output_type": "execute_result" 1407 | } 1408 | ], 1409 | "source": [ 1410 | "# Run evaluation\n", 1411 | "experiment_results = await client.aevaluate(\n", 1412 | " run_intent_classifier,\n", 1413 | " data=dataset_name,\n", 1414 | " evaluators=[correct],\n", 1415 | " experiment_prefix=\"sql-agent-gpt4o-intent-classifier\",\n", 1416 | " max_concurrency=4,\n", 1417 | ")\n", 1418 | "experiment_results.to_pandas()" 1419 | ] 1420 | }, 1421 | { 1422 | "cell_type": "markdown", 1423 | "metadata": {}, 1424 | "source": [ 1425 | "### Trajectory evaluator" 1426 | ] 1427 | }, 1428 | { 1429 | "cell_type": "markdown", 1430 | "metadata": {}, 1431 | "source": [ 1432 | "\"trajectory\"" 1433 | ] 1434 | }, 1435 | { 1436 | "cell_type": "markdown", 1437 | "metadata": {}, 1438 | "source": [ 1439 | "##### Create dataset" 1440 | ] 1441 | }, 1442 | { 1443 | "cell_type": "code", 1444 | "execution_count": 47, 1445 | "metadata": {}, 1446 | "outputs": [], 1447 | "source": [ 1448 | "from langsmith import Client\n", 1449 | "\n", 1450 | "client = Client()\n", 1451 | "\n", 1452 | "# Create a dataset\n", 1453 | "examples = [\n", 1454 | " {\n", 1455 | " \"question\": \"How many songs do you have by James Brown\",\n", 1456 | " \"trajectory\": [\"intent_classifier\", \"question_answering_agent\", \"agent\", \"tools\", \"lookup_track\", \"agent\", \"compile_followup\"]\n", 1457 | " },\n", 1458 | " {\n", 1459 | " \"question\": \"My name is Aaron Mitchell and I'd like a refund.\",\n", 1460 | " \"trajectory\": [\"intent_classifier\", \"refund_agent\", \"gather_info\", \"compile_followup\"],\n", 1461 | " },\n", 1462 | " {\n", 1463 | " \"question\": \"My name is Aaron Mitchell and I'd like a refund on my Led Zeppelin purchases. My number is +1 (204) 452-6452\",\n", 1464 | " \"trajectory\": [\"intent_classifier\", \"refund_agent\", \"gather_info\", \"lookup\", \"compile_followup\"],\n", 1465 | " },\n", 1466 | " {\n", 1467 | " \"question\": \"Who recorded Wish You Were Here again? What other albums by them do you have?\",\n", 1468 | " \"trajectory\": [\"intent_classifier\", \"question_answering_agent\", \"agent\", \"tools\", \"lookup_track\", \"agent\", \"tools\", \"lookup_album\", \"agent\", \"compile_followup\"],\n", 1469 | " },\n", 1470 | " {\n", 1471 | " \"question\": \"My name is Aaron Mitchell. My number is +1 (204) 452-6452 and I want a full refund for invoice id 237\",\n", 1472 | " \"trajectory\": [\"intent_classifier\", \"refund_agent\", \"gather_info\", \"refund\", \"compile_followup\"],\n", 1473 | " },\n", 1474 | "]\n", 1475 | "\n", 1476 | "dataset_name = \"Chinook Customer Service Bot: Trajectory\"\n", 1477 | "\n", 1478 | "if not client.has_dataset(dataset_name=dataset_name):\n", 1479 | " dataset = client.create_dataset(dataset_name=dataset_name)\n", 1480 | " client.create_examples(\n", 1481 | " inputs=[{\"question\": ex[\"question\"]} for ex in examples],\n", 1482 | " outputs=[{\"trajectory\": ex[\"trajectory\"]} for ex in examples],\n", 1483 | " dataset_id=dataset.id\n", 1484 | " )" 1485 | ] 1486 | }, 1487 | { 1488 | "cell_type": "markdown", 1489 | "metadata": {}, 1490 | "source": [ 1491 | "##### Define application logic to be evaluated" 1492 | ] 1493 | }, 1494 | { 1495 | "cell_type": "code", 1496 | "execution_count": 48, 1497 | "metadata": {}, 1498 | "outputs": [], 1499 | "source": [ 1500 | "async def run_graph(inputs: dict) -> dict:\n", 1501 | " \"\"\"Run graph and track the trajectory it takes along with the final response.\"\"\"\n", 1502 | " trajectory = []\n", 1503 | " # Set subgraph=True to stream events from subgraphs of the main graph: https://langchain-ai.github.io/langgraph/how-tos/streaming-subgraphs/\n", 1504 | " # Set stream_mode=\"debug\" to stream all possible events: https://langchain-ai.github.io/langgraph/concepts/streaming\n", 1505 | " async for chunk in graph.astream({\"messages\": [\n", 1506 | " {\n", 1507 | " \"role\": \"user\",\n", 1508 | " \"content\": inputs['question'],\n", 1509 | " }\n", 1510 | " ]}, subgraphs=True, stream_mode=\"debug\"):\n", 1511 | " # Event type for entering a node\n", 1512 | " if chunk[1]['type'] == 'task':\n", 1513 | " # Record the node name\n", 1514 | " trajectory.append(chunk[1]['payload']['name'])\n", 1515 | " # Given how we defined our dataset, we also need to track when specific tools are\n", 1516 | " # called by our question answering ReACT agent. These tool calls can be found\n", 1517 | " # when the ToolsNode (named \"tools\") is invoked by looking at the AIMessage.tool_calls\n", 1518 | " # of the latest input message.\n", 1519 | " if chunk[1]['payload']['name'] == 'tools' and chunk[1]['type'] == 'task':\n", 1520 | " for tc in chunk[1]['payload']['input']['messages'][-1].tool_calls:\n", 1521 | " trajectory.append(tc['name'])\n", 1522 | " return {\"trajectory\": trajectory}" 1523 | ] 1524 | }, 1525 | { 1526 | "cell_type": "markdown", 1527 | "metadata": {}, 1528 | "source": [ 1529 | "##### Define evaluators" 1530 | ] 1531 | }, 1532 | { 1533 | "cell_type": "code", 1534 | "execution_count": 49, 1535 | "metadata": {}, 1536 | "outputs": [], 1537 | "source": [ 1538 | "def evaluate_extra_steps(outputs: dict, reference_outputs: dict) -> dict:\n", 1539 | " \"\"\"Evaluate the number of extra steps in the agent's output.\"\"\"\n", 1540 | " extra_steps = len(outputs['trajectory']) - len(reference_outputs['trajectory'])\n", 1541 | " return {\n", 1542 | " \"key\": \"extra_steps\",\n", 1543 | " \"score\": extra_steps,\n", 1544 | " }\n", 1545 | "\n", 1546 | "def evaluate_unmatched_steps(outputs: dict, reference_outputs: dict) -> dict:\n", 1547 | " # [\"step1\", \"step2\", \"step3\"]\n", 1548 | " # [\"step3\", \"step2\", \"step1\"]\n", 1549 | " \"\"\"Evaluate the number of unmatched steps in the agent's output.\"\"\"\n", 1550 | " i = j = 0\n", 1551 | " unmatched_steps = 0\n", 1552 | "\n", 1553 | " while i < len(reference_outputs['trajectory']) and j < len(outputs['trajectory']):\n", 1554 | " if reference_outputs['trajectory'][i] == outputs['trajectory'][j]:\n", 1555 | " i += 1 # Match found, move to the next step in reference trajectory\n", 1556 | " else:\n", 1557 | " unmatched_steps += 1 # Step is not part of the reference trajectory\n", 1558 | " j += 1 # Always move to the next step in outputs trajectory\n", 1559 | "\n", 1560 | " # Count remaining unmatched steps in outputs beyond the comparison loop\n", 1561 | " unmatched_steps += len(outputs['trajectory']) - j\n", 1562 | "\n", 1563 | " return {\n", 1564 | " \"key\": \"unmatched_steps\",\n", 1565 | " \"score\": unmatched_steps,\n", 1566 | " }" 1567 | ] 1568 | }, 1569 | { 1570 | "cell_type": "markdown", 1571 | "metadata": {}, 1572 | "source": [ 1573 | "##### Run evaluation" 1574 | ] 1575 | }, 1576 | { 1577 | "cell_type": "code", 1578 | "execution_count": 50, 1579 | "metadata": {}, 1580 | "outputs": [ 1581 | { 1582 | "name": "stdout", 1583 | "output_type": "stream", 1584 | "text": [ 1585 | "View the evaluation results for experiment: 'sql-agent-gpt4o-trajectory-c654f01b' at:\n", 1586 | "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/46c97213-3e6f-47e4-846c-cd79143192fc/compare?selectedSessions=f700d7fa-78ba-41d2-9177-9d490531cef7\n", 1587 | "\n", 1588 | "\n" 1589 | ] 1590 | }, 1591 | { 1592 | "name": "stderr", 1593 | "output_type": "stream", 1594 | "text": [ 1595 | "5it [00:20, 4.01s/it]\n" 1596 | ] 1597 | }, 1598 | { 1599 | "data": { 1600 | "text/html": [ 1601 | "
\n", 1602 | "\n", 1615 | "\n", 1616 | " \n", 1617 | " \n", 1618 | " \n", 1619 | " \n", 1620 | " \n", 1621 | " \n", 1622 | " \n", 1623 | " \n", 1624 | " \n", 1625 | " \n", 1626 | " \n", 1627 | " \n", 1628 | " \n", 1629 | " \n", 1630 | " \n", 1631 | " \n", 1632 | " \n", 1633 | " \n", 1634 | " \n", 1635 | " \n", 1636 | " \n", 1637 | " \n", 1638 | " \n", 1639 | " \n", 1640 | " \n", 1641 | " \n", 1642 | " \n", 1643 | " \n", 1644 | " \n", 1645 | " \n", 1646 | " \n", 1647 | " \n", 1648 | " \n", 1649 | " \n", 1650 | " \n", 1651 | " \n", 1652 | " \n", 1653 | " \n", 1654 | " \n", 1655 | " \n", 1656 | " \n", 1657 | " \n", 1658 | " \n", 1659 | " \n", 1660 | " \n", 1661 | " \n", 1662 | " \n", 1663 | " \n", 1664 | " \n", 1665 | " \n", 1666 | " \n", 1667 | " \n", 1668 | " \n", 1669 | " \n", 1670 | " \n", 1671 | " \n", 1672 | " \n", 1673 | " \n", 1674 | " \n", 1675 | " \n", 1676 | " \n", 1677 | " \n", 1678 | " \n", 1679 | " \n", 1680 | " \n", 1681 | " \n", 1682 | " \n", 1683 | " \n", 1684 | " \n", 1685 | " \n", 1686 | " \n", 1687 | " \n", 1688 | " \n", 1689 | " \n", 1690 | " \n", 1691 | " \n", 1692 | "
inputs.questionoutputs.trajectoryerrorreference.trajectoryfeedback.extra_stepsfeedback.unmatched_stepsexecution_timeexample_idid
0My name is Aaron Mitchell and I'd like a refund.[intent_classifier, refund_agent, gather_info,...None[intent_classifier, refund_agent, gather_info,...002.9935480f9e2190-ef0f-455e-86e1-f3965fe2dd2070f63270-7d0e-49f4-9e25-873ce8725324
1My name is Aaron Mitchell. My number is +1 (20...[intent_classifier, refund_agent, gather_info,...None[intent_classifier, refund_agent, gather_info,...003.585207f7573d70-d3c9-4273-bad8-96c1693d41e67e2b5eaf-3af9-475b-978b-c84167d9c3d8
2My name is Aaron Mitchell and I'd like a refun...[intent_classifier, refund_agent, gather_info,...None[intent_classifier, refund_agent, gather_info,...003.5969756c35522a-a1ca-4212-9edf-5c17566cfb05a748f4a7-bd7e-40b2-af6e-e9de37e56000
3How many songs do you have by James Brown[intent_classifier, question_answering_agent, ...None[intent_classifier, question_answering_agent, ...007.2496611c308dca-8a5e-44d8-9620-d71c36a0685245055ce2-bb55-475c-918f-3e69aaaecc40
4Who recorded Wish You Were Here again? What ot...[intent_classifier, question_answering_agent, ...None[intent_classifier, question_answering_agent, ...3319.67982414b71f30-6e70-4ea4-b527-3fdca46de008b416ff67-b582-402b-9161-3503f9bc5d87
\n", 1693 | "
" 1694 | ], 1695 | "text/plain": [ 1696 | " inputs.question \\\n", 1697 | "0 My name is Aaron Mitchell and I'd like a refund. \n", 1698 | "1 My name is Aaron Mitchell. My number is +1 (20... \n", 1699 | "2 My name is Aaron Mitchell and I'd like a refun... \n", 1700 | "3 How many songs do you have by James Brown \n", 1701 | "4 Who recorded Wish You Were Here again? What ot... \n", 1702 | "\n", 1703 | " outputs.trajectory error \\\n", 1704 | "0 [intent_classifier, refund_agent, gather_info,... None \n", 1705 | "1 [intent_classifier, refund_agent, gather_info,... None \n", 1706 | "2 [intent_classifier, refund_agent, gather_info,... None \n", 1707 | "3 [intent_classifier, question_answering_agent, ... None \n", 1708 | "4 [intent_classifier, question_answering_agent, ... None \n", 1709 | "\n", 1710 | " reference.trajectory feedback.extra_steps \\\n", 1711 | "0 [intent_classifier, refund_agent, gather_info,... 0 \n", 1712 | "1 [intent_classifier, refund_agent, gather_info,... 0 \n", 1713 | "2 [intent_classifier, refund_agent, gather_info,... 0 \n", 1714 | "3 [intent_classifier, question_answering_agent, ... 0 \n", 1715 | "4 [intent_classifier, question_answering_agent, ... 3 \n", 1716 | "\n", 1717 | " feedback.unmatched_steps execution_time \\\n", 1718 | "0 0 2.993548 \n", 1719 | "1 0 3.585207 \n", 1720 | "2 0 3.596975 \n", 1721 | "3 0 7.249661 \n", 1722 | "4 3 19.679824 \n", 1723 | "\n", 1724 | " example_id id \n", 1725 | "0 0f9e2190-ef0f-455e-86e1-f3965fe2dd20 70f63270-7d0e-49f4-9e25-873ce8725324 \n", 1726 | "1 f7573d70-d3c9-4273-bad8-96c1693d41e6 7e2b5eaf-3af9-475b-978b-c84167d9c3d8 \n", 1727 | "2 6c35522a-a1ca-4212-9edf-5c17566cfb05 a748f4a7-bd7e-40b2-af6e-e9de37e56000 \n", 1728 | "3 1c308dca-8a5e-44d8-9620-d71c36a06852 45055ce2-bb55-475c-918f-3e69aaaecc40 \n", 1729 | "4 14b71f30-6e70-4ea4-b527-3fdca46de008 b416ff67-b582-402b-9161-3503f9bc5d87 " 1730 | ] 1731 | }, 1732 | "execution_count": 50, 1733 | "metadata": {}, 1734 | "output_type": "execute_result" 1735 | } 1736 | ], 1737 | "source": [ 1738 | "experiment_results = await client.aevaluate(\n", 1739 | " run_graph,\n", 1740 | " data=dataset_name,\n", 1741 | " evaluators=[evaluate_extra_steps, evaluate_unmatched_steps],\n", 1742 | " experiment_prefix=\"sql-agent-gpt4o-trajectory\",\n", 1743 | " num_repetitions=1,\n", 1744 | " max_concurrency=4,\n", 1745 | ")\n", 1746 | "experiment_results.to_pandas()" 1747 | ] 1748 | } 1749 | ], 1750 | "metadata": { 1751 | "kernelspec": { 1752 | "display_name": "Python 3", 1753 | "language": "python", 1754 | "name": "python3" 1755 | }, 1756 | "language_info": { 1757 | "codemirror_mode": { 1758 | "name": "ipython", 1759 | "version": 3 1760 | }, 1761 | "file_extension": ".py", 1762 | "mimetype": "text/x-python", 1763 | "name": "python", 1764 | "nbconvert_exporter": "python", 1765 | "pygments_lexer": "ipython3", 1766 | "version": "3.13.0" 1767 | } 1768 | }, 1769 | "nbformat": 4, 1770 | "nbformat_minor": 2 1771 | } 1772 | -------------------------------------------------------------------------------- /build-eval-agent/chinook.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/the-judge/b2793c9811dfc476993c42e3696a0adebc43bac9/build-eval-agent/chinook.db -------------------------------------------------------------------------------- /build-eval-agent/images/architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/the-judge/b2793c9811dfc476993c42e3696a0adebc43bac9/build-eval-agent/images/architecture.png -------------------------------------------------------------------------------- /build-eval-agent/images/evals-conceptual.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/the-judge/b2793c9811dfc476993c42e3696a0adebc43bac9/build-eval-agent/images/evals-conceptual.png -------------------------------------------------------------------------------- /build-eval-agent/images/final-response.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/the-judge/b2793c9811dfc476993c42e3696a0adebc43bac9/build-eval-agent/images/final-response.png -------------------------------------------------------------------------------- /build-eval-agent/images/lookup.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/the-judge/b2793c9811dfc476993c42e3696a0adebc43bac9/build-eval-agent/images/lookup.png -------------------------------------------------------------------------------- /build-eval-agent/images/refund.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/the-judge/b2793c9811dfc476993c42e3696a0adebc43bac9/build-eval-agent/images/refund.png -------------------------------------------------------------------------------- /build-eval-agent/images/single-step.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/the-judge/b2793c9811dfc476993c42e3696a0adebc43bac9/build-eval-agent/images/single-step.png -------------------------------------------------------------------------------- /build-eval-agent/images/trajectory.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/the-judge/b2793c9811dfc476993c42e3696a0adebc43bac9/build-eval-agent/images/trajectory.png -------------------------------------------------------------------------------- /build-eval-agent/images/with-supervisor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/the-judge/b2793c9811dfc476993c42e3696a0adebc43bac9/build-eval-agent/images/with-supervisor.png -------------------------------------------------------------------------------- /evaluate-document-extraction/.env.example: -------------------------------------------------------------------------------- 1 | OPENAI_API_KEY= 2 | LANGCHAIN_API_KEY= 3 | LANGCHAIN_TRACING_V2= 4 | LANGCHAIN_PROJECT= 5 | -------------------------------------------------------------------------------- /evaluate-document-extraction/.gitignore: -------------------------------------------------------------------------------- 1 | .env -------------------------------------------------------------------------------- /evaluate-document-extraction/aapl.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/the-judge/b2793c9811dfc476993c42e3696a0adebc43bac9/evaluate-document-extraction/aapl.pdf -------------------------------------------------------------------------------- /evaluate-document-extraction/build-eval-extraction.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Build and evaluate document extraction 🦜⛓️\n", 8 | "\n" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "metadata": {}, 14 | "source": [ 15 | "### Install dependencies" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 1, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "%%capture --no-stderr\n", 25 | "%pip install langsmith langchain-openai langchain-core langchain-community pydantic python-dotenv openai\n", 26 | "%pip install --upgrade langsmith" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "import langsmith\n", 36 | "\n", 37 | "print(f\"\\nCurrent langsmith version: {langsmith.__version__}\")" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "### Load env" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "from dotenv import load_dotenv\n", 54 | "\n", 55 | "load_dotenv()" 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": {}, 61 | "source": [ 62 | "### Load the 10-K" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 4, 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "from langchain_community.document_loaders import PyPDFLoader\n", 72 | "\n", 73 | "def load_pdf():\n", 74 | " loader = PyPDFLoader(\"./aapl.pdf\")\n", 75 | " all_text = loader.load()\n", 76 | " return all_text" 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "metadata": {}, 82 | "source": [ 83 | "### Perform extraction" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [ 92 | "from pydantic import BaseModel, Field\n", 93 | "from langsmith import wrappers, Client\n", 94 | "from openai import OpenAI\n", 95 | "openai_client = wrappers.wrap_openai(OpenAI())\n", 96 | "\n", 97 | "class UsefulInformation(BaseModel):\n", 98 | " products_and_services: list[str] = Field(description=\"A list of products and services provided by the company\")\n", 99 | " risk_factors: list[str] = Field(description=\"A list of risk factors described in the document\")\n", 100 | " irs_employer_id_number: list[str] = Field(description=\"The IRS Employer Identification Number of the company\")\n", 101 | " company_address: list[str] = Field(description=\"The address of the company\")\n", 102 | " earnings_per_share_basic: list[str] = Field(description=\"The basic earnings per share of the company\")\n", 103 | " net_income: list[str] = Field(description=\"The net income of the company\")\n", 104 | "\n", 105 | "def extract_information(doc):\n", 106 | " prompt = f\"\"\"\n", 107 | " The text below is an excerpt from a 10-K report. You must extract specific information and return it in a structured format.\n", 108 | " \n", 109 | " CRITICAL INSTRUCTIONS:\n", 110 | " 1. AVOID DUPLICATES: Never include duplicate items in any list\n", 111 | " 2. BE CONCISE: Keep each item brief and to the point\n", 112 | " 3. VALIDATE: Each piece of information must be explicitly stated in the text, do not make assumptions\n", 113 | " 4. FORMAT: All fields must be lists, even if empty or single item\n", 114 | " \n", 115 | " Examples of GOOD responses:\n", 116 | " - Products: [\"Google Search\", \"Google Cloud\", \"Android\"]\n", 117 | " - Address: [\"1600 Amphitheatre Parkway, Mountain View, CA 94043\"]\n", 118 | " - Phone: [\"+1 650-253-0000\"]\n", 119 | " \n", 120 | " Examples of BAD responses (avoid these):\n", 121 | " - Duplicates: [\"Google Search\", \"Search by Google\", \"Google Search Engine\"]\n", 122 | " - Too verbose: [\"Google Search is a web search engine that allows users to search the World Wide Web...\"]\n", 123 | " - Made up data: Do not include information unless explicitly found in the text\n", 124 | " \n", 125 | " Please extract:\n", 126 | " 1. Products and Services: List unique products/services (max 10 items)\n", 127 | " 2. Risk Factors: List unique, critical risks (max 10 items)\n", 128 | " 3. IRS Employer ID Number: List any EIN found\n", 129 | " 4. Company Address: List primary address of the company\n", 130 | " 5. Earnings Per Share (Basic): List basic EPS figure\n", 131 | " 6. Net Income: List net income figure\n", 132 | "\n", 133 | " Text from the 10-K report:\n", 134 | " {doc}\n", 135 | " \"\"\"\n", 136 | " try:\n", 137 | " response = openai_client.beta.chat.completions.parse(\n", 138 | " model=\"o1-2024-12-17\",\n", 139 | " messages=[\n", 140 | " { \"role\": \"user\", \"content\": prompt },\n", 141 | " ],\n", 142 | " response_format=UsefulInformation\n", 143 | " )\n", 144 | " return response.choices[0].message.content\n", 145 | " except Exception as e:\n", 146 | " print(f\"Error in structured output LLM call: {str(e)}\")\n", 147 | " print(f\"Error type: {type(e)}\")\n", 148 | " return UsefulInformation(\n", 149 | " products_and_services=[],\n", 150 | " risk_factors=[],\n", 151 | " irs_employer_id_number=[],\n", 152 | " company_address=[],\n", 153 | " earnings_per_share_basic=[],\n", 154 | " net_income=[]\n", 155 | " )\n", 156 | "\n", 157 | "def process_all_docs():\n", 158 | " all_text = load_pdf()\n", 159 | " results = extract_information(all_text)\n", 160 | " print(\"processed all docs...\")\n", 161 | " return results\n", 162 | "\n", 163 | "aggregated_info = process_all_docs()\n", 164 | "print(aggregated_info)" 165 | ] 166 | }, 167 | { 168 | "cell_type": "markdown", 169 | "metadata": {}, 170 | "source": [ 171 | "### Evaluate extraction" 172 | ] 173 | }, 174 | { 175 | "cell_type": "markdown", 176 | "metadata": {}, 177 | "source": [ 178 | "\"extraction-eval\"" 179 | ] 180 | }, 181 | { 182 | "cell_type": "markdown", 183 | "metadata": {}, 184 | "source": [ 185 | "##### Load existing dataset" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": 6, 191 | "metadata": {}, 192 | "outputs": [], 193 | "source": [ 194 | "dataset_name = \"10-k extraction\"" 195 | ] 196 | }, 197 | { 198 | "cell_type": "markdown", 199 | "metadata": {}, 200 | "source": [ 201 | "##### Define application logic to be evaluated" 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": 7, 207 | "metadata": {}, 208 | "outputs": [], 209 | "source": [ 210 | "from langsmith import traceable\n", 211 | "\n", 212 | "client = Client()\n", 213 | "\n", 214 | "@traceable\n", 215 | "def target(inputs: dict) -> dict:\n", 216 | " response = openai_client.beta.chat.completions.parse(\n", 217 | " model=\"gpt-4o\",\n", 218 | " messages=[\n", 219 | " { \"role\": \"user\", \"content\": inputs[\"input\"][0][\"content\"] },\n", 220 | " ],\n", 221 | " response_format=UsefulInformation\n", 222 | " )\n", 223 | " return { \"response\": response.choices[0].message.content }" 224 | ] 225 | }, 226 | { 227 | "cell_type": "markdown", 228 | "metadata": {}, 229 | "source": [ 230 | "##### Define evaluator" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": 8, 236 | "metadata": {}, 237 | "outputs": [], 238 | "source": [ 239 | "import json\n", 240 | "\n", 241 | "def format_objects_for_llm_judge(obj1, obj2):\n", 242 | " \"\"\"Formats two objects into natural language for easier LLM comparison.\"\"\"\n", 243 | " def format_single_object(obj, object_name):\n", 244 | " if isinstance(obj, str):\n", 245 | " obj = json.loads(obj)\n", 246 | " formatted_sections = []\n", 247 | " formatted_sections.append(f\"\\n{object_name} contains the following information:\")\n", 248 | " sorted_keys = sorted(obj.keys())\n", 249 | " for key in sorted_keys:\n", 250 | " values = obj[key]\n", 251 | " readable_key = key.replace('_', ' ').capitalize()\n", 252 | " if isinstance(values, list):\n", 253 | " if len(values) == 1:\n", 254 | " formatted_sections.append(f\"\\n{readable_key}: {values[0]}\")\n", 255 | " else:\n", 256 | " items = '\\n - '.join(values)\n", 257 | " formatted_sections.append(f\"\\n{readable_key}:\\n - {items}\")\n", 258 | " else:\n", 259 | " formatted_sections.append(f\"\\n{readable_key}: {values}\")\n", 260 | " \n", 261 | " return '\\n'.join(formatted_sections)\n", 262 | "\n", 263 | " object1_text = format_single_object(obj1, \"Actual Output\")\n", 264 | " object2_text = format_single_object(obj2, \"Reference Output\")\n", 265 | " return [object1_text, object2_text]\n", 266 | "\n", 267 | "@traceable(run_type=\"llm\")\n", 268 | "def run_llm_judge(formatted_text):\n", 269 | " class Score(BaseModel):\n", 270 | " \"\"\"Evaluate how well an extracted output matches a reference ground truth for 10-K document information.\"\"\"\n", 271 | " accuracy: bool = Field(\n", 272 | " description=(\n", 273 | " \"A binary score (0 or 1) that indicates whether the model's extraction adequately matches the reference ground truth. \"\n", 274 | " \"Score 1 if the model's output captures the same essential business information as the reference extraction, even if \"\n", 275 | " \"expressed differently. The goal is to verify that the model successfully extracted similar key business information \"\n", 276 | " \"as found in the reference ground truth, not to ensure identical representation.\"\n", 277 | " )\n", 278 | " )\n", 279 | " reason: str = Field(\n", 280 | " description=(\n", 281 | " \"An explanation of how well the model's extraction aligns with the reference ground truth. Consider how effectively \"\n", 282 | " \"the model captured the same key business information, financial data, and risk factors as the reference output. \"\n", 283 | " \"Acknowledge that variations in expression are acceptable as long as the same core information is captured.\"\n", 284 | " )\n", 285 | " )\n", 286 | " response = openai_client.beta.chat.completions.parse(\n", 287 | " model=\"gpt-4o\",\n", 288 | " messages=[\n", 289 | " {\n", 290 | " \"role\": \"system\",\n", 291 | " \"content\": (\n", 292 | " \"You are evaluating how well a model's extraction of 10-K document information matches a reference ground truth output. \"\n", 293 | " \"Your task is to determine if the model successfully captured the same essential business information as the reference, \"\n", 294 | " \"understanding that similar concepts may be expressed differently.\\n\\n\"\n", 295 | " \"Context:\\n\"\n", 296 | " \"- The reference output represents the ground truth extraction from a 10-K document\\n\"\n", 297 | " \"- The model's output is being evaluated against this reference for accuracy and completeness\\n\"\n", 298 | " \"- Both extractions contain key business information like products/services, risk factors, and financial metrics\\n\"\n", 299 | " \"- The goal is to verify the model captured similar information as the reference, not identical expression\\n\\n\"\n", 300 | " \"Evaluation Guidelines:\\n\"\n", 301 | " \"- Score 1 (true) if the model's output:\\n\"\n", 302 | " \" * Captures the same core business information as the reference\\n\"\n", 303 | " \" * Identifies similar risk factors, even if described differently\\n\"\n", 304 | " \" * Extracts matching or equivalent financial metrics\\n\"\n", 305 | " \" * Contains consistent company identifiers\\n\"\n", 306 | " \" * May include additional valid information beyond the reference\\n\\n\"\n", 307 | " \"- Score 0 (false) only if the model's output:\\n\"\n", 308 | " \" * Misses or contradicts critical information from the reference\\n\"\n", 309 | " \" * Shows fundamental misunderstanding of the business details\\n\"\n", 310 | " \" * Contains irreconcilable differences in key metrics\\n\"\n", 311 | " \" * Fails to capture the essential information found in the reference\\n\\n\"\n", 312 | " \"Remember: The reference output is our ground truth. Evaluate how well the model's extraction \"\n", 313 | " \"captures the same essential business information, allowing for variations in expression.\\n\\n\"\n", 314 | " \"Outputs to Evaluate:\\n\"\n", 315 | " f\"- **Model Output:** {formatted_text[0]}\\n\"\n", 316 | " f\"- **Reference Ground Truth:** {formatted_text[1]}\\n\"\n", 317 | " )\n", 318 | " }\n", 319 | " ],\n", 320 | " response_format=Score\n", 321 | " )\n", 322 | " response_object = json.loads(response.choices[0].message.content)\n", 323 | " return { \"response\": response_object }\n", 324 | "\n", 325 | "@traceable\n", 326 | "def evaluate_accuracy(outputs: dict, reference_outputs: dict) -> dict:\n", 327 | " actual_output = outputs[\"response\"]\n", 328 | " expected_output = reference_outputs['output']\n", 329 | " formatted_text = format_objects_for_llm_judge(actual_output, expected_output)\n", 330 | " object_response = run_llm_judge(formatted_text)[\"response\"]\n", 331 | " return { \"key\": \"accuracy\",\n", 332 | " \"score\": object_response[\"accuracy\"],\n", 333 | " \"reason\": object_response[\"reason\"] }" 334 | ] 335 | }, 336 | { 337 | "cell_type": "markdown", 338 | "metadata": {}, 339 | "source": [ 340 | "##### Run evaluation" 341 | ] 342 | }, 343 | { 344 | "cell_type": "code", 345 | "execution_count": null, 346 | "metadata": {}, 347 | "outputs": [], 348 | "source": [ 349 | "experiment_results = client.evaluate(\n", 350 | " target,\n", 351 | " data=\"10-k extraction\",\n", 352 | " evaluators=[evaluate_accuracy],\n", 353 | " experiment_prefix=\"10-k-extraction-gpt-4o\",\n", 354 | " max_concurrency=5,\n", 355 | " num_repetitions=3\n", 356 | ")\n", 357 | "\n", 358 | "experiment_results.to_pandas()" 359 | ] 360 | } 361 | ], 362 | "metadata": { 363 | "kernelspec": { 364 | "display_name": "Python 3", 365 | "language": "python", 366 | "name": "python3" 367 | }, 368 | "language_info": { 369 | "codemirror_mode": { 370 | "name": "ipython", 371 | "version": 3 372 | }, 373 | "file_extension": ".py", 374 | "mimetype": "text/x-python", 375 | "name": "python", 376 | "nbconvert_exporter": "python", 377 | "pygments_lexer": "ipython3", 378 | "version": "3.13.0" 379 | } 380 | }, 381 | "nbformat": 4, 382 | "nbformat_minor": 2 383 | } 384 | -------------------------------------------------------------------------------- /evaluate-document-extraction/extraction-eval.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/the-judge/b2793c9811dfc476993c42e3696a0adebc43bac9/evaluate-document-extraction/extraction-eval.png --------------------------------------------------------------------------------