├── README.md
├── build-eval-agent
    ├── .env.example
    ├── .gitignore
    ├── agent-eval.ipynb
    ├── chinook.db
    └── images
    │   ├── architecture.png
    │   ├── evals-conceptual.png
    │   ├── final-response.png
    │   ├── lookup.png
    │   ├── refund.png
    │   ├── single-step.png
    │   ├── trajectory.png
    │   └── with-supervisor.png
└── evaluate-document-extraction
    ├── .env.example
    ├── .gitignore
    ├── aapl.pdf
    ├── build-eval-extraction.ipynb
    ├── evaluate-document-extraction-dataset.csv
    └── extraction-eval.png


/README.md:
--------------------------------------------------------------------------------
 1 | # The Judge
 2 | 
 3 | This repo is a collection of notebooks for the 'The Judge' video series
 4 | 
 5 | ### 1. Build and evaluate an agent
 6 | 
 7 | <img src="./build-eval-agent/images/architecture.png" alt="architecture" width="1250">
 8 | 
 9 | ### 2. Evaluate document extraction
10 | 
11 | <img src="./evaluate-document-extraction/extraction-eval.png" alt="architecture" width="1250">
12 | 


--------------------------------------------------------------------------------
/build-eval-agent/.env.example:
--------------------------------------------------------------------------------
1 | export OPENAI_API_KEY=<YOUR_OPENAI_API_KEY>
2 | export LANGCHAIN_API_KEY=<YOUR_LANGCHAIN_API_KEY>
3 | export LANGCHAIN_TRACING_V2=true
4 | export ANTHROPIC_API_KEY=<YOUR_ANTHROPIC_API_KEY>
5 | 


--------------------------------------------------------------------------------
/build-eval-agent/.gitignore:
--------------------------------------------------------------------------------
1 | .env


--------------------------------------------------------------------------------
/build-eval-agent/agent-eval.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "# Build and evaluate a customer support agent 🦜🕸️"
   8 |    ]
   9 |   },
  10 |   {
  11 |    "cell_type": "markdown",
  12 |    "metadata": {},
  13 |    "source": [
  14 |     "<img src=\"./images/architecture.png\" alt=\"architecture\" width=\"1250\">"
  15 |    ]
  16 |   },
  17 |   {
  18 |    "cell_type": "markdown",
  19 |    "metadata": {},
  20 |    "source": [
  21 |     "## Build the agent"
  22 |    ]
  23 |   },
  24 |   {
  25 |    "cell_type": "markdown",
  26 |    "metadata": {
  27 |     "vscode": {
  28 |      "languageId": "plaintext"
  29 |     }
  30 |    },
  31 |    "source": [
  32 |     "### Install dependencies"
  33 |    ]
  34 |   },
  35 |   {
  36 |    "cell_type": "code",
  37 |    "execution_count": 31,
  38 |    "metadata": {},
  39 |    "outputs": [],
  40 |    "source": [
  41 |     "%%capture --no-stderr\n",
  42 |     "%pip install --upgrade --quiet langgraph langchain_openai openai langchain_community langchain_core SQLAlchemy python-dotenv typing-extensions pydantic scikit-learn asttokens tabulate langsmith"
  43 |    ]
  44 |   },
  45 |   {
  46 |    "cell_type": "markdown",
  47 |    "metadata": {},
  48 |    "source": [
  49 |     "### Load env variables"
  50 |    ]
  51 |   },
  52 |   {
  53 |    "cell_type": "code",
  54 |    "execution_count": 32,
  55 |    "metadata": {},
  56 |    "outputs": [
  57 |     {
  58 |      "data": {
  59 |       "text/plain": [
  60 |        "True"
  61 |       ]
  62 |      },
  63 |      "execution_count": 32,
  64 |      "metadata": {},
  65 |      "output_type": "execute_result"
  66 |     }
  67 |    ],
  68 |    "source": [
  69 |     "from dotenv import load_dotenv\n",
  70 |     "\n",
  71 |     "load_dotenv()"
  72 |    ]
  73 |   },
  74 |   {
  75 |    "cell_type": "markdown",
  76 |    "metadata": {},
  77 |    "source": [
  78 |     "### Download database"
  79 |    ]
  80 |   },
  81 |   {
  82 |    "cell_type": "code",
  83 |    "execution_count": 33,
  84 |    "metadata": {},
  85 |    "outputs": [
  86 |     {
  87 |      "name": "stdout",
  88 |      "output_type": "stream",
  89 |      "text": [
  90 |       "File downloaded and saved as Chinook.db\n"
  91 |      ]
  92 |     }
  93 |    ],
  94 |    "source": [
  95 |     "import requests\n",
  96 |     "\n",
  97 |     "url = \"https://storage.googleapis.com/benchmarks-artifacts/chinook/Chinook.db\"\n",
  98 |     "\n",
  99 |     "response = requests.get(url)\n",
 100 |     "\n",
 101 |     "if response.status_code == 200:\n",
 102 |     "    # Open a local file in binary write mode\n",
 103 |     "    with open(\"chinook.db\", \"wb\") as file:\n",
 104 |     "        # Write the content of the response (the file) to the local file\n",
 105 |     "        file.write(response.content)\n",
 106 |     "    print(\"File downloaded and saved as Chinook.db\")\n",
 107 |     "else:\n",
 108 |     "    print(f\"Failed to download the file. Status code: {response.status_code}\")"
 109 |    ]
 110 |   },
 111 |   {
 112 |    "cell_type": "markdown",
 113 |    "metadata": {},
 114 |    "source": [
 115 |     "### List tables"
 116 |    ]
 117 |   },
 118 |   {
 119 |    "cell_type": "code",
 120 |    "execution_count": 34,
 121 |    "metadata": {},
 122 |    "outputs": [
 123 |     {
 124 |      "name": "stdout",
 125 |      "output_type": "stream",
 126 |      "text": [
 127 |       "Tables in the database:\n",
 128 |       "Album\n",
 129 |       "Artist\n",
 130 |       "Customer\n",
 131 |       "Employee\n",
 132 |       "Genre\n",
 133 |       "Invoice\n",
 134 |       "InvoiceLine\n",
 135 |       "MediaType\n",
 136 |       "Playlist\n",
 137 |       "PlaylistTrack\n",
 138 |       "Track\n"
 139 |      ]
 140 |     }
 141 |    ],
 142 |    "source": [
 143 |     "import sqlite3\n",
 144 |     "\n",
 145 |     "# Connect to the database\n",
 146 |     "conn = sqlite3.connect(\"chinook.db\")\n",
 147 |     "cursor = conn.cursor()\n",
 148 |     "\n",
 149 |     "# Fetch the names of all tables\n",
 150 |     "cursor.execute(\"SELECT name FROM sqlite_master WHERE type='table';\")\n",
 151 |     "tables = cursor.fetchall()\n",
 152 |     "\n",
 153 |     "# Print the names of all tables\n",
 154 |     "print(\"Tables in the database:\")\n",
 155 |     "for table in tables:\n",
 156 |     "    print(table[0])\n",
 157 |     "\n",
 158 |     "# Close the connection\n",
 159 |     "conn.close()"
 160 |    ]
 161 |   },
 162 |   {
 163 |    "cell_type": "markdown",
 164 |    "metadata": {},
 165 |    "source": [
 166 |     "### Define refund subgraph "
 167 |    ]
 168 |   },
 169 |   {
 170 |    "cell_type": "markdown",
 171 |    "metadata": {},
 172 |    "source": [
 173 |     "<img src=\"./images/refund.png\" alt=\"refund\" width=\"300\">"
 174 |    ]
 175 |   },
 176 |   {
 177 |    "cell_type": "markdown",
 178 |    "metadata": {},
 179 |    "source": [
 180 |     "##### Helper functions to fetch invoice info and execute refunds"
 181 |    ]
 182 |   },
 183 |   {
 184 |    "cell_type": "code",
 185 |    "execution_count": 35,
 186 |    "metadata": {},
 187 |    "outputs": [],
 188 |    "source": [
 189 |     "import sqlite3\n",
 190 |     "\n",
 191 |     "def _refund(invoice_id: int | None, invoice_line_ids: list[int] | None, mock: bool = False) -> float:\n",
 192 |     "    \"\"\"Given an Invoice ID and/or Invoice Line IDs, delete the relevant Invoice/InvoiceLine records in the Chinook DB.\n",
 193 |     "\n",
 194 |     "    Args:\n",
 195 |     "        invoice_id: The Invoice to delete.\n",
 196 |     "        invoice_line_ids: The Invoice Lines to delete.\n",
 197 |     "        mock: If True, do not actually delete the specified Invoice/Invoice Lines. Used for testing purposes.\n",
 198 |     "\n",
 199 |     "    Returns:\n",
 200 |     "        float: The total dollar amount that was deleted (or mock deleted).\n",
 201 |     "    \"\"\"\n",
 202 |     "\n",
 203 |     "    if invoice_id is None and invoice_line_ids is None:\n",
 204 |     "        return 0.0\n",
 205 |     "\n",
 206 |     "    # Connect to the Chinook database\n",
 207 |     "    conn = sqlite3.connect(\"chinook.db\")\n",
 208 |     "    cursor = conn.cursor()\n",
 209 |     "\n",
 210 |     "    total_refund = 0.0\n",
 211 |     "\n",
 212 |     "    try:\n",
 213 |     "        # If invoice_id is provided, delete entire invoice and its lines\n",
 214 |     "        if invoice_id is not None:\n",
 215 |     "            # First get the total amount for the invoice\n",
 216 |     "            cursor.execute(\n",
 217 |     "                \"\"\"\n",
 218 |     "                SELECT Total\n",
 219 |     "                FROM Invoice\n",
 220 |     "                WHERE InvoiceId = ?\n",
 221 |     "            \"\"\",\n",
 222 |     "                (invoice_id,),\n",
 223 |     "            )\n",
 224 |     "\n",
 225 |     "            result = cursor.fetchone()\n",
 226 |     "            if result:\n",
 227 |     "                total_refund += result[0]\n",
 228 |     "\n",
 229 |     "            # Delete invoice lines first (due to foreign key constraints)\n",
 230 |     "            if not mock:\n",
 231 |     "                cursor.execute(\n",
 232 |     "                    \"\"\"\n",
 233 |     "                    DELETE FROM InvoiceLine\n",
 234 |     "                    WHERE InvoiceId = ?\n",
 235 |     "                \"\"\",\n",
 236 |     "                    (invoice_id,),\n",
 237 |     "                )\n",
 238 |     "\n",
 239 |     "                # Then delete the invoice\n",
 240 |     "                cursor.execute(\n",
 241 |     "                    \"\"\"\n",
 242 |     "                    DELETE FROM Invoice\n",
 243 |     "                    WHERE InvoiceId = ?\n",
 244 |     "                \"\"\",\n",
 245 |     "                    (invoice_id,),\n",
 246 |     "                )\n",
 247 |     "\n",
 248 |     "        # If specific invoice lines are provided\n",
 249 |     "        if invoice_line_ids is not None:\n",
 250 |     "            # Get the total amount for the specified invoice lines\n",
 251 |     "            placeholders = \",\".join([\"?\" for _ in invoice_line_ids])\n",
 252 |     "            cursor.execute(\n",
 253 |     "                f\"\"\"\n",
 254 |     "                SELECT SUM(UnitPrice * Quantity)\n",
 255 |     "                FROM InvoiceLine\n",
 256 |     "                WHERE InvoiceLineId IN ({placeholders})\n",
 257 |     "            \"\"\",\n",
 258 |     "                invoice_line_ids,\n",
 259 |     "            )\n",
 260 |     "\n",
 261 |     "            result = cursor.fetchone()\n",
 262 |     "            if result and result[0]:\n",
 263 |     "                total_refund += result[0]\n",
 264 |     "\n",
 265 |     "            if not mock:\n",
 266 |     "                # Delete the specified invoice lines\n",
 267 |     "                cursor.execute(\n",
 268 |     "                    f\"\"\"\n",
 269 |     "                    DELETE FROM InvoiceLine\n",
 270 |     "                    WHERE InvoiceLineId IN ({placeholders})\n",
 271 |     "                \"\"\",\n",
 272 |     "                    invoice_line_ids,\n",
 273 |     "                )\n",
 274 |     "\n",
 275 |     "        # Commit the changes\n",
 276 |     "        conn.commit()\n",
 277 |     "\n",
 278 |     "    except sqlite3.Error as e:\n",
 279 |     "        # Roll back in case of error\n",
 280 |     "        conn.rollback()\n",
 281 |     "        raise e\n",
 282 |     "\n",
 283 |     "    finally:\n",
 284 |     "        # Close the connection\n",
 285 |     "        conn.close()\n",
 286 |     "\n",
 287 |     "    return float(total_refund)\n",
 288 |     "\n",
 289 |     "def _lookup(\n",
 290 |     "    customer_first_name: str,\n",
 291 |     "    customer_last_name: str,\n",
 292 |     "    customer_phone: str,\n",
 293 |     "    track_name: str | None,\n",
 294 |     "    album_title: str | None,\n",
 295 |     "    artist_name: str | None,\n",
 296 |     "    purchase_date_iso_8601: str | None,\n",
 297 |     ") -> list[dict]:\n",
 298 |     "    \"\"\"Find all of the Invoice Line IDs in the Chinook DB for the given filters.\n",
 299 |     "\n",
 300 |     "    Returns:\n",
 301 |     "        a list of dictionaries that contain keys: {\n",
 302 |     "            'invoice_line_id',\n",
 303 |     "            'track_name',\n",
 304 |     "            'artist_name',\n",
 305 |     "            'purchase_date',\n",
 306 |     "            'quantity_purchased',\n",
 307 |     "            'price_per_unit'\n",
 308 |     "        }\n",
 309 |     "    \"\"\"\n",
 310 |     "\n",
 311 |     "    # Connect to the database\n",
 312 |     "    conn = sqlite3.connect(\"chinook.db\")\n",
 313 |     "    cursor = conn.cursor()\n",
 314 |     "\n",
 315 |     "    # Base query joining all necessary tables\n",
 316 |     "    query = \"\"\"\n",
 317 |     "    SELECT\n",
 318 |     "        il.InvoiceLineId,\n",
 319 |     "        t.Name as track_name,\n",
 320 |     "        art.Name as artist_name,\n",
 321 |     "        i.InvoiceDate as purchase_date,\n",
 322 |     "        il.Quantity as quantity_purchased,\n",
 323 |     "        il.UnitPrice as price_per_unit\n",
 324 |     "    FROM InvoiceLine il\n",
 325 |     "    JOIN Invoice i ON il.InvoiceId = i.InvoiceId\n",
 326 |     "    JOIN Customer c ON i.CustomerId = c.CustomerId\n",
 327 |     "    JOIN Track t ON il.TrackId = t.TrackId\n",
 328 |     "    JOIN Album alb ON t.AlbumId = alb.AlbumId\n",
 329 |     "    JOIN Artist art ON alb.ArtistId = art.ArtistId\n",
 330 |     "    WHERE c.FirstName = ?\n",
 331 |     "    AND c.LastName = ?\n",
 332 |     "    AND c.Phone = ?\n",
 333 |     "    \"\"\"\n",
 334 |     "\n",
 335 |     "    # Parameters for the query\n",
 336 |     "    params = [customer_first_name, customer_last_name, customer_phone]\n",
 337 |     "\n",
 338 |     "    # Add optional filters\n",
 339 |     "    if track_name:\n",
 340 |     "        query += \" AND t.Name = ?\"\n",
 341 |     "        params.append(track_name)\n",
 342 |     "\n",
 343 |     "    if album_title:\n",
 344 |     "        query += \" AND alb.Title = ?\"\n",
 345 |     "        params.append(album_title)\n",
 346 |     "\n",
 347 |     "    if artist_name:\n",
 348 |     "        query += \" AND art.Name = ?\"\n",
 349 |     "        params.append(artist_name)\n",
 350 |     "\n",
 351 |     "    if purchase_date_iso_8601:\n",
 352 |     "        query += \" AND date(i.InvoiceDate) = date(?)\"\n",
 353 |     "        params.append(purchase_date_iso_8601)\n",
 354 |     "\n",
 355 |     "    # Execute query\n",
 356 |     "    cursor.execute(query, params)\n",
 357 |     "\n",
 358 |     "    # Fetch results\n",
 359 |     "    results = cursor.fetchall()\n",
 360 |     "\n",
 361 |     "    # Convert results to list of dictionaries\n",
 362 |     "    output = []\n",
 363 |     "    for row in results:\n",
 364 |     "        output.append(\n",
 365 |     "            {\n",
 366 |     "                \"invoice_line_id\": row[0],\n",
 367 |     "                \"track_name\": row[1],\n",
 368 |     "                \"artist_name\": row[2],\n",
 369 |     "                \"purchase_date\": row[3],\n",
 370 |     "                \"quantity_purchased\": row[4],\n",
 371 |     "                \"price_per_unit\": row[5],\n",
 372 |     "            }\n",
 373 |     "        )\n",
 374 |     "\n",
 375 |     "    # Close connection\n",
 376 |     "    conn.close()\n",
 377 |     "\n",
 378 |     "    return output"
 379 |    ]
 380 |   },
 381 |   {
 382 |    "cell_type": "markdown",
 383 |    "metadata": {},
 384 |    "source": [
 385 |     "##### Build the graph"
 386 |    ]
 387 |   },
 388 |   {
 389 |    "cell_type": "code",
 390 |    "execution_count": 36,
 391 |    "metadata": {},
 392 |    "outputs": [],
 393 |    "source": [
 394 |     "from typing import Literal\n",
 395 |     "import json\n",
 396 |     "\n",
 397 |     "from langchain.chat_models import init_chat_model\n",
 398 |     "from langchain_core.runnables import RunnableConfig\n",
 399 |     "from langgraph.graph import END, StateGraph\n",
 400 |     "from langgraph.graph.message import AnyMessage, add_messages\n",
 401 |     "from langgraph.types import Command\n",
 402 |     "from tabulate import tabulate\n",
 403 |     "from typing_extensions import Annotated, TypedDict\n",
 404 |     "\n",
 405 |     "# Graph state.\n",
 406 |     "class State(TypedDict):\n",
 407 |     "    \"\"\"Agent state.\"\"\"\n",
 408 |     "    messages: Annotated[list[AnyMessage], add_messages]\n",
 409 |     "    followup: str | None\n",
 410 |     "\n",
 411 |     "    invoice_id: int | None\n",
 412 |     "    invoice_line_ids: list[int] | None\n",
 413 |     "    customer_first_name: str | None\n",
 414 |     "    customer_last_name: str | None\n",
 415 |     "    customer_phone: str | None\n",
 416 |     "    track_name: str | None\n",
 417 |     "    album_title: str | None\n",
 418 |     "    artist_name: str | None\n",
 419 |     "    purchase_date_iso_8601: str | None\n",
 420 |     "\n",
 421 |     "# Instructions for extracting the user/purchase info from the conversation.\n",
 422 |     "gather_info_instructions = \"\"\"You are managing an online music store that sells song tracks. \\\n",
 423 |     "Customers can buy multiple tracks at a time and these purchases are recorded in a database as \\\n",
 424 |     "an Invoice per purchase and an associated set of Invoice Lines for each purchased track.\n",
 425 |     "\n",
 426 |     "Your task is to help customers who would like a refund for one or more of the tracks they've \\\n",
 427 |     "purchased. In order for you to be able refund them, the customer must specify the Invoice ID \\\n",
 428 |     "to get a refund on all the tracks they bought in a single transaction, or one or more Invoice \\\n",
 429 |     "Line IDs if they would like refunds on individual tracks.\n",
 430 |     "\n",
 431 |     "Often a user will not know the specific Invoice ID(s) or Invoice Line ID(s) for which they \\\n",
 432 |     "would like a refund. In this case you can help them look up their invoices by asking them to \\\n",
 433 |     "specify:\n",
 434 |     "- Required: Their first name, last name, and phone number.\n",
 435 |     "- Optionally: The track name, artist name, album name, or purchase date.\n",
 436 |     "\n",
 437 |     "If the customer has not specified the required information (either Invoice/Invoice Line IDs \\\n",
 438 |     "or first name, last name, phone) then please ask them to specify it.\"\"\"\n",
 439 |     "\n",
 440 |     "# Extraction schema, mirrors the graph state.\n",
 441 |     "class PurchaseInformation(TypedDict):\n",
 442 |     "    \"\"\"All of the known information about the invoice / invoice lines the customer would like refunded. Do not make up values, leave fields as null if you don't know their value.\"\"\"\n",
 443 |     "\n",
 444 |     "    invoice_id: int | None\n",
 445 |     "    invoice_line_ids: list[int] | None\n",
 446 |     "    customer_first_name: str | None\n",
 447 |     "    customer_last_name: str | None\n",
 448 |     "    customer_phone: str | None\n",
 449 |     "    track_name: str | None\n",
 450 |     "    album_title: str | None\n",
 451 |     "    artist_name: str | None\n",
 452 |     "    purchase_date_iso_8601: str | None\n",
 453 |     "    followup: Annotated[\n",
 454 |     "        str | None,\n",
 455 |     "        ...,\n",
 456 |     "        \"If the user hasn't enough identifying information, please tell them what the required information is and ask them to specify it.\",\n",
 457 |     "    ]\n",
 458 |     "\n",
 459 |     "# Model for performing extraction.\n",
 460 |     "info_llm = init_chat_model(\"gpt-4o-mini\").with_structured_output(\n",
 461 |     "    PurchaseInformation, method=\"json_schema\", include_raw=True\n",
 462 |     ")\n",
 463 |     "\n",
 464 |     "# Graph node for extracting user info and routing to lookup/refund/END.\n",
 465 |     "async def gather_info(state: State) -> Command[Literal[\"lookup\", \"refund\", \"__end__\"]]:\n",
 466 |     "    info = await info_llm.ainvoke(\n",
 467 |     "        [\n",
 468 |     "            {\"role\": \"system\", \"content\": gather_info_instructions},\n",
 469 |     "            *state[\"messages\"],\n",
 470 |     "        ]\n",
 471 |     "    )\n",
 472 |     "    parsed = info[\"parsed\"]\n",
 473 |     "    if any(parsed[k] for k in (\"invoice_id\", \"invoice_line_ids\")):\n",
 474 |     "        goto = \"refund\"\n",
 475 |     "    elif all(\n",
 476 |     "        parsed[k]\n",
 477 |     "        for k in (\"customer_first_name\", \"customer_last_name\", \"customer_phone\")\n",
 478 |     "    ):\n",
 479 |     "        goto = \"lookup\"\n",
 480 |     "    else:\n",
 481 |     "        goto = END\n",
 482 |     "    update = {\"messages\": [info[\"raw\"]], **parsed}\n",
 483 |     "    return Command(update=update, goto=goto)\n",
 484 |     "\n",
 485 |     "# Graph node for executing the refund.\n",
 486 |     "# Note that here we inspect the runtime config for an \"env\" variable.\n",
 487 |     "# If \"env\" is set to \"test\", then we don't actually delete any rows from our database.\n",
 488 |     "# This will become important when we're runnign our evaluations.\n",
 489 |     "def refund(state: State, config: RunnableConfig) -> dict:\n",
 490 |     "    # Whether to mock the deletion. True if the configurable var 'env' is set to 'test'.\n",
 491 |     "    mock = config.get(\"configurable\", {}).get(\"env\", \"prod\") == \"test\"\n",
 492 |     "    refunded = _refund(\n",
 493 |     "        invoice_id=state[\"invoice_id\"], invoice_line_ids=state[\"invoice_line_ids\"], mock=mock\n",
 494 |     "    )\n",
 495 |     "    response = f\"You have been refunded a total of: ${refunded:.2f}. Is there anything else I can help with?\"\n",
 496 |     "    return {\n",
 497 |     "        \"messages\": [{\"role\": \"assistant\", \"content\": response}],\n",
 498 |     "        \"followup\": response,\n",
 499 |     "    }\n",
 500 |     "\n",
 501 |     "# Graph node for looking up the users purchases\n",
 502 |     "def lookup(state: State) -> dict:\n",
 503 |     "    args = (\n",
 504 |     "        state[k]\n",
 505 |     "        for k in (\n",
 506 |     "            \"customer_first_name\",\n",
 507 |     "            \"customer_last_name\",\n",
 508 |     "            \"customer_phone\",\n",
 509 |     "            \"track_name\",\n",
 510 |     "            \"album_title\",\n",
 511 |     "            \"artist_name\",\n",
 512 |     "            \"purchase_date_iso_8601\",\n",
 513 |     "        )\n",
 514 |     "    )\n",
 515 |     "    results = _lookup(*args)\n",
 516 |     "    if not results:\n",
 517 |     "        response = \"We did not find any purchases associated with the information you've provided. Are you sure you've entered all of your information correctly?\"\n",
 518 |     "        followup = response\n",
 519 |     "    else:\n",
 520 |     "        response = f\"Which of the following purchases would you like to be refunded for?\\n\\n```json{json.dumps(results, indent=2)}\\n```\"\n",
 521 |     "        followup = f\"Which of the following purchases would you like to be refunded for?\\n\\n{tabulate(results, headers='keys')}\"\n",
 522 |     "    return {\n",
 523 |     "        \"messages\": [{\"role\": \"assistant\", \"content\": response}],\n",
 524 |     "        \"followup\": followup,\n",
 525 |     "        \"invoice_line_ids\": [res[\"invoice_line_id\"] for res in results],\n",
 526 |     "    }\n",
 527 |     "\n",
 528 |     "# Building our graph\n",
 529 |     "graph_builder = StateGraph(State)\n",
 530 |     "\n",
 531 |     "graph_builder.add_node(gather_info)\n",
 532 |     "graph_builder.add_node(refund)\n",
 533 |     "graph_builder.add_node(lookup)\n",
 534 |     "\n",
 535 |     "graph_builder.set_entry_point(\"gather_info\")\n",
 536 |     "graph_builder.add_edge(\"lookup\", END)\n",
 537 |     "graph_builder.add_edge(\"refund\", END)\n",
 538 |     "\n",
 539 |     "refund_graph = graph_builder.compile()"
 540 |    ]
 541 |   },
 542 |   {
 543 |    "cell_type": "markdown",
 544 |    "metadata": {},
 545 |    "source": [
 546 |     "### Define question answering subgraph"
 547 |    ]
 548 |   },
 549 |   {
 550 |    "cell_type": "markdown",
 551 |    "metadata": {},
 552 |    "source": [
 553 |     "<img src=\"./images/lookup.png\" alt=\"lookup\" width=\"300\">"
 554 |    ]
 555 |   },
 556 |   {
 557 |    "cell_type": "code",
 558 |    "execution_count": 37,
 559 |    "metadata": {},
 560 |    "outputs": [],
 561 |    "source": [
 562 |     "from langchain.embeddings import init_embeddings\n",
 563 |     "from langchain_core.tools import tool\n",
 564 |     "from langchain_core.vectorstores import InMemoryVectorStore\n",
 565 |     "from langgraph.prebuilt import create_react_agent\n",
 566 |     "\n",
 567 |     "# We'll create vectorstore indexes for all of the artists, tracks and albums\n",
 568 |     "# ahead of time and use those to disambiguate the user input. E.g. if a user searches for\n",
 569 |     "# songs by \"prince\" and our DB records the artist as \"Prince\", ideally when we query our\n",
 570 |     "# artist vectorstore for \"prince\" we'll get back the value \"Prince\", which we can then\n",
 571 |     "# use in our SQL queries.\n",
 572 |     "def index_fields() -> tuple[InMemoryVectorStore, InMemoryVectorStore, InMemoryVectorStore]:\n",
 573 |     "    \"\"\"Create an index for all artists, an index for all albums, and an index for all songs.\"\"\"\n",
 574 |     "    try:\n",
 575 |     "        # Connect to the chinook database\n",
 576 |     "        conn = sqlite3.connect(\"chinook.db\")\n",
 577 |     "        cursor = conn.cursor()\n",
 578 |     "\n",
 579 |     "        # Fetch all results\n",
 580 |     "        tracks = cursor.execute(\"SELECT Name FROM Track\").fetchall()\n",
 581 |     "        artists = cursor.execute(\"SELECT Name FROM Artist\").fetchall()\n",
 582 |     "        albums = cursor.execute(\"SELECT Title FROM Album\").fetchall()\n",
 583 |     "    finally:\n",
 584 |     "        # Close the connection\n",
 585 |     "        if conn:\n",
 586 |     "            conn.close()\n",
 587 |     "\n",
 588 |     "    embeddings = init_embeddings(\"openai:text-embedding-3-small\")\n",
 589 |     "\n",
 590 |     "    track_store = InMemoryVectorStore(embeddings)\n",
 591 |     "    artist_store = InMemoryVectorStore(embeddings)\n",
 592 |     "    album_store = InMemoryVectorStore(embeddings)\n",
 593 |     "\n",
 594 |     "    track_store.add_texts([t[0] for t in tracks])\n",
 595 |     "    artist_store.add_texts([a[0] for a in artists])\n",
 596 |     "    album_store.add_texts([a[0] for a in albums])\n",
 597 |     "    return track_store, artist_store, album_store\n",
 598 |     "\n",
 599 |     "track_store, artist_store, album_store = index_fields()\n",
 600 |     "\n",
 601 |     "# Agent tools\n",
 602 |     "@tool\n",
 603 |     "def lookup_track(\n",
 604 |     "    track_name: str | None = None,\n",
 605 |     "    album_title: str | None = None,\n",
 606 |     "    artist_name: str | None = None,\n",
 607 |     ") -> list[dict]:\n",
 608 |     "    \"\"\"Lookup a track in Chinook DB based on identifying information about.\n",
 609 |     "\n",
 610 |     "    Returns:\n",
 611 |     "        a list of dictionaries per matching track that contain keys {'track_name', 'artist_name', 'album_name'}\n",
 612 |     "    \"\"\"\n",
 613 |     "    conn = sqlite3.connect(\"chinook.db\")\n",
 614 |     "    cursor = conn.cursor()\n",
 615 |     "\n",
 616 |     "    query = \"\"\"\n",
 617 |     "    SELECT DISTINCT t.Name as track_name, ar.Name as artist_name, al.Title as album_name\n",
 618 |     "    FROM Track t\n",
 619 |     "    JOIN Album al ON t.AlbumId = al.AlbumId\n",
 620 |     "    JOIN Artist ar ON al.ArtistId = ar.ArtistId\n",
 621 |     "    WHERE 1=1\n",
 622 |     "    \"\"\"\n",
 623 |     "    params = []\n",
 624 |     "\n",
 625 |     "    if track_name:\n",
 626 |     "        track_name = track_store.similarity_search(track_name, k=1)[0].page_content\n",
 627 |     "        query += \" AND t.Name LIKE ?\"\n",
 628 |     "        params.append(f\"%{track_name}%\")\n",
 629 |     "    if album_title:\n",
 630 |     "        album_title = album_store.similarity_search(album_title, k=1)[0].page_content\n",
 631 |     "        query += \" AND al.Title LIKE ?\"\n",
 632 |     "        params.append(f\"%{album_title}%\")\n",
 633 |     "    if artist_name:\n",
 634 |     "        artist_name = artist_store.similarity_search(artist_name, k=1)[0].page_content\n",
 635 |     "        query += \" AND ar.Name LIKE ?\"\n",
 636 |     "        params.append(f\"%{artist_name}%\")\n",
 637 |     "\n",
 638 |     "    cursor.execute(query, params)\n",
 639 |     "    results = cursor.fetchall()\n",
 640 |     "\n",
 641 |     "    tracks = [\n",
 642 |     "        {\"track_name\": row[0], \"artist_name\": row[1], \"album_name\": row[2]}\n",
 643 |     "        for row in results\n",
 644 |     "    ]\n",
 645 |     "\n",
 646 |     "    conn.close()\n",
 647 |     "    return tracks\n",
 648 |     "\n",
 649 |     "@tool\n",
 650 |     "def lookup_album(\n",
 651 |     "    track_name: str | None = None,\n",
 652 |     "    album_title: str | None = None,\n",
 653 |     "    artist_name: str | None = None,\n",
 654 |     ") -> list[dict]:\n",
 655 |     "    \"\"\"Lookup an album in Chinook DB based on identifying information about.\n",
 656 |     "\n",
 657 |     "    Returns:\n",
 658 |     "        a list of dictionaries per matching album that contain keys {'album_name', 'artist_name'}\n",
 659 |     "    \"\"\"\n",
 660 |     "    conn = sqlite3.connect(\"chinook.db\")\n",
 661 |     "    cursor = conn.cursor()\n",
 662 |     "\n",
 663 |     "    query = \"\"\"\n",
 664 |     "    SELECT DISTINCT al.Title as album_name, ar.Name as artist_name\n",
 665 |     "    FROM Album al\n",
 666 |     "    JOIN Artist ar ON al.ArtistId = ar.ArtistId\n",
 667 |     "    LEFT JOIN Track t ON t.AlbumId = al.AlbumId\n",
 668 |     "    WHERE 1=1\n",
 669 |     "    \"\"\"\n",
 670 |     "    params = []\n",
 671 |     "\n",
 672 |     "    if track_name:\n",
 673 |     "        query += \" AND t.Name LIKE ?\"\n",
 674 |     "        params.append(f\"%{track_name}%\")\n",
 675 |     "    if album_title:\n",
 676 |     "        query += \" AND al.Title LIKE ?\"\n",
 677 |     "        params.append(f\"%{album_title}%\")\n",
 678 |     "    if artist_name:\n",
 679 |     "        query += \" AND ar.Name LIKE ?\"\n",
 680 |     "        params.append(f\"%{artist_name}%\")\n",
 681 |     "\n",
 682 |     "    cursor.execute(query, params)\n",
 683 |     "    results = cursor.fetchall()\n",
 684 |     "\n",
 685 |     "    albums = [{\"album_name\": row[0], \"artist_name\": row[1]} for row in results]\n",
 686 |     "\n",
 687 |     "    conn.close()\n",
 688 |     "    return albums\n",
 689 |     "\n",
 690 |     "@tool\n",
 691 |     "def lookup_artist(\n",
 692 |     "    track_name: str | None = None,\n",
 693 |     "    album_title: str | None = None,\n",
 694 |     "    artist_name: str | None = None,\n",
 695 |     ") -> list[str]:\n",
 696 |     "    \"\"\"Lookup an album in Chinook DB based on identifying information about.\n",
 697 |     "\n",
 698 |     "    Returns:\n",
 699 |     "        a list of matching artist names\n",
 700 |     "    \"\"\"\n",
 701 |     "    conn = sqlite3.connect(\"chinook.db\")\n",
 702 |     "    cursor = conn.cursor()\n",
 703 |     "\n",
 704 |     "    query = \"\"\"\n",
 705 |     "    SELECT DISTINCT ar.Name as artist_name\n",
 706 |     "    FROM Artist ar\n",
 707 |     "    LEFT JOIN Album al ON al.ArtistId = ar.ArtistId\n",
 708 |     "    LEFT JOIN Track t ON t.AlbumId = al.AlbumId\n",
 709 |     "    WHERE 1=1\n",
 710 |     "    \"\"\"\n",
 711 |     "    params = []\n",
 712 |     "\n",
 713 |     "    if track_name:\n",
 714 |     "        query += \" AND t.Name LIKE ?\"\n",
 715 |     "        params.append(f\"%{track_name}%\")\n",
 716 |     "    if album_title:\n",
 717 |     "        query += \" AND al.Title LIKE ?\"\n",
 718 |     "        params.append(f\"%{album_title}%\")\n",
 719 |     "    if artist_name:\n",
 720 |     "        query += \" AND ar.Name LIKE ?\"\n",
 721 |     "        params.append(f\"%{artist_name}%\")\n",
 722 |     "\n",
 723 |     "    cursor.execute(query, params)\n",
 724 |     "    results = cursor.fetchall()\n",
 725 |     "\n",
 726 |     "    artists = [row[0] for row in results]\n",
 727 |     "\n",
 728 |     "    conn.close()\n",
 729 |     "    return artists\n",
 730 |     "\n",
 731 |     "# Agent model\n",
 732 |     "qa_llm = init_chat_model(\"claude-3-5-sonnet-latest\")\n",
 733 |     "# The prebuilt ReACT agent only expects State to have a 'messages' key, so the\n",
 734 |     "# state we defined for the refund agent can also be passed to our lookup agent.\n",
 735 |     "qa_graph = create_react_agent(qa_llm, [lookup_track, lookup_artist, lookup_album])"
 736 |    ]
 737 |   },
 738 |   {
 739 |    "cell_type": "markdown",
 740 |    "metadata": {},
 741 |    "source": [
 742 |     "### Define supervisor node"
 743 |    ]
 744 |   },
 745 |   {
 746 |    "cell_type": "markdown",
 747 |    "metadata": {},
 748 |    "source": [
 749 |     "<img src=\"./images/with-supervisor.png\" alt=\"supervisor\" width=\"350\">"
 750 |    ]
 751 |   },
 752 |   {
 753 |    "cell_type": "code",
 754 |    "execution_count": 38,
 755 |    "metadata": {},
 756 |    "outputs": [
 757 |     {
 758 |      "name": "stdout",
 759 |      "output_type": "stream",
 760 |      "text": [
 761 |       "Which of the following purchases would you like to be refunded for?\n",
 762 |       "\n",
 763 |       "  invoice_line_id  track_name                        artist_name    purchase_date          quantity_purchased    price_per_unit\n",
 764 |       "-----------------  --------------------------------  -------------  -------------------  --------------------  ----------------\n",
 765 |       "              267  How Many More Times               Led Zeppelin   2009-08-06 00:00:00                     1              0.99\n",
 766 |       "              268  What Is And What Should Never Be  Led Zeppelin   2009-08-06 00:00:00                     1              0.99\n"
 767 |      ]
 768 |     }
 769 |    ],
 770 |    "source": [
 771 |     "# Schema for routing user intent.\n",
 772 |     "# We'll use structured outputs to enforce that the model returns only\n",
 773 |     "# the desired output.\n",
 774 |     "class UserIntent(TypedDict):\n",
 775 |     "    \"\"\"The user's current intent in the conversation\"\"\"\n",
 776 |     "\n",
 777 |     "    intent: Literal[\"refund\", \"question_answering\"]\n",
 778 |     "\n",
 779 |     "# Routing model with structured output\n",
 780 |     "router_llm = init_chat_model(\"gpt-4o-mini\").with_structured_output(\n",
 781 |     "    UserIntent, method=\"json_schema\", strict=True\n",
 782 |     ")\n",
 783 |     "\n",
 784 |     "# Instructions for routing.\n",
 785 |     "route_instructions = \"\"\"You are managing an online music store that sells song tracks. \\\n",
 786 |     "You can help customers in two types of ways: (1) answering general questions about \\\n",
 787 |     "tracks sold at your store, (2) helping them get a refund on a purhcase they made at your store.\n",
 788 |     "\n",
 789 |     "Based on the following conversation, determine if the user is currently seeking general \\\n",
 790 |     "information about song tracks or if they are trying to refund a specific purchase.\n",
 791 |     "\n",
 792 |     "Return 'refund' if they are trying to get a refund and 'question_answering' if they are \\\n",
 793 |     "asking a general music question. Do NOT return anything else. Do NOT try to respond to \\\n",
 794 |     "the user.\n",
 795 |     "\"\"\"\n",
 796 |     "\n",
 797 |     "# Node for routing.\n",
 798 |     "async def intent_classifier(\n",
 799 |     "    state: State,\n",
 800 |     ") -> Command[Literal[\"refund_agent\", \"question_answering_agent\"]]:\n",
 801 |     "    response = router_llm.invoke(\n",
 802 |     "        [{\"role\": \"system\", \"content\": route_instructions}, *state[\"messages\"]]\n",
 803 |     "    )\n",
 804 |     "    return Command(goto=response[\"intent\"] + \"_agent\")\n",
 805 |     "\n",
 806 |     "# Node for making sure the 'followup' key is set before our agent run completes.\n",
 807 |     "def compile_followup(state: State) -> dict:\n",
 808 |     "    \"\"\"Set the followup to be the last message if it hasn't explicitly been set.\"\"\"\n",
 809 |     "    if not state.get(\"followup\"):\n",
 810 |     "        return {\"followup\": state[\"messages\"][-1].content}\n",
 811 |     "    return {}\n",
 812 |     "\n",
 813 |     "# Agent definition\n",
 814 |     "graph_builder = StateGraph(State)\n",
 815 |     "graph_builder.add_node(intent_classifier)\n",
 816 |     "# Since all of our subagents have compatible state,\n",
 817 |     "# we can add them as nodes directly.\n",
 818 |     "graph_builder.add_node(\"refund_agent\", refund_graph)\n",
 819 |     "graph_builder.add_node(\"question_answering_agent\", qa_graph)\n",
 820 |     "graph_builder.add_node(compile_followup)\n",
 821 |     "\n",
 822 |     "graph_builder.set_entry_point(\"intent_classifier\")\n",
 823 |     "graph_builder.add_edge(\"refund_agent\", \"compile_followup\")\n",
 824 |     "graph_builder.add_edge(\"question_answering_agent\", \"compile_followup\")\n",
 825 |     "graph_builder.add_edge(\"compile_followup\", END)\n",
 826 |     "\n",
 827 |     "graph = graph_builder.compile()\n",
 828 |     "state = await graph.ainvoke({\"messages\": [\n",
 829 |     "    {\n",
 830 |     "        \"role\": \"user\",\n",
 831 |     "        \"content\": \"my name is Aaron Mitchell and my number is +1 (204) 452-6452. I bought some songs by Led Zeppelin that i'd like refunded\",\n",
 832 |     "    }\n",
 833 |     "]})\n",
 834 |     "print(state[\"followup\"])"
 835 |    ]
 836 |   },
 837 |   {
 838 |    "cell_type": "markdown",
 839 |    "metadata": {},
 840 |    "source": [
 841 |     "## Evaluate the agent"
 842 |    ]
 843 |   },
 844 |   {
 845 |    "cell_type": "markdown",
 846 |    "metadata": {},
 847 |    "source": [
 848 |     "<img src=\"./images/evals-conceptual.png\" alt=\"conceptual overview\" width=\"1200\">"
 849 |    ]
 850 |   },
 851 |   {
 852 |    "cell_type": "markdown",
 853 |    "metadata": {},
 854 |    "source": [
 855 |     "### Final response"
 856 |    ]
 857 |   },
 858 |   {
 859 |    "cell_type": "markdown",
 860 |    "metadata": {},
 861 |    "source": [
 862 |     "<img src=\"./images/final-response.png\" alt=\"final response\" width=\"850\">"
 863 |    ]
 864 |   },
 865 |   {
 866 |    "cell_type": "markdown",
 867 |    "metadata": {},
 868 |    "source": [
 869 |     "##### Create a dataset"
 870 |    ]
 871 |   },
 872 |   {
 873 |    "cell_type": "code",
 874 |    "execution_count": 39,
 875 |    "metadata": {},
 876 |    "outputs": [],
 877 |    "source": [
 878 |     "from langsmith import Client\n",
 879 |     "\n",
 880 |     "client = Client()\n",
 881 |     "\n",
 882 |     "# Create a dataset\n",
 883 |     "examples = [\n",
 884 |     "    {\n",
 885 |     "        \"question\": \"How many songs do you have by James Brown\",\n",
 886 |     "        \"response\": \"We have 20 songs by James Brown\",\n",
 887 |     "    },\n",
 888 |     "    {\n",
 889 |     "        \"question\": \"My name is Aaron Mitchell and I'd like a refund.\",\n",
 890 |     "        \"response\": \"I need some more information to help you with the refund. Please specify your phone number, the invoice ID, or the line item IDs for the purchase you'd like refunded.\",\n",
 891 |     "    },\n",
 892 |     "    {\n",
 893 |     "        \"question\": \"My name is Aaron Mitchell and I'd like a refund on my Led Zeppelin purchases. My number is +1 (204) 452-6452\",\n",
 894 |     "        \"response\": 'Which of the following purchases would you like to be refunded for?\\n\\n  invoice_line_id  track_name                        artist_name    purchase_date          quantity_purchased    price_per_unit\\n-----------------  --------------------------------  -------------  -------------------  --------------------  ----------------\\n              267  How Many More Times               Led Zeppelin   2009-08-06 00:00:00                     1              0.99\\n              268  What Is And What Should Never Be  Led Zeppelin   2009-08-06 00:00:00                     1              0.99',\n",
 895 |     "    },\n",
 896 |     "    {\n",
 897 |     "        \"question\": \"Who recorded Wish You Were Here again?\",\n",
 898 |     "        \"response\": \"Wish You Were Here is an album by Pink Floyd\",\n",
 899 |     "    },\n",
 900 |     "    { \n",
 901 |     "        \"question\": \"I want a full refund for invoice 237\",\n",
 902 |     "        \"response\": \"You have been refunded $0.99.\",\n",
 903 |     "    },\n",
 904 |     "]\n",
 905 |     "\n",
 906 |     "dataset_name = \"Chinook Customer Service Bot: Final Response\"\n",
 907 |     "\n",
 908 |     "if not client.has_dataset(dataset_name=dataset_name):\n",
 909 |     "    dataset = client.create_dataset(dataset_name=dataset_name)\n",
 910 |     "    client.create_examples(\n",
 911 |     "        inputs=[{\"question\": ex[\"question\"]} for ex in examples],\n",
 912 |     "        outputs=[{\"response\": ex[\"response\"]} for ex in examples],\n",
 913 |     "        dataset_id=dataset.id\n",
 914 |     "    )"
 915 |    ]
 916 |   },
 917 |   {
 918 |    "cell_type": "markdown",
 919 |    "metadata": {},
 920 |    "source": [
 921 |     "##### Define application logic to be evaluated"
 922 |    ]
 923 |   },
 924 |   {
 925 |    "cell_type": "code",
 926 |    "execution_count": 40,
 927 |    "metadata": {},
 928 |    "outputs": [],
 929 |    "source": [
 930 |     "# Target function\n",
 931 |     "async def run_graph(inputs: dict) -> dict:\n",
 932 |     "    \"\"\"Run graph and track the final response.\"\"\"\n",
 933 |     "    result = await graph.ainvoke({\"messages\": [\n",
 934 |     "        { \"role\": \"user\", \"content\": inputs['question']},\n",
 935 |     "    ]}, config={\"env\": \"test\"})\n",
 936 |     "    return {\"response\": result[\"followup\"]}"
 937 |    ]
 938 |   },
 939 |   {
 940 |    "cell_type": "markdown",
 941 |    "metadata": {},
 942 |    "source": [
 943 |     "##### Define evaluator"
 944 |    ]
 945 |   },
 946 |   {
 947 |    "cell_type": "code",
 948 |    "execution_count": 41,
 949 |    "metadata": {},
 950 |    "outputs": [],
 951 |    "source": [
 952 |     "# LLM-as-judge instructions\n",
 953 |     "grader_instructions = \"\"\"You are a teacher grading a quiz.\n",
 954 |     "\n",
 955 |     "You will be given a QUESTION, the GROUND TRUTH (correct) RESPONSE, and the STUDENT RESPONSE.\n",
 956 |     "\n",
 957 |     "Here is the grade criteria to follow:\n",
 958 |     "(1) Grade the student responses based ONLY on their factual accuracy relative to the ground truth answer.\n",
 959 |     "(2) Ensure that the student response does not contain any conflicting statements.\n",
 960 |     "(3) It is OK if the student response contains more information than the ground truth response, as long as it is factually accurate relative to the  ground truth response.\n",
 961 |     "\n",
 962 |     "Correctness:\n",
 963 |     "True means that the student's response meets all of the criteria.\n",
 964 |     "False means that the student's response does not meet all of the criteria.\n",
 965 |     "\n",
 966 |     "Explain your reasoning in a step-by-step manner to ensure your reasoning and conclusion are correct.\"\"\"\n",
 967 |     "\n",
 968 |     "# LLM-as-judge output schema\n",
 969 |     "class Grade(TypedDict):\n",
 970 |     "    \"\"\"Compare the expected and actual answers and grade the actual answer.\"\"\"\n",
 971 |     "    reasoning: Annotated[str, ..., \"Explain your reasoning for whether the actual response is correct or not.\"]\n",
 972 |     "    is_correct: Annotated[bool, ..., \"True if the student response is mostly or exactly correct, otherwise False.\"]\n",
 973 |     "\n",
 974 |     "# Judge LLM\n",
 975 |     "grader_llm = init_chat_model(\"gpt-4o-mini\", temperature=0).with_structured_output(Grade, method=\"json_schema\", strict=True)\n",
 976 |     "\n",
 977 |     "# Evaluator function\n",
 978 |     "async def final_answer_correct(inputs: dict, outputs: dict, reference_outputs: dict) -> bool:\n",
 979 |     "    \"\"\"Evaluate if the final response is equivalent to reference response.\"\"\"\n",
 980 |     "    # Note that we assume the outputs has a 'response' dictionary. We'll need to make sure\n",
 981 |     "    # that the target function we define includes this key.\n",
 982 |     "    user = f\"\"\"QUESTION: {inputs['question']}\n",
 983 |     "    GROUND TRUTH RESPONSE: {reference_outputs['response']}\n",
 984 |     "    STUDENT RESPONSE: {outputs['response']}\"\"\"\n",
 985 |     "\n",
 986 |     "    grade = await grader_llm.ainvoke([{\"role\": \"system\", \"content\": grader_instructions}, {\"role\": \"user\", \"content\": user}])\n",
 987 |     "    return grade[\"is_correct\"]"
 988 |    ]
 989 |   },
 990 |   {
 991 |    "cell_type": "markdown",
 992 |    "metadata": {},
 993 |    "source": [
 994 |     "##### Run evaluation"
 995 |    ]
 996 |   },
 997 |   {
 998 |    "cell_type": "code",
 999 |    "execution_count": 42,
1000 |    "metadata": {},
1001 |    "outputs": [
1002 |     {
1003 |      "name": "stdout",
1004 |      "output_type": "stream",
1005 |      "text": [
1006 |       "View the evaluation results for experiment: 'sql-agent-gpt4o-e2e-d984d052' at:\n",
1007 |       "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/18ed4055-e160-40cb-ba16-e1af9002b310/compare?selectedSessions=57f00a9d-9450-4c34-8396-6f139cff8e0a\n",
1008 |       "\n",
1009 |       "\n"
1010 |      ]
1011 |     },
1012 |     {
1013 |      "name": "stderr",
1014 |      "output_type": "stream",
1015 |      "text": [
1016 |       "5it [00:11,  2.36s/it]\n"
1017 |      ]
1018 |     },
1019 |     {
1020 |      "data": {
1021 |       "text/html": [
1022 |        "<div>\n",
1023 |        "<style scoped>\n",
1024 |        "    .dataframe tbody tr th:only-of-type {\n",
1025 |        "        vertical-align: middle;\n",
1026 |        "    }\n",
1027 |        "\n",
1028 |        "    .dataframe tbody tr th {\n",
1029 |        "        vertical-align: top;\n",
1030 |        "    }\n",
1031 |        "\n",
1032 |        "    .dataframe thead th {\n",
1033 |        "        text-align: right;\n",
1034 |        "    }\n",
1035 |        "</style>\n",
1036 |        "<table border=\"1\" class=\"dataframe\">\n",
1037 |        "  <thead>\n",
1038 |        "    <tr style=\"text-align: right;\">\n",
1039 |        "      <th></th>\n",
1040 |        "      <th>inputs.question</th>\n",
1041 |        "      <th>outputs.response</th>\n",
1042 |        "      <th>error</th>\n",
1043 |        "      <th>reference.response</th>\n",
1044 |        "      <th>feedback.final_answer_correct</th>\n",
1045 |        "      <th>execution_time</th>\n",
1046 |        "      <th>example_id</th>\n",
1047 |        "      <th>id</th>\n",
1048 |        "    </tr>\n",
1049 |        "  </thead>\n",
1050 |        "  <tbody>\n",
1051 |        "    <tr>\n",
1052 |        "      <th>0</th>\n",
1053 |        "      <td>I want a full refund for invoice 237</td>\n",
1054 |        "      <td>You have been refunded a total of: $0.99. Is t...</td>\n",
1055 |        "      <td>None</td>\n",
1056 |        "      <td>You have been refunded $0.99.</td>\n",
1057 |        "      <td>True</td>\n",
1058 |        "      <td>3.750356</td>\n",
1059 |        "      <td>b094b32b-94f6-49a4-9af7-a7ccb289cba5</td>\n",
1060 |        "      <td>2ab8e132-e3b0-4ffe-ba73-3d42aea64fda</td>\n",
1061 |        "    </tr>\n",
1062 |        "    <tr>\n",
1063 |        "      <th>1</th>\n",
1064 |        "      <td>My name is Aaron Mitchell and I'd like a refun...</td>\n",
1065 |        "      <td>We did not find any purchases associated with ...</td>\n",
1066 |        "      <td>None</td>\n",
1067 |        "      <td>Which of the following purchases would you lik...</td>\n",
1068 |        "      <td>False</td>\n",
1069 |        "      <td>3.748979</td>\n",
1070 |        "      <td>2274dab9-5682-4e35-8f30-e3171886b2a9</td>\n",
1071 |        "      <td>c9d3bb87-0ec2-4557-b384-cbf038e18149</td>\n",
1072 |        "    </tr>\n",
1073 |        "    <tr>\n",
1074 |        "      <th>2</th>\n",
1075 |        "      <td>My name is Aaron Mitchell and I'd like a refund.</td>\n",
1076 |        "      <td>Please provide your phone number and either th...</td>\n",
1077 |        "      <td>None</td>\n",
1078 |        "      <td>I need some more information to help you with ...</td>\n",
1079 |        "      <td>True</td>\n",
1080 |        "      <td>3.108677</td>\n",
1081 |        "      <td>16e7f941-7c18-4152-82fe-639ce4feca0e</td>\n",
1082 |        "      <td>35d9a6f6-dbb7-445f-a214-a12af1d495b0</td>\n",
1083 |        "    </tr>\n",
1084 |        "    <tr>\n",
1085 |        "      <th>3</th>\n",
1086 |        "      <td>How many songs do you have by James Brown</td>\n",
1087 |        "      <td>There are 20 James Brown songs in the database...</td>\n",
1088 |        "      <td>None</td>\n",
1089 |        "      <td>We have 20 songs by James Brown</td>\n",
1090 |        "      <td>True</td>\n",
1091 |        "      <td>6.446163</td>\n",
1092 |        "      <td>5a516727-b97f-40ea-a363-605a8b92669a</td>\n",
1093 |        "      <td>21d7d353-accd-43f0-9776-56cffb8a3954</td>\n",
1094 |        "    </tr>\n",
1095 |        "    <tr>\n",
1096 |        "      <th>4</th>\n",
1097 |        "      <td>Who recorded Wish You Were Here again?</td>\n",
1098 |        "      <td>I apologize, but I'm not finding \"Wish You Wer...</td>\n",
1099 |        "      <td>None</td>\n",
1100 |        "      <td>Wish You Were Here is an album by Pink Floyd</td>\n",
1101 |        "      <td>False</td>\n",
1102 |        "      <td>10.356575</td>\n",
1103 |        "      <td>27e113d9-498f-49ab-a66b-8395a4a6b32b</td>\n",
1104 |        "      <td>c476f0a7-b2f5-4628-8e59-c6eb2a4acf4d</td>\n",
1105 |        "    </tr>\n",
1106 |        "  </tbody>\n",
1107 |        "</table>\n",
1108 |        "</div>"
1109 |       ],
1110 |       "text/plain": [
1111 |        "                                     inputs.question  \\\n",
1112 |        "0               I want a full refund for invoice 237   \n",
1113 |        "1  My name is Aaron Mitchell and I'd like a refun...   \n",
1114 |        "2   My name is Aaron Mitchell and I'd like a refund.   \n",
1115 |        "3          How many songs do you have by James Brown   \n",
1116 |        "4             Who recorded Wish You Were Here again?   \n",
1117 |        "\n",
1118 |        "                                    outputs.response error  \\\n",
1119 |        "0  You have been refunded a total of: $0.99. Is t...  None   \n",
1120 |        "1  We did not find any purchases associated with ...  None   \n",
1121 |        "2  Please provide your phone number and either th...  None   \n",
1122 |        "3  There are 20 James Brown songs in the database...  None   \n",
1123 |        "4  I apologize, but I'm not finding \"Wish You Wer...  None   \n",
1124 |        "\n",
1125 |        "                                  reference.response  \\\n",
1126 |        "0                      You have been refunded $0.99.   \n",
1127 |        "1  Which of the following purchases would you lik...   \n",
1128 |        "2  I need some more information to help you with ...   \n",
1129 |        "3                    We have 20 songs by James Brown   \n",
1130 |        "4       Wish You Were Here is an album by Pink Floyd   \n",
1131 |        "\n",
1132 |        "   feedback.final_answer_correct  execution_time  \\\n",
1133 |        "0                           True        3.750356   \n",
1134 |        "1                          False        3.748979   \n",
1135 |        "2                           True        3.108677   \n",
1136 |        "3                           True        6.446163   \n",
1137 |        "4                          False       10.356575   \n",
1138 |        "\n",
1139 |        "                             example_id                                    id  \n",
1140 |        "0  b094b32b-94f6-49a4-9af7-a7ccb289cba5  2ab8e132-e3b0-4ffe-ba73-3d42aea64fda  \n",
1141 |        "1  2274dab9-5682-4e35-8f30-e3171886b2a9  c9d3bb87-0ec2-4557-b384-cbf038e18149  \n",
1142 |        "2  16e7f941-7c18-4152-82fe-639ce4feca0e  35d9a6f6-dbb7-445f-a214-a12af1d495b0  \n",
1143 |        "3  5a516727-b97f-40ea-a363-605a8b92669a  21d7d353-accd-43f0-9776-56cffb8a3954  \n",
1144 |        "4  27e113d9-498f-49ab-a66b-8395a4a6b32b  c476f0a7-b2f5-4628-8e59-c6eb2a4acf4d  "
1145 |       ]
1146 |      },
1147 |      "execution_count": 42,
1148 |      "metadata": {},
1149 |      "output_type": "execute_result"
1150 |     }
1151 |    ],
1152 |    "source": [
1153 |     "from langsmith import Client\n",
1154 |     "\n",
1155 |     "client = Client()\n",
1156 |     "\n",
1157 |     "# Evaluation job and results\n",
1158 |     "experiment_results = await client.aevaluate(\n",
1159 |     "    run_graph,\n",
1160 |     "    data=dataset_name,\n",
1161 |     "    evaluators=[final_answer_correct],\n",
1162 |     "    experiment_prefix=\"sql-agent-gpt4o-e2e\",\n",
1163 |     "    num_repetitions=1,\n",
1164 |     "    max_concurrency=4,\n",
1165 |     ")\n",
1166 |     "experiment_results.to_pandas()"
1167 |    ]
1168 |   },
1169 |   {
1170 |    "cell_type": "markdown",
1171 |    "metadata": {},
1172 |    "source": [
1173 |     "### Single step evaluator"
1174 |    ]
1175 |   },
1176 |   {
1177 |    "cell_type": "markdown",
1178 |    "metadata": {},
1179 |    "source": [
1180 |     "<img src=\"./images/single-step.png\" alt=\"single step eval\" width=\"850\">"
1181 |    ]
1182 |   },
1183 |   {
1184 |    "cell_type": "markdown",
1185 |    "metadata": {},
1186 |    "source": [
1187 |     "##### Create dataset"
1188 |    ]
1189 |   },
1190 |   {
1191 |    "cell_type": "code",
1192 |    "execution_count": 43,
1193 |    "metadata": {},
1194 |    "outputs": [],
1195 |    "source": [
1196 |     "# Create dataset\n",
1197 |     "examples = [\n",
1198 |     "    {\"messages\": [{\"role\": \"user\", \"content\": \"i bought some tracks recently and i dont like them\"}], \"route\": \"refund_agent\"},\n",
1199 |     "    {\"messages\": [{\"role\": \"user\", \"content\": \"I was thinking of purchasing some Rolling Stones tunes, any recommendations?\"}], \"route\": \"question_answering_agent\"},\n",
1200 |     "    {\"messages\": [{\"role\": \"user\", \"content\": \"i want a refund on purchase 237\"}, {\"role\": \"assistant\", \"content\": \"I've refunded you a total of $1.98. How else can I help you today?\"}, {\"role\": \"user\", \"content\": \"did prince release any albums in 2000?\"}], \"route\": \"question_answering_agent\"},\n",
1201 |     "    {\"messages\": [{\"role\": \"user\", \"content\": \"i purchased a cover of Yesterday recently but can't remember who it was by, which versions of it do you have?\"}], \"route\": \"question_answering_agent\"},\n",
1202 |     "    {\"messages\": [{\"role\": \"user\", \"content\": \"Can I get my money back? I bought an album from the store last week, but it was the wrong one.\"}], \"route\": \"refund_agent\"}\n",
1203 |     "]\n",
1204 |     "\n",
1205 |     "dataset_name = \"Chinook Customer Service Bot: Single Step\"\n",
1206 |     "if not client.has_dataset(dataset_name=dataset_name):\n",
1207 |     "    dataset = client.create_dataset(dataset_name=dataset_name)\n",
1208 |     "    client.create_examples(\n",
1209 |     "        inputs = [{\"messages\": ex[\"messages\"]} for ex in examples],\n",
1210 |     "        outputs = [{\"route\": ex[\"route\"]} for ex in examples],\n",
1211 |     "        dataset_id=dataset.id\n",
1212 |     "    )"
1213 |    ]
1214 |   },
1215 |   {
1216 |    "cell_type": "markdown",
1217 |    "metadata": {},
1218 |    "source": [
1219 |     "##### Define application logic to be evaluated"
1220 |    ]
1221 |   },
1222 |   {
1223 |    "cell_type": "code",
1224 |    "execution_count": 44,
1225 |    "metadata": {},
1226 |    "outputs": [],
1227 |    "source": [
1228 |     "# Target function for running the relevant step\n",
1229 |     "async def run_intent_classifier(inputs: dict) -> dict:\n",
1230 |     "    # Note that we can access and run the intent_classifier node of our graph directly.\n",
1231 |     "    command = await graph.nodes['intent_classifier'].ainvoke(inputs)\n",
1232 |     "    return {\"route\": command.goto}"
1233 |    ]
1234 |   },
1235 |   {
1236 |    "cell_type": "markdown",
1237 |    "metadata": {},
1238 |    "source": [
1239 |     "##### Define evaluator"
1240 |    ]
1241 |   },
1242 |   {
1243 |    "cell_type": "code",
1244 |    "execution_count": 45,
1245 |    "metadata": {},
1246 |    "outputs": [],
1247 |    "source": [
1248 |     "# Evaluator\n",
1249 |     "def correct(outputs: dict, reference_outputs: dict) -> bool:\n",
1250 |     "    \"\"\"Check if the agent chose the correct route.\"\"\"\n",
1251 |     "    return outputs[\"route\"] == reference_outputs[\"route\"]"
1252 |    ]
1253 |   },
1254 |   {
1255 |    "cell_type": "markdown",
1256 |    "metadata": {},
1257 |    "source": [
1258 |     "##### Run evaluation"
1259 |    ]
1260 |   },
1261 |   {
1262 |    "cell_type": "code",
1263 |    "execution_count": 46,
1264 |    "metadata": {},
1265 |    "outputs": [
1266 |     {
1267 |      "name": "stdout",
1268 |      "output_type": "stream",
1269 |      "text": [
1270 |       "View the evaluation results for experiment: 'sql-agent-gpt4o-intent-classifier-3b90c1a3' at:\n",
1271 |       "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/fd928d25-c809-4ca3-b12a-35c0cb306022/compare?selectedSessions=8cdb4a0a-fd7b-40cd-bd83-024b82ae2a83\n",
1272 |       "\n",
1273 |       "\n"
1274 |      ]
1275 |     },
1276 |     {
1277 |      "name": "stderr",
1278 |      "output_type": "stream",
1279 |      "text": [
1280 |       "5it [00:02,  2.07it/s]\n"
1281 |      ]
1282 |     },
1283 |     {
1284 |      "data": {
1285 |       "text/html": [
1286 |        "<div>\n",
1287 |        "<style scoped>\n",
1288 |        "    .dataframe tbody tr th:only-of-type {\n",
1289 |        "        vertical-align: middle;\n",
1290 |        "    }\n",
1291 |        "\n",
1292 |        "    .dataframe tbody tr th {\n",
1293 |        "        vertical-align: top;\n",
1294 |        "    }\n",
1295 |        "\n",
1296 |        "    .dataframe thead th {\n",
1297 |        "        text-align: right;\n",
1298 |        "    }\n",
1299 |        "</style>\n",
1300 |        "<table border=\"1\" class=\"dataframe\">\n",
1301 |        "  <thead>\n",
1302 |        "    <tr style=\"text-align: right;\">\n",
1303 |        "      <th></th>\n",
1304 |        "      <th>inputs.messages</th>\n",
1305 |        "      <th>outputs.route</th>\n",
1306 |        "      <th>error</th>\n",
1307 |        "      <th>reference.route</th>\n",
1308 |        "      <th>feedback.correct</th>\n",
1309 |        "      <th>execution_time</th>\n",
1310 |        "      <th>example_id</th>\n",
1311 |        "      <th>id</th>\n",
1312 |        "    </tr>\n",
1313 |        "  </thead>\n",
1314 |        "  <tbody>\n",
1315 |        "    <tr>\n",
1316 |        "      <th>0</th>\n",
1317 |        "      <td>[{'role': 'user', 'content': 'Can I get my mon...</td>\n",
1318 |        "      <td>refund_agent</td>\n",
1319 |        "      <td>None</td>\n",
1320 |        "      <td>refund_agent</td>\n",
1321 |        "      <td>True</td>\n",
1322 |        "      <td>0.634425</td>\n",
1323 |        "      <td>47bb4665-da6f-4b58-a314-fefc886040c3</td>\n",
1324 |        "      <td>f6ccda12-147a-41ad-8e3c-93da544b6ca5</td>\n",
1325 |        "    </tr>\n",
1326 |        "    <tr>\n",
1327 |        "      <th>1</th>\n",
1328 |        "      <td>[{'role': 'user', 'content': 'i purchased a co...</td>\n",
1329 |        "      <td>question_answering_agent</td>\n",
1330 |        "      <td>None</td>\n",
1331 |        "      <td>question_answering_agent</td>\n",
1332 |        "      <td>True</td>\n",
1333 |        "      <td>0.412655</td>\n",
1334 |        "      <td>e88b4749-4b72-42d0-9efd-e69ff276b4e2</td>\n",
1335 |        "      <td>47fb6f5c-0d54-4757-b3df-ba2d271d1cf3</td>\n",
1336 |        "    </tr>\n",
1337 |        "    <tr>\n",
1338 |        "      <th>2</th>\n",
1339 |        "      <td>[{'role': 'user', 'content': 'i want a refund ...</td>\n",
1340 |        "      <td>question_answering_agent</td>\n",
1341 |        "      <td>None</td>\n",
1342 |        "      <td>question_answering_agent</td>\n",
1343 |        "      <td>True</td>\n",
1344 |        "      <td>0.409138</td>\n",
1345 |        "      <td>0dc9df85-806c-477c-ba2e-b1872b445db9</td>\n",
1346 |        "      <td>aacd798d-9afb-4b84-ab67-e2ec1c3adb22</td>\n",
1347 |        "    </tr>\n",
1348 |        "    <tr>\n",
1349 |        "      <th>3</th>\n",
1350 |        "      <td>[{'role': 'user', 'content': 'I was thinking o...</td>\n",
1351 |        "      <td>question_answering_agent</td>\n",
1352 |        "      <td>None</td>\n",
1353 |        "      <td>question_answering_agent</td>\n",
1354 |        "      <td>True</td>\n",
1355 |        "      <td>0.410535</td>\n",
1356 |        "      <td>cd503c74-547a-41fb-8763-ed808dcf9ba9</td>\n",
1357 |        "      <td>7770076c-0894-460f-a7ec-ed53ae491cf4</td>\n",
1358 |        "    </tr>\n",
1359 |        "    <tr>\n",
1360 |        "      <th>4</th>\n",
1361 |        "      <td>[{'role': 'user', 'content': 'i bought some tr...</td>\n",
1362 |        "      <td>refund_agent</td>\n",
1363 |        "      <td>None</td>\n",
1364 |        "      <td>refund_agent</td>\n",
1365 |        "      <td>True</td>\n",
1366 |        "      <td>0.342001</td>\n",
1367 |        "      <td>ac23df0a-f0d7-442f-b85d-26e62126adce</td>\n",
1368 |        "      <td>b227ce5d-5378-4195-ab76-3d52f1db1b2a</td>\n",
1369 |        "    </tr>\n",
1370 |        "  </tbody>\n",
1371 |        "</table>\n",
1372 |        "</div>"
1373 |       ],
1374 |       "text/plain": [
1375 |        "                                     inputs.messages  \\\n",
1376 |        "0  [{'role': 'user', 'content': 'Can I get my mon...   \n",
1377 |        "1  [{'role': 'user', 'content': 'i purchased a co...   \n",
1378 |        "2  [{'role': 'user', 'content': 'i want a refund ...   \n",
1379 |        "3  [{'role': 'user', 'content': 'I was thinking o...   \n",
1380 |        "4  [{'role': 'user', 'content': 'i bought some tr...   \n",
1381 |        "\n",
1382 |        "              outputs.route error           reference.route  feedback.correct  \\\n",
1383 |        "0              refund_agent  None              refund_agent              True   \n",
1384 |        "1  question_answering_agent  None  question_answering_agent              True   \n",
1385 |        "2  question_answering_agent  None  question_answering_agent              True   \n",
1386 |        "3  question_answering_agent  None  question_answering_agent              True   \n",
1387 |        "4              refund_agent  None              refund_agent              True   \n",
1388 |        "\n",
1389 |        "   execution_time                            example_id  \\\n",
1390 |        "0        0.634425  47bb4665-da6f-4b58-a314-fefc886040c3   \n",
1391 |        "1        0.412655  e88b4749-4b72-42d0-9efd-e69ff276b4e2   \n",
1392 |        "2        0.409138  0dc9df85-806c-477c-ba2e-b1872b445db9   \n",
1393 |        "3        0.410535  cd503c74-547a-41fb-8763-ed808dcf9ba9   \n",
1394 |        "4        0.342001  ac23df0a-f0d7-442f-b85d-26e62126adce   \n",
1395 |        "\n",
1396 |        "                                     id  \n",
1397 |        "0  f6ccda12-147a-41ad-8e3c-93da544b6ca5  \n",
1398 |        "1  47fb6f5c-0d54-4757-b3df-ba2d271d1cf3  \n",
1399 |        "2  aacd798d-9afb-4b84-ab67-e2ec1c3adb22  \n",
1400 |        "3  7770076c-0894-460f-a7ec-ed53ae491cf4  \n",
1401 |        "4  b227ce5d-5378-4195-ab76-3d52f1db1b2a  "
1402 |       ]
1403 |      },
1404 |      "execution_count": 46,
1405 |      "metadata": {},
1406 |      "output_type": "execute_result"
1407 |     }
1408 |    ],
1409 |    "source": [
1410 |     "# Run evaluation\n",
1411 |     "experiment_results = await client.aevaluate(\n",
1412 |     "    run_intent_classifier,\n",
1413 |     "    data=dataset_name,\n",
1414 |     "    evaluators=[correct],\n",
1415 |     "    experiment_prefix=\"sql-agent-gpt4o-intent-classifier\",\n",
1416 |     "    max_concurrency=4,\n",
1417 |     ")\n",
1418 |     "experiment_results.to_pandas()"
1419 |    ]
1420 |   },
1421 |   {
1422 |    "cell_type": "markdown",
1423 |    "metadata": {},
1424 |    "source": [
1425 |     "### Trajectory evaluator"
1426 |    ]
1427 |   },
1428 |   {
1429 |    "cell_type": "markdown",
1430 |    "metadata": {},
1431 |    "source": [
1432 |     "<img src=\"./images/trajectory.png\" alt=\"trajectory\" width=\"850\">"
1433 |    ]
1434 |   },
1435 |   {
1436 |    "cell_type": "markdown",
1437 |    "metadata": {},
1438 |    "source": [
1439 |     "##### Create dataset"
1440 |    ]
1441 |   },
1442 |   {
1443 |    "cell_type": "code",
1444 |    "execution_count": 47,
1445 |    "metadata": {},
1446 |    "outputs": [],
1447 |    "source": [
1448 |     "from langsmith import Client\n",
1449 |     "\n",
1450 |     "client = Client()\n",
1451 |     "\n",
1452 |     "# Create a dataset\n",
1453 |     "examples = [\n",
1454 |     "    {\n",
1455 |     "        \"question\": \"How many songs do you have by James Brown\",\n",
1456 |     "        \"trajectory\": [\"intent_classifier\", \"question_answering_agent\", \"agent\", \"tools\", \"lookup_track\", \"agent\", \"compile_followup\"]\n",
1457 |     "    },\n",
1458 |     "    {\n",
1459 |     "        \"question\": \"My name is Aaron Mitchell and I'd like a refund.\",\n",
1460 |     "        \"trajectory\": [\"intent_classifier\", \"refund_agent\", \"gather_info\", \"compile_followup\"],\n",
1461 |     "    },\n",
1462 |     "    {\n",
1463 |     "        \"question\": \"My name is Aaron Mitchell and I'd like a refund on my Led Zeppelin purchases. My number is +1 (204) 452-6452\",\n",
1464 |     "        \"trajectory\": [\"intent_classifier\", \"refund_agent\", \"gather_info\", \"lookup\", \"compile_followup\"],\n",
1465 |     "    },\n",
1466 |     "    {\n",
1467 |     "        \"question\": \"Who recorded Wish You Were Here again? What other albums by them do you have?\",\n",
1468 |     "        \"trajectory\": [\"intent_classifier\", \"question_answering_agent\", \"agent\", \"tools\", \"lookup_track\", \"agent\", \"tools\", \"lookup_album\", \"agent\", \"compile_followup\"],\n",
1469 |     "    },\n",
1470 |     "    {\n",
1471 |     "        \"question\": \"My name is Aaron Mitchell. My number is +1 (204) 452-6452 and I want a full refund for invoice id 237\",\n",
1472 |     "        \"trajectory\": [\"intent_classifier\", \"refund_agent\", \"gather_info\", \"refund\", \"compile_followup\"],\n",
1473 |     "    },\n",
1474 |     "]\n",
1475 |     "\n",
1476 |     "dataset_name = \"Chinook Customer Service Bot: Trajectory\"\n",
1477 |     "\n",
1478 |     "if not client.has_dataset(dataset_name=dataset_name):\n",
1479 |     "    dataset = client.create_dataset(dataset_name=dataset_name)\n",
1480 |     "    client.create_examples(\n",
1481 |     "        inputs=[{\"question\": ex[\"question\"]} for ex in examples],\n",
1482 |     "        outputs=[{\"trajectory\": ex[\"trajectory\"]} for ex in examples],\n",
1483 |     "        dataset_id=dataset.id\n",
1484 |     "    )"
1485 |    ]
1486 |   },
1487 |   {
1488 |    "cell_type": "markdown",
1489 |    "metadata": {},
1490 |    "source": [
1491 |     "##### Define application logic to be evaluated"
1492 |    ]
1493 |   },
1494 |   {
1495 |    "cell_type": "code",
1496 |    "execution_count": 48,
1497 |    "metadata": {},
1498 |    "outputs": [],
1499 |    "source": [
1500 |     "async def run_graph(inputs: dict) -> dict:\n",
1501 |     "    \"\"\"Run graph and track the trajectory it takes along with the final response.\"\"\"\n",
1502 |     "    trajectory = []\n",
1503 |     "    # Set subgraph=True to stream events from subgraphs of the main graph: https://langchain-ai.github.io/langgraph/how-tos/streaming-subgraphs/\n",
1504 |     "    # Set stream_mode=\"debug\" to stream all possible events: https://langchain-ai.github.io/langgraph/concepts/streaming\n",
1505 |     "    async for chunk in graph.astream({\"messages\": [\n",
1506 |     "            {\n",
1507 |     "                \"role\": \"user\",\n",
1508 |     "                \"content\": inputs['question'],\n",
1509 |     "            }\n",
1510 |     "        ]}, subgraphs=True, stream_mode=\"debug\"):\n",
1511 |     "        # Event type for entering a node\n",
1512 |     "        if chunk[1]['type'] == 'task':\n",
1513 |     "            # Record the node name\n",
1514 |     "            trajectory.append(chunk[1]['payload']['name'])\n",
1515 |     "            # Given how we defined our dataset, we also need to track when specific tools are\n",
1516 |     "            # called by our question answering ReACT agent. These tool calls can be found\n",
1517 |     "            # when the ToolsNode (named \"tools\") is invoked by looking at the AIMessage.tool_calls\n",
1518 |     "            # of the latest input message.\n",
1519 |     "            if chunk[1]['payload']['name'] == 'tools' and chunk[1]['type'] == 'task':\n",
1520 |     "                for tc in chunk[1]['payload']['input']['messages'][-1].tool_calls:\n",
1521 |     "                    trajectory.append(tc['name'])\n",
1522 |     "    return {\"trajectory\": trajectory}"
1523 |    ]
1524 |   },
1525 |   {
1526 |    "cell_type": "markdown",
1527 |    "metadata": {},
1528 |    "source": [
1529 |     "##### Define evaluators"
1530 |    ]
1531 |   },
1532 |   {
1533 |    "cell_type": "code",
1534 |    "execution_count": 49,
1535 |    "metadata": {},
1536 |    "outputs": [],
1537 |    "source": [
1538 |     "def evaluate_extra_steps(outputs: dict, reference_outputs: dict) -> dict:\n",
1539 |     "    \"\"\"Evaluate the number of extra steps in the agent's output.\"\"\"\n",
1540 |     "    extra_steps = len(outputs['trajectory']) - len(reference_outputs['trajectory'])\n",
1541 |     "    return {\n",
1542 |     "        \"key\": \"extra_steps\",\n",
1543 |     "        \"score\": extra_steps,\n",
1544 |     "    }\n",
1545 |     "\n",
1546 |     "def evaluate_unmatched_steps(outputs: dict, reference_outputs: dict) -> dict:\n",
1547 |     "    # [\"step1\", \"step2\", \"step3\"]\n",
1548 |     "    # [\"step3\", \"step2\", \"step1\"]\n",
1549 |     "    \"\"\"Evaluate the number of unmatched steps in the agent's output.\"\"\"\n",
1550 |     "    i = j = 0\n",
1551 |     "    unmatched_steps = 0\n",
1552 |     "\n",
1553 |     "    while i < len(reference_outputs['trajectory']) and j < len(outputs['trajectory']):\n",
1554 |     "        if reference_outputs['trajectory'][i] == outputs['trajectory'][j]:\n",
1555 |     "            i += 1  # Match found, move to the next step in reference trajectory\n",
1556 |     "        else:\n",
1557 |     "            unmatched_steps += 1  # Step is not part of the reference trajectory\n",
1558 |     "        j += 1  # Always move to the next step in outputs trajectory\n",
1559 |     "\n",
1560 |     "    # Count remaining unmatched steps in outputs beyond the comparison loop\n",
1561 |     "    unmatched_steps += len(outputs['trajectory']) - j\n",
1562 |     "\n",
1563 |     "    return {\n",
1564 |     "        \"key\": \"unmatched_steps\",\n",
1565 |     "        \"score\": unmatched_steps,\n",
1566 |     "    }"
1567 |    ]
1568 |   },
1569 |   {
1570 |    "cell_type": "markdown",
1571 |    "metadata": {},
1572 |    "source": [
1573 |     "##### Run evaluation"
1574 |    ]
1575 |   },
1576 |   {
1577 |    "cell_type": "code",
1578 |    "execution_count": 50,
1579 |    "metadata": {},
1580 |    "outputs": [
1581 |     {
1582 |      "name": "stdout",
1583 |      "output_type": "stream",
1584 |      "text": [
1585 |       "View the evaluation results for experiment: 'sql-agent-gpt4o-trajectory-c654f01b' at:\n",
1586 |       "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/46c97213-3e6f-47e4-846c-cd79143192fc/compare?selectedSessions=f700d7fa-78ba-41d2-9177-9d490531cef7\n",
1587 |       "\n",
1588 |       "\n"
1589 |      ]
1590 |     },
1591 |     {
1592 |      "name": "stderr",
1593 |      "output_type": "stream",
1594 |      "text": [
1595 |       "5it [00:20,  4.01s/it]\n"
1596 |      ]
1597 |     },
1598 |     {
1599 |      "data": {
1600 |       "text/html": [
1601 |        "<div>\n",
1602 |        "<style scoped>\n",
1603 |        "    .dataframe tbody tr th:only-of-type {\n",
1604 |        "        vertical-align: middle;\n",
1605 |        "    }\n",
1606 |        "\n",
1607 |        "    .dataframe tbody tr th {\n",
1608 |        "        vertical-align: top;\n",
1609 |        "    }\n",
1610 |        "\n",
1611 |        "    .dataframe thead th {\n",
1612 |        "        text-align: right;\n",
1613 |        "    }\n",
1614 |        "</style>\n",
1615 |        "<table border=\"1\" class=\"dataframe\">\n",
1616 |        "  <thead>\n",
1617 |        "    <tr style=\"text-align: right;\">\n",
1618 |        "      <th></th>\n",
1619 |        "      <th>inputs.question</th>\n",
1620 |        "      <th>outputs.trajectory</th>\n",
1621 |        "      <th>error</th>\n",
1622 |        "      <th>reference.trajectory</th>\n",
1623 |        "      <th>feedback.extra_steps</th>\n",
1624 |        "      <th>feedback.unmatched_steps</th>\n",
1625 |        "      <th>execution_time</th>\n",
1626 |        "      <th>example_id</th>\n",
1627 |        "      <th>id</th>\n",
1628 |        "    </tr>\n",
1629 |        "  </thead>\n",
1630 |        "  <tbody>\n",
1631 |        "    <tr>\n",
1632 |        "      <th>0</th>\n",
1633 |        "      <td>My name is Aaron Mitchell and I'd like a refund.</td>\n",
1634 |        "      <td>[intent_classifier, refund_agent, gather_info,...</td>\n",
1635 |        "      <td>None</td>\n",
1636 |        "      <td>[intent_classifier, refund_agent, gather_info,...</td>\n",
1637 |        "      <td>0</td>\n",
1638 |        "      <td>0</td>\n",
1639 |        "      <td>2.993548</td>\n",
1640 |        "      <td>0f9e2190-ef0f-455e-86e1-f3965fe2dd20</td>\n",
1641 |        "      <td>70f63270-7d0e-49f4-9e25-873ce8725324</td>\n",
1642 |        "    </tr>\n",
1643 |        "    <tr>\n",
1644 |        "      <th>1</th>\n",
1645 |        "      <td>My name is Aaron Mitchell. My number is +1 (20...</td>\n",
1646 |        "      <td>[intent_classifier, refund_agent, gather_info,...</td>\n",
1647 |        "      <td>None</td>\n",
1648 |        "      <td>[intent_classifier, refund_agent, gather_info,...</td>\n",
1649 |        "      <td>0</td>\n",
1650 |        "      <td>0</td>\n",
1651 |        "      <td>3.585207</td>\n",
1652 |        "      <td>f7573d70-d3c9-4273-bad8-96c1693d41e6</td>\n",
1653 |        "      <td>7e2b5eaf-3af9-475b-978b-c84167d9c3d8</td>\n",
1654 |        "    </tr>\n",
1655 |        "    <tr>\n",
1656 |        "      <th>2</th>\n",
1657 |        "      <td>My name is Aaron Mitchell and I'd like a refun...</td>\n",
1658 |        "      <td>[intent_classifier, refund_agent, gather_info,...</td>\n",
1659 |        "      <td>None</td>\n",
1660 |        "      <td>[intent_classifier, refund_agent, gather_info,...</td>\n",
1661 |        "      <td>0</td>\n",
1662 |        "      <td>0</td>\n",
1663 |        "      <td>3.596975</td>\n",
1664 |        "      <td>6c35522a-a1ca-4212-9edf-5c17566cfb05</td>\n",
1665 |        "      <td>a748f4a7-bd7e-40b2-af6e-e9de37e56000</td>\n",
1666 |        "    </tr>\n",
1667 |        "    <tr>\n",
1668 |        "      <th>3</th>\n",
1669 |        "      <td>How many songs do you have by James Brown</td>\n",
1670 |        "      <td>[intent_classifier, question_answering_agent, ...</td>\n",
1671 |        "      <td>None</td>\n",
1672 |        "      <td>[intent_classifier, question_answering_agent, ...</td>\n",
1673 |        "      <td>0</td>\n",
1674 |        "      <td>0</td>\n",
1675 |        "      <td>7.249661</td>\n",
1676 |        "      <td>1c308dca-8a5e-44d8-9620-d71c36a06852</td>\n",
1677 |        "      <td>45055ce2-bb55-475c-918f-3e69aaaecc40</td>\n",
1678 |        "    </tr>\n",
1679 |        "    <tr>\n",
1680 |        "      <th>4</th>\n",
1681 |        "      <td>Who recorded Wish You Were Here again? What ot...</td>\n",
1682 |        "      <td>[intent_classifier, question_answering_agent, ...</td>\n",
1683 |        "      <td>None</td>\n",
1684 |        "      <td>[intent_classifier, question_answering_agent, ...</td>\n",
1685 |        "      <td>3</td>\n",
1686 |        "      <td>3</td>\n",
1687 |        "      <td>19.679824</td>\n",
1688 |        "      <td>14b71f30-6e70-4ea4-b527-3fdca46de008</td>\n",
1689 |        "      <td>b416ff67-b582-402b-9161-3503f9bc5d87</td>\n",
1690 |        "    </tr>\n",
1691 |        "  </tbody>\n",
1692 |        "</table>\n",
1693 |        "</div>"
1694 |       ],
1695 |       "text/plain": [
1696 |        "                                     inputs.question  \\\n",
1697 |        "0   My name is Aaron Mitchell and I'd like a refund.   \n",
1698 |        "1  My name is Aaron Mitchell. My number is +1 (20...   \n",
1699 |        "2  My name is Aaron Mitchell and I'd like a refun...   \n",
1700 |        "3          How many songs do you have by James Brown   \n",
1701 |        "4  Who recorded Wish You Were Here again? What ot...   \n",
1702 |        "\n",
1703 |        "                                  outputs.trajectory error  \\\n",
1704 |        "0  [intent_classifier, refund_agent, gather_info,...  None   \n",
1705 |        "1  [intent_classifier, refund_agent, gather_info,...  None   \n",
1706 |        "2  [intent_classifier, refund_agent, gather_info,...  None   \n",
1707 |        "3  [intent_classifier, question_answering_agent, ...  None   \n",
1708 |        "4  [intent_classifier, question_answering_agent, ...  None   \n",
1709 |        "\n",
1710 |        "                                reference.trajectory  feedback.extra_steps  \\\n",
1711 |        "0  [intent_classifier, refund_agent, gather_info,...                     0   \n",
1712 |        "1  [intent_classifier, refund_agent, gather_info,...                     0   \n",
1713 |        "2  [intent_classifier, refund_agent, gather_info,...                     0   \n",
1714 |        "3  [intent_classifier, question_answering_agent, ...                     0   \n",
1715 |        "4  [intent_classifier, question_answering_agent, ...                     3   \n",
1716 |        "\n",
1717 |        "   feedback.unmatched_steps  execution_time  \\\n",
1718 |        "0                         0        2.993548   \n",
1719 |        "1                         0        3.585207   \n",
1720 |        "2                         0        3.596975   \n",
1721 |        "3                         0        7.249661   \n",
1722 |        "4                         3       19.679824   \n",
1723 |        "\n",
1724 |        "                             example_id                                    id  \n",
1725 |        "0  0f9e2190-ef0f-455e-86e1-f3965fe2dd20  70f63270-7d0e-49f4-9e25-873ce8725324  \n",
1726 |        "1  f7573d70-d3c9-4273-bad8-96c1693d41e6  7e2b5eaf-3af9-475b-978b-c84167d9c3d8  \n",
1727 |        "2  6c35522a-a1ca-4212-9edf-5c17566cfb05  a748f4a7-bd7e-40b2-af6e-e9de37e56000  \n",
1728 |        "3  1c308dca-8a5e-44d8-9620-d71c36a06852  45055ce2-bb55-475c-918f-3e69aaaecc40  \n",
1729 |        "4  14b71f30-6e70-4ea4-b527-3fdca46de008  b416ff67-b582-402b-9161-3503f9bc5d87  "
1730 |       ]
1731 |      },
1732 |      "execution_count": 50,
1733 |      "metadata": {},
1734 |      "output_type": "execute_result"
1735 |     }
1736 |    ],
1737 |    "source": [
1738 |     "experiment_results = await client.aevaluate(\n",
1739 |     "    run_graph,\n",
1740 |     "    data=dataset_name,\n",
1741 |     "    evaluators=[evaluate_extra_steps, evaluate_unmatched_steps],\n",
1742 |     "    experiment_prefix=\"sql-agent-gpt4o-trajectory\",\n",
1743 |     "    num_repetitions=1,\n",
1744 |     "    max_concurrency=4,\n",
1745 |     ")\n",
1746 |     "experiment_results.to_pandas()"
1747 |    ]
1748 |   }
1749 |  ],
1750 |  "metadata": {
1751 |   "kernelspec": {
1752 |    "display_name": "Python 3",
1753 |    "language": "python",
1754 |    "name": "python3"
1755 |   },
1756 |   "language_info": {
1757 |    "codemirror_mode": {
1758 |     "name": "ipython",
1759 |     "version": 3
1760 |    },
1761 |    "file_extension": ".py",
1762 |    "mimetype": "text/x-python",
1763 |    "name": "python",
1764 |    "nbconvert_exporter": "python",
1765 |    "pygments_lexer": "ipython3",
1766 |    "version": "3.13.0"
1767 |   }
1768 |  },
1769 |  "nbformat": 4,
1770 |  "nbformat_minor": 2
1771 | }
1772 | 


--------------------------------------------------------------------------------
/build-eval-agent/chinook.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/langchain-ai/the-judge/b2793c9811dfc476993c42e3696a0adebc43bac9/build-eval-agent/chinook.db


--------------------------------------------------------------------------------
/build-eval-agent/images/architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/langchain-ai/the-judge/b2793c9811dfc476993c42e3696a0adebc43bac9/build-eval-agent/images/architecture.png


--------------------------------------------------------------------------------
/build-eval-agent/images/evals-conceptual.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/langchain-ai/the-judge/b2793c9811dfc476993c42e3696a0adebc43bac9/build-eval-agent/images/evals-conceptual.png


--------------------------------------------------------------------------------
/build-eval-agent/images/final-response.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/langchain-ai/the-judge/b2793c9811dfc476993c42e3696a0adebc43bac9/build-eval-agent/images/final-response.png


--------------------------------------------------------------------------------
/build-eval-agent/images/lookup.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/langchain-ai/the-judge/b2793c9811dfc476993c42e3696a0adebc43bac9/build-eval-agent/images/lookup.png


--------------------------------------------------------------------------------
/build-eval-agent/images/refund.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/langchain-ai/the-judge/b2793c9811dfc476993c42e3696a0adebc43bac9/build-eval-agent/images/refund.png


--------------------------------------------------------------------------------
/build-eval-agent/images/single-step.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/langchain-ai/the-judge/b2793c9811dfc476993c42e3696a0adebc43bac9/build-eval-agent/images/single-step.png


--------------------------------------------------------------------------------
/build-eval-agent/images/trajectory.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/langchain-ai/the-judge/b2793c9811dfc476993c42e3696a0adebc43bac9/build-eval-agent/images/trajectory.png


--------------------------------------------------------------------------------
/build-eval-agent/images/with-supervisor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/langchain-ai/the-judge/b2793c9811dfc476993c42e3696a0adebc43bac9/build-eval-agent/images/with-supervisor.png


--------------------------------------------------------------------------------
/evaluate-document-extraction/.env.example:
--------------------------------------------------------------------------------
1 | OPENAI_API_KEY=
2 | LANGCHAIN_API_KEY=
3 | LANGCHAIN_TRACING_V2=
4 | LANGCHAIN_PROJECT=
5 | 


--------------------------------------------------------------------------------
/evaluate-document-extraction/.gitignore:
--------------------------------------------------------------------------------
1 | .env


--------------------------------------------------------------------------------
/evaluate-document-extraction/aapl.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/langchain-ai/the-judge/b2793c9811dfc476993c42e3696a0adebc43bac9/evaluate-document-extraction/aapl.pdf


--------------------------------------------------------------------------------
/evaluate-document-extraction/build-eval-extraction.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Build and evaluate document extraction 🦜⛓️\n",
  8 |     "\n"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "metadata": {},
 14 |    "source": [
 15 |     "### Install dependencies"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": 1,
 21 |    "metadata": {},
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "%%capture --no-stderr\n",
 25 |     "%pip install langsmith langchain-openai langchain-core langchain-community pydantic python-dotenv openai\n",
 26 |     "%pip install --upgrade langsmith"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": null,
 32 |    "metadata": {},
 33 |    "outputs": [],
 34 |    "source": [
 35 |     "import langsmith\n",
 36 |     "\n",
 37 |     "print(f\"\\nCurrent langsmith version: {langsmith.__version__}\")"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "markdown",
 42 |    "metadata": {},
 43 |    "source": [
 44 |     "### Load env"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": null,
 50 |    "metadata": {},
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "from dotenv import load_dotenv\n",
 54 |     "\n",
 55 |     "load_dotenv()"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "markdown",
 60 |    "metadata": {},
 61 |    "source": [
 62 |     "### Load the 10-K"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "code",
 67 |    "execution_count": 4,
 68 |    "metadata": {},
 69 |    "outputs": [],
 70 |    "source": [
 71 |     "from langchain_community.document_loaders import PyPDFLoader\n",
 72 |     "\n",
 73 |     "def load_pdf():\n",
 74 |     "    loader = PyPDFLoader(\"./aapl.pdf\")\n",
 75 |     "    all_text = loader.load()\n",
 76 |     "    return all_text"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "markdown",
 81 |    "metadata": {},
 82 |    "source": [
 83 |     "### Perform extraction"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": null,
 89 |    "metadata": {},
 90 |    "outputs": [],
 91 |    "source": [
 92 |     "from pydantic import BaseModel, Field\n",
 93 |     "from langsmith import wrappers, Client\n",
 94 |     "from openai import OpenAI\n",
 95 |     "openai_client = wrappers.wrap_openai(OpenAI())\n",
 96 |     "\n",
 97 |     "class UsefulInformation(BaseModel):\n",
 98 |     "    products_and_services: list[str] = Field(description=\"A list of products and services provided by the company\")\n",
 99 |     "    risk_factors: list[str] = Field(description=\"A list of risk factors described in the document\")\n",
100 |     "    irs_employer_id_number: list[str] = Field(description=\"The IRS Employer Identification Number of the company\")\n",
101 |     "    company_address: list[str] = Field(description=\"The address of the company\")\n",
102 |     "    earnings_per_share_basic: list[str] = Field(description=\"The basic earnings per share of the company\")\n",
103 |     "    net_income: list[str] = Field(description=\"The net income of the company\")\n",
104 |     "\n",
105 |     "def extract_information(doc):\n",
106 |     "    prompt = f\"\"\"\n",
107 |     "    The text below is an excerpt from a 10-K report. You must extract specific information and return it in a structured format.\n",
108 |     "    \n",
109 |     "    CRITICAL INSTRUCTIONS:\n",
110 |     "    1. AVOID DUPLICATES: Never include duplicate items in any list\n",
111 |     "    2. BE CONCISE: Keep each item brief and to the point\n",
112 |     "    3. VALIDATE: Each piece of information must be explicitly stated in the text, do not make assumptions\n",
113 |     "    4. FORMAT: All fields must be lists, even if empty or single item\n",
114 |     "    \n",
115 |     "    Examples of GOOD responses:\n",
116 |     "    - Products: [\"Google Search\", \"Google Cloud\", \"Android\"]\n",
117 |     "    - Address: [\"1600 Amphitheatre Parkway, Mountain View, CA 94043\"]\n",
118 |     "    - Phone: [\"+1 650-253-0000\"]\n",
119 |     "    \n",
120 |     "    Examples of BAD responses (avoid these):\n",
121 |     "    - Duplicates: [\"Google Search\", \"Search by Google\", \"Google Search Engine\"]\n",
122 |     "    - Too verbose: [\"Google Search is a web search engine that allows users to search the World Wide Web...\"]\n",
123 |     "    - Made up data: Do not include information unless explicitly found in the text\n",
124 |     "    \n",
125 |     "    Please extract:\n",
126 |     "    1. Products and Services: List unique products/services (max 10 items)\n",
127 |     "    2. Risk Factors: List unique, critical risks (max 10 items)\n",
128 |     "    3. IRS Employer ID Number: List any EIN found\n",
129 |     "    4. Company Address: List primary address of the company\n",
130 |     "    5. Earnings Per Share (Basic): List basic EPS figure\n",
131 |     "    6. Net Income: List net income figure\n",
132 |     "\n",
133 |     "    Text from the 10-K report:\n",
134 |     "    {doc}\n",
135 |     "    \"\"\"\n",
136 |     "    try:\n",
137 |     "        response = openai_client.beta.chat.completions.parse(\n",
138 |     "        model=\"o1-2024-12-17\",\n",
139 |     "        messages=[\n",
140 |     "            { \"role\": \"user\", \"content\": prompt },\n",
141 |     "        ],\n",
142 |     "        response_format=UsefulInformation\n",
143 |     "        )\n",
144 |     "        return response.choices[0].message.content\n",
145 |     "    except Exception as e:\n",
146 |     "        print(f\"Error in structured output LLM call: {str(e)}\")\n",
147 |     "        print(f\"Error type: {type(e)}\")\n",
148 |     "        return UsefulInformation(\n",
149 |     "            products_and_services=[],\n",
150 |     "            risk_factors=[],\n",
151 |     "            irs_employer_id_number=[],\n",
152 |     "            company_address=[],\n",
153 |     "            earnings_per_share_basic=[],\n",
154 |     "            net_income=[]\n",
155 |     "        )\n",
156 |     "\n",
157 |     "def process_all_docs():\n",
158 |     "    all_text =  load_pdf()\n",
159 |     "    results =  extract_information(all_text)\n",
160 |     "    print(\"processed all docs...\")\n",
161 |     "    return results\n",
162 |     "\n",
163 |     "aggregated_info = process_all_docs()\n",
164 |     "print(aggregated_info)"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "markdown",
169 |    "metadata": {},
170 |    "source": [
171 |     "### Evaluate extraction"
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "markdown",
176 |    "metadata": {},
177 |    "source": [
178 |     "<img src=\"./extraction-eval.png\" alt=\"extraction-eval\" width=\"600\">"
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "markdown",
183 |    "metadata": {},
184 |    "source": [
185 |     "##### Load existing dataset"
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "code",
190 |    "execution_count": 6,
191 |    "metadata": {},
192 |    "outputs": [],
193 |    "source": [
194 |     "dataset_name = \"10-k extraction\""
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "markdown",
199 |    "metadata": {},
200 |    "source": [
201 |     "##### Define application logic to be evaluated"
202 |    ]
203 |   },
204 |   {
205 |    "cell_type": "code",
206 |    "execution_count": 7,
207 |    "metadata": {},
208 |    "outputs": [],
209 |    "source": [
210 |     "from langsmith import traceable\n",
211 |     "\n",
212 |     "client = Client()\n",
213 |     "\n",
214 |     "@traceable\n",
215 |     "def target(inputs: dict) -> dict:\n",
216 |     "    response = openai_client.beta.chat.completions.parse(\n",
217 |     "        model=\"gpt-4o\",\n",
218 |     "        messages=[\n",
219 |     "            { \"role\": \"user\", \"content\": inputs[\"input\"][0][\"content\"] },\n",
220 |     "        ],\n",
221 |     "        response_format=UsefulInformation\n",
222 |     "    )\n",
223 |     "    return { \"response\": response.choices[0].message.content }"
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "markdown",
228 |    "metadata": {},
229 |    "source": [
230 |     "##### Define evaluator"
231 |    ]
232 |   },
233 |   {
234 |    "cell_type": "code",
235 |    "execution_count": 8,
236 |    "metadata": {},
237 |    "outputs": [],
238 |    "source": [
239 |     "import json\n",
240 |     "\n",
241 |     "def format_objects_for_llm_judge(obj1, obj2):\n",
242 |     "    \"\"\"Formats two objects into natural language for easier LLM comparison.\"\"\"\n",
243 |     "    def format_single_object(obj, object_name):\n",
244 |     "        if isinstance(obj, str):\n",
245 |     "            obj = json.loads(obj)\n",
246 |     "        formatted_sections = []\n",
247 |     "        formatted_sections.append(f\"\\n{object_name} contains the following information:\")\n",
248 |     "        sorted_keys = sorted(obj.keys())\n",
249 |     "        for key in sorted_keys:\n",
250 |     "            values = obj[key]\n",
251 |     "            readable_key = key.replace('_', ' ').capitalize()\n",
252 |     "            if isinstance(values, list):\n",
253 |     "                if len(values) == 1:\n",
254 |     "                    formatted_sections.append(f\"\\n{readable_key}: {values[0]}\")\n",
255 |     "                else:\n",
256 |     "                    items = '\\n  - '.join(values)\n",
257 |     "                    formatted_sections.append(f\"\\n{readable_key}:\\n  - {items}\")\n",
258 |     "            else:\n",
259 |     "                formatted_sections.append(f\"\\n{readable_key}: {values}\")\n",
260 |     "        \n",
261 |     "        return '\\n'.join(formatted_sections)\n",
262 |     "\n",
263 |     "    object1_text = format_single_object(obj1, \"Actual Output\")\n",
264 |     "    object2_text = format_single_object(obj2, \"Reference Output\")\n",
265 |     "    return [object1_text, object2_text]\n",
266 |     "\n",
267 |     "@traceable(run_type=\"llm\")\n",
268 |     "def run_llm_judge(formatted_text):\n",
269 |     "    class Score(BaseModel):\n",
270 |     "        \"\"\"Evaluate how well an extracted output matches a reference ground truth for 10-K document information.\"\"\"\n",
271 |     "        accuracy: bool = Field(\n",
272 |     "            description=(\n",
273 |     "                \"A binary score (0 or 1) that indicates whether the model's extraction adequately matches the reference ground truth. \"\n",
274 |     "                \"Score 1 if the model's output captures the same essential business information as the reference extraction, even if \"\n",
275 |     "                \"expressed differently. The goal is to verify that the model successfully extracted similar key business information \"\n",
276 |     "                \"as found in the reference ground truth, not to ensure identical representation.\"\n",
277 |     "            )\n",
278 |     "        )\n",
279 |     "        reason: str = Field(\n",
280 |     "            description=(\n",
281 |     "                \"An explanation of how well the model's extraction aligns with the reference ground truth. Consider how effectively \"\n",
282 |     "                \"the model captured the same key business information, financial data, and risk factors as the reference output. \"\n",
283 |     "                \"Acknowledge that variations in expression are acceptable as long as the same core information is captured.\"\n",
284 |     "            )\n",
285 |     "        )\n",
286 |     "    response = openai_client.beta.chat.completions.parse(\n",
287 |     "        model=\"gpt-4o\",\n",
288 |     "        messages=[\n",
289 |     "            {\n",
290 |     "                \"role\": \"system\",\n",
291 |     "                \"content\": (\n",
292 |     "                    \"You are evaluating how well a model's extraction of 10-K document information matches a reference ground truth output. \"\n",
293 |     "                    \"Your task is to determine if the model successfully captured the same essential business information as the reference, \"\n",
294 |     "                    \"understanding that similar concepts may be expressed differently.\\n\\n\"\n",
295 |     "                    \"Context:\\n\"\n",
296 |     "                    \"- The reference output represents the ground truth extraction from a 10-K document\\n\"\n",
297 |     "                    \"- The model's output is being evaluated against this reference for accuracy and completeness\\n\"\n",
298 |     "                    \"- Both extractions contain key business information like products/services, risk factors, and financial metrics\\n\"\n",
299 |     "                    \"- The goal is to verify the model captured similar information as the reference, not identical expression\\n\\n\"\n",
300 |     "                    \"Evaluation Guidelines:\\n\"\n",
301 |     "                    \"- Score 1 (true) if the model's output:\\n\"\n",
302 |     "                    \"  * Captures the same core business information as the reference\\n\"\n",
303 |     "                    \"  * Identifies similar risk factors, even if described differently\\n\"\n",
304 |     "                    \"  * Extracts matching or equivalent financial metrics\\n\"\n",
305 |     "                    \"  * Contains consistent company identifiers\\n\"\n",
306 |     "                    \"  * May include additional valid information beyond the reference\\n\\n\"\n",
307 |     "                    \"- Score 0 (false) only if the model's output:\\n\"\n",
308 |     "                    \"  * Misses or contradicts critical information from the reference\\n\"\n",
309 |     "                    \"  * Shows fundamental misunderstanding of the business details\\n\"\n",
310 |     "                    \"  * Contains irreconcilable differences in key metrics\\n\"\n",
311 |     "                    \"  * Fails to capture the essential information found in the reference\\n\\n\"\n",
312 |     "                    \"Remember: The reference output is our ground truth. Evaluate how well the model's extraction \"\n",
313 |     "                    \"captures the same essential business information, allowing for variations in expression.\\n\\n\"\n",
314 |     "                    \"Outputs to Evaluate:\\n\"\n",
315 |     "                    f\"- **Model Output:** {formatted_text[0]}\\n\"\n",
316 |     "                    f\"- **Reference Ground Truth:** {formatted_text[1]}\\n\"\n",
317 |     "                )\n",
318 |     "            }\n",
319 |     "        ],\n",
320 |     "        response_format=Score\n",
321 |     "    )\n",
322 |     "    response_object = json.loads(response.choices[0].message.content)\n",
323 |     "    return { \"response\": response_object }\n",
324 |     "\n",
325 |     "@traceable\n",
326 |     "def evaluate_accuracy(outputs: dict, reference_outputs: dict) -> dict:\n",
327 |     "    actual_output = outputs[\"response\"]\n",
328 |     "    expected_output = reference_outputs['output']\n",
329 |     "    formatted_text = format_objects_for_llm_judge(actual_output, expected_output)\n",
330 |     "    object_response = run_llm_judge(formatted_text)[\"response\"]\n",
331 |     "    return { \"key\": \"accuracy\",\n",
332 |     "            \"score\": object_response[\"accuracy\"],\n",
333 |     "            \"reason\": object_response[\"reason\"] }"
334 |    ]
335 |   },
336 |   {
337 |    "cell_type": "markdown",
338 |    "metadata": {},
339 |    "source": [
340 |     "##### Run evaluation"
341 |    ]
342 |   },
343 |   {
344 |    "cell_type": "code",
345 |    "execution_count": null,
346 |    "metadata": {},
347 |    "outputs": [],
348 |    "source": [
349 |     "experiment_results = client.evaluate(\n",
350 |     "    target,\n",
351 |     "    data=\"10-k extraction\",\n",
352 |     "    evaluators=[evaluate_accuracy],\n",
353 |     "    experiment_prefix=\"10-k-extraction-gpt-4o\",\n",
354 |     "    max_concurrency=5,\n",
355 |     "    num_repetitions=3\n",
356 |     ")\n",
357 |     "\n",
358 |     "experiment_results.to_pandas()"
359 |    ]
360 |   }
361 |  ],
362 |  "metadata": {
363 |   "kernelspec": {
364 |    "display_name": "Python 3",
365 |    "language": "python",
366 |    "name": "python3"
367 |   },
368 |   "language_info": {
369 |    "codemirror_mode": {
370 |     "name": "ipython",
371 |     "version": 3
372 |    },
373 |    "file_extension": ".py",
374 |    "mimetype": "text/x-python",
375 |    "name": "python",
376 |    "nbconvert_exporter": "python",
377 |    "pygments_lexer": "ipython3",
378 |    "version": "3.13.0"
379 |   }
380 |  },
381 |  "nbformat": 4,
382 |  "nbformat_minor": 2
383 | }
384 | 


--------------------------------------------------------------------------------
/evaluate-document-extraction/extraction-eval.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/langchain-ai/the-judge/b2793c9811dfc476993c42e3696a0adebc43bac9/evaluate-document-extraction/extraction-eval.png


--------------------------------------------------------------------------------
	inputs.question	outputs.response	error	reference.response	feedback.final_answer_correct	execution_time	example_id	id
0	I want a full refund for invoice 237	You have been refunded a total of: $0.99. Is t...	None	You have been refunded $0.99.	True	3.750356	b094b32b-94f6-49a4-9af7-a7ccb289cba5	2ab8e132-e3b0-4ffe-ba73-3d42aea64fda
1	My name is Aaron Mitchell and I'd like a refun...	We did not find any purchases associated with ...	None	Which of the following purchases would you lik...	False	3.748979	2274dab9-5682-4e35-8f30-e3171886b2a9	c9d3bb87-0ec2-4557-b384-cbf038e18149
2	My name is Aaron Mitchell and I'd like a refund.	Please provide your phone number and either th...	None	I need some more information to help you with ...	True	3.108677	16e7f941-7c18-4152-82fe-639ce4feca0e	35d9a6f6-dbb7-445f-a214-a12af1d495b0
3	How many songs do you have by James Brown	There are 20 James Brown songs in the database...	None	We have 20 songs by James Brown	True	6.446163	5a516727-b97f-40ea-a363-605a8b92669a	21d7d353-accd-43f0-9776-56cffb8a3954
4	Who recorded Wish You Were Here again?	I apologize, but I'm not finding \"Wish You Wer...	None	Wish You Were Here is an album by Pink Floyd	False	10.356575	27e113d9-498f-49ab-a66b-8395a4a6b32b	c476f0a7-b2f5-4628-8e59-c6eb2a4acf4d
	inputs.messages	outputs.route	error	reference.route	feedback.correct	execution_time	example_id	id
0	[{'role': 'user', 'content': 'Can I get my mon...	refund_agent	None	refund_agent	True	0.634425	47bb4665-da6f-4b58-a314-fefc886040c3	f6ccda12-147a-41ad-8e3c-93da544b6ca5
1	[{'role': 'user', 'content': 'i purchased a co...	question_answering_agent	None	question_answering_agent	True	0.412655	e88b4749-4b72-42d0-9efd-e69ff276b4e2	47fb6f5c-0d54-4757-b3df-ba2d271d1cf3
2	[{'role': 'user', 'content': 'i want a refund ...	question_answering_agent	None	question_answering_agent	True	0.409138	0dc9df85-806c-477c-ba2e-b1872b445db9	aacd798d-9afb-4b84-ab67-e2ec1c3adb22
3	[{'role': 'user', 'content': 'I was thinking o...	question_answering_agent	None	question_answering_agent	True	0.410535	cd503c74-547a-41fb-8763-ed808dcf9ba9	7770076c-0894-460f-a7ec-ed53ae491cf4
4	[{'role': 'user', 'content': 'i bought some tr...	refund_agent	None	refund_agent	True	0.342001	ac23df0a-f0d7-442f-b85d-26e62126adce	b227ce5d-5378-4195-ab76-3d52f1db1b2a
	inputs.question	outputs.trajectory	error	reference.trajectory	feedback.extra_steps	feedback.unmatched_steps	execution_time	example_id	id
0	My name is Aaron Mitchell and I'd like a refund.	[intent_classifier, refund_agent, gather_info,...	None	[intent_classifier, refund_agent, gather_info,...	0	0	2.993548	0f9e2190-ef0f-455e-86e1-f3965fe2dd20	70f63270-7d0e-49f4-9e25-873ce8725324
1	My name is Aaron Mitchell. My number is +1 (20...	[intent_classifier, refund_agent, gather_info,...	None	[intent_classifier, refund_agent, gather_info,...	0	0	3.585207	f7573d70-d3c9-4273-bad8-96c1693d41e6	7e2b5eaf-3af9-475b-978b-c84167d9c3d8
2	My name is Aaron Mitchell and I'd like a refun...	[intent_classifier, refund_agent, gather_info,...	None	[intent_classifier, refund_agent, gather_info,...	0	0	3.596975	6c35522a-a1ca-4212-9edf-5c17566cfb05	a748f4a7-bd7e-40b2-af6e-e9de37e56000
3	How many songs do you have by James Brown	[intent_classifier, question_answering_agent, ...	None	[intent_classifier, question_answering_agent, ...	0	0	7.249661	1c308dca-8a5e-44d8-9620-d71c36a06852	45055ce2-bb55-475c-918f-3e69aaaecc40
4	Who recorded Wish You Were Here again? What ot...	[intent_classifier, question_answering_agent, ...	None	[intent_classifier, question_answering_agent, ...	3	3	19.679824	14b71f30-6e70-4ea4-b527-3fdca46de008	b416ff67-b582-402b-9161-3503f9bc5d87