├── src
    └── extraction_review
    │   ├── __init__.py
    │   ├── metadata_workflow.py
    │   ├── schema.py
    │   ├── clients.py
    │   ├── config.py
    │   ├── index_contract.py
    │   └── process_file.py
├── tests
    └── test_placeholder.py
├── .gitignore
├── .env.template
├── ui
    ├── postcss.config.mjs
    ├── src
    │   ├── lib
    │   │   ├── config.ts
    │   │   ├── utils.ts
    │   │   ├── export.ts
    │   │   ├── ToolbarContext.tsx
    │   │   ├── useMetadata.ts
    │   │   ├── client.ts
    │   │   ├── MetadataProvider.tsx
    │   │   ├── WorkflowProgress.tsx
    │   │   └── ContractsDropdown.tsx
    │   ├── main.tsx
    │   ├── pages
    │   │   ├── HomePage.module.css
    │   │   ├── HomePage.tsx
    │   │   └── ItemPage.tsx
    │   ├── vite-env.d.ts
    │   ├── App.tsx
    │   └── index.css
    ├── index.html
    ├── README.md
    ├── components.json
    ├── .gitignore
    ├── tsconfig.json
    ├── vite.config.ts
    └── package.json
├── {{ _copier_conf.answers_file }}.jinja
├── .copier-answers.yml
├── pyproject.toml
├── task.md
└── README.md


/src/extraction_review/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/test_placeholder.py:
--------------------------------------------------------------------------------
1 | def test_placeholder():
2 |     pass
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .env
2 | __pycache__
3 | workflows.db
4 | 
5 | .venv
6 | package-lock.json
7 | node_modules
8 | 


--------------------------------------------------------------------------------
/.env.template:
--------------------------------------------------------------------------------
1 | # copy to .env and place any needed secrets here. LLAMA_CLOUD_API_KEY will be automatically set
2 | # OPENAI_API_KEY=sk-xxx
3 | 


--------------------------------------------------------------------------------
/ui/postcss.config.mjs:
--------------------------------------------------------------------------------
1 | const config = {
2 |   plugins: {
3 |     "@tailwindcss/postcss": {},
4 |   },
5 | };
6 | 
7 | export default config; 


--------------------------------------------------------------------------------
/{{ _copier_conf.answers_file }}.jinja:
--------------------------------------------------------------------------------
1 | # Changes here will be overwritten by Copier; NEVER EDIT MANUALLY
2 | {{ _copier_answers|to_nice_yaml -}}


--------------------------------------------------------------------------------
/ui/src/lib/config.ts:
--------------------------------------------------------------------------------
1 | export const APP_TITLE = "Extraction Review";
2 | export const AGENT_NAME = import.meta.env.VITE_LLAMA_DEPLOY_DEPLOYMENT_NAME;
3 | 


--------------------------------------------------------------------------------
/.copier-answers.yml:
--------------------------------------------------------------------------------
1 | # Changes here will be overwritten by Copier; NEVER EDIT MANUALLY
2 | _commit: v0.3.6
3 | _src_path: https://github.com/run-llama/template-workflow-data-extraction
4 | 


--------------------------------------------------------------------------------
/ui/src/lib/utils.ts:
--------------------------------------------------------------------------------
1 | import { clsx, type ClassValue } from "clsx";
2 | import { twMerge } from "tailwind-merge";
3 | 
4 | export function cn(...inputs: ClassValue[]) {
5 |   return twMerge(clsx(inputs));
6 | }
7 | 


--------------------------------------------------------------------------------
/ui/index.html:
--------------------------------------------------------------------------------
 1 | <!doctype html>
 2 | <html lang="en">
 3 |   <head>
 4 |     <meta charset="UTF-8" />
 5 |     <meta name="viewport" content="width=device-width, initial-scale=1.0" />
 6 |     <title>Review</title>
 7 |   </head>
 8 |   <body>
 9 |     <div id="root"></div>
10 |     <script type="module" src="/src/main.tsx"></script>
11 |   </body>
12 | </html>


--------------------------------------------------------------------------------
/ui/README.md:
--------------------------------------------------------------------------------
1 | # Data Extraction UI
2 | 
3 | This is a simple next.js template that builds on the @llamaindex/agent-app ui component library
4 | for showing displaying tables of extracted data.
5 | 
6 | Ideally run this with `llamactl` in the parent directory (See [README.md](../README.md)),
7 | but you can also run it standalone with `npm run dev`, but workflow integrations will not work


--------------------------------------------------------------------------------
/ui/src/main.tsx:
--------------------------------------------------------------------------------
 1 | import { StrictMode } from "react";
 2 | import { createRoot } from "react-dom/client";
 3 | import { HashRouter } from "react-router-dom";
 4 | import App from "./App";
 5 | import "@llamaindex/ui/styles.css";
 6 | import "./index.css";
 7 | 
 8 | createRoot(document.getElementById("root")!).render(
 9 |   <StrictMode>
10 |     <HashRouter>
11 |       <App />
12 |     </HashRouter>
13 |   </StrictMode>,
14 | );
15 | 


--------------------------------------------------------------------------------
/ui/src/pages/HomePage.module.css:
--------------------------------------------------------------------------------
 1 | .main {
 2 |   padding: 1rem;
 3 | }
 4 | 
 5 | .grid {
 6 |   display: flex;
 7 |   flex-direction: row;
 8 |   gap: 1rem;
 9 |   margin-bottom: 1rem;
10 |   & > * {
11 |     flex: 1;
12 |   }
13 | }
14 | 
15 | .commandBar {
16 |   display: flex;
17 |   justify-content: flex-end;
18 |   margin-bottom: 1rem;
19 |   gap: 1rem;
20 |   align-items: center;
21 | }
22 | 
23 | .progressBar {
24 |   margin-bottom: 1rem;
25 | }
26 | 


--------------------------------------------------------------------------------
/ui/src/vite-env.d.ts:
--------------------------------------------------------------------------------
 1 | /// <reference types="vite/client" />
 2 | 
 3 | interface ImportMetaEnv {
 4 |   readonly VITE_LLAMA_CLOUD_API_KEY?: string;
 5 |   readonly VITE_LLAMA_CLOUD_BASE_URL?: string;
 6 | 
 7 |   // injected from llama_deploy
 8 |   readonly VITE_LLAMA_DEPLOY_BASE_PATH: string;
 9 |   readonly VITE_LLAMA_DEPLOY_DEPLOYMENT_NAME: string;
10 |   readonly VITE_LLAMA_DEPLOY_PROJECT_ID: string;
11 | }
12 | 
13 | interface ImportMeta {
14 |   readonly env: ImportMetaEnv;
15 | }
16 | 


--------------------------------------------------------------------------------
/ui/components.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "$schema": "https://ui.shadcn.com/schema.json",
 3 |   "style": "new-york",
 4 |   "rsc": true,
 5 |   "tsx": true,
 6 |   "tailwind": {
 7 |     "config": "",
 8 |     "css": "src/index.css",
 9 |     "baseColor": "zinc",
10 |     "cssVariables": true,
11 |     "prefix": ""
12 |   },
13 |   "aliases": {
14 |     "components": "@/components",
15 |     "utils": "@/lib/utils",
16 |     "ui": "@/components/ui",
17 |     "lib": "@/lib",
18 |     "hooks": "@/hooks"
19 |   },
20 |   "iconLibrary": "lucide"
21 | }


--------------------------------------------------------------------------------
/ui/.gitignore:
--------------------------------------------------------------------------------
 1 | # See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
 2 | 
 3 | # dependencies
 4 | /node_modules
 5 | /.pnp
 6 | .pnp.*
 7 | .yarn/*
 8 | !.yarn/patches
 9 | !.yarn/plugins
10 | !.yarn/releases
11 | !.yarn/versions
12 | 
13 | # testing
14 | /coverage
15 | 
16 | # next.js
17 | /.next/
18 | /out/
19 | /dist/
20 | 
21 | # production
22 | /build
23 | 
24 | # misc
25 | .DS_Store
26 | *.pem
27 | 
28 | # debug
29 | npm-debug.log*
30 | yarn-debug.log*
31 | yarn-error.log*
32 | .pnpm-debug.log*
33 | 
34 | # env files (can opt-in for committing if needed)
35 | .env*
36 | 
37 | # vercel
38 | .vercel
39 | 
40 | # typescript
41 | *.tsbuildinfo
42 | next-env.d.ts
43 | 
44 | 


--------------------------------------------------------------------------------
/ui/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "compilerOptions": {
 3 |     "target": "ES2020",
 4 |     "useDefineForClassFields": true,
 5 |     "lib": ["ES2020", "DOM", "DOM.Iterable"],
 6 |     "module": "ESNext",
 7 |     "skipLibCheck": true,
 8 |     
 9 |     /* Bundler mode */
10 |     "moduleResolution": "bundler",
11 |     "allowImportingTsExtensions": true,
12 |     "resolveJsonModule": true,
13 |     "isolatedModules": true,
14 |     "noEmit": true,
15 |     "jsx": "react-jsx",
16 |     
17 |     /* Linting */
18 |     "strict": true,
19 |     "noUnusedLocals": true,
20 |     "noUnusedParameters": true,
21 |     "noFallthroughCasesInSwitch": true,
22 |     
23 |     /* Path mapping */
24 |     "baseUrl": ".",
25 |     "paths": {
26 |       "@/*": ["./src/*"]
27 |     }
28 |   },
29 |   "include": ["src", "vite.config.ts", "src/vite-env.d.ts"],
30 |   "exclude": ["node_modules"]
31 | }
32 | 


--------------------------------------------------------------------------------
/ui/src/lib/export.ts:
--------------------------------------------------------------------------------
 1 | import type {
 2 |   ExtractedData,
 3 |   TypedAgentData,
 4 | } from "llama-cloud-services/beta/agent";
 5 | 
 6 | /**
 7 |  * Downloads data as a JSON file
 8 |  */
 9 | export function downloadJSON<T>(
10 |   data: T,
11 |   filename: string = "extraction-results.json",
12 | ) {
13 |   const jsonString = JSON.stringify(data, null, 2);
14 |   const blob = new Blob([jsonString], { type: "application/json" });
15 |   const url = URL.createObjectURL(blob);
16 | 
17 |   const link = document.createElement("a");
18 |   link.href = url;
19 |   link.download = filename;
20 |   document.body.appendChild(link);
21 |   link.click();
22 | 
23 |   // Cleanup
24 |   document.body.removeChild(link);
25 |   URL.revokeObjectURL(url);
26 | }
27 | 
28 | /**
29 |  * Downloads extracted data item as JSON
30 |  */
31 | export function downloadExtractedDataItem<T>(
32 |   item: TypedAgentData<ExtractedData<T>>,
33 | ) {
34 |   const fileName = item.data.file_name || "item";
35 |   const timestamp = item.createdAt.toISOString().split("T")[0];
36 |   const filename = `${fileName}-${timestamp}.json`;
37 | 
38 |   downloadJSON(item, filename);
39 | }
40 | 


--------------------------------------------------------------------------------
/src/extraction_review/metadata_workflow.py:
--------------------------------------------------------------------------------
 1 | from typing import Any
 2 | 
 3 | from workflows import Workflow, step
 4 | from workflows.events import StartEvent, StopEvent
 5 | 
 6 | from extraction_review.schema import get_extraction_schema_json
 7 | 
 8 | from .clients import get_contracts_index
 9 | from .config import EXTRACTED_DATA_COLLECTION
10 | 
11 | 
12 | class MetadataResponse(StopEvent):
13 |     json_schema: dict[str, Any]
14 |     extracted_data_collection: str
15 |     contracts_pipeline_id: str
16 | 
17 | 
18 | class MetadataWorkflow(Workflow):
19 |     """
20 |     Simple single step workflow to expose configuration to the UI, such as the JSON schema and collection name.
21 |     """
22 | 
23 |     @step
24 |     async def get_metadata(self, _: StartEvent) -> MetadataResponse:
25 |         json_schema = await get_extraction_schema_json()
26 |         contracts_index = get_contracts_index()
27 |         return MetadataResponse(
28 |             json_schema=json_schema,
29 |             extracted_data_collection=EXTRACTED_DATA_COLLECTION,
30 |             contracts_pipeline_id=contracts_index.id,
31 |         )
32 | 
33 | 
34 | workflow = MetadataWorkflow(timeout=None)
35 | 


--------------------------------------------------------------------------------
/src/extraction_review/schema.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Type
 2 | 
 3 | import jsonref
 4 | from pydantic import BaseModel, Field, create_model
 5 | 
 6 | from extraction_review.config import InvoiceWithReconciliation
 7 | 
 8 | 
 9 | async def get_extraction_schema_json() -> dict[str, Any]:
10 |     json_schema = InvoiceWithReconciliation.model_json_schema()
11 |     json_schema = jsonref.replace_refs(json_schema, proxies=False)
12 |     return json_schema
13 | 
14 | 
15 | def model_from_schema(schema: dict[str, Any]) -> Type[BaseModel]:
16 |     """
17 |     Converts a JSON schema back to a Pydantic model.
18 |     """
19 |     typemap = {
20 |         "string": str,
21 |         "integer": int,
22 |         "number": float,
23 |         "boolean": bool,
24 |         "array": list,
25 |         "object": dict,
26 |     }
27 |     fields = {}
28 |     for prop, meta in schema.get("properties", {}).items():
29 |         py_type = typemap.get(meta.get("type"), Any)
30 |         default = ... if prop in schema.get("required", []) else None
31 |         fields[prop] = (py_type, Field(default, description=meta.get("description")))
32 |     return create_model(schema.get("title", "DynamicModel"), **fields)
33 | 


--------------------------------------------------------------------------------
/ui/src/lib/ToolbarContext.tsx:
--------------------------------------------------------------------------------
 1 | import React from "react";
 2 | import { APP_TITLE } from "./config";
 3 | 
 4 | export interface BreadcrumbItem {
 5 |   label: string;
 6 |   href?: string;
 7 |   isCurrentPage?: boolean;
 8 | }
 9 | 
10 | export const ToolbarCtx = React.createContext<{
11 |   buttons: React.ReactNode[];
12 |   setButtons: (fn: (prev: React.ReactNode[]) => React.ReactNode[]) => void;
13 |   breadcrumbs: BreadcrumbItem[];
14 |   setBreadcrumbs: (items: BreadcrumbItem[]) => void;
15 | }>({
16 |   buttons: [],
17 |   setButtons: () => {},
18 |   breadcrumbs: [],
19 |   setBreadcrumbs: () => {},
20 | });
21 | 
22 | export const ToolbarProvider = ({
23 |   children,
24 | }: {
25 |   children: React.ReactNode;
26 | }) => {
27 |   const [buttons, setButtons] = React.useState<React.ReactNode[]>([]);
28 |   const [breadcrumbs, setBreadcrumbs] = React.useState<BreadcrumbItem[]>([
29 |     { label: APP_TITLE, href: "/" },
30 |   ]);
31 | 
32 |   return (
33 |     <ToolbarCtx.Provider
34 |       value={{ buttons, setButtons, breadcrumbs, setBreadcrumbs }}
35 |     >
36 |       {children}
37 |     </ToolbarCtx.Provider>
38 |   );
39 | };
40 | 
41 | export const useToolbar = () => React.useContext(ToolbarCtx);
42 | 


--------------------------------------------------------------------------------
/ui/vite.config.ts:
--------------------------------------------------------------------------------
 1 | import { defineConfig } from "vite";
 2 | import react from "@vitejs/plugin-react";
 3 | import path from "path";
 4 | 
 5 | // https://vitejs.dev/config/
 6 | export default defineConfig(({}) => {
 7 |   const deploymentName = process.env.LLAMA_DEPLOY_DEPLOYMENT_NAME;
 8 |   const basePath = process.env.LLAMA_DEPLOY_DEPLOYMENT_BASE_PATH;
 9 |   const projectId = process.env.LLAMA_DEPLOY_PROJECT_ID;
10 |   const port = process.env.PORT ? Number(process.env.PORT) : 3000;
11 |   const baseUrl = process.env.LLAMA_CLOUD_BASE_URL;
12 |   return {
13 |     plugins: [react()],
14 |     resolve: {
15 |       alias: {
16 |         "@": path.resolve(__dirname, "./src"),
17 |       },
18 |     },
19 |     server: {
20 |       port: port,
21 |       host: true,
22 |     },
23 |     build: {
24 |       outDir: "dist",
25 |       sourcemap: true,
26 |     },
27 |     base: basePath,
28 |     define: {
29 |       // Primary define uses NAME
30 |       "import.meta.env.VITE_LLAMA_DEPLOY_DEPLOYMENT_NAME": JSON.stringify(
31 |         deploymentName
32 |       ),
33 |       "import.meta.env.VITE_LLAMA_DEPLOY_DEPLOYMENT_BASE_PATH": JSON.stringify(basePath),
34 |       ...(projectId && {
35 |         "import.meta.env.VITE_LLAMA_DEPLOY_PROJECT_ID":
36 |           JSON.stringify(projectId),
37 |       }),
38 |       ...(baseUrl && {
39 |         "import.meta.env.VITE_LLAMA_CLOUD_BASE_URL": JSON.stringify(baseUrl),
40 |       }),
41 |     },
42 |   };
43 | });
44 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "extraction-review"
 3 | version = "0.1.0"
 4 | description = "Extracts data"
 5 | readme = "README.md"
 6 | requires-python = ">=3.12"
 7 | dependencies = [
 8 |     "llama-cloud-services>=0.6.69",
 9 |     "llama-index-workflows>=2.2.0,<3.0.0",
10 |     "python-dotenv>=1.1.0",
11 |     "jsonref>=1.1.0",
12 |     "click>=8.2.1,<8.3.0",
13 |     "httpx>=0.28.1",
14 |     "llama-index-core>=0.14.0",
15 |     "llama-index-llms-openai>=0.3.0",
16 | ]
17 | 
18 | [dependency-groups]
19 | dev = [
20 |     "ruff>=0.11.10",
21 |     "typescript>=0.0.12",
22 |     "ty>=0.0.1a16",
23 |     "pytest>=8.4.1",
24 |     "hatch>=1.14.1",
25 |     "llamactl>=0.3.0",
26 |     "pytest-asyncio>=1.3.0",
27 | ]
28 | 
29 | [build-system]
30 | requires = ["hatchling"]
31 | build-backend = "hatchling.build"
32 | 
33 | [tool.hatch.envs.default.scripts]
34 | "format" = "ruff format ."
35 | "format-check" = "ruff format --check ."
36 | "lint" = "ruff check --fix ."
37 | "lint-check" = ["ruff check ."]
38 | typecheck = "ty check src"
39 | test = "pytest"
40 | "all-check" = ["format-check", "lint-check", "test"]
41 | "all-fix" = ["format", "lint", "test"]
42 | 
43 | [tool.llamadeploy]
44 | env_files = [".env"]
45 | llama_cloud = true
46 | required_env_vars = ["OPENAI_API_KEY"]
47 | 
48 | [tool.llamadeploy.workflows]
49 | process-file = "extraction_review.process_file:workflow"
50 | metadata = "extraction_review.metadata_workflow:workflow"
51 | index-contract = "extraction_review.index_contract:workflow"
52 | 
53 | [tool.llamadeploy.ui]
54 | directory = "ui"
55 | 
56 | 


--------------------------------------------------------------------------------
/ui/src/lib/useMetadata.ts:
--------------------------------------------------------------------------------
 1 | import { useWorkflow } from "@llamaindex/ui";
 2 | import { useEffect, useRef, useState } from "react";
 3 | 
 4 | export interface Metadata {
 5 |   json_schema: any;
 6 |   extracted_data_collection: string;
 7 |   contracts_pipeline_id: string;
 8 | }
 9 | 
10 | export interface UseMetadataResult {
11 |   metadata: Metadata;
12 |   loading: boolean;
13 |   error: string | undefined;
14 | }
15 | 
16 | export function useMetadata() {
17 |   const wf = useWorkflow("metadata");
18 |   const [error, setError] = useState<string | undefined>(undefined);
19 |   const [loading, setLoading] = useState(true);
20 |   const [metadata, setMetadata] = useState<Metadata | undefined>(undefined);
21 |   const strictModeWorkaround = useRef(false);
22 |   useEffect(() => {
23 |     if (strictModeWorkaround.current) {
24 |       return;
25 |     }
26 |     strictModeWorkaround.current = true;
27 |     setLoading(true);
28 |     wf.runToCompletion({})
29 |       .then((handler) => {
30 |         if (handler.status === "completed") {
31 |           const result = handler.result?.data as unknown as Metadata;
32 |           setMetadata(result);
33 |         } else {
34 |           setError(
35 |             handler.error || `Unexpected workflow status: ${handler.status}`,
36 |           );
37 |         }
38 |       })
39 |       .catch((error) => {
40 |         setError(error instanceof Error ? error.message : String(error));
41 |       })
42 |       .finally(() => {
43 |         setLoading(false);
44 |       });
45 |   }, []);
46 | 
47 |   return { metadata, loading, error };
48 | }
49 | 


--------------------------------------------------------------------------------
/ui/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "extraction-review-ui",
 3 |   "version": "0.1.0",
 4 |   "private": true,
 5 |   "type": "module",
 6 |   "scripts": {
 7 |     "dev": "vite",
 8 |     "build": "tsc && vite build",
 9 |     "preview": "vite preview",
10 |     "lint": "tsc --noEmit",
11 |     "format": "prettier --write src",
12 |     "format-check": "prettier --check src",
13 |     "all-check": "pnpm i && pnpm run lint && pnpm run format-check && pnpm run build",
14 |     "all-fix": "pnpm i && pnpm run lint && pnpm run format && pnpm run build"
15 |   },
16 |   "dependencies": {
17 |     "@babel/runtime": "^7.27.6",
18 |     "@lezer/highlight": "^1.2.1",
19 |     "@llamaindex/ui": "^3.2.0",
20 |     "@llamaindex/workflows-client": "^1.7.0",
21 |     "@radix-ui/themes": "^3.2.1",
22 |     "class-variance-authority": "^0.7.1",
23 |     "clsx": "^2.1.1",
24 |     "llama-cloud-services": "^0.3.4",
25 |     "lucide-react": "^0.514.0",
26 |     "react": "^19.0.0",
27 |     "react-dom": "^19.0.0",
28 |     "react-router-dom": "^6.30.0",
29 |     "sonner": "^2.0.5",
30 |     "tw-animate-css": "^1.3.5"
31 |   },
32 |   "devDependencies": {
33 |     "@tailwindcss/postcss": "^4.1.10",
34 |     "@types/node": "^20",
35 |     "@types/react": "^19",
36 |     "@types/react-dom": "^19",
37 |     "@vitejs/plugin-react": "^4.3.4",
38 |     "postcss": "^8.5.5",
39 |     "prettier": "^3.6.2",
40 |     "tailwind-merge": "^3.3.1",
41 |     "tailwindcss": "^4.1.8",
42 |     "typescript": "^5",
43 |     "vite": "^6.0.5"
44 |   },
45 |   "packageManager": "pnpm@10.11.1+sha512.e519b9f7639869dc8d5c3c5dfef73b3f091094b0a006d7317353c72b124e80e1afd429732e28705ad6bfa1ee879c1fce46c128ccebd3192101f43dd67c667912"
46 | }
47 | 


--------------------------------------------------------------------------------
/ui/src/lib/client.ts:
--------------------------------------------------------------------------------
 1 | import { ExtractedData } from "llama-cloud-services/beta/agent";
 2 | import {
 3 |   ApiClients,
 4 |   createWorkflowsClient,
 5 |   createWorkflowsConfig,
 6 |   createCloudAgentClient,
 7 |   cloudApiClient,
 8 | } from "@llamaindex/ui";
 9 | import { AGENT_NAME } from "./config";
10 | import type { Metadata } from "./useMetadata";
11 | 
12 | const platformToken = import.meta.env.VITE_LLAMA_CLOUD_API_KEY;
13 | const apiBaseUrl = import.meta.env.VITE_LLAMA_CLOUD_BASE_URL;
14 | const projectId = import.meta.env.VITE_LLAMA_DEPLOY_PROJECT_ID;
15 | 
16 | // Configure the platform client
17 | cloudApiClient.setConfig({
18 |   ...(apiBaseUrl && { baseUrl: apiBaseUrl }),
19 |   headers: {
20 |     // optionally use a backend API token scoped to a project. For local development,
21 |     ...(platformToken && { authorization: `Bearer ${platformToken}` }),
22 |     // This header is required for requests to correctly scope to the agent's project
23 |     // when authenticating with a user cookie
24 |     ...(projectId && { "Project-Id": projectId }),
25 |   },
26 | });
27 | 
28 | export function createBaseWorkflowClient(): ReturnType<
29 |   typeof createWorkflowsClient
30 | > {
31 |   return createWorkflowsClient(
32 |     createWorkflowsConfig({
33 |       baseUrl: `/deployments/${AGENT_NAME}/`,
34 |     }),
35 |   );
36 | }
37 | 
38 | export function createClients(metadata: Metadata): ApiClients {
39 |   const workflowsClient = createBaseWorkflowClient();
40 |   const agentClient = createCloudAgentClient<ExtractedData<any>>({
41 |     client: cloudApiClient,
42 |     windowUrl: typeof window !== "undefined" ? window.location.href : undefined,
43 |     collection: metadata.extracted_data_collection,
44 |   });
45 | 
46 |   return {
47 |     workflowsClient,
48 |     cloudApiClient,
49 |     agentDataClient: agentClient,
50 |   } as ApiClients;
51 | }
52 | 


--------------------------------------------------------------------------------
/task.md:
--------------------------------------------------------------------------------
 1 | We are building an invoice extraction and reconciliation workflow app.
 2 | 
 3 | Invoices are parsed into structured data, then compared against indexed contracts to reconcile the invoice with its matching contract. Update the invoice record with contract-derived information and any discrepancies.
 4 | 
 5 | Using the UI, the user should be able to:
 6 | - add and index new contracts
 7 | - add and reconcile new invoices
 8 | 
 9 | This should be based off of the base extraction review template, which has 2 pages, one that displays a table of all extracted items (one row per invoice), and one for the item details (the extracted data for one invoice, e.g. total and line items). The items and details view should show the invoices.
10 | 
11 | Contracts can remain largely invisible in the UI for now, but there should be a minimal way to add them. These should be placed into a LlamaCloud index (which parses PDFs to plain text for retrieval).
12 | 
13 | The stored schema should extend the extracted invoice schema with reconciliation fields, such as links to the matched contract, a match confidence/score, and a structured list of discrepancies.
14 | 
15 | Matching should retrieve candidate contracts and then use an LLM, with context for both the candidate contracts and the invoice data, to make the final selection and provide rationale. When no contract matches, record that outcome clearly.
16 | 
17 | When matching and reconciling, consider:
18 | - Whether there is any plausible matching contract versus only irrelevant results (e.g., vendor name, contract dates/ranges, contract or PO numbers).
19 | - Whether payment terms are matching (at minimum).
20 | - Optionally, check other obvious alignments if cheaply available (e.g., totals, vendor identifiers).
21 | 
22 | Represent reconciliation results in the details view with a clear, structured list of discrepancies (e.g., field, invoice_value, contract_value, optional note/severity).
23 | 
24 | The vast majority of this change should be kept in the python codebase. Some minor changes may need to be added to the UI, however do not do anything complex, just a button or small widget.
25 | 


--------------------------------------------------------------------------------
/ui/src/App.tsx:
--------------------------------------------------------------------------------
 1 | import React from "react";
 2 | import { Routes, Route } from "react-router-dom";
 3 | import { Theme } from "@radix-ui/themes";
 4 | import {
 5 |   Breadcrumb,
 6 |   BreadcrumbItem,
 7 |   BreadcrumbList,
 8 |   BreadcrumbSeparator,
 9 | } from "@llamaindex/ui";
10 | import { Link } from "react-router-dom";
11 | import { Toaster } from "sonner";
12 | import { useToolbar, ToolbarProvider } from "@/lib/ToolbarContext";
13 | import { MetadataProvider } from "@/lib/MetadataProvider";
14 | 
15 | // Import pages
16 | import HomePage from "./pages/HomePage";
17 | import ItemPage from "./pages/ItemPage";
18 | 
19 | export default function App() {
20 |   return (
21 |     <Theme>
22 |       <MetadataProvider>
23 |         <ToolbarProvider>
24 |           <div className="grid grid-rows-[auto_1fr] h-screen">
25 |             <Toolbar />
26 |             <main className="overflow-auto">
27 |               <Routes>
28 |                 <Route path="/" element={<HomePage />} />
29 |                 <Route path="/item/:itemId" element={<ItemPage />} />
30 |               </Routes>
31 |             </main>
32 |           </div>
33 |           <Toaster />
34 |         </ToolbarProvider>
35 |       </MetadataProvider>
36 |     </Theme>
37 |   );
38 | }
39 | 
40 | const Toolbar = () => {
41 |   const { buttons, breadcrumbs } = useToolbar();
42 | 
43 |   return (
44 |     <header className="sticky top-0 z-50 flex h-16 shrink-0 items-center gap-2 border-b px-4 bg-white/95 backdrop-blur supports-[backdrop-filter]:bg-white/60">
45 |       <Breadcrumb>
46 |         <BreadcrumbList>
47 |           {breadcrumbs.map((item, index) => (
48 |             <React.Fragment key={index}>
49 |               {index > 0 && <BreadcrumbSeparator />}
50 |               <BreadcrumbItem>
51 |                 {item.href && !item.isCurrentPage ? (
52 |                   <Link to={item.href} className="font-medium text-base">
53 |                     {item.label}
54 |                   </Link>
55 |                 ) : (
56 |                   <span
57 |                     className={`font-medium ${index === 0 ? "text-base" : ""}`}
58 |                   >
59 |                     {item.label}
60 |                   </span>
61 |                 )}
62 |               </BreadcrumbItem>
63 |             </React.Fragment>
64 |           ))}
65 |         </BreadcrumbList>
66 |       </Breadcrumb>
67 |       {buttons}
68 |     </header>
69 |   );
70 | };
71 | 


--------------------------------------------------------------------------------
/ui/src/lib/MetadataProvider.tsx:
--------------------------------------------------------------------------------
 1 | import { createContext, useContext, ReactNode, useMemo } from "react";
 2 | import { ApiProvider, ApiClients } from "@llamaindex/ui";
 3 | import { useMetadata, Metadata } from "./useMetadata";
 4 | import { createBaseWorkflowClient, createClients } from "./client";
 5 | import { Clock, XCircle } from "lucide-react";
 6 | 
 7 | interface MetadataContextValue {
 8 |   metadata: Metadata;
 9 |   clients: ApiClients;
10 | }
11 | 
12 | const MetadataContext = createContext<MetadataContextValue | null>(null);
13 | 
14 | export function MetadataProvider({ children }: { children: ReactNode }) {
15 |   const baseClients: ApiClients = useMemo(() => {
16 |     return {
17 |       workflowsClient: createBaseWorkflowClient(),
18 |     } as ApiClients;
19 |   }, []);
20 |   return (
21 |     <ApiProvider clients={baseClients}>
22 |       <InnerMetadataProvider>{children}</InnerMetadataProvider>
23 |     </ApiProvider>
24 |   );
25 | }
26 | 
27 | function InnerMetadataProvider({ children }: { children: ReactNode }) {
28 |   const { metadata, loading, error } = useMetadata();
29 |   const clients = useMemo(
30 |     () => (metadata ? createClients(metadata) : undefined),
31 |     [metadata],
32 |   );
33 | 
34 |   if (loading) {
35 |     return (
36 |       <div className="flex h-screen items-center justify-center">
37 |         <div className="text-center">
38 |           <Clock className="h-8 w-8 animate-spin mx-auto mb-2" />
39 |           <div className="text-sm text-gray-500">Loading configuration...</div>
40 |         </div>
41 |       </div>
42 |     );
43 |   }
44 | 
45 |   if (error || !metadata || !clients) {
46 |     return (
47 |       <div className="flex h-screen items-center justify-center">
48 |         <div className="text-center">
49 |           <XCircle className="h-8 w-8 text-red-500 mx-auto mb-2" />
50 |           <div className="text-sm text-gray-500">
51 |             Error loading configuration: {error || "Unknown error"}
52 |           </div>
53 |         </div>
54 |       </div>
55 |     );
56 |   }
57 | 
58 |   return (
59 |     <MetadataContext.Provider value={{ metadata, clients }}>
60 |       <ApiProvider clients={clients}>{children}</ApiProvider>
61 |     </MetadataContext.Provider>
62 |   );
63 | }
64 | 
65 | export function useMetadataContext() {
66 |   const context = useContext(MetadataContext);
67 |   if (!context) {
68 |     throw new Error("useMetadataContext must be used within MetadataProvider");
69 |   }
70 |   return context;
71 | }
72 | 


--------------------------------------------------------------------------------
/ui/src/pages/HomePage.tsx:
--------------------------------------------------------------------------------
 1 | import {
 2 |   ItemCount,
 3 |   WorkflowTrigger,
 4 |   ExtractedDataItemGrid,
 5 |   HandlerState,
 6 | } from "@llamaindex/ui";
 7 | import type { TypedAgentData } from "llama-cloud-services/beta/agent";
 8 | import styles from "./HomePage.module.css";
 9 | import { useNavigate } from "react-router-dom";
10 | import { useState } from "react";
11 | import { WorkflowProgress } from "@/lib/WorkflowProgress";
12 | import { ContractsDropdown } from "@/lib/ContractsDropdown";
13 | 
14 | export default function HomePage() {
15 |   return <TaskList />;
16 | }
17 | 
18 | function TaskList() {
19 |   const navigate = useNavigate();
20 |   const goToItem = (item: TypedAgentData) => {
21 |     navigate(`/item/${item.id}`);
22 |   };
23 |   const [reloadSignal, setReloadSignal] = useState(0);
24 |   const [handlers, setHandlers] = useState<HandlerState[]>([]);
25 | 
26 |   return (
27 |     <div className={styles.page}>
28 |       <main className={styles.main}>
29 |         <div className={styles.grid}>
30 |           <ItemCount title="Total Items" key={`total-items-${reloadSignal}`} />
31 |           <ItemCount
32 |             title="Reviewed"
33 |             filter={{
34 |               status: { includes: ["approved", "rejected"] },
35 |             }}
36 |             key={`reviewed-${reloadSignal}`}
37 |           />
38 |           <ItemCount
39 |             title="Needs Review"
40 |             filter={{
41 |               status: { eq: "pending_review" },
42 |             }}
43 |             key={`needs-review-${reloadSignal}`}
44 |           />
45 |         </div>
46 |         <div className={styles.commandBar}>
47 |           <WorkflowProgress
48 |             workflowName={["process-file", "index-contract"]}
49 |             handlers={handlers}
50 |             onWorkflowCompletion={() => {
51 |               setReloadSignal(reloadSignal + 1);
52 |             }}
53 |           />
54 |           <WorkflowTrigger
55 |             workflowName="process-file"
56 |             customWorkflowInput={(files) => {
57 |               return {
58 |                 file_id: files[0].fileId,
59 |               };
60 |             }}
61 |             title="Upload Invoice"
62 |             onSuccess={(handler) => {
63 |               setHandlers([...handlers, handler]);
64 |             }}
65 |           />
66 |           <div className="flex gap-1 items-center">
67 |             <WorkflowTrigger
68 |               workflowName="index-contract"
69 |               customWorkflowInput={(files) => {
70 |                 return {
71 |                   file_ids: files.map((file) => file.fileId),
72 |                 };
73 |               }}
74 |               title="Upload Contract"
75 |               multiple
76 |               onSuccess={(handler) => {
77 |                 setHandlers([...handlers, handler]);
78 |               }}
79 |             />
80 |             <ContractsDropdown />
81 |           </div>
82 |         </div>
83 | 
84 |         <ExtractedDataItemGrid
85 |           key={reloadSignal}
86 |           onRowClick={goToItem}
87 |           builtInColumns={{
88 |             fileName: true,
89 |             status: true,
90 |             createdAt: true,
91 |             itemsToReview: true,
92 |             actions: true,
93 |           }}
94 |         />
95 |       </main>
96 |     </div>
97 |   );
98 | }
99 | 


--------------------------------------------------------------------------------
/src/extraction_review/clients.py:
--------------------------------------------------------------------------------
 1 | import functools
 2 | import logging
 3 | import os
 4 | 
 5 | import httpx
 6 | from llama_cloud.client import AsyncLlamaCloud
 7 | from llama_cloud.core.api_error import ApiError
 8 | from llama_cloud_services import ExtractionAgent, LlamaExtract, LlamaCloudIndex
 9 | from llama_cloud_services.beta.agent_data import AsyncAgentDataClient, ExtractedData
10 | from llama_index.llms.openai import OpenAI
11 | 
12 | from extraction_review.config import (
13 |     CONTRACTS_INDEX_NAME,
14 |     EXTRACTED_DATA_COLLECTION,
15 |     EXTRACT_CONFIG,
16 |     EXTRACTION_AGENT_NAME,
17 |     InvoiceExtractionSchema,
18 |     InvoiceWithReconciliation,
19 | )
20 | 
21 | logger = logging.getLogger(__name__)
22 | 
23 | # deployed agents may infer their name from the deployment name
24 | # Note: Make sure that an agent deployment with this name actually exists
25 | # otherwise calls to get or set data will fail. You may need to adjust the `or `
26 | # name for development
27 | agent_name = os.getenv("LLAMA_DEPLOY_DEPLOYMENT_NAME")
28 | # required for all llama cloud calls
29 | api_key = os.getenv("LLAMA_CLOUD_API_KEY")
30 | # get this in case running against a different environment than production
31 | base_url = os.getenv("LLAMA_CLOUD_BASE_URL")
32 | project_id = os.getenv("LLAMA_DEPLOY_PROJECT_ID")
33 | 
34 | 
35 | @functools.lru_cache(maxsize=None)
36 | def get_extract_agent() -> ExtractionAgent:
37 |     extract_api = LlamaExtract(
38 |         api_key=api_key, base_url=base_url, project_id=project_id
39 |     )
40 | 
41 |     try:
42 |         existing = extract_api.get_agent(EXTRACTION_AGENT_NAME)
43 |         existing.data_schema = InvoiceExtractionSchema
44 |         existing.config = EXTRACT_CONFIG
45 |         return existing
46 |     except ApiError as e:
47 |         if e.status_code == 404:
48 |             return extract_api.create_agent(
49 |                 name=EXTRACTION_AGENT_NAME,
50 |                 data_schema=InvoiceExtractionSchema,
51 |                 config=EXTRACT_CONFIG,
52 |             )
53 |         else:
54 |             raise
55 | 
56 | 
57 | @functools.lru_cache(maxsize=None)
58 | def get_data_client() -> AsyncAgentDataClient[ExtractedData[InvoiceWithReconciliation]]:
59 |     return AsyncAgentDataClient(
60 |         deployment_name=agent_name,
61 |         collection=EXTRACTED_DATA_COLLECTION,
62 |         type=ExtractedData[InvoiceWithReconciliation],
63 |         client=get_llama_cloud_client(),
64 |     )
65 | 
66 | 
67 | @functools.lru_cache(maxsize=None)
68 | def get_llama_cloud_client():
69 |     return AsyncLlamaCloud(
70 |         base_url=base_url,
71 |         token=api_key,
72 |         httpx_client=httpx.AsyncClient(
73 |             timeout=60, headers={"Project-Id": project_id} if project_id else None
74 |         ),
75 |     )
76 | 
77 | 
78 | @functools.lru_cache(maxsize=None)
79 | def get_contracts_index() -> LlamaCloudIndex:
80 |     """Get or create the contracts index for storing and retrieving contract documents"""
81 |     return LlamaCloudIndex.create_index(
82 |         name=CONTRACTS_INDEX_NAME,
83 |         project_id=project_id,
84 |         api_key=api_key,
85 |         base_url=base_url,
86 |     )
87 | 
88 | 
89 | @functools.lru_cache(maxsize=None)
90 | def get_llm() -> OpenAI:
91 |     """Get OpenAI LLM for structured predictions"""
92 |     return OpenAI(model="gpt-5-mini", temperature=0)
93 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Invoice Extraction and Contract Reconciliation
 2 | 
 3 | This template provides a LlamaAgents application for extracting structured data from invoices
 4 | and reconciling it against contract documents using LlamaExtract, LlamaCloud Index, and Agent Data.
 5 | It helps finance and operations teams validate that incoming invoices comply with agreed contract terms
 6 | by automatically detecting mismatches in payment terms, totals, and other key fields.
 7 | 
 8 | # Running the application
 9 | 
10 | This is a starter for LlamaAgents. See the
11 | [LlamaAgents (llamactl) getting started guide](https://developers.llamaindex.ai/python/llamaagents/llamactl/getting-started/)
12 | for context on local development and deployment.
13 | 
14 | To run the application locally, clone this repo, install [`uv`](https://docs.astral.sh/uv/) and run `uvx llamactl serve`.
15 | 
16 | This application can also be deployed directly to [LlamaCloud](https://cloud.llamaindex.ai) via the UI,
17 | or with `llamactl deployment create`.
18 | 
19 | ## Features
20 | 
21 | - **Invoice data extraction**: Uses a Pydantic `InvoiceExtractionSchema` to extract key invoice fields
22 |   (vendor, dates, PO number, line items, subtotals, tax, totals, and more) via a LlamaExtract agent.
23 | - **Contract indexing and retrieval**: Includes an `index-contract` workflow that downloads contract files
24 |   from LlamaCloud and indexes them into a dedicated `contracts` LlamaCloud Index for retrieval.
25 | - **Automated reconciliation**: Matches invoices to the most relevant contracts using retrieval plus an LLM,
26 |   then produces an `InvoiceWithReconciliation` record with match confidence, rationale, and structured discrepancies.
27 | - **Agent Data storage**: Stores reconciled invoice records in LlamaCloud Agent Data, deduplicated by file hash,
28 |   so that re-processing the same file replaces prior results instead of duplicating them.
29 | - **UI integration**: A web UI lets you upload invoices and contracts, monitor workflow progress,
30 |   and review or edit extracted and reconciled data.
31 | 
32 | ## Example Documents
33 | 
34 | You can find sample invoice and contract PDF files to test the application with
35 | [here](https://github.com/run-llama/llama-datasets/tree/main/llama_agents/invoice-contracts).
36 | 
37 | ## Configuration
38 | 
39 | All main configuration is in `src/extraction_review/config.py`.
40 | 
41 | ## How It Works
42 | 
43 | The application uses a multi-step workflow powered by LlamaIndex:
44 | 
45 | 1. **File Upload**: Users upload invoice or contract documents through the UI, which are stored in LlamaCloud.
46 | 2. **Index Contracts**: Contract files are processed by the `index-contract` workflow and indexed into
47 |    the `contracts` LlamaCloud Index.
48 | 3. **Download Invoice**: The `process-file` workflow downloads the selected invoice file from LlamaCloud storage.
49 | 4. **Extraction**: A LlamaExtract agent runs against the invoice using `InvoiceExtractionSchema`, returning
50 |    structured invoice data plus field-level metadata.
51 | 5. **Contract Retrieval**: The workflow queries the contracts index with a query built from invoice fields
52 |    (vendor, PO number, invoice number, etc.) and retrieves the most relevant contracts.
53 | 6. **Reconciliation**: An LLM compares the invoice to the retrieved contracts, selects the best match,
54 |    and produces an `InvoiceWithReconciliation` object with match confidence, rationale, and discrepancy list.
55 | 7. **Storage**: The reconciled invoice data is wrapped in an `ExtractedData` record (including file hash)
56 |    and stored in Agent Data, replacing any previous records for the same file hash.
57 | 8. **Review**: The UI displays the stored data for review, editing, and export.
58 | 
59 | ### Workflows
60 | 
61 | The application includes three main workflows:
62 | 
63 | - **`process-file`** (`src/extraction_review/process_file.py`): Main workflow for processing invoices
64 |   end-to-end (download → extract → reconcile → store).
65 | - **`index-contract`** (`src/extraction_review/index_contract.py`): Workflow for downloading and indexing
66 |   contract documents into a LlamaCloud Index for later retrieval during reconciliation.
67 | - **`metadata`** (`src/extraction_review/metadata_workflow.py`): Exposes configuration metadata to the UI,
68 |   returning the JSON Schema for `InvoiceWithReconciliation` and the Agent Data collection name.
69 | 
70 | ## Linting and type checking
71 | 
72 | Python and javascript packages contain helpful scripts to lint, format, and type check the code.
73 | 
74 | To check and fix python code:
75 | 
76 | ```bash
77 | uv run hatch run lint
78 | uv run hatch run typecheck
79 | uv run hatch run test
80 | # run all at once
81 | uv run hatch run all-fix
82 | ```
83 | 
84 | To check and fix javascript code, within the `ui` directory:
85 | 
86 | ```bash
87 | pnpm run lint
88 | pnpm run typecheck
89 | pnpm run test
90 | # run all at once
91 | pnpm run all-fix
92 | ```
93 | 


--------------------------------------------------------------------------------
/ui/src/index.css:
--------------------------------------------------------------------------------
  1 | @import "tailwindcss";
  2 | @import "tw-animate-css";
  3 | 
  4 | @custom-variant dark (&:is(.dark *));
  5 | 
  6 | @theme inline {
  7 |   --radius-sm: calc(var(--radius) - 4px);
  8 |   --radius-md: calc(var(--radius) - 2px);
  9 |   --radius-lg: var(--radius);
 10 |   --radius-xl: calc(var(--radius) + 4px);
 11 |   --color-background: var(--background);
 12 |   --color-foreground: var(--foreground);
 13 |   --color-card: var(--card);
 14 |   --color-card-foreground: var(--card-foreground);
 15 |   --color-popover: var(--popover);
 16 |   --color-popover-foreground: var(--popover-foreground);
 17 |   --color-primary: var(--primary);
 18 |   --color-primary-foreground: var(--primary-foreground);
 19 |   --color-secondary: var(--secondary);
 20 |   --color-secondary-foreground: var(--secondary-foreground);
 21 |   --color-muted: var(--muted);
 22 |   --color-muted-foreground: var(--muted-foreground);
 23 |   --color-accent: var(--accent);
 24 |   --color-accent-foreground: var(--accent-foreground);
 25 |   --color-destructive: var(--destructive);
 26 |   --color-border: var(--border);
 27 |   --color-input: var(--input);
 28 |   --color-ring: var(--ring);
 29 |   --color-chart-1: var(--chart-1);
 30 |   --color-chart-2: var(--chart-2);
 31 |   --color-chart-3: var(--chart-3);
 32 |   --color-chart-4: var(--chart-4);
 33 |   --color-chart-5: var(--chart-5);
 34 |   --color-sidebar: var(--sidebar);
 35 |   --color-sidebar-foreground: var(--sidebar-foreground);
 36 |   --color-sidebar-primary: var(--sidebar-primary);
 37 |   --color-sidebar-primary-foreground: var(--sidebar-primary-foreground);
 38 |   --color-sidebar-accent: var(--sidebar-accent);
 39 |   --color-sidebar-accent-foreground: var(--sidebar-accent-foreground);
 40 |   --color-sidebar-border: var(--sidebar-border);
 41 |   --color-sidebar-ring: var(--sidebar-ring);
 42 | }
 43 | 
 44 | :root {
 45 |   --radius: 0.625rem;
 46 |   --card: oklch(1 0 0);
 47 |   --card-foreground: oklch(0.141 0.005 285.823);
 48 |   --popover: oklch(1 0 0);
 49 |   --popover-foreground: oklch(0.141 0.005 285.823);
 50 |   --primary: oklch(0.21 0.006 285.885);
 51 |   --primary-foreground: oklch(0.985 0 0);
 52 |   --secondary: oklch(0.967 0.001 286.375);
 53 |   --secondary-foreground: oklch(0.21 0.006 285.885);
 54 |   --muted: oklch(0.967 0.001 286.375);
 55 |   --muted-foreground: oklch(0.552 0.016 285.938);
 56 |   --accent: oklch(0.967 0.001 286.375);
 57 |   --accent-foreground: oklch(0.21 0.006 285.885);
 58 |   --destructive: oklch(0.577 0.245 27.325);
 59 |   --border: oklch(0.92 0.004 286.32);
 60 |   --input: oklch(0.92 0.004 286.32);
 61 |   --ring: oklch(0.705 0.015 286.067);
 62 |   --chart-1: oklch(0.646 0.222 41.116);
 63 |   --chart-2: oklch(0.6 0.118 184.704);
 64 |   --chart-3: oklch(0.398 0.07 227.392);
 65 |   --chart-4: oklch(0.828 0.189 84.429);
 66 |   --chart-5: oklch(0.769 0.188 70.08);
 67 |   --sidebar: oklch(0.985 0 0);
 68 |   --sidebar-foreground: oklch(0.141 0.005 285.823);
 69 |   --sidebar-primary: oklch(0.21 0.006 285.885);
 70 |   --sidebar-primary-foreground: oklch(0.985 0 0);
 71 |   --sidebar-accent: oklch(0.967 0.001 286.375);
 72 |   --sidebar-accent-foreground: oklch(0.21 0.006 285.885);
 73 |   --sidebar-border: oklch(0.92 0.004 286.32);
 74 |   --sidebar-ring: oklch(0.705 0.015 286.067);
 75 |   --background: oklch(1 0 0);
 76 |   --foreground: oklch(0.141 0.005 285.823);
 77 | }
 78 | 
 79 | .dark {
 80 |   --background: oklch(0.141 0.005 285.823);
 81 |   --foreground: oklch(0.985 0 0);
 82 |   --card: oklch(0.21 0.006 285.885);
 83 |   --card-foreground: oklch(0.985 0 0);
 84 |   --popover: oklch(0.21 0.006 285.885);
 85 |   --popover-foreground: oklch(0.985 0 0);
 86 |   --primary: oklch(0.92 0.004 286.32);
 87 |   --primary-foreground: oklch(0.21 0.006 285.885);
 88 |   --secondary: oklch(0.274 0.006 286.033);
 89 |   --secondary-foreground: oklch(0.985 0 0);
 90 |   --muted: oklch(0.274 0.006 286.033);
 91 |   --muted-foreground: oklch(0.705 0.015 286.067);
 92 |   --accent: oklch(0.274 0.006 286.033);
 93 |   --accent-foreground: oklch(0.985 0 0);
 94 |   --destructive: oklch(0.704 0.191 22.216);
 95 |   --border: oklch(1 0 0 / 10%);
 96 |   --input: oklch(1 0 0 / 15%);
 97 |   --ring: oklch(0.552 0.016 285.938);
 98 |   --chart-1: oklch(0.488 0.243 264.376);
 99 |   --chart-2: oklch(0.696 0.17 162.48);
100 |   --chart-3: oklch(0.769 0.188 70.08);
101 |   --chart-4: oklch(0.627 0.265 303.9);
102 |   --chart-5: oklch(0.645 0.246 16.439);
103 |   --sidebar: oklch(0.21 0.006 285.885);
104 |   --sidebar-foreground: oklch(0.985 0 0);
105 |   --sidebar-primary: oklch(0.488 0.243 264.376);
106 |   --sidebar-primary-foreground: oklch(0.985 0 0);
107 |   --sidebar-accent: oklch(0.274 0.006 286.033);
108 |   --sidebar-accent-foreground: oklch(0.985 0 0);
109 |   --sidebar-border: oklch(1 0 0 / 10%);
110 |   --sidebar-ring: oklch(0.552 0.016 285.938);
111 | }
112 | 
113 | @layer base {
114 |   * {
115 |     @apply border-border outline-ring/50;
116 |   }
117 |   body {
118 |     @apply bg-background text-foreground;
119 |   }
120 | }
121 | 


--------------------------------------------------------------------------------
/ui/src/pages/ItemPage.tsx:
--------------------------------------------------------------------------------
  1 | import { useEffect, useState } from "react";
  2 | import {
  3 |   AcceptReject,
  4 |   ExtractedDataDisplay,
  5 |   FilePreview,
  6 |   useItemData,
  7 |   type Highlight,
  8 |   Button,
  9 | } from "@llamaindex/ui";
 10 | import { Clock, XCircle, Download } from "lucide-react";
 11 | import { useParams } from "react-router-dom";
 12 | import { useToolbar } from "@/lib/ToolbarContext";
 13 | import { useNavigate } from "react-router-dom";
 14 | import { modifyJsonSchema } from "@llamaindex/ui/lib";
 15 | import { APP_TITLE } from "@/lib/config";
 16 | import { downloadExtractedDataItem } from "@/lib/export";
 17 | import { useMetadataContext } from "@/lib/MetadataProvider";
 18 | 
 19 | export default function ItemPage() {
 20 |   const { itemId } = useParams<{ itemId: string }>();
 21 |   const { setButtons, setBreadcrumbs } = useToolbar();
 22 |   const [highlight, setHighlight] = useState<Highlight | undefined>(undefined);
 23 |   const { metadata } = useMetadataContext();
 24 |   // Use the hook to fetch item data
 25 |   const itemHookData = useItemData<any>({
 26 |     // order/remove fields as needed here
 27 |     jsonSchema: modifyJsonSchema(metadata.json_schema, {}),
 28 |     itemId: itemId as string,
 29 |     isMock: false,
 30 |   });
 31 | 
 32 |   const navigate = useNavigate();
 33 | 
 34 |   // Update breadcrumb when item data loads
 35 |   useEffect(() => {
 36 |     const fileName = itemHookData.item?.data?.file_name;
 37 |     if (fileName) {
 38 |       setBreadcrumbs([
 39 |         { label: APP_TITLE, href: "/" },
 40 |         {
 41 |           label: fileName,
 42 |           isCurrentPage: true,
 43 |         },
 44 |       ]);
 45 |     }
 46 | 
 47 |     return () => {
 48 |       // Reset to default breadcrumb when leaving the page
 49 |       setBreadcrumbs([{ label: APP_TITLE, href: "/" }]);
 50 |     };
 51 |   }, [itemHookData.item?.data?.file_name, setBreadcrumbs]);
 52 | 
 53 |   useEffect(() => {
 54 |     setButtons(() => [
 55 |       <div className="ml-auto flex items-center gap-2">
 56 |         <Button
 57 |           variant="outline"
 58 |           size="sm"
 59 |           onClick={() => {
 60 |             if (itemData) {
 61 |               downloadExtractedDataItem(itemData);
 62 |             }
 63 |           }}
 64 |           disabled={!itemData}
 65 |         >
 66 |           <Download className="h-4 w-4 mr-2" />
 67 |           Export JSON
 68 |         </Button>
 69 |         <AcceptReject<any>
 70 |           itemData={itemHookData}
 71 |           onComplete={() => navigate("/")}
 72 |         />
 73 |       </div>,
 74 |     ]);
 75 |     return () => {
 76 |       setButtons(() => []);
 77 |     };
 78 |   }, [itemHookData.data, setButtons]);
 79 | 
 80 |   const {
 81 |     item: itemData,
 82 |     updateData,
 83 |     loading: isLoading,
 84 |     error,
 85 |   } = itemHookData;
 86 | 
 87 |   if (isLoading) {
 88 |     return (
 89 |       <div className="flex h-screen items-center justify-center">
 90 |         <div className="text-center">
 91 |           <Clock className="h-8 w-8 animate-spin mx-auto mb-2" />
 92 |           <div className="text-sm text-gray-500">Loading item...</div>
 93 |         </div>
 94 |       </div>
 95 |     );
 96 |   }
 97 | 
 98 |   if (error || !itemData) {
 99 |     return (
100 |       <div className="flex h-screen items-center justify-center">
101 |         <div className="text-center">
102 |           <XCircle className="h-8 w-8 text-red-500 mx-auto mb-2" />
103 |           <div className="text-sm text-gray-500">
104 |             Error loading item: {error || "Item not found"}
105 |           </div>
106 |         </div>
107 |       </div>
108 |     );
109 |   }
110 | 
111 |   return (
112 |     <div className="flex h-full bg-gray-50">
113 |       {/* Left Side - File Preview */}
114 |       <div className="w-1/2 border-r border-gray-200 bg-white">
115 |         {itemData.data.file_id && (
116 |           <FilePreview
117 |             fileId={itemData.data.file_id}
118 |             onBoundingBoxClick={(box, pageNumber) => {
119 |               console.log("Bounding box clicked:", box, "on page:", pageNumber);
120 |             }}
121 |             highlight={highlight}
122 |           />
123 |         )}
124 |       </div>
125 | 
126 |       {/* Right Side - Review Panel */}
127 |       <div className="flex-1 bg-white h-full overflow-y-auto">
128 |         <div className="p-4 space-y-4">
129 |           {/* Extracted Data */}
130 |           <ExtractedDataDisplay<any>
131 |             extractedData={itemData.data}
132 |             title="Extracted Data"
133 |             onChange={(updatedData) => {
134 |               updateData(updatedData);
135 |             }}
136 |             onClickField={(args) => {
137 |               // TODO: set multiple highlights
138 |               setHighlight({
139 |                 page: args.metadata?.citation?.[0]?.page ?? 1,
140 |                 x: 100,
141 |                 y: 100,
142 |                 width: 0,
143 |                 height: 0,
144 |               });
145 |             }}
146 |             jsonSchema={itemHookData.jsonSchema}
147 |           />
148 |         </div>
149 |       </div>
150 |     </div>
151 |   );
152 | }
153 | 


--------------------------------------------------------------------------------
/src/extraction_review/config.py:
--------------------------------------------------------------------------------
  1 | """
  2 | For simple configuration of the extraction review application, just customize this file.
  3 | 
  4 | If you need more control, feel free to edit the rest of the application
  5 | """
  6 | 
  7 | from __future__ import annotations
  8 | 
  9 | import os
 10 | 
 11 | from llama_cloud import ExtractConfig
 12 | from llama_cloud_services.extract import ExtractMode
 13 | from pydantic import BaseModel, Field
 14 | 
 15 | # The name of the extraction agent to use. Prefers the name of this deployment when deployed to isolate environments.
 16 | # Note that the application will create a new agent from the below ExtractionSchema if the extraction agent does not yet exist.
 17 | EXTRACTION_AGENT_NAME: str = (
 18 |     os.getenv("LLAMA_DEPLOY_DEPLOYMENT_NAME") or "invoice-reconciliation"
 19 | )
 20 | # The name of the collection to use for storing extracted data. This will be qualified by the agent name.
 21 | # When developing locally, this will use the _public collection (shared within the project), otherwise agent
 22 | # data is isolated to each agent
 23 | EXTRACTED_DATA_COLLECTION: str = "invoices"
 24 | 
 25 | # The name of the LlamaCloud index for storing contracts
 26 | CONTRACTS_INDEX_NAME: str = "contracts"
 27 | 
 28 | 
 29 | # Invoice extraction schema - extracted from invoice documents
 30 | class LineItem(BaseModel):
 31 |     description: str | None = Field(
 32 |         default=None, description="Description of the line item"
 33 |     )
 34 |     quantity: float | None = Field(default=None, description="Quantity of the item")
 35 |     unit_price: float | None = Field(
 36 |         default=None, description="Price per unit of the item"
 37 |     )
 38 |     total: float | None = Field(
 39 |         default=None, description="Total price for this line item"
 40 |     )
 41 | 
 42 | 
 43 | class InvoiceExtractionSchema(BaseModel):
 44 |     """Schema for extracting invoice data"""
 45 | 
 46 |     invoice_number: str | None = Field(
 47 |         default=None, description="Invoice number or identifier"
 48 |     )
 49 |     invoice_date: str | None = Field(
 50 |         default=None, description="Date of the invoice (YYYY-MM-DD format if possible)"
 51 |     )
 52 |     vendor_name: str | None = Field(
 53 |         default=None, description="Name of the vendor or supplier"
 54 |     )
 55 |     vendor_address: str | None = Field(
 56 |         default=None, description="Address of the vendor"
 57 |     )
 58 |     purchase_order_number: str | None = Field(
 59 |         default=None, description="Purchase order (PO) number if present"
 60 |     )
 61 |     payment_terms: str | None = Field(
 62 |         default=None,
 63 |         description="Payment terms (e.g., Net 30, Net 60, Due on receipt)",
 64 |     )
 65 |     line_items: list[LineItem] | None = Field(
 66 |         default=None, description="List of line items on the invoice"
 67 |     )
 68 |     subtotal: float | None = Field(
 69 |         default=None, description="Subtotal before tax and other charges"
 70 |     )
 71 |     tax: float | None = Field(default=None, description="Tax amount")
 72 |     total: float | None = Field(
 73 |         default=None, description="Total amount due on the invoice"
 74 |     )
 75 | 
 76 | 
 77 | # For backward compatibility
 78 | ExtractionSchema = InvoiceExtractionSchema
 79 | 
 80 | 
 81 | # Reconciliation schema - extends invoice data with contract matching and discrepancy information
 82 | class Discrepancy(BaseModel):
 83 |     """Represents a single discrepancy between invoice and contract"""
 84 | 
 85 |     field: str = Field(description="Field name where discrepancy was found")
 86 |     invoice_value: str | None = Field(
 87 |         default=None, description="Value from the invoice"
 88 |     )
 89 |     contract_value: str | None = Field(
 90 |         default=None, description="Expected value from the contract"
 91 |     )
 92 |     severity: str | None = Field(
 93 |         default=None,
 94 |         description="Severity of the discrepancy (e.g., 'high', 'medium', 'low')",
 95 |     )
 96 |     note: str | None = Field(
 97 |         default=None, description="Additional notes about the discrepancy"
 98 |     )
 99 | 
100 | 
101 | class InvoiceWithReconciliation(InvoiceExtractionSchema):
102 |     """Invoice data with reconciliation information"""
103 | 
104 |     matched_contract_id: str | None = Field(
105 |         default=None, description="ID of the matched contract file in LlamaCloud"
106 |     )
107 |     matched_contract_name: str | None = Field(
108 |         default=None, description="Name of the matched contract file"
109 |     )
110 |     match_confidence: str | None = Field(
111 |         default=None,
112 |         description="Confidence level of the match (e.g., 'high', 'medium', 'low', 'none')",
113 |     )
114 |     match_rationale: str | None = Field(
115 |         default=None, description="Explanation of why this contract was matched"
116 |     )
117 |     discrepancies: list[Discrepancy] | None = Field(
118 |         default=None,
119 |         description="List of discrepancies found between invoice and contract",
120 |     )
121 | 
122 | 
123 | EXTRACT_CONFIG = ExtractConfig(
124 |     extraction_mode=ExtractMode.PREMIUM,
125 |     system_prompt=None,
126 |     # advanced. Only compatible with Premium mode.
127 |     use_reasoning=False,
128 |     cite_sources=False,
129 |     confidence_scores=True,
130 | )
131 | 


--------------------------------------------------------------------------------
/src/extraction_review/index_contract.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Workflow for indexing contract documents into LlamaCloud Index for retrieval.
  3 | """
  4 | 
  5 | import logging
  6 | import os
  7 | import tempfile
  8 | from pathlib import Path
  9 | from typing import Literal
 10 | 
 11 | import httpx
 12 | from llama_index.core import Document
 13 | from pydantic import BaseModel
 14 | from workflows import Context, Workflow, step
 15 | from workflows.events import Event, StartEvent, StopEvent
 16 | 
 17 | from .clients import get_contracts_index, get_llama_cloud_client
 18 | 
 19 | logger = logging.getLogger(__name__)
 20 | 
 21 | 
 22 | class ContractFileEvent(StartEvent):
 23 |     """Event to start contract indexing with a file ID"""
 24 | 
 25 |     file_ids: list[str]
 26 | 
 27 | 
 28 | class DownloadContractEvent(Event):
 29 |     """Event to trigger contract download"""
 30 | 
 31 |     file_id: str
 32 | 
 33 | 
 34 | class ContractDownloadedEvent(Event):
 35 |     """Event indicating contract has been downloaded"""
 36 | 
 37 |     file_id: str
 38 |     file_path: str
 39 |     filename: str
 40 | 
 41 | 
 42 | class ContractIndexedEvent(Event):
 43 |     """Event indicating a single contract has been indexed"""
 44 | 
 45 |     file_id: str
 46 |     filename: str
 47 | 
 48 | 
 49 | class Status(Event):
 50 |     """Event to show toast notifications in the UI"""
 51 | 
 52 |     level: Literal["info", "warning", "error"]
 53 |     message: str
 54 | 
 55 | 
 56 | class ContractIndexState(BaseModel):
 57 |     """State for contract indexing workflow"""
 58 | 
 59 |     total_files: int = 0
 60 |     # Store file info keyed by file_id
 61 |     file_paths: dict[str, str] = {}
 62 |     filenames: dict[str, str] = {}
 63 | 
 64 | 
 65 | class IndexContractWorkflow(Workflow):
 66 |     """
 67 |     Workflow to download and index a contract document into LlamaCloud Index.
 68 |     """
 69 | 
 70 |     @step()
 71 |     async def start_indexing(
 72 |         self, event: ContractFileEvent, ctx: Context[ContractIndexState]
 73 |     ) -> DownloadContractEvent | None:
 74 |         """Initialize the workflow with multiple file IDs and fan out to parallel downloads"""
 75 |         logger.info(f"Starting contract indexing for {len(event.file_ids)} files")
 76 |         async with ctx.store.edit_state() as state:
 77 |             state.total_files = len(event.file_ids)
 78 | 
 79 |         # Fan out: emit one download event per file
 80 |         for file_id in event.file_ids:
 81 |             ctx.send_event(DownloadContractEvent(file_id=file_id))
 82 | 
 83 |         return None
 84 | 
 85 |     @step(num_workers=4)
 86 |     async def download_contract(
 87 |         self, event: DownloadContractEvent, ctx: Context[ContractIndexState]
 88 |     ) -> ContractDownloadedEvent:
 89 |         """Download the contract file from LlamaCloud storage (runs in parallel)"""
 90 |         file_id = event.file_id
 91 | 
 92 |         file_metadata = await get_llama_cloud_client().files.get_file(id=file_id)
 93 |         file_url = await get_llama_cloud_client().files.read_file_content(file_id)
 94 | 
 95 |         temp_dir = tempfile.gettempdir()
 96 |         filename = file_metadata.name
 97 |         file_path = os.path.join(temp_dir, filename)
 98 | 
 99 |         logger.info(f"Downloading contract {filename} from {file_url.url}")
100 |         ctx.write_event_to_stream(
101 |             Status(level="info", message=f"Downloading contract: {filename}")
102 |         )
103 | 
104 |         client = httpx.AsyncClient()
105 |         async with client.stream("GET", file_url.url) as response:
106 |             with open(file_path, "wb") as f:
107 |                 async for chunk in response.aiter_bytes():
108 |                     f.write(chunk)
109 | 
110 |         logger.info(f"Downloaded contract to {file_path}")
111 |         async with ctx.store.edit_state() as state:
112 |             state.file_paths[file_id] = file_path
113 |             state.filenames[file_id] = filename
114 | 
115 |         return ContractDownloadedEvent(
116 |             file_id=file_id, file_path=file_path, filename=filename
117 |         )
118 | 
119 |     @step(num_workers=4)
120 |     async def index_contract(
121 |         self, event: ContractDownloadedEvent, ctx: Context[ContractIndexState]
122 |     ) -> ContractIndexedEvent:
123 |         """Index the contract document into LlamaCloud Index (runs in parallel)"""
124 |         file_id = event.file_id
125 |         file_path = event.file_path
126 |         filename = event.filename
127 | 
128 |         logger.info(f"Indexing contract {filename}")
129 |         ctx.write_event_to_stream(
130 |             Status(level="info", message=f"Indexing contract: {filename}")
131 |         )
132 | 
133 |         # Create a document with metadata
134 |         file_content = Path(file_path).read_text(errors="ignore")
135 |         document = Document(
136 |             text=file_content,
137 |             metadata={
138 |                 "filename": filename,
139 |                 "file_id": file_id,
140 |                 "document_type": "contract",
141 |             },
142 |         )
143 | 
144 |         # Get the contracts index and insert the document
145 |         index = get_contracts_index()
146 |         await index.ainsert(document)
147 | 
148 |         logger.info(f"Successfully indexed contract {filename}")
149 |         ctx.write_event_to_stream(
150 |             Status(
151 |                 level="info",
152 |                 message=f"Successfully indexed contract: {filename}",
153 |             )
154 |         )
155 | 
156 |         return ContractIndexedEvent(file_id=file_id, filename=filename)
157 | 
158 |     @step()
159 |     async def collect_results(
160 |         self, event: ContractIndexedEvent, ctx: Context[ContractIndexState]
161 |     ) -> StopEvent | None:
162 |         """Collect all indexed contracts and return final results (fan-in)"""
163 |         state = await ctx.store.get_state()
164 | 
165 |         # Collect all ContractIndexedEvent events - one for each file
166 |         events = ctx.collect_events(event, [ContractIndexedEvent] * state.total_files)
167 | 
168 |         if events is None:
169 |             # Not all files have been indexed yet
170 |             return None
171 | 
172 |         # All files have been indexed, return aggregated results
173 |         results = [{"file_id": ev.file_id, "filename": ev.filename} for ev in events]
174 | 
175 |         logger.info(f"Successfully indexed all {len(results)} contracts")
176 |         ctx.write_event_to_stream(
177 |             Status(
178 |                 level="info",
179 |                 message=f"Successfully indexed all {len(results)} contracts",
180 |             )
181 |         )
182 | 
183 |         return StopEvent(result={"contracts": results, "total": len(results)})
184 | 
185 | 
186 | workflow = IndexContractWorkflow(timeout=None)
187 | 
188 | if __name__ == "__main__":
189 |     import asyncio
190 |     from dotenv import load_dotenv
191 | 
192 |     load_dotenv()
193 |     logging.basicConfig(level=logging.INFO)
194 | 
195 |     async def main():
196 |         # Example usage - upload a contract and index it
197 |         file = await get_llama_cloud_client().files.upload_file(
198 |             upload_file=Path("sample_contract.pdf").open("rb")
199 |         )
200 |         result = await workflow.run(start_event=ContractFileEvent(file_ids=[file.id]))
201 |         print(f"Indexed contract: {result}")
202 | 
203 |     asyncio.run(main())
204 | 


--------------------------------------------------------------------------------
/ui/src/lib/WorkflowProgress.tsx:
--------------------------------------------------------------------------------
  1 | import {
  2 |   useHandlers,
  3 |   WorkflowEvent,
  4 |   StreamOperation,
  5 |   HandlerState,
  6 | } from "@llamaindex/ui";
  7 | import { useEffect, useRef, useState } from "react";
  8 | import { Loader2 } from "lucide-react";
  9 | import { cn } from "./utils";
 10 | 
 11 | interface StatusMessage {
 12 |   type: "Status";
 13 |   data: {
 14 |     level: "info" | "warning" | "error";
 15 |     message: string;
 16 |   };
 17 | }
 18 | /**
 19 |  * Given a workflow type, keeps track of the number of running handlers and the maximum number of running handlers.
 20 |  * Has hooks to notify when a workflow handler is completed.
 21 |  */
 22 | export const WorkflowProgress = ({
 23 |   workflowName,
 24 |   onWorkflowCompletion,
 25 |   handlers = [],
 26 |   sync = true,
 27 | }: {
 28 |   workflowName: string[];
 29 |   onWorkflowCompletion?: (handlerIds: string[]) => void;
 30 |   handlers?: HandlerState[]; // specific handlers to track, e.g. after triggering a workflow run
 31 |   sync?: boolean; // whether to sync the handlers with the query on mount
 32 | }) => {
 33 |   const handlersService = useHandlers({
 34 |     query: { workflow_name: workflowName, status: ["running"] },
 35 |     sync: sync,
 36 |   });
 37 |   const seenHandlers = useRef<Set<string>>(new Set());
 38 |   useEffect(() => {
 39 |     for (const handler of handlers) {
 40 |       if (!seenHandlers.current.has(handler.handler_id)) {
 41 |         seenHandlers.current.add(handler.handler_id);
 42 |         handlersService.setHandler(handler);
 43 |       }
 44 |     }
 45 |   }, [handlers, handlersService]);
 46 | 
 47 |   const subscribed = useRef<Record<string, StreamOperation<WorkflowEvent>>>({});
 48 | 
 49 |   const [statusMessage, setStatusMessage] = useState<
 50 |     StatusMessage["data"] | undefined
 51 |   >();
 52 |   const [statusVisible, setStatusVisible] = useState(false);
 53 |   const hideTimerRef = useRef<number | undefined>(undefined);
 54 |   const clearTimerRef = useRef<number | undefined>(undefined);
 55 |   const [hasHadRunning, setHasHadRunning] = useState(false);
 56 | 
 57 |   const runningHandlers = Object.values(handlersService.state.handlers).filter(
 58 |     (handler) => handler.status === "running",
 59 |   );
 60 |   const runningHandlersKey = runningHandlers
 61 |     .map((handler) => handler.handler_id)
 62 |     .sort()
 63 |     .join(",");
 64 |   // subscribe to all running handlers and disconnect when they complete
 65 |   useEffect(() => {
 66 |     for (const handler of runningHandlers) {
 67 |       if (!subscribed.current[handler.handler_id]) {
 68 |         handlersService.actions(handler.handler_id).subscribeToEvents({
 69 |           onComplete() {
 70 |             subscribed.current[handler.handler_id]?.disconnect();
 71 |             delete subscribed.current[handler.handler_id];
 72 |           },
 73 |           onData(data) {
 74 |             if (data.type === "Status") {
 75 |               setStatusMessage(data.data as StatusMessage["data"]);
 76 |             }
 77 |           },
 78 |         });
 79 |       }
 80 |     }
 81 |   }, [runningHandlersKey]);
 82 |   const lastHandlers = useRef<string[]>([]);
 83 |   useEffect(() => {
 84 |     const newRunningHandlers = runningHandlers.map(
 85 |       (handler) => handler.handler_id,
 86 |     );
 87 |     const anyRemoved = lastHandlers.current.some(
 88 |       (handler) => !newRunningHandlers.includes(handler),
 89 |     );
 90 |     if (anyRemoved) {
 91 |       onWorkflowCompletion?.(lastHandlers.current);
 92 |     }
 93 |     lastHandlers.current = newRunningHandlers;
 94 |   }, [runningHandlersKey]);
 95 | 
 96 |   // unsubscribe on unmount
 97 |   useEffect(() => {
 98 |     return () => {
 99 |       for (const [key, handler] of Object.entries(subscribed.current)) {
100 |         handler.disconnect();
101 |         delete subscribed.current[key];
102 |       }
103 |       if (hideTimerRef.current !== undefined) {
104 |         clearTimeout(hideTimerRef.current);
105 |         hideTimerRef.current = undefined;
106 |       }
107 |       if (clearTimerRef.current !== undefined) {
108 |         clearTimeout(clearTimerRef.current);
109 |         clearTimerRef.current = undefined;
110 |       }
111 |     };
112 |   }, []);
113 | 
114 |   // Animate in on new messages and auto-hide after 15s
115 |   useEffect(() => {
116 |     if (!statusMessage) {
117 |       return;
118 |     }
119 |     if (hideTimerRef.current !== undefined) {
120 |       clearTimeout(hideTimerRef.current);
121 |       hideTimerRef.current = undefined;
122 |     }
123 |     if (clearTimerRef.current !== undefined) {
124 |       clearTimeout(clearTimerRef.current);
125 |       clearTimerRef.current = undefined;
126 |     }
127 |     setStatusVisible(false);
128 |     requestAnimationFrame(() => {
129 |       setStatusVisible(true);
130 |     });
131 |     hideTimerRef.current = window.setTimeout(() => {
132 |       setStatusVisible(false);
133 |       clearTimerRef.current = window.setTimeout(() => {
134 |         setStatusMessage(undefined);
135 |       }, 300);
136 |     }, 15000);
137 |     // eslint-disable-next-line react-hooks/exhaustive-deps
138 |   }, [statusMessage?.level, statusMessage?.message]);
139 | 
140 |   // Track if we've ever had any running workflows in this session
141 |   useEffect(() => {
142 |     if (runningHandlers.length > 0 && !hasHadRunning) {
143 |       setHasHadRunning(true);
144 |     }
145 |   }, [runningHandlers.length, hasHadRunning]);
146 | 
147 |   if (!runningHandlers.length && !hasHadRunning) {
148 |     return null;
149 |   }
150 |   return (
151 |     <div className="relative w-full rounded-full bg-muted text-muted-foreground border border-border px-4 py-2 flex items-center gap-1 text-xs overflow-hidden shadow-sm">
152 |       <span
153 |         aria-hidden="true"
154 |         className="pointer-events-none absolute inset-0 rounded-full z-0 [box-shadow:inset_0_1px_0_rgba(255,255,255,0.6)] dark:[box-shadow:inset_0_1px_0_rgba(0,0,0,0.35)]"
155 |       />
156 |       <span
157 |         aria-hidden="true"
158 |         className="pointer-events-none absolute inset-0 rounded-full z-0 opacity-60 dark:opacity-25 [background:linear-gradient(to_bottom,rgba(255,255,255,0.55),rgba(255,255,255,0.15))] dark:[background:linear-gradient(to_bottom,rgba(255,255,255,0.08),rgba(255,255,255,0.02))]"
159 |       />
160 |       <div className="relative z-10 flex items-center gap-1">
161 |         {runningHandlers.length > 0 ? (
162 |           <>
163 |             <Loader2
164 |               className="h-3 w-3 animate-spin shrink-0"
165 |               aria-hidden="true"
166 |             />
167 |             <span>
168 |               {runningHandlers.length} running workflow
169 |               {runningHandlers.length === 1 ? "" : "s"}
170 |             </span>
171 |           </>
172 |         ) : (
173 |           <span>all workflows completed</span>
174 |         )}
175 |         {statusMessage && (
176 |           <span
177 |             className={cn(
178 |               "ml-2 transition-all duration-300",
179 |               statusVisible
180 |                 ? "opacity-100 translate-x-0"
181 |                 : "opacity-0 translate-x-2",
182 |               statusMessage.level === "error"
183 |                 ? "text-red-500"
184 |                 : statusMessage.level === "warning"
185 |                   ? "text-yellow-600"
186 |                   : undefined,
187 |             )}
188 |           >
189 |             {statusMessage.message}
190 |           </span>
191 |         )}
192 |       </div>
193 |     </div>
194 |   );
195 | };
196 | 


--------------------------------------------------------------------------------
/ui/src/lib/ContractsDropdown.tsx:
--------------------------------------------------------------------------------
  1 | import { useState, useEffect, useCallback } from "react";
  2 | import {
  3 |   paginatedListPipelineDocumentsApiV1PipelinesPipelineIdDocumentsPaginatedGet,
  4 |   deletePipelineDocumentApiV1PipelinesPipelineIdDocumentsDocumentIdDelete,
  5 |   readFileContentApiV1FilesIdContentGet,
  6 | } from "llama-cloud-services/api";
  7 | import {
  8 |   Button,
  9 |   DropdownMenu,
 10 |   DropdownMenuContent,
 11 |   DropdownMenuTrigger,
 12 |   DropdownMenuSeparator,
 13 |   ScrollArea,
 14 | } from "@llamaindex/ui";
 15 | import type { CloudDocument } from "llama-cloud-services/api";
 16 | import { Trash2, ChevronDown, Loader2 } from "lucide-react";
 17 | import { useMetadata } from "./useMetadata";
 18 | 
 19 | const LIMIT = 20;
 20 | 
 21 | interface UseContractsLoaderResult {
 22 |   contracts: CloudDocument[];
 23 |   total: number | null;
 24 |   loading: boolean;
 25 |   hasMore: boolean;
 26 |   loadMore: () => void;
 27 |   handleScroll: (event: React.UIEvent<HTMLDivElement>) => void;
 28 |   removeContract: (id: string) => void;
 29 | }
 30 | 
 31 | function useContractsLoader(
 32 |   pipelineId: string | undefined,
 33 |   isOpen: boolean,
 34 | ): UseContractsLoaderResult {
 35 |   const [contracts, setContracts] = useState<CloudDocument[]>([]);
 36 |   const [total, setTotal] = useState<number | null>(null);
 37 |   const [offset, setOffset] = useState(0);
 38 |   const [loading, setLoading] = useState(false);
 39 |   const [hasMore, setHasMore] = useState(true);
 40 | 
 41 |   const loadContracts = useCallback(
 42 |     async (reset = false) => {
 43 |       if (!pipelineId || loading) return;
 44 | 
 45 |       setLoading(true);
 46 |       try {
 47 |         const currentOffset = reset ? 0 : offset;
 48 |         const response =
 49 |           await paginatedListPipelineDocumentsApiV1PipelinesPipelineIdDocumentsPaginatedGet(
 50 |             {
 51 |               path: {
 52 |                 pipeline_id: pipelineId,
 53 |               },
 54 |               query: {
 55 |                 offset: currentOffset,
 56 |                 limit: LIMIT,
 57 |               },
 58 |             },
 59 |           );
 60 | 
 61 |         if (response.data) {
 62 |           setTotal(response.data.total_count);
 63 |           setContracts((prev) =>
 64 |             reset
 65 |               ? response.data!.documents
 66 |               : [...prev, ...response.data!.documents],
 67 |           );
 68 |           setOffset(currentOffset + response.data.documents.length);
 69 |           setHasMore(
 70 |             currentOffset + response.data.documents.length <
 71 |               response.data.total_count,
 72 |           );
 73 |         }
 74 |       } catch (error) {
 75 |         console.error("Failed to load contracts:", error);
 76 |       } finally {
 77 |         setLoading(false);
 78 |       }
 79 |     },
 80 |     [pipelineId, offset, loading],
 81 |   );
 82 | 
 83 |   useEffect(() => {
 84 |     if (isOpen && pipelineId && contracts.length === 0) {
 85 |       loadContracts(true);
 86 |     }
 87 |   }, [isOpen, pipelineId]);
 88 | 
 89 |   const handleScroll = useCallback(
 90 |     (event: React.UIEvent<HTMLDivElement>) => {
 91 |       if (!hasMore || loading) return;
 92 | 
 93 |       const target = event.currentTarget;
 94 |       const { scrollTop, scrollHeight, clientHeight } = target;
 95 |       if (scrollTop + clientHeight >= scrollHeight - 50) {
 96 |         loadContracts();
 97 |       }
 98 |     },
 99 |     [hasMore, loading, loadContracts],
100 |   );
101 | 
102 |   const removeContract = useCallback((id: string) => {
103 |     setContracts((prev) => prev.filter((doc) => doc.id !== id));
104 |     setTotal((prev) => (prev !== null ? prev - 1 : null));
105 |   }, []);
106 | 
107 |   return {
108 |     contracts,
109 |     total,
110 |     loading,
111 |     hasMore,
112 |     loadMore: loadContracts,
113 |     handleScroll,
114 |     removeContract,
115 |   };
116 | }
117 | 
118 | interface UseDeleteContractResult {
119 |   deleteConfirmId: string | null;
120 |   deletingId: string | null;
121 |   showDeleteConfirm: (id: string) => void;
122 |   cancelDelete: () => void;
123 |   confirmDelete: (id: string) => Promise<void>;
124 | }
125 | 
126 | function useDeleteContract(
127 |   pipelineId: string | undefined,
128 |   onSuccess?: () => void,
129 | ): UseDeleteContractResult {
130 |   const [deleteConfirmId, setDeleteConfirmId] = useState<string | null>(null);
131 |   const [deletingId, setDeletingId] = useState<string | null>(null);
132 | 
133 |   const confirmDelete = useCallback(
134 |     async (documentId: string) => {
135 |       if (!pipelineId) return;
136 | 
137 |       setDeletingId(documentId);
138 |       try {
139 |         await deletePipelineDocumentApiV1PipelinesPipelineIdDocumentsDocumentIdDelete(
140 |           {
141 |             path: {
142 |               pipeline_id: pipelineId,
143 |               document_id: documentId,
144 |             },
145 |           },
146 |         );
147 | 
148 |         setDeleteConfirmId(null);
149 |         onSuccess?.();
150 |       } catch (error) {
151 |         console.error("Failed to delete contract:", error);
152 |         alert("Failed to delete contract. Please try again.");
153 |       } finally {
154 |         setDeletingId(null);
155 |       }
156 |     },
157 |     [pipelineId, onSuccess],
158 |   );
159 | 
160 |   return {
161 |     deleteConfirmId,
162 |     deletingId,
163 |     showDeleteConfirm: setDeleteConfirmId,
164 |     cancelDelete: () => setDeleteConfirmId(null),
165 |     confirmDelete,
166 |   };
167 | }
168 | 
169 | interface ContractsDropdownProps {
170 |   onDeleteSuccess?: () => void;
171 | }
172 | 
173 | export function ContractsDropdown({ onDeleteSuccess }: ContractsDropdownProps) {
174 |   const { metadata, loading: metadataLoading } = useMetadata();
175 |   const [isOpen, setIsOpen] = useState(false);
176 | 
177 |   const { contracts, total, loading, handleScroll, removeContract } =
178 |     useContractsLoader(metadata?.contracts_pipeline_id, isOpen);
179 | 
180 |   const {
181 |     deleteConfirmId,
182 |     deletingId,
183 |     showDeleteConfirm,
184 |     cancelDelete,
185 |     confirmDelete,
186 |   } = useDeleteContract(metadata?.contracts_pipeline_id, onDeleteSuccess);
187 | 
188 |   const handleDelete = async (documentId: string) => {
189 |     await confirmDelete(documentId);
190 |     removeContract(documentId);
191 |   };
192 | 
193 |   const handleDownload = async (contract: CloudDocument) => {
194 |     const fileId = contract.metadata?.file_id as string;
195 |     if (!fileId) {
196 |       console.error("No file_id found in contract metadata");
197 |       alert("Cannot download: file information not available");
198 |       return;
199 |     }
200 | 
201 |     try {
202 |       const response = await readFileContentApiV1FilesIdContentGet({
203 |         path: { id: fileId },
204 |       });
205 | 
206 |       if (response.data?.url) {
207 |         // Create a temporary link and trigger download
208 |         const link = document.createElement("a");
209 |         link.href = response.data.url;
210 |         link.download = (contract.metadata?.filename as string) || "contract";
211 |         document.body.appendChild(link);
212 |         link.click();
213 |         document.body.removeChild(link);
214 |       }
215 |     } catch (error) {
216 |       console.error("Failed to download contract:", error);
217 |       alert("Failed to download contract. Please try again.");
218 |     }
219 |   };
220 | 
221 |   if (metadataLoading) {
222 |     return null;
223 |   }
224 | 
225 |   return (
226 |     <DropdownMenu open={isOpen} onOpenChange={setIsOpen}>
227 |       <DropdownMenuTrigger asChild>
228 |         <Button
229 |           variant="default"
230 |           size="icon"
231 |           className="cursor-pointer bg-black hover:bg-black/90 text-white"
232 |           aria-label="View contracts"
233 |         >
234 |           <ChevronDown className="h-4 w-4" />
235 |         </Button>
236 |       </DropdownMenuTrigger>
237 | 
238 |       <DropdownMenuContent className="w-96" align="end">
239 |         {total !== null && (
240 |           <>
241 |             <div className="px-2 py-1.5 text-sm font-semibold">
242 |               Total Contracts: {total}
243 |             </div>
244 |             <DropdownMenuSeparator />
245 |           </>
246 |         )}
247 | 
248 |         <ScrollArea className="h-96" onScrollCapture={handleScroll}>
249 |           {contracts.length === 0 && !loading ? (
250 |             <div className="p-4 text-center text-sm text-muted-foreground">
251 |               No contracts found
252 |             </div>
253 |           ) : (
254 |             <div>
255 |               {contracts.map((contract, index) => (
256 |                 <div key={contract.id}>
257 |                   {deleteConfirmId === contract.id ? (
258 |                     <div className="flex flex-col gap-2 px-4 py-3 bg-accent/50">
259 |                       <p className="text-sm font-medium">
260 |                         Delete "
261 |                         {(contract.metadata?.filename as string) || "Untitled"}
262 |                         "?
263 |                       </p>
264 |                       <div className="flex items-center gap-2">
265 |                         <Button
266 |                           onClick={() => handleDelete(contract.id)}
267 |                           disabled={deletingId === contract.id}
268 |                           variant="destructive"
269 |                           size="sm"
270 |                           className="flex-1 cursor-pointer"
271 |                           aria-label={`Confirm delete ${(contract.metadata?.filename as string) || "Untitled"}`}
272 |                         >
273 |                           {deletingId === contract.id
274 |                             ? "Deleting..."
275 |                             : "Confirm"}
276 |                         </Button>
277 |                         <Button
278 |                           onClick={cancelDelete}
279 |                           disabled={deletingId === contract.id}
280 |                           variant="outline"
281 |                           size="sm"
282 |                           className="flex-1 cursor-pointer"
283 |                           aria-label="Cancel delete"
284 |                         >
285 |                           Cancel
286 |                         </Button>
287 |                       </div>
288 |                     </div>
289 |                   ) : (
290 |                     <div className="flex items-center justify-between gap-2 px-2 py-2 hover:bg-accent group">
291 |                       <button
292 |                         className="flex-1 min-w-0 cursor-pointer px-2 py-1 text-left bg-transparent border-none outline-none focus-visible:ring-2 focus-visible:ring-ring focus-visible:ring-offset-2 rounded"
293 |                         onClick={() => handleDownload(contract)}
294 |                         aria-label={`Download ${(contract.metadata?.filename as string) || "Untitled"}`}
295 |                         type="button"
296 |                       >
297 |                         <p className="text-sm font-medium truncate group-hover:text-primary">
298 |                           {(contract.metadata?.filename as string) ||
299 |                             "Untitled"}
300 |                         </p>
301 |                       </button>
302 | 
303 |                       <Button
304 |                         variant="ghost"
305 |                         size="icon"
306 |                         className="h-8 w-8 flex-shrink-0 text-muted-foreground hover:text-destructive cursor-pointer"
307 |                         onClick={(e) => {
308 |                           e.stopPropagation();
309 |                           showDeleteConfirm(contract.id);
310 |                         }}
311 |                         aria-label={`Delete ${(contract.metadata?.filename as string) || "Untitled"}`}
312 |                         type="button"
313 |                       >
314 |                         <Trash2 className="h-4 w-4" />
315 |                       </Button>
316 |                     </div>
317 |                   )}
318 |                   {index < contracts.length - 1 && <DropdownMenuSeparator />}
319 |                 </div>
320 |               ))}
321 |             </div>
322 |           )}
323 | 
324 |           {loading && (
325 |             <div className="flex justify-center items-center gap-2 p-4 text-sm text-muted-foreground">
326 |               <Loader2 className="h-4 w-4 animate-spin" />
327 |               <span>Loading...</span>
328 |             </div>
329 |           )}
330 |         </ScrollArea>
331 |       </DropdownMenuContent>
332 |     </DropdownMenu>
333 |   );
334 | }
335 | 


--------------------------------------------------------------------------------
/src/extraction_review/process_file.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import hashlib
  3 | import logging
  4 | import os
  5 | import tempfile
  6 | from pathlib import Path
  7 | from typing import Any, Literal
  8 | 
  9 | import httpx
 10 | from llama_cloud import ExtractRun
 11 | from llama_cloud_services.beta.agent_data import ExtractedData, InvalidExtractionData
 12 | from llama_cloud_services.extract import SourceText
 13 | from llama_index.core.prompts import PromptTemplate
 14 | from pydantic import BaseModel, Field
 15 | from workflows import Context, Workflow, step
 16 | from workflows.events import Event, StartEvent, StopEvent
 17 | 
 18 | from .clients import (
 19 |     get_contracts_index,
 20 |     get_data_client,
 21 |     get_extract_agent,
 22 |     get_llama_cloud_client,
 23 |     get_llm,
 24 | )
 25 | from .config import Discrepancy, InvoiceExtractionSchema, InvoiceWithReconciliation
 26 | 
 27 | logger = logging.getLogger(__name__)
 28 | 
 29 | 
 30 | class FileEvent(StartEvent):
 31 |     file_id: str
 32 | 
 33 | 
 34 | class DownloadFileEvent(Event):
 35 |     pass
 36 | 
 37 | 
 38 | class FileDownloadedEvent(Event):
 39 |     pass
 40 | 
 41 | 
 42 | class Status(Event):
 43 |     level: Literal["info", "warning", "error"]
 44 |     message: str
 45 | 
 46 | 
 47 | class ExtractedEvent(Event):
 48 |     """Event when invoice data is successfully extracted"""
 49 | 
 50 |     invoice_data: InvoiceExtractionSchema
 51 |     field_metadata: dict[str, Any]
 52 | 
 53 | 
 54 | class ExtractedInvalidEvent(Event):
 55 |     """Event when extraction validation fails"""
 56 | 
 57 |     data: ExtractedData[dict[str, Any]]
 58 | 
 59 | 
 60 | class ReconciledEvent(Event):
 61 |     """Event when invoice is reconciled with contracts"""
 62 | 
 63 |     data: ExtractedData[InvoiceWithReconciliation]
 64 | 
 65 | 
 66 | class ExtractionState(BaseModel):
 67 |     file_id: str | None = None
 68 |     file_path: str | None = None
 69 |     filename: str | None = None
 70 | 
 71 | 
 72 | class ProcessFileWorkflow(Workflow):
 73 |     """
 74 |     Given a file path, this workflow will process a single file through the custom extraction logic.
 75 |     """
 76 | 
 77 |     @step()
 78 |     async def run_file(self, event: FileEvent, ctx: Context) -> DownloadFileEvent:
 79 |         logger.info(f"Running file {event.file_id}")
 80 |         async with ctx.store.edit_state() as state:
 81 |             state.file_id = event.file_id
 82 |         return DownloadFileEvent()
 83 | 
 84 |     @step()
 85 |     async def download_file(
 86 |         self, event: DownloadFileEvent, ctx: Context[ExtractionState]
 87 |     ) -> FileDownloadedEvent:
 88 |         """Download the file reference from the cloud storage"""
 89 |         state = await ctx.store.get_state()
 90 |         if state.file_id is None:
 91 |             raise ValueError("File ID is not set")
 92 |         try:
 93 |             file_metadata = await get_llama_cloud_client().files.get_file(
 94 |                 id=state.file_id
 95 |             )
 96 |             file_url = await get_llama_cloud_client().files.read_file_content(
 97 |                 state.file_id
 98 |             )
 99 | 
100 |             temp_dir = tempfile.gettempdir()
101 |             filename = file_metadata.name
102 |             file_path = os.path.join(temp_dir, filename)
103 |             client = httpx.AsyncClient()
104 |             # Report progress to the UI
105 |             logger.info(f"Downloading file {file_url.url} to {file_path}")
106 | 
107 |             async with client.stream("GET", file_url.url) as response:
108 |                 with open(file_path, "wb") as f:
109 |                     async for chunk in response.aiter_bytes():
110 |                         f.write(chunk)
111 |             logger.info(f"Downloaded file {file_url.url} to {file_path}")
112 |             async with ctx.store.edit_state() as state:
113 |                 state.file_path = file_path
114 |                 state.filename = filename
115 |             return FileDownloadedEvent()
116 | 
117 |         except Exception as e:
118 |             logger.error(f"Error downloading file {state.file_id}: {e}", exc_info=True)
119 |             ctx.write_event_to_stream(
120 |                 Status(
121 |                     level="error",
122 |                     message=f"Error downloading file {state.file_id}: {e}",
123 |                 )
124 |             )
125 |             raise e
126 | 
127 |     @step()
128 |     async def process_file(
129 |         self, event: FileDownloadedEvent, ctx: Context[ExtractionState]
130 |     ) -> ExtractedEvent | ExtractedInvalidEvent:
131 |         """Runs the extraction against the file"""
132 |         state = await ctx.store.get_state()
133 |         if state.file_path is None or state.filename is None:
134 |             raise ValueError("File path or filename is not set")
135 |         try:
136 |             agent = get_extract_agent()
137 |             source_text = SourceText(
138 |                 file=state.file_path,
139 |                 filename=state.filename,
140 |             )
141 |             logger.info(f"Extracting data from file {state.filename}")
142 |             ctx.write_event_to_stream(
143 |                 Status(
144 |                     level="info", message=f"Extracting data from file {state.filename}"
145 |                 )
146 |             )
147 |             extracted_result: ExtractRun = await agent.aextract(source_text)
148 | 
149 |             # Validate the extracted data
150 |             if not extracted_result.data:
151 |                 raise ValueError("No data extracted from invoice")
152 | 
153 |             invoice_data = InvoiceExtractionSchema.model_validate(extracted_result.data)
154 |             logger.info(f"Extracted invoice data: {invoice_data}")
155 |             # Extract only the field_metadata we need, not the entire ExtractRun object
156 |             field_metadata = extracted_result.extraction_metadata.get(
157 |                 "field_metadata", {}
158 |             )
159 |             return ExtractedEvent(
160 |                 invoice_data=invoice_data, field_metadata=field_metadata
161 |             )
162 |         except InvalidExtractionData as e:
163 |             logger.error(f"Error validating extracted data: {e}", exc_info=True)
164 |             return ExtractedInvalidEvent(data=e.invalid_item)
165 |         except Exception as e:
166 |             logger.error(
167 |                 f"Error extracting data from file {state.filename}: {e}",
168 |                 exc_info=True,
169 |             )
170 |             ctx.write_event_to_stream(
171 |                 Status(
172 |                     level="error",
173 |                     message=f"Error extracting data from file {state.filename}: {e}",
174 |                 )
175 |             )
176 |             raise e
177 | 
178 |     @step()
179 |     async def reconcile_with_contract(
180 |         self, event: ExtractedEvent, ctx: Context[ExtractionState]
181 |     ) -> ReconciledEvent:
182 |         """Reconcile the invoice with matching contracts using retrieval and LLM"""
183 |         state = await ctx.store.get_state()
184 |         invoice_data = event.invoice_data
185 | 
186 |         logger.info("Reconciling invoice with contracts")
187 |         ctx.write_event_to_stream(
188 |             Status(level="info", message="Matching invoice with contracts...")
189 |         )
190 | 
191 |         try:
192 |             # Build a query from invoice data for contract retrieval
193 |             query_parts = []
194 |             if invoice_data.vendor_name:
195 |                 query_parts.append(f"vendor: {invoice_data.vendor_name}")
196 |             if invoice_data.purchase_order_number:
197 |                 query_parts.append(f"PO: {invoice_data.purchase_order_number}")
198 |             if invoice_data.invoice_number:
199 |                 query_parts.append(f"invoice: {invoice_data.invoice_number}")
200 | 
201 |             query = " ".join(query_parts) if query_parts else "contract agreement"
202 | 
203 |             # Retrieve relevant contracts
204 |             index = get_contracts_index()
205 |             retriever = index.as_retriever(similarity_top_k=3)
206 |             retrieved_nodes = await retriever.aretrieve(query)
207 | 
208 |             if not retrieved_nodes:
209 |                 logger.info("No contracts found in index")
210 |                 # No contracts available - create reconciliation data with no match
211 |                 reconciled_data = InvoiceWithReconciliation(
212 |                     **invoice_data.model_dump(),
213 |                     match_confidence="none",
214 |                     match_rationale="No contracts found in the system",
215 |                     discrepancies=[],
216 |                 )
217 |             else:
218 |                 # Use LLM to match and reconcile
219 |                 reconciled_data = await self._match_and_reconcile(
220 |                     invoice_data, retrieved_nodes
221 |                 )
222 | 
223 |             # Create ExtractedData with reconciliation information
224 |             file_content = Path(state.file_path).read_bytes()
225 |             file_hash = hashlib.sha256(file_content).hexdigest()
226 | 
227 |             # Get field metadata from extraction event
228 |             field_metadata = event.field_metadata
229 | 
230 |             extracted_data = ExtractedData.create(
231 |                 data=reconciled_data,
232 |                 file_id=state.file_id,
233 |                 file_name=state.filename,
234 |                 file_hash=file_hash,
235 |                 field_metadata=field_metadata,
236 |             )
237 | 
238 |             logger.info(f"Reconciliation complete: {reconciled_data.match_confidence}")
239 |             return ReconciledEvent(data=extracted_data)
240 | 
241 |         except Exception as e:
242 |             logger.error(f"Error during reconciliation: {e}", exc_info=True)
243 |             # If reconciliation fails, still create data without reconciliation
244 |             reconciled_data = InvoiceWithReconciliation(
245 |                 **invoice_data.model_dump(),
246 |                 match_confidence="error",
247 |                 match_rationale=f"Error during reconciliation: {str(e)}",
248 |                 discrepancies=[],
249 |             )
250 | 
251 |             file_content = Path(state.file_path).read_bytes()
252 |             file_hash = hashlib.sha256(file_content).hexdigest()
253 |             field_metadata = event.field_metadata
254 | 
255 |             extracted_data = ExtractedData.create(
256 |                 data=reconciled_data,
257 |                 file_id=state.file_id,
258 |                 file_name=state.filename,
259 |                 file_hash=file_hash,
260 |                 field_metadata=field_metadata,
261 |             )
262 | 
263 |             return ReconciledEvent(data=extracted_data)
264 | 
265 |     async def _match_and_reconcile(
266 |         self, invoice_data: InvoiceExtractionSchema, retrieved_nodes: list
267 |     ) -> InvoiceWithReconciliation:
268 |         """Use LLM to match invoice with contract and identify discrepancies"""
269 | 
270 |         # Define structured output schema for LLM
271 |         class ContractMatchResult(BaseModel):
272 |             """Result of matching invoice to contract"""
273 | 
274 |             is_match: bool = Field(
275 |                 description="Whether a plausible contract match was found"
276 |             )
277 |             matched_contract_index: int | None = Field(
278 |                 default=None,
279 |                 description="Index (0-based) of the matched contract in the provided list, or None if no match",
280 |             )
281 |             match_confidence: str = Field(
282 |                 description="Confidence level: 'high', 'medium', 'low', or 'none'"
283 |             )
284 |             match_rationale: str = Field(
285 |                 description="Explanation of why this contract was or was not matched"
286 |             )
287 |             contract_payment_terms: str | None = Field(
288 |                 default=None, description="Payment terms found in the matched contract"
289 |             )
290 |             discrepancies: list[Discrepancy] = Field(
291 |                 default_factory=list,
292 |                 description="List of discrepancies found between invoice and contract",
293 |             )
294 | 
295 |         # Prepare contract context
296 |         contracts_text = "\n\n".join(
297 |             [
298 |                 f"Contract {i} (File: {node.metadata.get('filename', 'Unknown')}):\n{node.text[:1000]}"
299 |                 for i, node in enumerate(retrieved_nodes)
300 |             ]
301 |         )
302 | 
303 |         # Create prompt for matching
304 |         prompt_template = PromptTemplate(
305 |             """You are analyzing an invoice to match it with the correct contract and identify any discrepancies.
306 | 
307 | Invoice Details:
308 | - Vendor: {vendor_name}
309 | - Invoice Number: {invoice_number}
310 | - Invoice Date: {invoice_date}
311 | - PO Number: {po_number}
312 | - Payment Terms: {payment_terms}
313 | - Total: {total}
314 | 
315 | Retrieved Contracts:
316 | {contracts_text}
317 | 
318 | Task:
319 | 1. Determine if any of the retrieved contracts plausibly matches this invoice based on:
320 |    - Vendor name matching or similarity
321 |    - PO number or invoice number references
322 |    - Date ranges or validity periods
323 |    - Any other relevant identifiers
324 | 
325 | 2. If a match is found, identify discrepancies between invoice and contract, focusing on:
326 |    - Payment terms differences (CRITICAL)
327 |    - Total amount mismatches if contract specifies amounts
328 |    - Vendor name discrepancies
329 |    - Any other obvious conflicts
330 | 
331 | 3. Assess match confidence:
332 |    - 'high': Clear match with strong vendor/PO/identifier alignment
333 |    - 'medium': Probable match with some uncertainty
334 |    - 'low': Weak match, possibly relevant but uncertain
335 |    - 'none': No plausible match found
336 | 
337 | Provide your analysis in the specified format."""
338 |         )
339 | 
340 |         # Use LLM with structured prediction
341 |         llm = get_llm()
342 |         result = await llm.astructured_predict(
343 |             ContractMatchResult,
344 |             prompt_template,
345 |             **{
346 |                 "vendor_name": invoice_data.vendor_name or "N/A",
347 |                 "invoice_number": invoice_data.invoice_number or "N/A",
348 |                 "invoice_date": invoice_data.invoice_date or "N/A",
349 |                 "po_number": invoice_data.purchase_order_number or "N/A",
350 |                 "payment_terms": invoice_data.payment_terms or "N/A",
351 |                 "total": invoice_data.total or "N/A",
352 |                 "contracts_text": contracts_text,
353 |             },
354 |         )
355 | 
356 |         # Build reconciled invoice data
357 |         matched_contract_id = None
358 |         matched_contract_name = None
359 | 
360 |         if result.is_match and result.matched_contract_index is not None:
361 |             matched_node = retrieved_nodes[result.matched_contract_index]
362 |             matched_contract_id = matched_node.metadata.get("file_id")
363 |             matched_contract_name = matched_node.metadata.get("filename")
364 | 
365 |         return InvoiceWithReconciliation(
366 |             **invoice_data.model_dump(),
367 |             matched_contract_id=matched_contract_id,
368 |             matched_contract_name=matched_contract_name,
369 |             match_confidence=result.match_confidence,
370 |             match_rationale=result.match_rationale,
371 |             discrepancies=result.discrepancies,
372 |         )
373 | 
374 |     @step()
375 |     async def record_extracted_data(
376 |         self, event: ReconciledEvent | ExtractedInvalidEvent, ctx: Context
377 |     ) -> StopEvent:
378 |         """Records the extracted data to the agent data API"""
379 |         try:
380 |             logger.info(f"Recorded extracted data for file {event.data.file_name}")
381 |             ctx.write_event_to_stream(
382 |                 Status(
383 |                     level="info",
384 |                     message=f"Recorded extracted data for file {event.data.file_name}",
385 |                 )
386 |             )
387 |             # remove past data when reprocessing the same file
388 |             if event.data.file_hash:
389 |                 await get_data_client().delete(
390 |                     filter={
391 |                         "file_hash": {
392 |                             "eq": event.data.file_hash,
393 |                         },
394 |                     },
395 |                 )
396 |                 logger.info(
397 |                     f"Removing past data for file {event.data.file_name} with hash {event.data.file_hash}"
398 |                 )
399 |             # finally, save the new data
400 |             item_id = await get_data_client().create_item(event.data)
401 |             return StopEvent(
402 |                 result=item_id.id,
403 |             )
404 |         except Exception as e:
405 |             logger.error(
406 |                 f"Error recording extracted data for file {event.data.file_name}: {e}",
407 |                 exc_info=True,
408 |             )
409 |             ctx.write_event_to_stream(
410 |                 Status(
411 |                     level="error",
412 |                     message=f"Error recording extracted data for file {event.data.file_name}: {e}",
413 |                 )
414 |             )
415 |             raise e
416 | 
417 | 
418 | workflow = ProcessFileWorkflow(timeout=None)
419 | 
420 | if __name__ == "__main__":
421 |     from dotenv import load_dotenv
422 | 
423 |     load_dotenv()
424 |     logging.basicConfig(level=logging.INFO)
425 | 
426 |     async def main():
427 |         file = await get_llama_cloud_client().files.upload_file(
428 |             upload_file=Path("test.pdf").open("rb")
429 |         )
430 |         await workflow.run(start_event=FileEvent(file_id=file.id))
431 | 
432 |     asyncio.run(main())
433 | 


--------------------------------------------------------------------------------