├── src └── extraction_review │ ├── __init__.py │ ├── metadata_workflow.py │ ├── schema.py │ ├── clients.py │ ├── config.py │ ├── index_contract.py │ └── process_file.py ├── tests └── test_placeholder.py ├── .gitignore ├── .env.template ├── ui ├── postcss.config.mjs ├── src │ ├── lib │ │ ├── config.ts │ │ ├── utils.ts │ │ ├── export.ts │ │ ├── ToolbarContext.tsx │ │ ├── useMetadata.ts │ │ ├── client.ts │ │ ├── MetadataProvider.tsx │ │ ├── WorkflowProgress.tsx │ │ └── ContractsDropdown.tsx │ ├── main.tsx │ ├── pages │ │ ├── HomePage.module.css │ │ ├── HomePage.tsx │ │ └── ItemPage.tsx │ ├── vite-env.d.ts │ ├── App.tsx │ └── index.css ├── index.html ├── README.md ├── components.json ├── .gitignore ├── tsconfig.json ├── vite.config.ts └── package.json ├── {{ _copier_conf.answers_file }}.jinja ├── .copier-answers.yml ├── pyproject.toml ├── task.md └── README.md /src/extraction_review/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/test_placeholder.py: -------------------------------------------------------------------------------- 1 | def test_placeholder(): 2 | pass 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .env 2 | __pycache__ 3 | workflows.db 4 | 5 | .venv 6 | package-lock.json 7 | node_modules 8 | -------------------------------------------------------------------------------- /.env.template: -------------------------------------------------------------------------------- 1 | # copy to .env and place any needed secrets here. LLAMA_CLOUD_API_KEY will be automatically set 2 | # OPENAI_API_KEY=sk-xxx 3 | -------------------------------------------------------------------------------- /ui/postcss.config.mjs: -------------------------------------------------------------------------------- 1 | const config = { 2 | plugins: { 3 | "@tailwindcss/postcss": {}, 4 | }, 5 | }; 6 | 7 | export default config; -------------------------------------------------------------------------------- /{{ _copier_conf.answers_file }}.jinja: -------------------------------------------------------------------------------- 1 | # Changes here will be overwritten by Copier; NEVER EDIT MANUALLY 2 | {{ _copier_answers|to_nice_yaml -}} -------------------------------------------------------------------------------- /ui/src/lib/config.ts: -------------------------------------------------------------------------------- 1 | export const APP_TITLE = "Extraction Review"; 2 | export const AGENT_NAME = import.meta.env.VITE_LLAMA_DEPLOY_DEPLOYMENT_NAME; 3 | -------------------------------------------------------------------------------- /.copier-answers.yml: -------------------------------------------------------------------------------- 1 | # Changes here will be overwritten by Copier; NEVER EDIT MANUALLY 2 | _commit: v0.3.6 3 | _src_path: https://github.com/run-llama/template-workflow-data-extraction 4 | -------------------------------------------------------------------------------- /ui/src/lib/utils.ts: -------------------------------------------------------------------------------- 1 | import { clsx, type ClassValue } from "clsx"; 2 | import { twMerge } from "tailwind-merge"; 3 | 4 | export function cn(...inputs: ClassValue[]) { 5 | return twMerge(clsx(inputs)); 6 | } 7 | -------------------------------------------------------------------------------- /ui/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Review 7 | 8 | 9 |
10 | 11 | 12 | -------------------------------------------------------------------------------- /ui/README.md: -------------------------------------------------------------------------------- 1 | # Data Extraction UI 2 | 3 | This is a simple next.js template that builds on the @llamaindex/agent-app ui component library 4 | for showing displaying tables of extracted data. 5 | 6 | Ideally run this with `llamactl` in the parent directory (See [README.md](../README.md)), 7 | but you can also run it standalone with `npm run dev`, but workflow integrations will not work -------------------------------------------------------------------------------- /ui/src/main.tsx: -------------------------------------------------------------------------------- 1 | import { StrictMode } from "react"; 2 | import { createRoot } from "react-dom/client"; 3 | import { HashRouter } from "react-router-dom"; 4 | import App from "./App"; 5 | import "@llamaindex/ui/styles.css"; 6 | import "./index.css"; 7 | 8 | createRoot(document.getElementById("root")!).render( 9 | 10 | 11 | 12 | 13 | , 14 | ); 15 | -------------------------------------------------------------------------------- /ui/src/pages/HomePage.module.css: -------------------------------------------------------------------------------- 1 | .main { 2 | padding: 1rem; 3 | } 4 | 5 | .grid { 6 | display: flex; 7 | flex-direction: row; 8 | gap: 1rem; 9 | margin-bottom: 1rem; 10 | & > * { 11 | flex: 1; 12 | } 13 | } 14 | 15 | .commandBar { 16 | display: flex; 17 | justify-content: flex-end; 18 | margin-bottom: 1rem; 19 | gap: 1rem; 20 | align-items: center; 21 | } 22 | 23 | .progressBar { 24 | margin-bottom: 1rem; 25 | } 26 | -------------------------------------------------------------------------------- /ui/src/vite-env.d.ts: -------------------------------------------------------------------------------- 1 | /// 2 | 3 | interface ImportMetaEnv { 4 | readonly VITE_LLAMA_CLOUD_API_KEY?: string; 5 | readonly VITE_LLAMA_CLOUD_BASE_URL?: string; 6 | 7 | // injected from llama_deploy 8 | readonly VITE_LLAMA_DEPLOY_BASE_PATH: string; 9 | readonly VITE_LLAMA_DEPLOY_DEPLOYMENT_NAME: string; 10 | readonly VITE_LLAMA_DEPLOY_PROJECT_ID: string; 11 | } 12 | 13 | interface ImportMeta { 14 | readonly env: ImportMetaEnv; 15 | } 16 | -------------------------------------------------------------------------------- /ui/components.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://ui.shadcn.com/schema.json", 3 | "style": "new-york", 4 | "rsc": true, 5 | "tsx": true, 6 | "tailwind": { 7 | "config": "", 8 | "css": "src/index.css", 9 | "baseColor": "zinc", 10 | "cssVariables": true, 11 | "prefix": "" 12 | }, 13 | "aliases": { 14 | "components": "@/components", 15 | "utils": "@/lib/utils", 16 | "ui": "@/components/ui", 17 | "lib": "@/lib", 18 | "hooks": "@/hooks" 19 | }, 20 | "iconLibrary": "lucide" 21 | } -------------------------------------------------------------------------------- /ui/.gitignore: -------------------------------------------------------------------------------- 1 | # See https://help.github.com/articles/ignoring-files/ for more about ignoring files. 2 | 3 | # dependencies 4 | /node_modules 5 | /.pnp 6 | .pnp.* 7 | .yarn/* 8 | !.yarn/patches 9 | !.yarn/plugins 10 | !.yarn/releases 11 | !.yarn/versions 12 | 13 | # testing 14 | /coverage 15 | 16 | # next.js 17 | /.next/ 18 | /out/ 19 | /dist/ 20 | 21 | # production 22 | /build 23 | 24 | # misc 25 | .DS_Store 26 | *.pem 27 | 28 | # debug 29 | npm-debug.log* 30 | yarn-debug.log* 31 | yarn-error.log* 32 | .pnpm-debug.log* 33 | 34 | # env files (can opt-in for committing if needed) 35 | .env* 36 | 37 | # vercel 38 | .vercel 39 | 40 | # typescript 41 | *.tsbuildinfo 42 | next-env.d.ts 43 | 44 | -------------------------------------------------------------------------------- /ui/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "target": "ES2020", 4 | "useDefineForClassFields": true, 5 | "lib": ["ES2020", "DOM", "DOM.Iterable"], 6 | "module": "ESNext", 7 | "skipLibCheck": true, 8 | 9 | /* Bundler mode */ 10 | "moduleResolution": "bundler", 11 | "allowImportingTsExtensions": true, 12 | "resolveJsonModule": true, 13 | "isolatedModules": true, 14 | "noEmit": true, 15 | "jsx": "react-jsx", 16 | 17 | /* Linting */ 18 | "strict": true, 19 | "noUnusedLocals": true, 20 | "noUnusedParameters": true, 21 | "noFallthroughCasesInSwitch": true, 22 | 23 | /* Path mapping */ 24 | "baseUrl": ".", 25 | "paths": { 26 | "@/*": ["./src/*"] 27 | } 28 | }, 29 | "include": ["src", "vite.config.ts", "src/vite-env.d.ts"], 30 | "exclude": ["node_modules"] 31 | } 32 | -------------------------------------------------------------------------------- /ui/src/lib/export.ts: -------------------------------------------------------------------------------- 1 | import type { 2 | ExtractedData, 3 | TypedAgentData, 4 | } from "llama-cloud-services/beta/agent"; 5 | 6 | /** 7 | * Downloads data as a JSON file 8 | */ 9 | export function downloadJSON( 10 | data: T, 11 | filename: string = "extraction-results.json", 12 | ) { 13 | const jsonString = JSON.stringify(data, null, 2); 14 | const blob = new Blob([jsonString], { type: "application/json" }); 15 | const url = URL.createObjectURL(blob); 16 | 17 | const link = document.createElement("a"); 18 | link.href = url; 19 | link.download = filename; 20 | document.body.appendChild(link); 21 | link.click(); 22 | 23 | // Cleanup 24 | document.body.removeChild(link); 25 | URL.revokeObjectURL(url); 26 | } 27 | 28 | /** 29 | * Downloads extracted data item as JSON 30 | */ 31 | export function downloadExtractedDataItem( 32 | item: TypedAgentData>, 33 | ) { 34 | const fileName = item.data.file_name || "item"; 35 | const timestamp = item.createdAt.toISOString().split("T")[0]; 36 | const filename = `${fileName}-${timestamp}.json`; 37 | 38 | downloadJSON(item, filename); 39 | } 40 | -------------------------------------------------------------------------------- /src/extraction_review/metadata_workflow.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | from workflows import Workflow, step 4 | from workflows.events import StartEvent, StopEvent 5 | 6 | from extraction_review.schema import get_extraction_schema_json 7 | 8 | from .clients import get_contracts_index 9 | from .config import EXTRACTED_DATA_COLLECTION 10 | 11 | 12 | class MetadataResponse(StopEvent): 13 | json_schema: dict[str, Any] 14 | extracted_data_collection: str 15 | contracts_pipeline_id: str 16 | 17 | 18 | class MetadataWorkflow(Workflow): 19 | """ 20 | Simple single step workflow to expose configuration to the UI, such as the JSON schema and collection name. 21 | """ 22 | 23 | @step 24 | async def get_metadata(self, _: StartEvent) -> MetadataResponse: 25 | json_schema = await get_extraction_schema_json() 26 | contracts_index = get_contracts_index() 27 | return MetadataResponse( 28 | json_schema=json_schema, 29 | extracted_data_collection=EXTRACTED_DATA_COLLECTION, 30 | contracts_pipeline_id=contracts_index.id, 31 | ) 32 | 33 | 34 | workflow = MetadataWorkflow(timeout=None) 35 | -------------------------------------------------------------------------------- /src/extraction_review/schema.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Type 2 | 3 | import jsonref 4 | from pydantic import BaseModel, Field, create_model 5 | 6 | from extraction_review.config import InvoiceWithReconciliation 7 | 8 | 9 | async def get_extraction_schema_json() -> dict[str, Any]: 10 | json_schema = InvoiceWithReconciliation.model_json_schema() 11 | json_schema = jsonref.replace_refs(json_schema, proxies=False) 12 | return json_schema 13 | 14 | 15 | def model_from_schema(schema: dict[str, Any]) -> Type[BaseModel]: 16 | """ 17 | Converts a JSON schema back to a Pydantic model. 18 | """ 19 | typemap = { 20 | "string": str, 21 | "integer": int, 22 | "number": float, 23 | "boolean": bool, 24 | "array": list, 25 | "object": dict, 26 | } 27 | fields = {} 28 | for prop, meta in schema.get("properties", {}).items(): 29 | py_type = typemap.get(meta.get("type"), Any) 30 | default = ... if prop in schema.get("required", []) else None 31 | fields[prop] = (py_type, Field(default, description=meta.get("description"))) 32 | return create_model(schema.get("title", "DynamicModel"), **fields) 33 | -------------------------------------------------------------------------------- /ui/src/lib/ToolbarContext.tsx: -------------------------------------------------------------------------------- 1 | import React from "react"; 2 | import { APP_TITLE } from "./config"; 3 | 4 | export interface BreadcrumbItem { 5 | label: string; 6 | href?: string; 7 | isCurrentPage?: boolean; 8 | } 9 | 10 | export const ToolbarCtx = React.createContext<{ 11 | buttons: React.ReactNode[]; 12 | setButtons: (fn: (prev: React.ReactNode[]) => React.ReactNode[]) => void; 13 | breadcrumbs: BreadcrumbItem[]; 14 | setBreadcrumbs: (items: BreadcrumbItem[]) => void; 15 | }>({ 16 | buttons: [], 17 | setButtons: () => {}, 18 | breadcrumbs: [], 19 | setBreadcrumbs: () => {}, 20 | }); 21 | 22 | export const ToolbarProvider = ({ 23 | children, 24 | }: { 25 | children: React.ReactNode; 26 | }) => { 27 | const [buttons, setButtons] = React.useState([]); 28 | const [breadcrumbs, setBreadcrumbs] = React.useState([ 29 | { label: APP_TITLE, href: "/" }, 30 | ]); 31 | 32 | return ( 33 | 36 | {children} 37 | 38 | ); 39 | }; 40 | 41 | export const useToolbar = () => React.useContext(ToolbarCtx); 42 | -------------------------------------------------------------------------------- /ui/vite.config.ts: -------------------------------------------------------------------------------- 1 | import { defineConfig } from "vite"; 2 | import react from "@vitejs/plugin-react"; 3 | import path from "path"; 4 | 5 | // https://vitejs.dev/config/ 6 | export default defineConfig(({}) => { 7 | const deploymentName = process.env.LLAMA_DEPLOY_DEPLOYMENT_NAME; 8 | const basePath = process.env.LLAMA_DEPLOY_DEPLOYMENT_BASE_PATH; 9 | const projectId = process.env.LLAMA_DEPLOY_PROJECT_ID; 10 | const port = process.env.PORT ? Number(process.env.PORT) : 3000; 11 | const baseUrl = process.env.LLAMA_CLOUD_BASE_URL; 12 | return { 13 | plugins: [react()], 14 | resolve: { 15 | alias: { 16 | "@": path.resolve(__dirname, "./src"), 17 | }, 18 | }, 19 | server: { 20 | port: port, 21 | host: true, 22 | }, 23 | build: { 24 | outDir: "dist", 25 | sourcemap: true, 26 | }, 27 | base: basePath, 28 | define: { 29 | // Primary define uses NAME 30 | "import.meta.env.VITE_LLAMA_DEPLOY_DEPLOYMENT_NAME": JSON.stringify( 31 | deploymentName 32 | ), 33 | "import.meta.env.VITE_LLAMA_DEPLOY_DEPLOYMENT_BASE_PATH": JSON.stringify(basePath), 34 | ...(projectId && { 35 | "import.meta.env.VITE_LLAMA_DEPLOY_PROJECT_ID": 36 | JSON.stringify(projectId), 37 | }), 38 | ...(baseUrl && { 39 | "import.meta.env.VITE_LLAMA_CLOUD_BASE_URL": JSON.stringify(baseUrl), 40 | }), 41 | }, 42 | }; 43 | }); 44 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "extraction-review" 3 | version = "0.1.0" 4 | description = "Extracts data" 5 | readme = "README.md" 6 | requires-python = ">=3.12" 7 | dependencies = [ 8 | "llama-cloud-services>=0.6.69", 9 | "llama-index-workflows>=2.2.0,<3.0.0", 10 | "python-dotenv>=1.1.0", 11 | "jsonref>=1.1.0", 12 | "click>=8.2.1,<8.3.0", 13 | "httpx>=0.28.1", 14 | "llama-index-core>=0.14.0", 15 | "llama-index-llms-openai>=0.3.0", 16 | ] 17 | 18 | [dependency-groups] 19 | dev = [ 20 | "ruff>=0.11.10", 21 | "typescript>=0.0.12", 22 | "ty>=0.0.1a16", 23 | "pytest>=8.4.1", 24 | "hatch>=1.14.1", 25 | "llamactl>=0.3.0", 26 | "pytest-asyncio>=1.3.0", 27 | ] 28 | 29 | [build-system] 30 | requires = ["hatchling"] 31 | build-backend = "hatchling.build" 32 | 33 | [tool.hatch.envs.default.scripts] 34 | "format" = "ruff format ." 35 | "format-check" = "ruff format --check ." 36 | "lint" = "ruff check --fix ." 37 | "lint-check" = ["ruff check ."] 38 | typecheck = "ty check src" 39 | test = "pytest" 40 | "all-check" = ["format-check", "lint-check", "test"] 41 | "all-fix" = ["format", "lint", "test"] 42 | 43 | [tool.llamadeploy] 44 | env_files = [".env"] 45 | llama_cloud = true 46 | required_env_vars = ["OPENAI_API_KEY"] 47 | 48 | [tool.llamadeploy.workflows] 49 | process-file = "extraction_review.process_file:workflow" 50 | metadata = "extraction_review.metadata_workflow:workflow" 51 | index-contract = "extraction_review.index_contract:workflow" 52 | 53 | [tool.llamadeploy.ui] 54 | directory = "ui" 55 | 56 | -------------------------------------------------------------------------------- /ui/src/lib/useMetadata.ts: -------------------------------------------------------------------------------- 1 | import { useWorkflow } from "@llamaindex/ui"; 2 | import { useEffect, useRef, useState } from "react"; 3 | 4 | export interface Metadata { 5 | json_schema: any; 6 | extracted_data_collection: string; 7 | contracts_pipeline_id: string; 8 | } 9 | 10 | export interface UseMetadataResult { 11 | metadata: Metadata; 12 | loading: boolean; 13 | error: string | undefined; 14 | } 15 | 16 | export function useMetadata() { 17 | const wf = useWorkflow("metadata"); 18 | const [error, setError] = useState(undefined); 19 | const [loading, setLoading] = useState(true); 20 | const [metadata, setMetadata] = useState(undefined); 21 | const strictModeWorkaround = useRef(false); 22 | useEffect(() => { 23 | if (strictModeWorkaround.current) { 24 | return; 25 | } 26 | strictModeWorkaround.current = true; 27 | setLoading(true); 28 | wf.runToCompletion({}) 29 | .then((handler) => { 30 | if (handler.status === "completed") { 31 | const result = handler.result?.data as unknown as Metadata; 32 | setMetadata(result); 33 | } else { 34 | setError( 35 | handler.error || `Unexpected workflow status: ${handler.status}`, 36 | ); 37 | } 38 | }) 39 | .catch((error) => { 40 | setError(error instanceof Error ? error.message : String(error)); 41 | }) 42 | .finally(() => { 43 | setLoading(false); 44 | }); 45 | }, []); 46 | 47 | return { metadata, loading, error }; 48 | } 49 | -------------------------------------------------------------------------------- /ui/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "extraction-review-ui", 3 | "version": "0.1.0", 4 | "private": true, 5 | "type": "module", 6 | "scripts": { 7 | "dev": "vite", 8 | "build": "tsc && vite build", 9 | "preview": "vite preview", 10 | "lint": "tsc --noEmit", 11 | "format": "prettier --write src", 12 | "format-check": "prettier --check src", 13 | "all-check": "pnpm i && pnpm run lint && pnpm run format-check && pnpm run build", 14 | "all-fix": "pnpm i && pnpm run lint && pnpm run format && pnpm run build" 15 | }, 16 | "dependencies": { 17 | "@babel/runtime": "^7.27.6", 18 | "@lezer/highlight": "^1.2.1", 19 | "@llamaindex/ui": "^3.2.0", 20 | "@llamaindex/workflows-client": "^1.7.0", 21 | "@radix-ui/themes": "^3.2.1", 22 | "class-variance-authority": "^0.7.1", 23 | "clsx": "^2.1.1", 24 | "llama-cloud-services": "^0.3.4", 25 | "lucide-react": "^0.514.0", 26 | "react": "^19.0.0", 27 | "react-dom": "^19.0.0", 28 | "react-router-dom": "^6.30.0", 29 | "sonner": "^2.0.5", 30 | "tw-animate-css": "^1.3.5" 31 | }, 32 | "devDependencies": { 33 | "@tailwindcss/postcss": "^4.1.10", 34 | "@types/node": "^20", 35 | "@types/react": "^19", 36 | "@types/react-dom": "^19", 37 | "@vitejs/plugin-react": "^4.3.4", 38 | "postcss": "^8.5.5", 39 | "prettier": "^3.6.2", 40 | "tailwind-merge": "^3.3.1", 41 | "tailwindcss": "^4.1.8", 42 | "typescript": "^5", 43 | "vite": "^6.0.5" 44 | }, 45 | "packageManager": "pnpm@10.11.1+sha512.e519b9f7639869dc8d5c3c5dfef73b3f091094b0a006d7317353c72b124e80e1afd429732e28705ad6bfa1ee879c1fce46c128ccebd3192101f43dd67c667912" 46 | } 47 | -------------------------------------------------------------------------------- /ui/src/lib/client.ts: -------------------------------------------------------------------------------- 1 | import { ExtractedData } from "llama-cloud-services/beta/agent"; 2 | import { 3 | ApiClients, 4 | createWorkflowsClient, 5 | createWorkflowsConfig, 6 | createCloudAgentClient, 7 | cloudApiClient, 8 | } from "@llamaindex/ui"; 9 | import { AGENT_NAME } from "./config"; 10 | import type { Metadata } from "./useMetadata"; 11 | 12 | const platformToken = import.meta.env.VITE_LLAMA_CLOUD_API_KEY; 13 | const apiBaseUrl = import.meta.env.VITE_LLAMA_CLOUD_BASE_URL; 14 | const projectId = import.meta.env.VITE_LLAMA_DEPLOY_PROJECT_ID; 15 | 16 | // Configure the platform client 17 | cloudApiClient.setConfig({ 18 | ...(apiBaseUrl && { baseUrl: apiBaseUrl }), 19 | headers: { 20 | // optionally use a backend API token scoped to a project. For local development, 21 | ...(platformToken && { authorization: `Bearer ${platformToken}` }), 22 | // This header is required for requests to correctly scope to the agent's project 23 | // when authenticating with a user cookie 24 | ...(projectId && { "Project-Id": projectId }), 25 | }, 26 | }); 27 | 28 | export function createBaseWorkflowClient(): ReturnType< 29 | typeof createWorkflowsClient 30 | > { 31 | return createWorkflowsClient( 32 | createWorkflowsConfig({ 33 | baseUrl: `/deployments/${AGENT_NAME}/`, 34 | }), 35 | ); 36 | } 37 | 38 | export function createClients(metadata: Metadata): ApiClients { 39 | const workflowsClient = createBaseWorkflowClient(); 40 | const agentClient = createCloudAgentClient>({ 41 | client: cloudApiClient, 42 | windowUrl: typeof window !== "undefined" ? window.location.href : undefined, 43 | collection: metadata.extracted_data_collection, 44 | }); 45 | 46 | return { 47 | workflowsClient, 48 | cloudApiClient, 49 | agentDataClient: agentClient, 50 | } as ApiClients; 51 | } 52 | -------------------------------------------------------------------------------- /task.md: -------------------------------------------------------------------------------- 1 | We are building an invoice extraction and reconciliation workflow app. 2 | 3 | Invoices are parsed into structured data, then compared against indexed contracts to reconcile the invoice with its matching contract. Update the invoice record with contract-derived information and any discrepancies. 4 | 5 | Using the UI, the user should be able to: 6 | - add and index new contracts 7 | - add and reconcile new invoices 8 | 9 | This should be based off of the base extraction review template, which has 2 pages, one that displays a table of all extracted items (one row per invoice), and one for the item details (the extracted data for one invoice, e.g. total and line items). The items and details view should show the invoices. 10 | 11 | Contracts can remain largely invisible in the UI for now, but there should be a minimal way to add them. These should be placed into a LlamaCloud index (which parses PDFs to plain text for retrieval). 12 | 13 | The stored schema should extend the extracted invoice schema with reconciliation fields, such as links to the matched contract, a match confidence/score, and a structured list of discrepancies. 14 | 15 | Matching should retrieve candidate contracts and then use an LLM, with context for both the candidate contracts and the invoice data, to make the final selection and provide rationale. When no contract matches, record that outcome clearly. 16 | 17 | When matching and reconciling, consider: 18 | - Whether there is any plausible matching contract versus only irrelevant results (e.g., vendor name, contract dates/ranges, contract or PO numbers). 19 | - Whether payment terms are matching (at minimum). 20 | - Optionally, check other obvious alignments if cheaply available (e.g., totals, vendor identifiers). 21 | 22 | Represent reconciliation results in the details view with a clear, structured list of discrepancies (e.g., field, invoice_value, contract_value, optional note/severity). 23 | 24 | The vast majority of this change should be kept in the python codebase. Some minor changes may need to be added to the UI, however do not do anything complex, just a button or small widget. 25 | -------------------------------------------------------------------------------- /ui/src/App.tsx: -------------------------------------------------------------------------------- 1 | import React from "react"; 2 | import { Routes, Route } from "react-router-dom"; 3 | import { Theme } from "@radix-ui/themes"; 4 | import { 5 | Breadcrumb, 6 | BreadcrumbItem, 7 | BreadcrumbList, 8 | BreadcrumbSeparator, 9 | } from "@llamaindex/ui"; 10 | import { Link } from "react-router-dom"; 11 | import { Toaster } from "sonner"; 12 | import { useToolbar, ToolbarProvider } from "@/lib/ToolbarContext"; 13 | import { MetadataProvider } from "@/lib/MetadataProvider"; 14 | 15 | // Import pages 16 | import HomePage from "./pages/HomePage"; 17 | import ItemPage from "./pages/ItemPage"; 18 | 19 | export default function App() { 20 | return ( 21 | 22 | 23 | 24 |
25 | 26 |
27 | 28 | } /> 29 | } /> 30 | 31 |
32 |
33 | 34 |
35 |
36 |
37 | ); 38 | } 39 | 40 | const Toolbar = () => { 41 | const { buttons, breadcrumbs } = useToolbar(); 42 | 43 | return ( 44 |
45 | 46 | 47 | {breadcrumbs.map((item, index) => ( 48 | 49 | {index > 0 && } 50 | 51 | {item.href && !item.isCurrentPage ? ( 52 | 53 | {item.label} 54 | 55 | ) : ( 56 | 59 | {item.label} 60 | 61 | )} 62 | 63 | 64 | ))} 65 | 66 | 67 | {buttons} 68 |
69 | ); 70 | }; 71 | -------------------------------------------------------------------------------- /ui/src/lib/MetadataProvider.tsx: -------------------------------------------------------------------------------- 1 | import { createContext, useContext, ReactNode, useMemo } from "react"; 2 | import { ApiProvider, ApiClients } from "@llamaindex/ui"; 3 | import { useMetadata, Metadata } from "./useMetadata"; 4 | import { createBaseWorkflowClient, createClients } from "./client"; 5 | import { Clock, XCircle } from "lucide-react"; 6 | 7 | interface MetadataContextValue { 8 | metadata: Metadata; 9 | clients: ApiClients; 10 | } 11 | 12 | const MetadataContext = createContext(null); 13 | 14 | export function MetadataProvider({ children }: { children: ReactNode }) { 15 | const baseClients: ApiClients = useMemo(() => { 16 | return { 17 | workflowsClient: createBaseWorkflowClient(), 18 | } as ApiClients; 19 | }, []); 20 | return ( 21 | 22 | {children} 23 | 24 | ); 25 | } 26 | 27 | function InnerMetadataProvider({ children }: { children: ReactNode }) { 28 | const { metadata, loading, error } = useMetadata(); 29 | const clients = useMemo( 30 | () => (metadata ? createClients(metadata) : undefined), 31 | [metadata], 32 | ); 33 | 34 | if (loading) { 35 | return ( 36 |
37 |
38 | 39 |
Loading configuration...
40 |
41 |
42 | ); 43 | } 44 | 45 | if (error || !metadata || !clients) { 46 | return ( 47 |
48 |
49 | 50 |
51 | Error loading configuration: {error || "Unknown error"} 52 |
53 |
54 |
55 | ); 56 | } 57 | 58 | return ( 59 | 60 | {children} 61 | 62 | ); 63 | } 64 | 65 | export function useMetadataContext() { 66 | const context = useContext(MetadataContext); 67 | if (!context) { 68 | throw new Error("useMetadataContext must be used within MetadataProvider"); 69 | } 70 | return context; 71 | } 72 | -------------------------------------------------------------------------------- /ui/src/pages/HomePage.tsx: -------------------------------------------------------------------------------- 1 | import { 2 | ItemCount, 3 | WorkflowTrigger, 4 | ExtractedDataItemGrid, 5 | HandlerState, 6 | } from "@llamaindex/ui"; 7 | import type { TypedAgentData } from "llama-cloud-services/beta/agent"; 8 | import styles from "./HomePage.module.css"; 9 | import { useNavigate } from "react-router-dom"; 10 | import { useState } from "react"; 11 | import { WorkflowProgress } from "@/lib/WorkflowProgress"; 12 | import { ContractsDropdown } from "@/lib/ContractsDropdown"; 13 | 14 | export default function HomePage() { 15 | return ; 16 | } 17 | 18 | function TaskList() { 19 | const navigate = useNavigate(); 20 | const goToItem = (item: TypedAgentData) => { 21 | navigate(`/item/${item.id}`); 22 | }; 23 | const [reloadSignal, setReloadSignal] = useState(0); 24 | const [handlers, setHandlers] = useState([]); 25 | 26 | return ( 27 |
28 |
29 |
30 | 31 | 38 | 45 |
46 |
47 | { 51 | setReloadSignal(reloadSignal + 1); 52 | }} 53 | /> 54 | { 57 | return { 58 | file_id: files[0].fileId, 59 | }; 60 | }} 61 | title="Upload Invoice" 62 | onSuccess={(handler) => { 63 | setHandlers([...handlers, handler]); 64 | }} 65 | /> 66 |
67 | { 70 | return { 71 | file_ids: files.map((file) => file.fileId), 72 | }; 73 | }} 74 | title="Upload Contract" 75 | multiple 76 | onSuccess={(handler) => { 77 | setHandlers([...handlers, handler]); 78 | }} 79 | /> 80 | 81 |
82 |
83 | 84 | 95 |
96 |
97 | ); 98 | } 99 | -------------------------------------------------------------------------------- /src/extraction_review/clients.py: -------------------------------------------------------------------------------- 1 | import functools 2 | import logging 3 | import os 4 | 5 | import httpx 6 | from llama_cloud.client import AsyncLlamaCloud 7 | from llama_cloud.core.api_error import ApiError 8 | from llama_cloud_services import ExtractionAgent, LlamaExtract, LlamaCloudIndex 9 | from llama_cloud_services.beta.agent_data import AsyncAgentDataClient, ExtractedData 10 | from llama_index.llms.openai import OpenAI 11 | 12 | from extraction_review.config import ( 13 | CONTRACTS_INDEX_NAME, 14 | EXTRACTED_DATA_COLLECTION, 15 | EXTRACT_CONFIG, 16 | EXTRACTION_AGENT_NAME, 17 | InvoiceExtractionSchema, 18 | InvoiceWithReconciliation, 19 | ) 20 | 21 | logger = logging.getLogger(__name__) 22 | 23 | # deployed agents may infer their name from the deployment name 24 | # Note: Make sure that an agent deployment with this name actually exists 25 | # otherwise calls to get or set data will fail. You may need to adjust the `or ` 26 | # name for development 27 | agent_name = os.getenv("LLAMA_DEPLOY_DEPLOYMENT_NAME") 28 | # required for all llama cloud calls 29 | api_key = os.getenv("LLAMA_CLOUD_API_KEY") 30 | # get this in case running against a different environment than production 31 | base_url = os.getenv("LLAMA_CLOUD_BASE_URL") 32 | project_id = os.getenv("LLAMA_DEPLOY_PROJECT_ID") 33 | 34 | 35 | @functools.lru_cache(maxsize=None) 36 | def get_extract_agent() -> ExtractionAgent: 37 | extract_api = LlamaExtract( 38 | api_key=api_key, base_url=base_url, project_id=project_id 39 | ) 40 | 41 | try: 42 | existing = extract_api.get_agent(EXTRACTION_AGENT_NAME) 43 | existing.data_schema = InvoiceExtractionSchema 44 | existing.config = EXTRACT_CONFIG 45 | return existing 46 | except ApiError as e: 47 | if e.status_code == 404: 48 | return extract_api.create_agent( 49 | name=EXTRACTION_AGENT_NAME, 50 | data_schema=InvoiceExtractionSchema, 51 | config=EXTRACT_CONFIG, 52 | ) 53 | else: 54 | raise 55 | 56 | 57 | @functools.lru_cache(maxsize=None) 58 | def get_data_client() -> AsyncAgentDataClient[ExtractedData[InvoiceWithReconciliation]]: 59 | return AsyncAgentDataClient( 60 | deployment_name=agent_name, 61 | collection=EXTRACTED_DATA_COLLECTION, 62 | type=ExtractedData[InvoiceWithReconciliation], 63 | client=get_llama_cloud_client(), 64 | ) 65 | 66 | 67 | @functools.lru_cache(maxsize=None) 68 | def get_llama_cloud_client(): 69 | return AsyncLlamaCloud( 70 | base_url=base_url, 71 | token=api_key, 72 | httpx_client=httpx.AsyncClient( 73 | timeout=60, headers={"Project-Id": project_id} if project_id else None 74 | ), 75 | ) 76 | 77 | 78 | @functools.lru_cache(maxsize=None) 79 | def get_contracts_index() -> LlamaCloudIndex: 80 | """Get or create the contracts index for storing and retrieving contract documents""" 81 | return LlamaCloudIndex.create_index( 82 | name=CONTRACTS_INDEX_NAME, 83 | project_id=project_id, 84 | api_key=api_key, 85 | base_url=base_url, 86 | ) 87 | 88 | 89 | @functools.lru_cache(maxsize=None) 90 | def get_llm() -> OpenAI: 91 | """Get OpenAI LLM for structured predictions""" 92 | return OpenAI(model="gpt-5-mini", temperature=0) 93 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Invoice Extraction and Contract Reconciliation 2 | 3 | This template provides a LlamaAgents application for extracting structured data from invoices 4 | and reconciling it against contract documents using LlamaExtract, LlamaCloud Index, and Agent Data. 5 | It helps finance and operations teams validate that incoming invoices comply with agreed contract terms 6 | by automatically detecting mismatches in payment terms, totals, and other key fields. 7 | 8 | # Running the application 9 | 10 | This is a starter for LlamaAgents. See the 11 | [LlamaAgents (llamactl) getting started guide](https://developers.llamaindex.ai/python/llamaagents/llamactl/getting-started/) 12 | for context on local development and deployment. 13 | 14 | To run the application locally, clone this repo, install [`uv`](https://docs.astral.sh/uv/) and run `uvx llamactl serve`. 15 | 16 | This application can also be deployed directly to [LlamaCloud](https://cloud.llamaindex.ai) via the UI, 17 | or with `llamactl deployment create`. 18 | 19 | ## Features 20 | 21 | - **Invoice data extraction**: Uses a Pydantic `InvoiceExtractionSchema` to extract key invoice fields 22 | (vendor, dates, PO number, line items, subtotals, tax, totals, and more) via a LlamaExtract agent. 23 | - **Contract indexing and retrieval**: Includes an `index-contract` workflow that downloads contract files 24 | from LlamaCloud and indexes them into a dedicated `contracts` LlamaCloud Index for retrieval. 25 | - **Automated reconciliation**: Matches invoices to the most relevant contracts using retrieval plus an LLM, 26 | then produces an `InvoiceWithReconciliation` record with match confidence, rationale, and structured discrepancies. 27 | - **Agent Data storage**: Stores reconciled invoice records in LlamaCloud Agent Data, deduplicated by file hash, 28 | so that re-processing the same file replaces prior results instead of duplicating them. 29 | - **UI integration**: A web UI lets you upload invoices and contracts, monitor workflow progress, 30 | and review or edit extracted and reconciled data. 31 | 32 | ## Example Documents 33 | 34 | You can find sample invoice and contract PDF files to test the application with 35 | [here](https://github.com/run-llama/llama-datasets/tree/main/llama_agents/invoice-contracts). 36 | 37 | ## Configuration 38 | 39 | All main configuration is in `src/extraction_review/config.py`. 40 | 41 | ## How It Works 42 | 43 | The application uses a multi-step workflow powered by LlamaIndex: 44 | 45 | 1. **File Upload**: Users upload invoice or contract documents through the UI, which are stored in LlamaCloud. 46 | 2. **Index Contracts**: Contract files are processed by the `index-contract` workflow and indexed into 47 | the `contracts` LlamaCloud Index. 48 | 3. **Download Invoice**: The `process-file` workflow downloads the selected invoice file from LlamaCloud storage. 49 | 4. **Extraction**: A LlamaExtract agent runs against the invoice using `InvoiceExtractionSchema`, returning 50 | structured invoice data plus field-level metadata. 51 | 5. **Contract Retrieval**: The workflow queries the contracts index with a query built from invoice fields 52 | (vendor, PO number, invoice number, etc.) and retrieves the most relevant contracts. 53 | 6. **Reconciliation**: An LLM compares the invoice to the retrieved contracts, selects the best match, 54 | and produces an `InvoiceWithReconciliation` object with match confidence, rationale, and discrepancy list. 55 | 7. **Storage**: The reconciled invoice data is wrapped in an `ExtractedData` record (including file hash) 56 | and stored in Agent Data, replacing any previous records for the same file hash. 57 | 8. **Review**: The UI displays the stored data for review, editing, and export. 58 | 59 | ### Workflows 60 | 61 | The application includes three main workflows: 62 | 63 | - **`process-file`** (`src/extraction_review/process_file.py`): Main workflow for processing invoices 64 | end-to-end (download → extract → reconcile → store). 65 | - **`index-contract`** (`src/extraction_review/index_contract.py`): Workflow for downloading and indexing 66 | contract documents into a LlamaCloud Index for later retrieval during reconciliation. 67 | - **`metadata`** (`src/extraction_review/metadata_workflow.py`): Exposes configuration metadata to the UI, 68 | returning the JSON Schema for `InvoiceWithReconciliation` and the Agent Data collection name. 69 | 70 | ## Linting and type checking 71 | 72 | Python and javascript packages contain helpful scripts to lint, format, and type check the code. 73 | 74 | To check and fix python code: 75 | 76 | ```bash 77 | uv run hatch run lint 78 | uv run hatch run typecheck 79 | uv run hatch run test 80 | # run all at once 81 | uv run hatch run all-fix 82 | ``` 83 | 84 | To check and fix javascript code, within the `ui` directory: 85 | 86 | ```bash 87 | pnpm run lint 88 | pnpm run typecheck 89 | pnpm run test 90 | # run all at once 91 | pnpm run all-fix 92 | ``` 93 | -------------------------------------------------------------------------------- /ui/src/index.css: -------------------------------------------------------------------------------- 1 | @import "tailwindcss"; 2 | @import "tw-animate-css"; 3 | 4 | @custom-variant dark (&:is(.dark *)); 5 | 6 | @theme inline { 7 | --radius-sm: calc(var(--radius) - 4px); 8 | --radius-md: calc(var(--radius) - 2px); 9 | --radius-lg: var(--radius); 10 | --radius-xl: calc(var(--radius) + 4px); 11 | --color-background: var(--background); 12 | --color-foreground: var(--foreground); 13 | --color-card: var(--card); 14 | --color-card-foreground: var(--card-foreground); 15 | --color-popover: var(--popover); 16 | --color-popover-foreground: var(--popover-foreground); 17 | --color-primary: var(--primary); 18 | --color-primary-foreground: var(--primary-foreground); 19 | --color-secondary: var(--secondary); 20 | --color-secondary-foreground: var(--secondary-foreground); 21 | --color-muted: var(--muted); 22 | --color-muted-foreground: var(--muted-foreground); 23 | --color-accent: var(--accent); 24 | --color-accent-foreground: var(--accent-foreground); 25 | --color-destructive: var(--destructive); 26 | --color-border: var(--border); 27 | --color-input: var(--input); 28 | --color-ring: var(--ring); 29 | --color-chart-1: var(--chart-1); 30 | --color-chart-2: var(--chart-2); 31 | --color-chart-3: var(--chart-3); 32 | --color-chart-4: var(--chart-4); 33 | --color-chart-5: var(--chart-5); 34 | --color-sidebar: var(--sidebar); 35 | --color-sidebar-foreground: var(--sidebar-foreground); 36 | --color-sidebar-primary: var(--sidebar-primary); 37 | --color-sidebar-primary-foreground: var(--sidebar-primary-foreground); 38 | --color-sidebar-accent: var(--sidebar-accent); 39 | --color-sidebar-accent-foreground: var(--sidebar-accent-foreground); 40 | --color-sidebar-border: var(--sidebar-border); 41 | --color-sidebar-ring: var(--sidebar-ring); 42 | } 43 | 44 | :root { 45 | --radius: 0.625rem; 46 | --card: oklch(1 0 0); 47 | --card-foreground: oklch(0.141 0.005 285.823); 48 | --popover: oklch(1 0 0); 49 | --popover-foreground: oklch(0.141 0.005 285.823); 50 | --primary: oklch(0.21 0.006 285.885); 51 | --primary-foreground: oklch(0.985 0 0); 52 | --secondary: oklch(0.967 0.001 286.375); 53 | --secondary-foreground: oklch(0.21 0.006 285.885); 54 | --muted: oklch(0.967 0.001 286.375); 55 | --muted-foreground: oklch(0.552 0.016 285.938); 56 | --accent: oklch(0.967 0.001 286.375); 57 | --accent-foreground: oklch(0.21 0.006 285.885); 58 | --destructive: oklch(0.577 0.245 27.325); 59 | --border: oklch(0.92 0.004 286.32); 60 | --input: oklch(0.92 0.004 286.32); 61 | --ring: oklch(0.705 0.015 286.067); 62 | --chart-1: oklch(0.646 0.222 41.116); 63 | --chart-2: oklch(0.6 0.118 184.704); 64 | --chart-3: oklch(0.398 0.07 227.392); 65 | --chart-4: oklch(0.828 0.189 84.429); 66 | --chart-5: oklch(0.769 0.188 70.08); 67 | --sidebar: oklch(0.985 0 0); 68 | --sidebar-foreground: oklch(0.141 0.005 285.823); 69 | --sidebar-primary: oklch(0.21 0.006 285.885); 70 | --sidebar-primary-foreground: oklch(0.985 0 0); 71 | --sidebar-accent: oklch(0.967 0.001 286.375); 72 | --sidebar-accent-foreground: oklch(0.21 0.006 285.885); 73 | --sidebar-border: oklch(0.92 0.004 286.32); 74 | --sidebar-ring: oklch(0.705 0.015 286.067); 75 | --background: oklch(1 0 0); 76 | --foreground: oklch(0.141 0.005 285.823); 77 | } 78 | 79 | .dark { 80 | --background: oklch(0.141 0.005 285.823); 81 | --foreground: oklch(0.985 0 0); 82 | --card: oklch(0.21 0.006 285.885); 83 | --card-foreground: oklch(0.985 0 0); 84 | --popover: oklch(0.21 0.006 285.885); 85 | --popover-foreground: oklch(0.985 0 0); 86 | --primary: oklch(0.92 0.004 286.32); 87 | --primary-foreground: oklch(0.21 0.006 285.885); 88 | --secondary: oklch(0.274 0.006 286.033); 89 | --secondary-foreground: oklch(0.985 0 0); 90 | --muted: oklch(0.274 0.006 286.033); 91 | --muted-foreground: oklch(0.705 0.015 286.067); 92 | --accent: oklch(0.274 0.006 286.033); 93 | --accent-foreground: oklch(0.985 0 0); 94 | --destructive: oklch(0.704 0.191 22.216); 95 | --border: oklch(1 0 0 / 10%); 96 | --input: oklch(1 0 0 / 15%); 97 | --ring: oklch(0.552 0.016 285.938); 98 | --chart-1: oklch(0.488 0.243 264.376); 99 | --chart-2: oklch(0.696 0.17 162.48); 100 | --chart-3: oklch(0.769 0.188 70.08); 101 | --chart-4: oklch(0.627 0.265 303.9); 102 | --chart-5: oklch(0.645 0.246 16.439); 103 | --sidebar: oklch(0.21 0.006 285.885); 104 | --sidebar-foreground: oklch(0.985 0 0); 105 | --sidebar-primary: oklch(0.488 0.243 264.376); 106 | --sidebar-primary-foreground: oklch(0.985 0 0); 107 | --sidebar-accent: oklch(0.274 0.006 286.033); 108 | --sidebar-accent-foreground: oklch(0.985 0 0); 109 | --sidebar-border: oklch(1 0 0 / 10%); 110 | --sidebar-ring: oklch(0.552 0.016 285.938); 111 | } 112 | 113 | @layer base { 114 | * { 115 | @apply border-border outline-ring/50; 116 | } 117 | body { 118 | @apply bg-background text-foreground; 119 | } 120 | } 121 | -------------------------------------------------------------------------------- /ui/src/pages/ItemPage.tsx: -------------------------------------------------------------------------------- 1 | import { useEffect, useState } from "react"; 2 | import { 3 | AcceptReject, 4 | ExtractedDataDisplay, 5 | FilePreview, 6 | useItemData, 7 | type Highlight, 8 | Button, 9 | } from "@llamaindex/ui"; 10 | import { Clock, XCircle, Download } from "lucide-react"; 11 | import { useParams } from "react-router-dom"; 12 | import { useToolbar } from "@/lib/ToolbarContext"; 13 | import { useNavigate } from "react-router-dom"; 14 | import { modifyJsonSchema } from "@llamaindex/ui/lib"; 15 | import { APP_TITLE } from "@/lib/config"; 16 | import { downloadExtractedDataItem } from "@/lib/export"; 17 | import { useMetadataContext } from "@/lib/MetadataProvider"; 18 | 19 | export default function ItemPage() { 20 | const { itemId } = useParams<{ itemId: string }>(); 21 | const { setButtons, setBreadcrumbs } = useToolbar(); 22 | const [highlight, setHighlight] = useState(undefined); 23 | const { metadata } = useMetadataContext(); 24 | // Use the hook to fetch item data 25 | const itemHookData = useItemData({ 26 | // order/remove fields as needed here 27 | jsonSchema: modifyJsonSchema(metadata.json_schema, {}), 28 | itemId: itemId as string, 29 | isMock: false, 30 | }); 31 | 32 | const navigate = useNavigate(); 33 | 34 | // Update breadcrumb when item data loads 35 | useEffect(() => { 36 | const fileName = itemHookData.item?.data?.file_name; 37 | if (fileName) { 38 | setBreadcrumbs([ 39 | { label: APP_TITLE, href: "/" }, 40 | { 41 | label: fileName, 42 | isCurrentPage: true, 43 | }, 44 | ]); 45 | } 46 | 47 | return () => { 48 | // Reset to default breadcrumb when leaving the page 49 | setBreadcrumbs([{ label: APP_TITLE, href: "/" }]); 50 | }; 51 | }, [itemHookData.item?.data?.file_name, setBreadcrumbs]); 52 | 53 | useEffect(() => { 54 | setButtons(() => [ 55 |
56 | 69 | 70 | itemData={itemHookData} 71 | onComplete={() => navigate("/")} 72 | /> 73 |
, 74 | ]); 75 | return () => { 76 | setButtons(() => []); 77 | }; 78 | }, [itemHookData.data, setButtons]); 79 | 80 | const { 81 | item: itemData, 82 | updateData, 83 | loading: isLoading, 84 | error, 85 | } = itemHookData; 86 | 87 | if (isLoading) { 88 | return ( 89 |
90 |
91 | 92 |
Loading item...
93 |
94 |
95 | ); 96 | } 97 | 98 | if (error || !itemData) { 99 | return ( 100 |
101 |
102 | 103 |
104 | Error loading item: {error || "Item not found"} 105 |
106 |
107 |
108 | ); 109 | } 110 | 111 | return ( 112 |
113 | {/* Left Side - File Preview */} 114 |
115 | {itemData.data.file_id && ( 116 | { 119 | console.log("Bounding box clicked:", box, "on page:", pageNumber); 120 | }} 121 | highlight={highlight} 122 | /> 123 | )} 124 |
125 | 126 | {/* Right Side - Review Panel */} 127 |
128 |
129 | {/* Extracted Data */} 130 | 131 | extractedData={itemData.data} 132 | title="Extracted Data" 133 | onChange={(updatedData) => { 134 | updateData(updatedData); 135 | }} 136 | onClickField={(args) => { 137 | // TODO: set multiple highlights 138 | setHighlight({ 139 | page: args.metadata?.citation?.[0]?.page ?? 1, 140 | x: 100, 141 | y: 100, 142 | width: 0, 143 | height: 0, 144 | }); 145 | }} 146 | jsonSchema={itemHookData.jsonSchema} 147 | /> 148 |
149 |
150 |
151 | ); 152 | } 153 | -------------------------------------------------------------------------------- /src/extraction_review/config.py: -------------------------------------------------------------------------------- 1 | """ 2 | For simple configuration of the extraction review application, just customize this file. 3 | 4 | If you need more control, feel free to edit the rest of the application 5 | """ 6 | 7 | from __future__ import annotations 8 | 9 | import os 10 | 11 | from llama_cloud import ExtractConfig 12 | from llama_cloud_services.extract import ExtractMode 13 | from pydantic import BaseModel, Field 14 | 15 | # The name of the extraction agent to use. Prefers the name of this deployment when deployed to isolate environments. 16 | # Note that the application will create a new agent from the below ExtractionSchema if the extraction agent does not yet exist. 17 | EXTRACTION_AGENT_NAME: str = ( 18 | os.getenv("LLAMA_DEPLOY_DEPLOYMENT_NAME") or "invoice-reconciliation" 19 | ) 20 | # The name of the collection to use for storing extracted data. This will be qualified by the agent name. 21 | # When developing locally, this will use the _public collection (shared within the project), otherwise agent 22 | # data is isolated to each agent 23 | EXTRACTED_DATA_COLLECTION: str = "invoices" 24 | 25 | # The name of the LlamaCloud index for storing contracts 26 | CONTRACTS_INDEX_NAME: str = "contracts" 27 | 28 | 29 | # Invoice extraction schema - extracted from invoice documents 30 | class LineItem(BaseModel): 31 | description: str | None = Field( 32 | default=None, description="Description of the line item" 33 | ) 34 | quantity: float | None = Field(default=None, description="Quantity of the item") 35 | unit_price: float | None = Field( 36 | default=None, description="Price per unit of the item" 37 | ) 38 | total: float | None = Field( 39 | default=None, description="Total price for this line item" 40 | ) 41 | 42 | 43 | class InvoiceExtractionSchema(BaseModel): 44 | """Schema for extracting invoice data""" 45 | 46 | invoice_number: str | None = Field( 47 | default=None, description="Invoice number or identifier" 48 | ) 49 | invoice_date: str | None = Field( 50 | default=None, description="Date of the invoice (YYYY-MM-DD format if possible)" 51 | ) 52 | vendor_name: str | None = Field( 53 | default=None, description="Name of the vendor or supplier" 54 | ) 55 | vendor_address: str | None = Field( 56 | default=None, description="Address of the vendor" 57 | ) 58 | purchase_order_number: str | None = Field( 59 | default=None, description="Purchase order (PO) number if present" 60 | ) 61 | payment_terms: str | None = Field( 62 | default=None, 63 | description="Payment terms (e.g., Net 30, Net 60, Due on receipt)", 64 | ) 65 | line_items: list[LineItem] | None = Field( 66 | default=None, description="List of line items on the invoice" 67 | ) 68 | subtotal: float | None = Field( 69 | default=None, description="Subtotal before tax and other charges" 70 | ) 71 | tax: float | None = Field(default=None, description="Tax amount") 72 | total: float | None = Field( 73 | default=None, description="Total amount due on the invoice" 74 | ) 75 | 76 | 77 | # For backward compatibility 78 | ExtractionSchema = InvoiceExtractionSchema 79 | 80 | 81 | # Reconciliation schema - extends invoice data with contract matching and discrepancy information 82 | class Discrepancy(BaseModel): 83 | """Represents a single discrepancy between invoice and contract""" 84 | 85 | field: str = Field(description="Field name where discrepancy was found") 86 | invoice_value: str | None = Field( 87 | default=None, description="Value from the invoice" 88 | ) 89 | contract_value: str | None = Field( 90 | default=None, description="Expected value from the contract" 91 | ) 92 | severity: str | None = Field( 93 | default=None, 94 | description="Severity of the discrepancy (e.g., 'high', 'medium', 'low')", 95 | ) 96 | note: str | None = Field( 97 | default=None, description="Additional notes about the discrepancy" 98 | ) 99 | 100 | 101 | class InvoiceWithReconciliation(InvoiceExtractionSchema): 102 | """Invoice data with reconciliation information""" 103 | 104 | matched_contract_id: str | None = Field( 105 | default=None, description="ID of the matched contract file in LlamaCloud" 106 | ) 107 | matched_contract_name: str | None = Field( 108 | default=None, description="Name of the matched contract file" 109 | ) 110 | match_confidence: str | None = Field( 111 | default=None, 112 | description="Confidence level of the match (e.g., 'high', 'medium', 'low', 'none')", 113 | ) 114 | match_rationale: str | None = Field( 115 | default=None, description="Explanation of why this contract was matched" 116 | ) 117 | discrepancies: list[Discrepancy] | None = Field( 118 | default=None, 119 | description="List of discrepancies found between invoice and contract", 120 | ) 121 | 122 | 123 | EXTRACT_CONFIG = ExtractConfig( 124 | extraction_mode=ExtractMode.PREMIUM, 125 | system_prompt=None, 126 | # advanced. Only compatible with Premium mode. 127 | use_reasoning=False, 128 | cite_sources=False, 129 | confidence_scores=True, 130 | ) 131 | -------------------------------------------------------------------------------- /src/extraction_review/index_contract.py: -------------------------------------------------------------------------------- 1 | """ 2 | Workflow for indexing contract documents into LlamaCloud Index for retrieval. 3 | """ 4 | 5 | import logging 6 | import os 7 | import tempfile 8 | from pathlib import Path 9 | from typing import Literal 10 | 11 | import httpx 12 | from llama_index.core import Document 13 | from pydantic import BaseModel 14 | from workflows import Context, Workflow, step 15 | from workflows.events import Event, StartEvent, StopEvent 16 | 17 | from .clients import get_contracts_index, get_llama_cloud_client 18 | 19 | logger = logging.getLogger(__name__) 20 | 21 | 22 | class ContractFileEvent(StartEvent): 23 | """Event to start contract indexing with a file ID""" 24 | 25 | file_ids: list[str] 26 | 27 | 28 | class DownloadContractEvent(Event): 29 | """Event to trigger contract download""" 30 | 31 | file_id: str 32 | 33 | 34 | class ContractDownloadedEvent(Event): 35 | """Event indicating contract has been downloaded""" 36 | 37 | file_id: str 38 | file_path: str 39 | filename: str 40 | 41 | 42 | class ContractIndexedEvent(Event): 43 | """Event indicating a single contract has been indexed""" 44 | 45 | file_id: str 46 | filename: str 47 | 48 | 49 | class Status(Event): 50 | """Event to show toast notifications in the UI""" 51 | 52 | level: Literal["info", "warning", "error"] 53 | message: str 54 | 55 | 56 | class ContractIndexState(BaseModel): 57 | """State for contract indexing workflow""" 58 | 59 | total_files: int = 0 60 | # Store file info keyed by file_id 61 | file_paths: dict[str, str] = {} 62 | filenames: dict[str, str] = {} 63 | 64 | 65 | class IndexContractWorkflow(Workflow): 66 | """ 67 | Workflow to download and index a contract document into LlamaCloud Index. 68 | """ 69 | 70 | @step() 71 | async def start_indexing( 72 | self, event: ContractFileEvent, ctx: Context[ContractIndexState] 73 | ) -> DownloadContractEvent | None: 74 | """Initialize the workflow with multiple file IDs and fan out to parallel downloads""" 75 | logger.info(f"Starting contract indexing for {len(event.file_ids)} files") 76 | async with ctx.store.edit_state() as state: 77 | state.total_files = len(event.file_ids) 78 | 79 | # Fan out: emit one download event per file 80 | for file_id in event.file_ids: 81 | ctx.send_event(DownloadContractEvent(file_id=file_id)) 82 | 83 | return None 84 | 85 | @step(num_workers=4) 86 | async def download_contract( 87 | self, event: DownloadContractEvent, ctx: Context[ContractIndexState] 88 | ) -> ContractDownloadedEvent: 89 | """Download the contract file from LlamaCloud storage (runs in parallel)""" 90 | file_id = event.file_id 91 | 92 | file_metadata = await get_llama_cloud_client().files.get_file(id=file_id) 93 | file_url = await get_llama_cloud_client().files.read_file_content(file_id) 94 | 95 | temp_dir = tempfile.gettempdir() 96 | filename = file_metadata.name 97 | file_path = os.path.join(temp_dir, filename) 98 | 99 | logger.info(f"Downloading contract {filename} from {file_url.url}") 100 | ctx.write_event_to_stream( 101 | Status(level="info", message=f"Downloading contract: {filename}") 102 | ) 103 | 104 | client = httpx.AsyncClient() 105 | async with client.stream("GET", file_url.url) as response: 106 | with open(file_path, "wb") as f: 107 | async for chunk in response.aiter_bytes(): 108 | f.write(chunk) 109 | 110 | logger.info(f"Downloaded contract to {file_path}") 111 | async with ctx.store.edit_state() as state: 112 | state.file_paths[file_id] = file_path 113 | state.filenames[file_id] = filename 114 | 115 | return ContractDownloadedEvent( 116 | file_id=file_id, file_path=file_path, filename=filename 117 | ) 118 | 119 | @step(num_workers=4) 120 | async def index_contract( 121 | self, event: ContractDownloadedEvent, ctx: Context[ContractIndexState] 122 | ) -> ContractIndexedEvent: 123 | """Index the contract document into LlamaCloud Index (runs in parallel)""" 124 | file_id = event.file_id 125 | file_path = event.file_path 126 | filename = event.filename 127 | 128 | logger.info(f"Indexing contract {filename}") 129 | ctx.write_event_to_stream( 130 | Status(level="info", message=f"Indexing contract: {filename}") 131 | ) 132 | 133 | # Create a document with metadata 134 | file_content = Path(file_path).read_text(errors="ignore") 135 | document = Document( 136 | text=file_content, 137 | metadata={ 138 | "filename": filename, 139 | "file_id": file_id, 140 | "document_type": "contract", 141 | }, 142 | ) 143 | 144 | # Get the contracts index and insert the document 145 | index = get_contracts_index() 146 | await index.ainsert(document) 147 | 148 | logger.info(f"Successfully indexed contract {filename}") 149 | ctx.write_event_to_stream( 150 | Status( 151 | level="info", 152 | message=f"Successfully indexed contract: {filename}", 153 | ) 154 | ) 155 | 156 | return ContractIndexedEvent(file_id=file_id, filename=filename) 157 | 158 | @step() 159 | async def collect_results( 160 | self, event: ContractIndexedEvent, ctx: Context[ContractIndexState] 161 | ) -> StopEvent | None: 162 | """Collect all indexed contracts and return final results (fan-in)""" 163 | state = await ctx.store.get_state() 164 | 165 | # Collect all ContractIndexedEvent events - one for each file 166 | events = ctx.collect_events(event, [ContractIndexedEvent] * state.total_files) 167 | 168 | if events is None: 169 | # Not all files have been indexed yet 170 | return None 171 | 172 | # All files have been indexed, return aggregated results 173 | results = [{"file_id": ev.file_id, "filename": ev.filename} for ev in events] 174 | 175 | logger.info(f"Successfully indexed all {len(results)} contracts") 176 | ctx.write_event_to_stream( 177 | Status( 178 | level="info", 179 | message=f"Successfully indexed all {len(results)} contracts", 180 | ) 181 | ) 182 | 183 | return StopEvent(result={"contracts": results, "total": len(results)}) 184 | 185 | 186 | workflow = IndexContractWorkflow(timeout=None) 187 | 188 | if __name__ == "__main__": 189 | import asyncio 190 | from dotenv import load_dotenv 191 | 192 | load_dotenv() 193 | logging.basicConfig(level=logging.INFO) 194 | 195 | async def main(): 196 | # Example usage - upload a contract and index it 197 | file = await get_llama_cloud_client().files.upload_file( 198 | upload_file=Path("sample_contract.pdf").open("rb") 199 | ) 200 | result = await workflow.run(start_event=ContractFileEvent(file_ids=[file.id])) 201 | print(f"Indexed contract: {result}") 202 | 203 | asyncio.run(main()) 204 | -------------------------------------------------------------------------------- /ui/src/lib/WorkflowProgress.tsx: -------------------------------------------------------------------------------- 1 | import { 2 | useHandlers, 3 | WorkflowEvent, 4 | StreamOperation, 5 | HandlerState, 6 | } from "@llamaindex/ui"; 7 | import { useEffect, useRef, useState } from "react"; 8 | import { Loader2 } from "lucide-react"; 9 | import { cn } from "./utils"; 10 | 11 | interface StatusMessage { 12 | type: "Status"; 13 | data: { 14 | level: "info" | "warning" | "error"; 15 | message: string; 16 | }; 17 | } 18 | /** 19 | * Given a workflow type, keeps track of the number of running handlers and the maximum number of running handlers. 20 | * Has hooks to notify when a workflow handler is completed. 21 | */ 22 | export const WorkflowProgress = ({ 23 | workflowName, 24 | onWorkflowCompletion, 25 | handlers = [], 26 | sync = true, 27 | }: { 28 | workflowName: string[]; 29 | onWorkflowCompletion?: (handlerIds: string[]) => void; 30 | handlers?: HandlerState[]; // specific handlers to track, e.g. after triggering a workflow run 31 | sync?: boolean; // whether to sync the handlers with the query on mount 32 | }) => { 33 | const handlersService = useHandlers({ 34 | query: { workflow_name: workflowName, status: ["running"] }, 35 | sync: sync, 36 | }); 37 | const seenHandlers = useRef>(new Set()); 38 | useEffect(() => { 39 | for (const handler of handlers) { 40 | if (!seenHandlers.current.has(handler.handler_id)) { 41 | seenHandlers.current.add(handler.handler_id); 42 | handlersService.setHandler(handler); 43 | } 44 | } 45 | }, [handlers, handlersService]); 46 | 47 | const subscribed = useRef>>({}); 48 | 49 | const [statusMessage, setStatusMessage] = useState< 50 | StatusMessage["data"] | undefined 51 | >(); 52 | const [statusVisible, setStatusVisible] = useState(false); 53 | const hideTimerRef = useRef(undefined); 54 | const clearTimerRef = useRef(undefined); 55 | const [hasHadRunning, setHasHadRunning] = useState(false); 56 | 57 | const runningHandlers = Object.values(handlersService.state.handlers).filter( 58 | (handler) => handler.status === "running", 59 | ); 60 | const runningHandlersKey = runningHandlers 61 | .map((handler) => handler.handler_id) 62 | .sort() 63 | .join(","); 64 | // subscribe to all running handlers and disconnect when they complete 65 | useEffect(() => { 66 | for (const handler of runningHandlers) { 67 | if (!subscribed.current[handler.handler_id]) { 68 | handlersService.actions(handler.handler_id).subscribeToEvents({ 69 | onComplete() { 70 | subscribed.current[handler.handler_id]?.disconnect(); 71 | delete subscribed.current[handler.handler_id]; 72 | }, 73 | onData(data) { 74 | if (data.type === "Status") { 75 | setStatusMessage(data.data as StatusMessage["data"]); 76 | } 77 | }, 78 | }); 79 | } 80 | } 81 | }, [runningHandlersKey]); 82 | const lastHandlers = useRef([]); 83 | useEffect(() => { 84 | const newRunningHandlers = runningHandlers.map( 85 | (handler) => handler.handler_id, 86 | ); 87 | const anyRemoved = lastHandlers.current.some( 88 | (handler) => !newRunningHandlers.includes(handler), 89 | ); 90 | if (anyRemoved) { 91 | onWorkflowCompletion?.(lastHandlers.current); 92 | } 93 | lastHandlers.current = newRunningHandlers; 94 | }, [runningHandlersKey]); 95 | 96 | // unsubscribe on unmount 97 | useEffect(() => { 98 | return () => { 99 | for (const [key, handler] of Object.entries(subscribed.current)) { 100 | handler.disconnect(); 101 | delete subscribed.current[key]; 102 | } 103 | if (hideTimerRef.current !== undefined) { 104 | clearTimeout(hideTimerRef.current); 105 | hideTimerRef.current = undefined; 106 | } 107 | if (clearTimerRef.current !== undefined) { 108 | clearTimeout(clearTimerRef.current); 109 | clearTimerRef.current = undefined; 110 | } 111 | }; 112 | }, []); 113 | 114 | // Animate in on new messages and auto-hide after 15s 115 | useEffect(() => { 116 | if (!statusMessage) { 117 | return; 118 | } 119 | if (hideTimerRef.current !== undefined) { 120 | clearTimeout(hideTimerRef.current); 121 | hideTimerRef.current = undefined; 122 | } 123 | if (clearTimerRef.current !== undefined) { 124 | clearTimeout(clearTimerRef.current); 125 | clearTimerRef.current = undefined; 126 | } 127 | setStatusVisible(false); 128 | requestAnimationFrame(() => { 129 | setStatusVisible(true); 130 | }); 131 | hideTimerRef.current = window.setTimeout(() => { 132 | setStatusVisible(false); 133 | clearTimerRef.current = window.setTimeout(() => { 134 | setStatusMessage(undefined); 135 | }, 300); 136 | }, 15000); 137 | // eslint-disable-next-line react-hooks/exhaustive-deps 138 | }, [statusMessage?.level, statusMessage?.message]); 139 | 140 | // Track if we've ever had any running workflows in this session 141 | useEffect(() => { 142 | if (runningHandlers.length > 0 && !hasHadRunning) { 143 | setHasHadRunning(true); 144 | } 145 | }, [runningHandlers.length, hasHadRunning]); 146 | 147 | if (!runningHandlers.length && !hasHadRunning) { 148 | return null; 149 | } 150 | return ( 151 |
152 |
194 | ); 195 | }; 196 | -------------------------------------------------------------------------------- /ui/src/lib/ContractsDropdown.tsx: -------------------------------------------------------------------------------- 1 | import { useState, useEffect, useCallback } from "react"; 2 | import { 3 | paginatedListPipelineDocumentsApiV1PipelinesPipelineIdDocumentsPaginatedGet, 4 | deletePipelineDocumentApiV1PipelinesPipelineIdDocumentsDocumentIdDelete, 5 | readFileContentApiV1FilesIdContentGet, 6 | } from "llama-cloud-services/api"; 7 | import { 8 | Button, 9 | DropdownMenu, 10 | DropdownMenuContent, 11 | DropdownMenuTrigger, 12 | DropdownMenuSeparator, 13 | ScrollArea, 14 | } from "@llamaindex/ui"; 15 | import type { CloudDocument } from "llama-cloud-services/api"; 16 | import { Trash2, ChevronDown, Loader2 } from "lucide-react"; 17 | import { useMetadata } from "./useMetadata"; 18 | 19 | const LIMIT = 20; 20 | 21 | interface UseContractsLoaderResult { 22 | contracts: CloudDocument[]; 23 | total: number | null; 24 | loading: boolean; 25 | hasMore: boolean; 26 | loadMore: () => void; 27 | handleScroll: (event: React.UIEvent) => void; 28 | removeContract: (id: string) => void; 29 | } 30 | 31 | function useContractsLoader( 32 | pipelineId: string | undefined, 33 | isOpen: boolean, 34 | ): UseContractsLoaderResult { 35 | const [contracts, setContracts] = useState([]); 36 | const [total, setTotal] = useState(null); 37 | const [offset, setOffset] = useState(0); 38 | const [loading, setLoading] = useState(false); 39 | const [hasMore, setHasMore] = useState(true); 40 | 41 | const loadContracts = useCallback( 42 | async (reset = false) => { 43 | if (!pipelineId || loading) return; 44 | 45 | setLoading(true); 46 | try { 47 | const currentOffset = reset ? 0 : offset; 48 | const response = 49 | await paginatedListPipelineDocumentsApiV1PipelinesPipelineIdDocumentsPaginatedGet( 50 | { 51 | path: { 52 | pipeline_id: pipelineId, 53 | }, 54 | query: { 55 | offset: currentOffset, 56 | limit: LIMIT, 57 | }, 58 | }, 59 | ); 60 | 61 | if (response.data) { 62 | setTotal(response.data.total_count); 63 | setContracts((prev) => 64 | reset 65 | ? response.data!.documents 66 | : [...prev, ...response.data!.documents], 67 | ); 68 | setOffset(currentOffset + response.data.documents.length); 69 | setHasMore( 70 | currentOffset + response.data.documents.length < 71 | response.data.total_count, 72 | ); 73 | } 74 | } catch (error) { 75 | console.error("Failed to load contracts:", error); 76 | } finally { 77 | setLoading(false); 78 | } 79 | }, 80 | [pipelineId, offset, loading], 81 | ); 82 | 83 | useEffect(() => { 84 | if (isOpen && pipelineId && contracts.length === 0) { 85 | loadContracts(true); 86 | } 87 | }, [isOpen, pipelineId]); 88 | 89 | const handleScroll = useCallback( 90 | (event: React.UIEvent) => { 91 | if (!hasMore || loading) return; 92 | 93 | const target = event.currentTarget; 94 | const { scrollTop, scrollHeight, clientHeight } = target; 95 | if (scrollTop + clientHeight >= scrollHeight - 50) { 96 | loadContracts(); 97 | } 98 | }, 99 | [hasMore, loading, loadContracts], 100 | ); 101 | 102 | const removeContract = useCallback((id: string) => { 103 | setContracts((prev) => prev.filter((doc) => doc.id !== id)); 104 | setTotal((prev) => (prev !== null ? prev - 1 : null)); 105 | }, []); 106 | 107 | return { 108 | contracts, 109 | total, 110 | loading, 111 | hasMore, 112 | loadMore: loadContracts, 113 | handleScroll, 114 | removeContract, 115 | }; 116 | } 117 | 118 | interface UseDeleteContractResult { 119 | deleteConfirmId: string | null; 120 | deletingId: string | null; 121 | showDeleteConfirm: (id: string) => void; 122 | cancelDelete: () => void; 123 | confirmDelete: (id: string) => Promise; 124 | } 125 | 126 | function useDeleteContract( 127 | pipelineId: string | undefined, 128 | onSuccess?: () => void, 129 | ): UseDeleteContractResult { 130 | const [deleteConfirmId, setDeleteConfirmId] = useState(null); 131 | const [deletingId, setDeletingId] = useState(null); 132 | 133 | const confirmDelete = useCallback( 134 | async (documentId: string) => { 135 | if (!pipelineId) return; 136 | 137 | setDeletingId(documentId); 138 | try { 139 | await deletePipelineDocumentApiV1PipelinesPipelineIdDocumentsDocumentIdDelete( 140 | { 141 | path: { 142 | pipeline_id: pipelineId, 143 | document_id: documentId, 144 | }, 145 | }, 146 | ); 147 | 148 | setDeleteConfirmId(null); 149 | onSuccess?.(); 150 | } catch (error) { 151 | console.error("Failed to delete contract:", error); 152 | alert("Failed to delete contract. Please try again."); 153 | } finally { 154 | setDeletingId(null); 155 | } 156 | }, 157 | [pipelineId, onSuccess], 158 | ); 159 | 160 | return { 161 | deleteConfirmId, 162 | deletingId, 163 | showDeleteConfirm: setDeleteConfirmId, 164 | cancelDelete: () => setDeleteConfirmId(null), 165 | confirmDelete, 166 | }; 167 | } 168 | 169 | interface ContractsDropdownProps { 170 | onDeleteSuccess?: () => void; 171 | } 172 | 173 | export function ContractsDropdown({ onDeleteSuccess }: ContractsDropdownProps) { 174 | const { metadata, loading: metadataLoading } = useMetadata(); 175 | const [isOpen, setIsOpen] = useState(false); 176 | 177 | const { contracts, total, loading, handleScroll, removeContract } = 178 | useContractsLoader(metadata?.contracts_pipeline_id, isOpen); 179 | 180 | const { 181 | deleteConfirmId, 182 | deletingId, 183 | showDeleteConfirm, 184 | cancelDelete, 185 | confirmDelete, 186 | } = useDeleteContract(metadata?.contracts_pipeline_id, onDeleteSuccess); 187 | 188 | const handleDelete = async (documentId: string) => { 189 | await confirmDelete(documentId); 190 | removeContract(documentId); 191 | }; 192 | 193 | const handleDownload = async (contract: CloudDocument) => { 194 | const fileId = contract.metadata?.file_id as string; 195 | if (!fileId) { 196 | console.error("No file_id found in contract metadata"); 197 | alert("Cannot download: file information not available"); 198 | return; 199 | } 200 | 201 | try { 202 | const response = await readFileContentApiV1FilesIdContentGet({ 203 | path: { id: fileId }, 204 | }); 205 | 206 | if (response.data?.url) { 207 | // Create a temporary link and trigger download 208 | const link = document.createElement("a"); 209 | link.href = response.data.url; 210 | link.download = (contract.metadata?.filename as string) || "contract"; 211 | document.body.appendChild(link); 212 | link.click(); 213 | document.body.removeChild(link); 214 | } 215 | } catch (error) { 216 | console.error("Failed to download contract:", error); 217 | alert("Failed to download contract. Please try again."); 218 | } 219 | }; 220 | 221 | if (metadataLoading) { 222 | return null; 223 | } 224 | 225 | return ( 226 | 227 | 228 | 236 | 237 | 238 | 239 | {total !== null && ( 240 | <> 241 |
242 | Total Contracts: {total} 243 |
244 | 245 | 246 | )} 247 | 248 | 249 | {contracts.length === 0 && !loading ? ( 250 |
251 | No contracts found 252 |
253 | ) : ( 254 |
255 | {contracts.map((contract, index) => ( 256 |
257 | {deleteConfirmId === contract.id ? ( 258 |
259 |

260 | Delete " 261 | {(contract.metadata?.filename as string) || "Untitled"} 262 | "? 263 |

264 |
265 | 277 | 287 |
288 |
289 | ) : ( 290 |
291 | 302 | 303 | 316 |
317 | )} 318 | {index < contracts.length - 1 && } 319 |
320 | ))} 321 |
322 | )} 323 | 324 | {loading && ( 325 |
326 | 327 | Loading... 328 |
329 | )} 330 |
331 |
332 |
333 | ); 334 | } 335 | -------------------------------------------------------------------------------- /src/extraction_review/process_file.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import hashlib 3 | import logging 4 | import os 5 | import tempfile 6 | from pathlib import Path 7 | from typing import Any, Literal 8 | 9 | import httpx 10 | from llama_cloud import ExtractRun 11 | from llama_cloud_services.beta.agent_data import ExtractedData, InvalidExtractionData 12 | from llama_cloud_services.extract import SourceText 13 | from llama_index.core.prompts import PromptTemplate 14 | from pydantic import BaseModel, Field 15 | from workflows import Context, Workflow, step 16 | from workflows.events import Event, StartEvent, StopEvent 17 | 18 | from .clients import ( 19 | get_contracts_index, 20 | get_data_client, 21 | get_extract_agent, 22 | get_llama_cloud_client, 23 | get_llm, 24 | ) 25 | from .config import Discrepancy, InvoiceExtractionSchema, InvoiceWithReconciliation 26 | 27 | logger = logging.getLogger(__name__) 28 | 29 | 30 | class FileEvent(StartEvent): 31 | file_id: str 32 | 33 | 34 | class DownloadFileEvent(Event): 35 | pass 36 | 37 | 38 | class FileDownloadedEvent(Event): 39 | pass 40 | 41 | 42 | class Status(Event): 43 | level: Literal["info", "warning", "error"] 44 | message: str 45 | 46 | 47 | class ExtractedEvent(Event): 48 | """Event when invoice data is successfully extracted""" 49 | 50 | invoice_data: InvoiceExtractionSchema 51 | field_metadata: dict[str, Any] 52 | 53 | 54 | class ExtractedInvalidEvent(Event): 55 | """Event when extraction validation fails""" 56 | 57 | data: ExtractedData[dict[str, Any]] 58 | 59 | 60 | class ReconciledEvent(Event): 61 | """Event when invoice is reconciled with contracts""" 62 | 63 | data: ExtractedData[InvoiceWithReconciliation] 64 | 65 | 66 | class ExtractionState(BaseModel): 67 | file_id: str | None = None 68 | file_path: str | None = None 69 | filename: str | None = None 70 | 71 | 72 | class ProcessFileWorkflow(Workflow): 73 | """ 74 | Given a file path, this workflow will process a single file through the custom extraction logic. 75 | """ 76 | 77 | @step() 78 | async def run_file(self, event: FileEvent, ctx: Context) -> DownloadFileEvent: 79 | logger.info(f"Running file {event.file_id}") 80 | async with ctx.store.edit_state() as state: 81 | state.file_id = event.file_id 82 | return DownloadFileEvent() 83 | 84 | @step() 85 | async def download_file( 86 | self, event: DownloadFileEvent, ctx: Context[ExtractionState] 87 | ) -> FileDownloadedEvent: 88 | """Download the file reference from the cloud storage""" 89 | state = await ctx.store.get_state() 90 | if state.file_id is None: 91 | raise ValueError("File ID is not set") 92 | try: 93 | file_metadata = await get_llama_cloud_client().files.get_file( 94 | id=state.file_id 95 | ) 96 | file_url = await get_llama_cloud_client().files.read_file_content( 97 | state.file_id 98 | ) 99 | 100 | temp_dir = tempfile.gettempdir() 101 | filename = file_metadata.name 102 | file_path = os.path.join(temp_dir, filename) 103 | client = httpx.AsyncClient() 104 | # Report progress to the UI 105 | logger.info(f"Downloading file {file_url.url} to {file_path}") 106 | 107 | async with client.stream("GET", file_url.url) as response: 108 | with open(file_path, "wb") as f: 109 | async for chunk in response.aiter_bytes(): 110 | f.write(chunk) 111 | logger.info(f"Downloaded file {file_url.url} to {file_path}") 112 | async with ctx.store.edit_state() as state: 113 | state.file_path = file_path 114 | state.filename = filename 115 | return FileDownloadedEvent() 116 | 117 | except Exception as e: 118 | logger.error(f"Error downloading file {state.file_id}: {e}", exc_info=True) 119 | ctx.write_event_to_stream( 120 | Status( 121 | level="error", 122 | message=f"Error downloading file {state.file_id}: {e}", 123 | ) 124 | ) 125 | raise e 126 | 127 | @step() 128 | async def process_file( 129 | self, event: FileDownloadedEvent, ctx: Context[ExtractionState] 130 | ) -> ExtractedEvent | ExtractedInvalidEvent: 131 | """Runs the extraction against the file""" 132 | state = await ctx.store.get_state() 133 | if state.file_path is None or state.filename is None: 134 | raise ValueError("File path or filename is not set") 135 | try: 136 | agent = get_extract_agent() 137 | source_text = SourceText( 138 | file=state.file_path, 139 | filename=state.filename, 140 | ) 141 | logger.info(f"Extracting data from file {state.filename}") 142 | ctx.write_event_to_stream( 143 | Status( 144 | level="info", message=f"Extracting data from file {state.filename}" 145 | ) 146 | ) 147 | extracted_result: ExtractRun = await agent.aextract(source_text) 148 | 149 | # Validate the extracted data 150 | if not extracted_result.data: 151 | raise ValueError("No data extracted from invoice") 152 | 153 | invoice_data = InvoiceExtractionSchema.model_validate(extracted_result.data) 154 | logger.info(f"Extracted invoice data: {invoice_data}") 155 | # Extract only the field_metadata we need, not the entire ExtractRun object 156 | field_metadata = extracted_result.extraction_metadata.get( 157 | "field_metadata", {} 158 | ) 159 | return ExtractedEvent( 160 | invoice_data=invoice_data, field_metadata=field_metadata 161 | ) 162 | except InvalidExtractionData as e: 163 | logger.error(f"Error validating extracted data: {e}", exc_info=True) 164 | return ExtractedInvalidEvent(data=e.invalid_item) 165 | except Exception as e: 166 | logger.error( 167 | f"Error extracting data from file {state.filename}: {e}", 168 | exc_info=True, 169 | ) 170 | ctx.write_event_to_stream( 171 | Status( 172 | level="error", 173 | message=f"Error extracting data from file {state.filename}: {e}", 174 | ) 175 | ) 176 | raise e 177 | 178 | @step() 179 | async def reconcile_with_contract( 180 | self, event: ExtractedEvent, ctx: Context[ExtractionState] 181 | ) -> ReconciledEvent: 182 | """Reconcile the invoice with matching contracts using retrieval and LLM""" 183 | state = await ctx.store.get_state() 184 | invoice_data = event.invoice_data 185 | 186 | logger.info("Reconciling invoice with contracts") 187 | ctx.write_event_to_stream( 188 | Status(level="info", message="Matching invoice with contracts...") 189 | ) 190 | 191 | try: 192 | # Build a query from invoice data for contract retrieval 193 | query_parts = [] 194 | if invoice_data.vendor_name: 195 | query_parts.append(f"vendor: {invoice_data.vendor_name}") 196 | if invoice_data.purchase_order_number: 197 | query_parts.append(f"PO: {invoice_data.purchase_order_number}") 198 | if invoice_data.invoice_number: 199 | query_parts.append(f"invoice: {invoice_data.invoice_number}") 200 | 201 | query = " ".join(query_parts) if query_parts else "contract agreement" 202 | 203 | # Retrieve relevant contracts 204 | index = get_contracts_index() 205 | retriever = index.as_retriever(similarity_top_k=3) 206 | retrieved_nodes = await retriever.aretrieve(query) 207 | 208 | if not retrieved_nodes: 209 | logger.info("No contracts found in index") 210 | # No contracts available - create reconciliation data with no match 211 | reconciled_data = InvoiceWithReconciliation( 212 | **invoice_data.model_dump(), 213 | match_confidence="none", 214 | match_rationale="No contracts found in the system", 215 | discrepancies=[], 216 | ) 217 | else: 218 | # Use LLM to match and reconcile 219 | reconciled_data = await self._match_and_reconcile( 220 | invoice_data, retrieved_nodes 221 | ) 222 | 223 | # Create ExtractedData with reconciliation information 224 | file_content = Path(state.file_path).read_bytes() 225 | file_hash = hashlib.sha256(file_content).hexdigest() 226 | 227 | # Get field metadata from extraction event 228 | field_metadata = event.field_metadata 229 | 230 | extracted_data = ExtractedData.create( 231 | data=reconciled_data, 232 | file_id=state.file_id, 233 | file_name=state.filename, 234 | file_hash=file_hash, 235 | field_metadata=field_metadata, 236 | ) 237 | 238 | logger.info(f"Reconciliation complete: {reconciled_data.match_confidence}") 239 | return ReconciledEvent(data=extracted_data) 240 | 241 | except Exception as e: 242 | logger.error(f"Error during reconciliation: {e}", exc_info=True) 243 | # If reconciliation fails, still create data without reconciliation 244 | reconciled_data = InvoiceWithReconciliation( 245 | **invoice_data.model_dump(), 246 | match_confidence="error", 247 | match_rationale=f"Error during reconciliation: {str(e)}", 248 | discrepancies=[], 249 | ) 250 | 251 | file_content = Path(state.file_path).read_bytes() 252 | file_hash = hashlib.sha256(file_content).hexdigest() 253 | field_metadata = event.field_metadata 254 | 255 | extracted_data = ExtractedData.create( 256 | data=reconciled_data, 257 | file_id=state.file_id, 258 | file_name=state.filename, 259 | file_hash=file_hash, 260 | field_metadata=field_metadata, 261 | ) 262 | 263 | return ReconciledEvent(data=extracted_data) 264 | 265 | async def _match_and_reconcile( 266 | self, invoice_data: InvoiceExtractionSchema, retrieved_nodes: list 267 | ) -> InvoiceWithReconciliation: 268 | """Use LLM to match invoice with contract and identify discrepancies""" 269 | 270 | # Define structured output schema for LLM 271 | class ContractMatchResult(BaseModel): 272 | """Result of matching invoice to contract""" 273 | 274 | is_match: bool = Field( 275 | description="Whether a plausible contract match was found" 276 | ) 277 | matched_contract_index: int | None = Field( 278 | default=None, 279 | description="Index (0-based) of the matched contract in the provided list, or None if no match", 280 | ) 281 | match_confidence: str = Field( 282 | description="Confidence level: 'high', 'medium', 'low', or 'none'" 283 | ) 284 | match_rationale: str = Field( 285 | description="Explanation of why this contract was or was not matched" 286 | ) 287 | contract_payment_terms: str | None = Field( 288 | default=None, description="Payment terms found in the matched contract" 289 | ) 290 | discrepancies: list[Discrepancy] = Field( 291 | default_factory=list, 292 | description="List of discrepancies found between invoice and contract", 293 | ) 294 | 295 | # Prepare contract context 296 | contracts_text = "\n\n".join( 297 | [ 298 | f"Contract {i} (File: {node.metadata.get('filename', 'Unknown')}):\n{node.text[:1000]}" 299 | for i, node in enumerate(retrieved_nodes) 300 | ] 301 | ) 302 | 303 | # Create prompt for matching 304 | prompt_template = PromptTemplate( 305 | """You are analyzing an invoice to match it with the correct contract and identify any discrepancies. 306 | 307 | Invoice Details: 308 | - Vendor: {vendor_name} 309 | - Invoice Number: {invoice_number} 310 | - Invoice Date: {invoice_date} 311 | - PO Number: {po_number} 312 | - Payment Terms: {payment_terms} 313 | - Total: {total} 314 | 315 | Retrieved Contracts: 316 | {contracts_text} 317 | 318 | Task: 319 | 1. Determine if any of the retrieved contracts plausibly matches this invoice based on: 320 | - Vendor name matching or similarity 321 | - PO number or invoice number references 322 | - Date ranges or validity periods 323 | - Any other relevant identifiers 324 | 325 | 2. If a match is found, identify discrepancies between invoice and contract, focusing on: 326 | - Payment terms differences (CRITICAL) 327 | - Total amount mismatches if contract specifies amounts 328 | - Vendor name discrepancies 329 | - Any other obvious conflicts 330 | 331 | 3. Assess match confidence: 332 | - 'high': Clear match with strong vendor/PO/identifier alignment 333 | - 'medium': Probable match with some uncertainty 334 | - 'low': Weak match, possibly relevant but uncertain 335 | - 'none': No plausible match found 336 | 337 | Provide your analysis in the specified format.""" 338 | ) 339 | 340 | # Use LLM with structured prediction 341 | llm = get_llm() 342 | result = await llm.astructured_predict( 343 | ContractMatchResult, 344 | prompt_template, 345 | **{ 346 | "vendor_name": invoice_data.vendor_name or "N/A", 347 | "invoice_number": invoice_data.invoice_number or "N/A", 348 | "invoice_date": invoice_data.invoice_date or "N/A", 349 | "po_number": invoice_data.purchase_order_number or "N/A", 350 | "payment_terms": invoice_data.payment_terms or "N/A", 351 | "total": invoice_data.total or "N/A", 352 | "contracts_text": contracts_text, 353 | }, 354 | ) 355 | 356 | # Build reconciled invoice data 357 | matched_contract_id = None 358 | matched_contract_name = None 359 | 360 | if result.is_match and result.matched_contract_index is not None: 361 | matched_node = retrieved_nodes[result.matched_contract_index] 362 | matched_contract_id = matched_node.metadata.get("file_id") 363 | matched_contract_name = matched_node.metadata.get("filename") 364 | 365 | return InvoiceWithReconciliation( 366 | **invoice_data.model_dump(), 367 | matched_contract_id=matched_contract_id, 368 | matched_contract_name=matched_contract_name, 369 | match_confidence=result.match_confidence, 370 | match_rationale=result.match_rationale, 371 | discrepancies=result.discrepancies, 372 | ) 373 | 374 | @step() 375 | async def record_extracted_data( 376 | self, event: ReconciledEvent | ExtractedInvalidEvent, ctx: Context 377 | ) -> StopEvent: 378 | """Records the extracted data to the agent data API""" 379 | try: 380 | logger.info(f"Recorded extracted data for file {event.data.file_name}") 381 | ctx.write_event_to_stream( 382 | Status( 383 | level="info", 384 | message=f"Recorded extracted data for file {event.data.file_name}", 385 | ) 386 | ) 387 | # remove past data when reprocessing the same file 388 | if event.data.file_hash: 389 | await get_data_client().delete( 390 | filter={ 391 | "file_hash": { 392 | "eq": event.data.file_hash, 393 | }, 394 | }, 395 | ) 396 | logger.info( 397 | f"Removing past data for file {event.data.file_name} with hash {event.data.file_hash}" 398 | ) 399 | # finally, save the new data 400 | item_id = await get_data_client().create_item(event.data) 401 | return StopEvent( 402 | result=item_id.id, 403 | ) 404 | except Exception as e: 405 | logger.error( 406 | f"Error recording extracted data for file {event.data.file_name}: {e}", 407 | exc_info=True, 408 | ) 409 | ctx.write_event_to_stream( 410 | Status( 411 | level="error", 412 | message=f"Error recording extracted data for file {event.data.file_name}: {e}", 413 | ) 414 | ) 415 | raise e 416 | 417 | 418 | workflow = ProcessFileWorkflow(timeout=None) 419 | 420 | if __name__ == "__main__": 421 | from dotenv import load_dotenv 422 | 423 | load_dotenv() 424 | logging.basicConfig(level=logging.INFO) 425 | 426 | async def main(): 427 | file = await get_llama_cloud_client().files.upload_file( 428 | upload_file=Path("test.pdf").open("rb") 429 | ) 430 | await workflow.run(start_event=FileEvent(file_id=file.id)) 431 | 432 | asyncio.run(main()) 433 | --------------------------------------------------------------------------------