113 | {/* Left Side - File Preview */}
114 |
115 | {itemData.data.file_id && (
116 | {
119 | console.log("Bounding box clicked:", box, "on page:", pageNumber);
120 | }}
121 | highlight={highlight}
122 | />
123 | )}
124 |
125 |
126 | {/* Right Side - Review Panel */}
127 |
128 |
129 | {/* Extracted Data */}
130 |
131 | extractedData={itemData.data}
132 | title="Extracted Data"
133 | onChange={(updatedData) => {
134 | updateData(updatedData);
135 | }}
136 | onClickField={(args) => {
137 | // TODO: set multiple highlights
138 | setHighlight({
139 | page: args.metadata?.citation?.[0]?.page ?? 1,
140 | x: 100,
141 | y: 100,
142 | width: 0,
143 | height: 0,
144 | });
145 | }}
146 | jsonSchema={itemHookData.jsonSchema}
147 | />
148 |
149 |
150 |
151 | );
152 | }
153 |
--------------------------------------------------------------------------------
/src/extraction_review/config.py:
--------------------------------------------------------------------------------
1 | """
2 | For simple configuration of the extraction review application, just customize this file.
3 |
4 | If you need more control, feel free to edit the rest of the application
5 | """
6 |
7 | from __future__ import annotations
8 |
9 | import os
10 |
11 | from llama_cloud import ExtractConfig
12 | from llama_cloud_services.extract import ExtractMode
13 | from pydantic import BaseModel, Field
14 |
15 | # The name of the extraction agent to use. Prefers the name of this deployment when deployed to isolate environments.
16 | # Note that the application will create a new agent from the below ExtractionSchema if the extraction agent does not yet exist.
17 | EXTRACTION_AGENT_NAME: str = (
18 | os.getenv("LLAMA_DEPLOY_DEPLOYMENT_NAME") or "invoice-reconciliation"
19 | )
20 | # The name of the collection to use for storing extracted data. This will be qualified by the agent name.
21 | # When developing locally, this will use the _public collection (shared within the project), otherwise agent
22 | # data is isolated to each agent
23 | EXTRACTED_DATA_COLLECTION: str = "invoices"
24 |
25 | # The name of the LlamaCloud index for storing contracts
26 | CONTRACTS_INDEX_NAME: str = "contracts"
27 |
28 |
29 | # Invoice extraction schema - extracted from invoice documents
30 | class LineItem(BaseModel):
31 | description: str | None = Field(
32 | default=None, description="Description of the line item"
33 | )
34 | quantity: float | None = Field(default=None, description="Quantity of the item")
35 | unit_price: float | None = Field(
36 | default=None, description="Price per unit of the item"
37 | )
38 | total: float | None = Field(
39 | default=None, description="Total price for this line item"
40 | )
41 |
42 |
43 | class InvoiceExtractionSchema(BaseModel):
44 | """Schema for extracting invoice data"""
45 |
46 | invoice_number: str | None = Field(
47 | default=None, description="Invoice number or identifier"
48 | )
49 | invoice_date: str | None = Field(
50 | default=None, description="Date of the invoice (YYYY-MM-DD format if possible)"
51 | )
52 | vendor_name: str | None = Field(
53 | default=None, description="Name of the vendor or supplier"
54 | )
55 | vendor_address: str | None = Field(
56 | default=None, description="Address of the vendor"
57 | )
58 | purchase_order_number: str | None = Field(
59 | default=None, description="Purchase order (PO) number if present"
60 | )
61 | payment_terms: str | None = Field(
62 | default=None,
63 | description="Payment terms (e.g., Net 30, Net 60, Due on receipt)",
64 | )
65 | line_items: list[LineItem] | None = Field(
66 | default=None, description="List of line items on the invoice"
67 | )
68 | subtotal: float | None = Field(
69 | default=None, description="Subtotal before tax and other charges"
70 | )
71 | tax: float | None = Field(default=None, description="Tax amount")
72 | total: float | None = Field(
73 | default=None, description="Total amount due on the invoice"
74 | )
75 |
76 |
77 | # For backward compatibility
78 | ExtractionSchema = InvoiceExtractionSchema
79 |
80 |
81 | # Reconciliation schema - extends invoice data with contract matching and discrepancy information
82 | class Discrepancy(BaseModel):
83 | """Represents a single discrepancy between invoice and contract"""
84 |
85 | field: str = Field(description="Field name where discrepancy was found")
86 | invoice_value: str | None = Field(
87 | default=None, description="Value from the invoice"
88 | )
89 | contract_value: str | None = Field(
90 | default=None, description="Expected value from the contract"
91 | )
92 | severity: str | None = Field(
93 | default=None,
94 | description="Severity of the discrepancy (e.g., 'high', 'medium', 'low')",
95 | )
96 | note: str | None = Field(
97 | default=None, description="Additional notes about the discrepancy"
98 | )
99 |
100 |
101 | class InvoiceWithReconciliation(InvoiceExtractionSchema):
102 | """Invoice data with reconciliation information"""
103 |
104 | matched_contract_id: str | None = Field(
105 | default=None, description="ID of the matched contract file in LlamaCloud"
106 | )
107 | matched_contract_name: str | None = Field(
108 | default=None, description="Name of the matched contract file"
109 | )
110 | match_confidence: str | None = Field(
111 | default=None,
112 | description="Confidence level of the match (e.g., 'high', 'medium', 'low', 'none')",
113 | )
114 | match_rationale: str | None = Field(
115 | default=None, description="Explanation of why this contract was matched"
116 | )
117 | discrepancies: list[Discrepancy] | None = Field(
118 | default=None,
119 | description="List of discrepancies found between invoice and contract",
120 | )
121 |
122 |
123 | EXTRACT_CONFIG = ExtractConfig(
124 | extraction_mode=ExtractMode.PREMIUM,
125 | system_prompt=None,
126 | # advanced. Only compatible with Premium mode.
127 | use_reasoning=False,
128 | cite_sources=False,
129 | confidence_scores=True,
130 | )
131 |
--------------------------------------------------------------------------------
/src/extraction_review/index_contract.py:
--------------------------------------------------------------------------------
1 | """
2 | Workflow for indexing contract documents into LlamaCloud Index for retrieval.
3 | """
4 |
5 | import logging
6 | import os
7 | import tempfile
8 | from pathlib import Path
9 | from typing import Literal
10 |
11 | import httpx
12 | from llama_index.core import Document
13 | from pydantic import BaseModel
14 | from workflows import Context, Workflow, step
15 | from workflows.events import Event, StartEvent, StopEvent
16 |
17 | from .clients import get_contracts_index, get_llama_cloud_client
18 |
19 | logger = logging.getLogger(__name__)
20 |
21 |
22 | class ContractFileEvent(StartEvent):
23 | """Event to start contract indexing with a file ID"""
24 |
25 | file_ids: list[str]
26 |
27 |
28 | class DownloadContractEvent(Event):
29 | """Event to trigger contract download"""
30 |
31 | file_id: str
32 |
33 |
34 | class ContractDownloadedEvent(Event):
35 | """Event indicating contract has been downloaded"""
36 |
37 | file_id: str
38 | file_path: str
39 | filename: str
40 |
41 |
42 | class ContractIndexedEvent(Event):
43 | """Event indicating a single contract has been indexed"""
44 |
45 | file_id: str
46 | filename: str
47 |
48 |
49 | class Status(Event):
50 | """Event to show toast notifications in the UI"""
51 |
52 | level: Literal["info", "warning", "error"]
53 | message: str
54 |
55 |
56 | class ContractIndexState(BaseModel):
57 | """State for contract indexing workflow"""
58 |
59 | total_files: int = 0
60 | # Store file info keyed by file_id
61 | file_paths: dict[str, str] = {}
62 | filenames: dict[str, str] = {}
63 |
64 |
65 | class IndexContractWorkflow(Workflow):
66 | """
67 | Workflow to download and index a contract document into LlamaCloud Index.
68 | """
69 |
70 | @step()
71 | async def start_indexing(
72 | self, event: ContractFileEvent, ctx: Context[ContractIndexState]
73 | ) -> DownloadContractEvent | None:
74 | """Initialize the workflow with multiple file IDs and fan out to parallel downloads"""
75 | logger.info(f"Starting contract indexing for {len(event.file_ids)} files")
76 | async with ctx.store.edit_state() as state:
77 | state.total_files = len(event.file_ids)
78 |
79 | # Fan out: emit one download event per file
80 | for file_id in event.file_ids:
81 | ctx.send_event(DownloadContractEvent(file_id=file_id))
82 |
83 | return None
84 |
85 | @step(num_workers=4)
86 | async def download_contract(
87 | self, event: DownloadContractEvent, ctx: Context[ContractIndexState]
88 | ) -> ContractDownloadedEvent:
89 | """Download the contract file from LlamaCloud storage (runs in parallel)"""
90 | file_id = event.file_id
91 |
92 | file_metadata = await get_llama_cloud_client().files.get_file(id=file_id)
93 | file_url = await get_llama_cloud_client().files.read_file_content(file_id)
94 |
95 | temp_dir = tempfile.gettempdir()
96 | filename = file_metadata.name
97 | file_path = os.path.join(temp_dir, filename)
98 |
99 | logger.info(f"Downloading contract {filename} from {file_url.url}")
100 | ctx.write_event_to_stream(
101 | Status(level="info", message=f"Downloading contract: {filename}")
102 | )
103 |
104 | client = httpx.AsyncClient()
105 | async with client.stream("GET", file_url.url) as response:
106 | with open(file_path, "wb") as f:
107 | async for chunk in response.aiter_bytes():
108 | f.write(chunk)
109 |
110 | logger.info(f"Downloaded contract to {file_path}")
111 | async with ctx.store.edit_state() as state:
112 | state.file_paths[file_id] = file_path
113 | state.filenames[file_id] = filename
114 |
115 | return ContractDownloadedEvent(
116 | file_id=file_id, file_path=file_path, filename=filename
117 | )
118 |
119 | @step(num_workers=4)
120 | async def index_contract(
121 | self, event: ContractDownloadedEvent, ctx: Context[ContractIndexState]
122 | ) -> ContractIndexedEvent:
123 | """Index the contract document into LlamaCloud Index (runs in parallel)"""
124 | file_id = event.file_id
125 | file_path = event.file_path
126 | filename = event.filename
127 |
128 | logger.info(f"Indexing contract {filename}")
129 | ctx.write_event_to_stream(
130 | Status(level="info", message=f"Indexing contract: {filename}")
131 | )
132 |
133 | # Create a document with metadata
134 | file_content = Path(file_path).read_text(errors="ignore")
135 | document = Document(
136 | text=file_content,
137 | metadata={
138 | "filename": filename,
139 | "file_id": file_id,
140 | "document_type": "contract",
141 | },
142 | )
143 |
144 | # Get the contracts index and insert the document
145 | index = get_contracts_index()
146 | await index.ainsert(document)
147 |
148 | logger.info(f"Successfully indexed contract {filename}")
149 | ctx.write_event_to_stream(
150 | Status(
151 | level="info",
152 | message=f"Successfully indexed contract: {filename}",
153 | )
154 | )
155 |
156 | return ContractIndexedEvent(file_id=file_id, filename=filename)
157 |
158 | @step()
159 | async def collect_results(
160 | self, event: ContractIndexedEvent, ctx: Context[ContractIndexState]
161 | ) -> StopEvent | None:
162 | """Collect all indexed contracts and return final results (fan-in)"""
163 | state = await ctx.store.get_state()
164 |
165 | # Collect all ContractIndexedEvent events - one for each file
166 | events = ctx.collect_events(event, [ContractIndexedEvent] * state.total_files)
167 |
168 | if events is None:
169 | # Not all files have been indexed yet
170 | return None
171 |
172 | # All files have been indexed, return aggregated results
173 | results = [{"file_id": ev.file_id, "filename": ev.filename} for ev in events]
174 |
175 | logger.info(f"Successfully indexed all {len(results)} contracts")
176 | ctx.write_event_to_stream(
177 | Status(
178 | level="info",
179 | message=f"Successfully indexed all {len(results)} contracts",
180 | )
181 | )
182 |
183 | return StopEvent(result={"contracts": results, "total": len(results)})
184 |
185 |
186 | workflow = IndexContractWorkflow(timeout=None)
187 |
188 | if __name__ == "__main__":
189 | import asyncio
190 | from dotenv import load_dotenv
191 |
192 | load_dotenv()
193 | logging.basicConfig(level=logging.INFO)
194 |
195 | async def main():
196 | # Example usage - upload a contract and index it
197 | file = await get_llama_cloud_client().files.upload_file(
198 | upload_file=Path("sample_contract.pdf").open("rb")
199 | )
200 | result = await workflow.run(start_event=ContractFileEvent(file_ids=[file.id]))
201 | print(f"Indexed contract: {result}")
202 |
203 | asyncio.run(main())
204 |
--------------------------------------------------------------------------------
/ui/src/lib/WorkflowProgress.tsx:
--------------------------------------------------------------------------------
1 | import {
2 | useHandlers,
3 | WorkflowEvent,
4 | StreamOperation,
5 | HandlerState,
6 | } from "@llamaindex/ui";
7 | import { useEffect, useRef, useState } from "react";
8 | import { Loader2 } from "lucide-react";
9 | import { cn } from "./utils";
10 |
11 | interface StatusMessage {
12 | type: "Status";
13 | data: {
14 | level: "info" | "warning" | "error";
15 | message: string;
16 | };
17 | }
18 | /**
19 | * Given a workflow type, keeps track of the number of running handlers and the maximum number of running handlers.
20 | * Has hooks to notify when a workflow handler is completed.
21 | */
22 | export const WorkflowProgress = ({
23 | workflowName,
24 | onWorkflowCompletion,
25 | handlers = [],
26 | sync = true,
27 | }: {
28 | workflowName: string[];
29 | onWorkflowCompletion?: (handlerIds: string[]) => void;
30 | handlers?: HandlerState[]; // specific handlers to track, e.g. after triggering a workflow run
31 | sync?: boolean; // whether to sync the handlers with the query on mount
32 | }) => {
33 | const handlersService = useHandlers({
34 | query: { workflow_name: workflowName, status: ["running"] },
35 | sync: sync,
36 | });
37 | const seenHandlers = useRef