├── .gitignore ├── requirements.txt ├── bill-of-sale.pdf ├── loan-application.pdf ├── README.md └── pdf-processing.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | /.env 2 | /.venv -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | llmwhisperer-client 2 | openai 3 | python-dotenv -------------------------------------------------------------------------------- /bill-of-sale.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/svpino/unstract-llmwhisperer-sample/HEAD/bill-of-sale.pdf -------------------------------------------------------------------------------- /loan-application.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/svpino/unstract-llmwhisperer-sample/HEAD/loan-application.pdf -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Better PDF processing with LLMs 2 | 3 | ## Setup 4 | 5 | 1. Create a virtual environment and install the required packages: 6 | 7 | ```bash 8 | $ python3 -m venv .venv 9 | $ source .venv/bin/activate 10 | $ pip install -r requirements.txt 11 | ``` 12 | 13 | 2. Create a free LLMWhisperer account and get your API key from [here](https://unstract.com/llmwhisperer/). 14 | 15 | 3. Create a `.env` file with the following variables: 16 | 17 | ```bash 18 | OPENAI_API_KEY = [YOUR OPENAI API KEY GOES HERE] 19 | LLMWHISPERER_API_KEY = [YOUR LLMWHISPERER API KEY GOES HERE] 20 | ``` 21 | -------------------------------------------------------------------------------- /pdf-processing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Better PDF processing with LLMs" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "%load_ext autoreload\n", 17 | "%autoreload 2\n", 18 | "%load_ext dotenv\n", 19 | "%dotenv" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 2, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "from openai import OpenAI\n", 29 | "\n", 30 | "client = OpenAI()" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "## Setting Things Up" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "Let's create a function that turns a document into text using LLMWhisperer:" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 3, 50 | "metadata": {}, 51 | "outputs": [ 52 | { 53 | "name": "stderr", 54 | "output_type": "stream", 55 | "text": [ 56 | "2024-08-24 09:52:45,962 - unstract.llmwhisperer.client - DEBUG - logging_level set to DEBUG\n", 57 | "2024-08-24 09:52:45,962 - unstract.llmwhisperer.client - DEBUG - base_url set to https://llmwhisperer-api.unstract.com/v1\n", 58 | "2024-08-24 09:52:45,962 - unstract.llmwhisperer.client - DEBUG - api_key set to e7dbxxxxxxxxxxxxxxxxxxxxxxxxxxxx\n" 59 | ] 60 | } 61 | ], 62 | "source": [ 63 | "import time\n", 64 | "import os\n", 65 | "from unstract.llmwhisperer.client import LLMWhispererClient\n", 66 | "\n", 67 | "LLMWHISPERER_API_KEY = os.getenv(\"LLMWHISPERER_API_KEY\")\n", 68 | "\n", 69 | "llm_whisperer_client = LLMWhispererClient(\n", 70 | " base_url=\"https://llmwhisperer-api.unstract.com/v1\", api_key=LLMWHISPERER_API_KEY\n", 71 | ")\n", 72 | "\n", 73 | "\n", 74 | "def preprocess_document(file_path):\n", 75 | " response = llm_whisperer_client.whisper(file_path=file_path)\n", 76 | " whisper_hash = response[\"whisper_hash\"]\n", 77 | "\n", 78 | " print(\"Status:\", response[\"status_code\"])\n", 79 | " print(\"Hash:\", whisper_hash)\n", 80 | "\n", 81 | " text = response.get(\"extracted_text\")\n", 82 | "\n", 83 | " while True:\n", 84 | " status = llm_whisperer_client.whisper_status(whisper_hash=whisper_hash)\n", 85 | "\n", 86 | " if status[\"status\"] == \"processed\":\n", 87 | " text = llm_whisperer_client.whisper_retrieve(whisper_hash=whisper_hash)[\n", 88 | " \"extracted_text\"\n", 89 | " ]\n", 90 | " break\n", 91 | " elif status[\"status\"] != \"processing\":\n", 92 | " break\n", 93 | "\n", 94 | " time.sleep(2)\n", 95 | "\n", 96 | " return text" 97 | ] 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "metadata": {}, 102 | "source": [ 103 | "Let's create a function that uses OpenAI's Assistants API to answer questions from a document:" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": 4, 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [ 112 | "def answer_from_document(instructions, question, file_path):\n", 113 | " assistant = client.beta.assistants.create(\n", 114 | " name=\"Assistant\",\n", 115 | " instructions=instructions,\n", 116 | " model=\"gpt-4o\",\n", 117 | " tools=[{\"type\": \"file_search\"}],\n", 118 | " )\n", 119 | "\n", 120 | " message_file = client.files.create(file=open(file_path, \"rb\"), purpose=\"assistants\")\n", 121 | "\n", 122 | " thread = client.beta.threads.create(\n", 123 | " messages=[\n", 124 | " {\n", 125 | " \"role\": \"user\",\n", 126 | " \"content\": question,\n", 127 | " \"attachments\": [\n", 128 | " {\"file_id\": message_file.id, \"tools\": [{\"type\": \"file_search\"}]}\n", 129 | " ],\n", 130 | " }\n", 131 | " ]\n", 132 | " )\n", 133 | "\n", 134 | " run = client.beta.threads.runs.create_and_poll(\n", 135 | " thread_id=thread.id, assistant_id=assistant.id\n", 136 | " )\n", 137 | "\n", 138 | " messages = list(\n", 139 | " client.beta.threads.messages.list(thread_id=thread.id, run_id=run.id)\n", 140 | " )\n", 141 | " return messages[0].content[0].text.value" 142 | ] 143 | }, 144 | { 145 | "cell_type": "markdown", 146 | "metadata": {}, 147 | "source": [ 148 | "Let's now create a function that uses the OpenAI's Completion API to answer a question:" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": 5, 154 | "metadata": {}, 155 | "outputs": [], 156 | "source": [ 157 | "def answer(question):\n", 158 | " completion = client.chat.completions.create(\n", 159 | " model=\"gpt-4o\", messages=[{\"role\": \"user\", \"content\": question}], stream=False\n", 160 | " )\n", 161 | "\n", 162 | " return completion.choices[0].message.content" 163 | ] 164 | }, 165 | { 166 | "cell_type": "markdown", 167 | "metadata": {}, 168 | "source": [ 169 | "## GPT-4o + Bill of Sale PDF" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": 12, 175 | "metadata": {}, 176 | "outputs": [ 177 | { 178 | "name": "stdout", 179 | "output_type": "stream", 180 | "text": [ 181 | "A total of 125 bats were ordered【4:0†bill-of-sale.pdf】.\n" 182 | ] 183 | } 184 | ], 185 | "source": [ 186 | "response = answer_from_document(\n", 187 | " instructions=\"You are a sales assistant. Answer questions about the supplied bill of sale.\",\n", 188 | " question=\"How many bats where ordered?\",\n", 189 | " file_path=\"bill-of-sale.pdf\",\n", 190 | ")\n", 191 | "print(response)" 192 | ] 193 | }, 194 | { 195 | "cell_type": "markdown", 196 | "metadata": {}, 197 | "source": [ 198 | "## GPT-4o + Extracted Text From The Bill of Sale" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": 13, 204 | "metadata": {}, 205 | "outputs": [ 206 | { 207 | "name": "stderr", 208 | "output_type": "stream", 209 | "text": [ 210 | "2024-08-24 10:06:46,808 - unstract.llmwhisperer.client - DEBUG - whisper called\n", 211 | "2024-08-24 10:06:46,808 - unstract.llmwhisperer.client - DEBUG - api_url: https://llmwhisperer-api.unstract.com/v1/whisper\n", 212 | "2024-08-24 10:06:46,808 - unstract.llmwhisperer.client - DEBUG - params: {'url': '', 'processing_mode': 'ocr', 'output_mode': 'line-printer', 'page_seperator': '<<<', 'force_text_processing': False, 'pages_to_extract': '', 'timeout': 200, 'store_metadata_for_highlighting': False, 'median_filter_size': 0, 'gaussian_blur_radius': 0, 'ocr_provider': 'advanced', 'line_splitter_tolerance': 0.4, 'horizontal_stretch_factor': 1.0}\n", 213 | "2024-08-24 10:06:53,012 - unstract.llmwhisperer.client - DEBUG - whisper_status called\n", 214 | "2024-08-24 10:06:53,012 - unstract.llmwhisperer.client - DEBUG - url: https://llmwhisperer-api.unstract.com/v1/whisper-status\n" 215 | ] 216 | }, 217 | { 218 | "name": "stdout", 219 | "output_type": "stream", 220 | "text": [ 221 | "Status: 200\n", 222 | "Hash: c96b3ffe|d5a80b735b076cfa184e1c3b0fb86897\n", 223 | "\n", 224 | "\n", 225 | "\n", 226 | " Al, Spalding Bros. SPALDING PLEASE REMIT TO SPALDING SALES CORP. \n", 227 | " SION OF SPALDING SALES CORPORATION MARK \n", 228 | "\n", 229 | "\n", 230 | " #1 \n", 231 | " STORE NO. FOLIO C \n", 232 | "\n", 233 | "\n", 234 | " FAMOUS FOR ATHLETIC EQUIPMENT \n", 235 | " INVOICE NO. S 2812 \n", 236 | "\n", 237 | "\n", 238 | " CUSTOMER'S \n", 239 | " Sold To DATE 6/1/39 Ship To ORDER NO. \n", 240 | "\n", 241 | "\n", 242 | " BKLYN EAGLES B B CLUB DELD TO DIRK LUNDY \n", 243 | " EMANLEY - \n", 244 | " ADDRESS ADDRESS \n", 245 | " 101 MONTGOMERY STREET \n", 246 | " TOWN NEWARK, N.J. STATE TOWN STATE \n", 247 | " TERMS: \n", 248 | " 2% CASH TO DAYS-NET 30 DAYS- VIA \n", 249 | "\n", 250 | "\n", 251 | " DEALER INST. GOLF PRO. ORDER TAKEN BY SALESMAN'S NAME NO, \n", 252 | " CLASS \n", 253 | " OF \n", 254 | " BALE A GOODWIN TAGUER 106 \n", 255 | "\n", 256 | "\n", 257 | " ITEM QUANTITY \n", 258 | " 86 NO. DESCRIPTION OF ARTICLE ORDERED SHIPPED UNIT PRICE AMOUNT \n", 259 | "\n", 260 | "\n", 261 | "125 BATS 9 9 EA 1 75 15 75 \n", 262 | " - \n", 263 | "120 BATS 1 1 EA 1 75 \n", 264 | " - \n", 265 | "200 BATS 6 6 EA 1 00 6 00 \n", 266 | "\n", 267 | "\n", 268 | " 1 40 \n", 269 | "1 30 BATS 2 2 EA 2 80 \n", 270 | " 26 30 \n", 271 | " 150 \n", 272 | " - \n", 273 | " 80- \n", 274 | "\n", 275 | "\n", 276 | "- \n", 277 | "\n", 278 | "\n", 279 | " SEP / / 1933 \n", 280 | "\n", 281 | "\n", 282 | " Form F 21 1-39-M \n", 283 | " NO RETURN OF MERCHANDISE FROM THIS INVOICE WILL BE ACCEPTED UNLESS YOU HAVE OUR WRITTEN PERMISSION. \n", 284 | "<<<\n", 285 | "\f\n", 286 | "\n" 287 | ] 288 | } 289 | ], 290 | "source": [ 291 | "text = preprocess_document(\"bill-of-sale.pdf\")\n", 292 | "print(text)" 293 | ] 294 | }, 295 | { 296 | "cell_type": "code", 297 | "execution_count": 14, 298 | "metadata": {}, 299 | "outputs": [ 300 | { 301 | "name": "stdout", 302 | "output_type": "stream", 303 | "text": [ 304 | "The bill of sale indicates that the following quantities of bats were ordered:\n", 305 | "\n", 306 | "- Item 125: 9 bats\n", 307 | "- Item 120: 1 bat\n", 308 | "- Item 200: 6 bats\n", 309 | "- Item 130: 2 bats\n", 310 | "\n", 311 | "Adding these quantities:\n", 312 | "\n", 313 | "9 (Item 125) + 1 (Item 120) + 6 (Item 200) + 2 (Item 130) = 18 bats\n", 314 | "\n", 315 | "Therefore, 18 bats were ordered.\n" 316 | ] 317 | } 318 | ], 319 | "source": [ 320 | "prompt = \"\"\"\n", 321 | "Look at the following bill of sale and answer the following question:\n", 322 | "\n", 323 | "Question: How many bats where ordered?\n", 324 | "\n", 325 | "Bill of sale:\n", 326 | "\"\"\"\n", 327 | "\n", 328 | "response = answer(prompt + text)\n", 329 | "print(response)" 330 | ] 331 | }, 332 | { 333 | "cell_type": "markdown", 334 | "metadata": {}, 335 | "source": [ 336 | "## GPT-4o + Loan Application PDF" 337 | ] 338 | }, 339 | { 340 | "cell_type": "code", 341 | "execution_count": 9, 342 | "metadata": {}, 343 | "outputs": [ 344 | { 345 | "name": "stdout", 346 | "output_type": "stream", 347 | "text": [ 348 | "I wasn't able to locate the applicant’s full address using the search function. Please provide the exact section or type of document where this information may be found, or alternatively, I can attempt a detailed manual review.\n" 349 | ] 350 | } 351 | ], 352 | "source": [ 353 | "response = answer_from_document(\n", 354 | " instructions=\"You are a loan application assistant. Answer questions about the supplied loan application.\",\n", 355 | " question=\"What's the full address of the applicant?\",\n", 356 | " file_path=\"loan-application.pdf\",\n", 357 | ")\n", 358 | "print(response)" 359 | ] 360 | }, 361 | { 362 | "cell_type": "markdown", 363 | "metadata": {}, 364 | "source": [ 365 | "## GPT-4o + Extracted Text From The Loan Application" 366 | ] 367 | }, 368 | { 369 | "cell_type": "code", 370 | "execution_count": 10, 371 | "metadata": {}, 372 | "outputs": [ 373 | { 374 | "name": "stderr", 375 | "output_type": "stream", 376 | "text": [ 377 | "2024-08-24 09:53:34,570 - unstract.llmwhisperer.client - DEBUG - whisper called\n", 378 | "2024-08-24 09:53:34,571 - unstract.llmwhisperer.client - DEBUG - api_url: https://llmwhisperer-api.unstract.com/v1/whisper\n", 379 | "2024-08-24 09:53:34,571 - unstract.llmwhisperer.client - DEBUG - params: {'url': '', 'processing_mode': 'ocr', 'output_mode': 'line-printer', 'page_seperator': '<<<', 'force_text_processing': False, 'pages_to_extract': '', 'timeout': 200, 'store_metadata_for_highlighting': False, 'median_filter_size': 0, 'gaussian_blur_radius': 0, 'ocr_provider': 'advanced', 'line_splitter_tolerance': 0.4, 'horizontal_stretch_factor': 1.0}\n", 380 | "2024-08-24 09:53:50,390 - unstract.llmwhisperer.client - DEBUG - whisper_status called\n", 381 | "2024-08-24 09:53:50,391 - unstract.llmwhisperer.client - DEBUG - url: https://llmwhisperer-api.unstract.com/v1/whisper-status\n" 382 | ] 383 | }, 384 | { 385 | "name": "stdout", 386 | "output_type": "stream", 387 | "text": [ 388 | "Status: 200\n", 389 | "Hash: 4fea52d2|a50523a76fbcf5cb5d802aadf86b4574\n", 390 | "\n", 391 | "\n", 392 | "\n", 393 | " To be completed by the Lender: \n", 394 | " Lender Loan No./Universal Loan Identifier Agency Case No. \n", 395 | "\n", 396 | "\n", 397 | "Uniform Residential Loan Application \n", 398 | "Verify and complete the information on this application. If you are applying for this loan with others, each additional Borrower must provide \n", 399 | "information as directed by your Lender. \n", 400 | "\n", 401 | "\n", 402 | "Section 1: Borrower Information. This section asks about your personal information and your income from \n", 403 | "employment and other sources, such as retirement, that you want considered to qualify for this loan. \n", 404 | "\n", 405 | "\n", 406 | " 1a. Personal Information \n", 407 | "Name (First, Middle, Last, Suffix) Social Security Number 175-678-910 \n", 408 | " IMA CARDHOLDER (or Individual Taxpayer Identification Number) \n", 409 | "Alternate Names - List any names by which you are known or any names Date of Birth Citizenship \n", 410 | "under which credit was previously received (First, Middle, Last, Suffix) (mm/dd/yyyy) [X] U.S. Citizen \n", 411 | " 08/31 / 1977 [ ] Permanent Resident Alien \n", 412 | " [ ] Non-Permanent Resident Alien \n", 413 | "Type of Credit List Name(s) of Other Borrower(s) Applying for this Loan \n", 414 | "[X] I am applying for individual credit. (First, Middle, Last, Suffix) - Use a separator between names \n", 415 | "[ ] I am applying for joint credit. Total Number of Borrowers: \n", 416 | " Each Borrower intends to apply for joint credit. Your initials: \n", 417 | "\n", 418 | "\n", 419 | "Marital Status Dependents (not listed by another Borrower) Contact Information \n", 420 | "[X] Married Number Home Phone ( ) - \n", 421 | "[ ] Separated Ages Cell Phone (408) 123-4567 \n", 422 | "[ ] Unmarried Work Phone 1 1 Ext. \n", 423 | " (Single, Divorced, Widowed, Civil Union, Domestic Partnership, Registered \n", 424 | " Email ima1977@gmail.com \n", 425 | " Reciprocal Beneficiary Relationship) \n", 426 | "Current Address \n", 427 | "Street 1024, SULLIVAN STREET Unit # \n", 428 | "City LOS ANGELES State CA ZIP 90210 Country USA \n", 429 | "How Long at Current Address? 3 Years 5 Months Housing [ ] No primary housing expense [ ] Own [X] Rent ($ 1,300 /month) \n", 430 | "\n", 431 | "\n", 432 | "If at Current Address for LESS than 2 years, list Former Address [X] Does not apply \n", 433 | "Street Unit # \n", 434 | "City State ZIP Country \n", 435 | "How Long at Former Address? Years Months Housing [ ] No primary housing expense [ ] Own [ ] Rent ($ /month) \n", 436 | "\n", 437 | "\n", 438 | "Mailing Address - if different from Current Address [X] Does not apply \n", 439 | "Street Unit # \n", 440 | "City State ZIP Country \n", 441 | "\n", 442 | "\n", 443 | " 1b. Current Employment/Self-Employment and Income [ ] Does not apply \n", 444 | " Gross Monthly Income \n", 445 | "Employer or Business Name CAFFIENATED Phone (408) 109-8765 \n", 446 | " Base $ 8000 /month \n", 447 | "Street 2048, MAIN STREET Unit # \n", 448 | " Overtime $ /month \n", 449 | "City LOS ANGELES State CA ZIP 90210 Country USA \n", 450 | " Bonus $ /month \n", 451 | "Position or Title CEO Check if this statement applies: Commission $ 0.00 /month \n", 452 | "Start Date 02 [ ] I am employed by a family member, \n", 453 | " / 04/2009 \n", 454 | " property seller, real estate agent, or other Military \n", 455 | "How long in this line of work? 15 Years 5 Months party to the transaction. Entitlements $ /month \n", 456 | " Other $ /month \n", 457 | "[X] Check if you are the Business [ ] I have an ownership share of less than 25%. Monthly Income (or Loss) \n", 458 | " TOTAL $ 8000 /month \n", 459 | " Owner or Self-Employed [X] I have an ownership share of 25% or more. $ 8000 \n", 460 | "\n", 461 | "\n", 462 | "Uniform Residential Loan Application \n", 463 | "Freddie Mac Form 65 . Fannie Mae Form 1003 \n", 464 | "Effective 1/2021 \n", 465 | "<<<\n", 466 | "\f\n", 467 | "\n", 468 | "\n", 469 | "Californiausa DRIVER LICENSE \n", 470 | "\n", 471 | "\n", 472 | " CLASS C \n", 473 | " DL /1234568 \n", 474 | " EXP 08/31/2014 END NONE \n", 475 | " LNCARDHOLDER \n", 476 | " FNIMA \n", 477 | " 2570 24TH STREET \n", 478 | " ANYTOWN. CA 95818 \n", 479 | " DOB 08/31/1977 \n", 480 | " RSTR NONE 08311977 \n", 481 | "\n", 482 | "\n", 483 | " VETERAN \n", 484 | " SEX F HAIR BRN EYES BRN \n", 485 | "Ima HGT 5'-05 WGT 125 1b \n", 486 | " 08/31/2009 \n", 487 | "<<<\n", 488 | "\f\n", 489 | "\n" 490 | ] 491 | } 492 | ], 493 | "source": [ 494 | "text = preprocess_document(\"loan-application.pdf\")\n", 495 | "print(text)" 496 | ] 497 | }, 498 | { 499 | "cell_type": "code", 500 | "execution_count": 11, 501 | "metadata": {}, 502 | "outputs": [ 503 | { 504 | "name": "stdout", 505 | "output_type": "stream", 506 | "text": [ 507 | "The full address of the applicant, as listed in the loan application, is:\n", 508 | "\n", 509 | "1024 Sullivan Street, Los Angeles, CA 90210, USA\n" 510 | ] 511 | } 512 | ], 513 | "source": [ 514 | "prompt = \"\"\"\n", 515 | "Look at the following loan application and answer the following question:\n", 516 | "\n", 517 | "Question: What's the full address of the applicant?\n", 518 | "\n", 519 | "Loan application:\n", 520 | "\"\"\"\n", 521 | "\n", 522 | "response = answer(prompt + text)\n", 523 | "print(response)" 524 | ] 525 | }, 526 | { 527 | "cell_type": "code", 528 | "execution_count": null, 529 | "metadata": {}, 530 | "outputs": [], 531 | "source": [] 532 | } 533 | ], 534 | "metadata": { 535 | "kernelspec": { 536 | "display_name": ".venv", 537 | "language": "python", 538 | "name": "python3" 539 | }, 540 | "language_info": { 541 | "codemirror_mode": { 542 | "name": "ipython", 543 | "version": 3 544 | }, 545 | "file_extension": ".py", 546 | "mimetype": "text/x-python", 547 | "name": "python", 548 | "nbconvert_exporter": "python", 549 | "pygments_lexer": "ipython3", 550 | "version": "3.10.11" 551 | } 552 | }, 553 | "nbformat": 4, 554 | "nbformat_minor": 2 555 | } 556 | --------------------------------------------------------------------------------