├── .env ├── .gitignore ├── LICENSE ├── README.md ├── assets └── imgs │ └── chase_cc.png ├── extract.py └── requirements.txt /.env: -------------------------------------------------------------------------------- 1 | LLMWHISPERER_API_KEY= 2 | OPENAI_API_KEY= -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 110 | .pdm.toml 111 | .pdm-python 112 | .pdm-build/ 113 | 114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 115 | __pypackages__/ 116 | 117 | # Celery stuff 118 | celerybeat-schedule 119 | celerybeat.pid 120 | 121 | # SageMath parsed files 122 | *.sage.py 123 | 124 | # Environments 125 | .env 126 | .venv 127 | env/ 128 | venv/ 129 | ENV/ 130 | env.bak/ 131 | venv.bak/ 132 | 133 | # Spyder project settings 134 | .spyderproject 135 | .spyproject 136 | 137 | # Rope project settings 138 | .ropeproject 139 | 140 | # mkdocs documentation 141 | /site 142 | 143 | # mypy 144 | .mypy_cache/ 145 | .dmypy.json 146 | dmypy.json 147 | 148 | # Pyre type checker 149 | .pyre/ 150 | 151 | # pytype static type analyzer 152 | .pytype/ 153 | 154 | # Cython debug symbols 155 | cython_debug/ 156 | 157 | # PyCharm 158 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 159 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 160 | # and can be added to the global gitignore or merged into this file. For a more nuclear 161 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 162 | #.idea/ 163 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Zipstack 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Extracting Structured JSON from Credit Card Statements with Langchain and Pydantic 2 | 3 | This repository contains the code for the blog post [Comparing approaches for using LLMs for Structured Data Extraction from PDFs](https://unstract.com/blog/comparing-approaches-for-using-llms-for-structured-data-extraction-from-pdfs/). The idea is to be able to develop generalized prompts that can extract structured data from credit card statements. We use Langchain to create the prompts and Pydantic to make sure that the data is in the schema as need. 4 | 5 | 6 | This full-code approach is then contrasted with using a purpose-built environment like Unstract's Prompt Studio. The blog post goes into more detail about the pros and cons of each approach. 7 | 8 | ## Supported operating systems 9 | You should be able to run this on Linux or on a Mac. Windows is not supported. 10 | 11 | ## Keys you'll need 12 | You'll need keys for OpenAI and [LLMWhisperer](https://unstract.com/llmwhisperer/), which you can get for free. Please read the blog post for more information. Once you have the keys, please add them to the `.env` file in the root of the project. 13 | 14 | ## Running the code 15 | Clone this repo and change to the `structured-extraction` directory. We suggest you run the code after you've created a Python virtual environment. You can create a virtual environment by running the following command: 16 | 17 | ```bash 18 | python3 -m venv .venv 19 | ``` 20 | 21 | Next, activate the virtual environment: 22 | 23 | ```bash 24 | source .venv/bin/activate 25 | ``` 26 | 27 | Now, install the dependencies: 28 | 29 | ```bash 30 | pip install -r requirements.txt 31 | ``` 32 | 33 | Finally, run the code: 34 | 35 | ```bash 36 | python extract.py 37 | ``` 38 | 39 | # Results 40 | 41 | ## Sample input (page 1): 42 | ![img](assets/imgs/chase_cc.png) 43 | 44 | ## Generated LLMWhisperer output 45 | 46 | ``` 47 | Manage your account online at: Customer Service: Mobile: Download the 48 | www.chase.com/cardhelp 1-800-524-3880 Chase Mobile® app today 49 | freedom 50 | 51 | New Balance 52 | February 2024 CHASE FREEDOM: ULTIMATE 53 | S M T W T F S $5,084.29 REWARDS® SUMMARY 54 | Minimum Payment Due 55 | 28 29 30 31 1 2 3 56 | Previous points balance 40,468 57 | $50.00 5,085 58 | 4 5 6 7 8 9 10 + 1% (1 Pt)/$1 earned on all purchases 59 | Payment Due Date 60 | 11 12 13 14 15 16 17 Total points available for 61 | 02/28/24 62 | 18 19 20 21 22 23 24 redemption 45,553 63 | 25 26 27 28 29 1 2 Start redeeming today. Visit Ultimate Rewards® at 64 | www.ultimaterewards.com 65 | 3 4 5 6 7 8 9 66 | 67 | You always earn unlimited 1% cash back on all your purchases. 68 | Late Payment Warning: If we do not receive your minimum payment Activate new bonus categories every quarter. You'll earn an 69 | by the date listed above, you may have to pay a late fee of up to additional 4% cash back, for a total of 5% cash back on up to 70 | $40.00 and your APR's will be subject to increase to a maximum $1,500 in combined bonus category purchases each quarter. 71 | Penalty APR of 29.99%. Activate for free at chase.com/freedom, visit a Chase branch or 72 | call the number on the back of your card. 73 | Minimum Payment Warning: If you make only the minimum 74 | payment each period, you will pay more in interest and it will take you 75 | longer to pay off your balance. For example: 76 | 77 | If you make no You will pay off the And you will end up 78 | additional charges balance shown on this paying an estimated 79 | using this card and statement in about ... total of ... 80 | each month you pay ... 81 | 82 | Only the minimum 15 years $12,128 83 | payment 84 | 85 | $189 3 years $6,817 86 | (Savings=$5,311) 87 | 88 | If you would like information about credit counseling services, call 89 | 1-866-797-2885. 90 | 91 | ACCOUNT SUMMARY 92 | Account Number: 4342 3780 1050 7320 93 | Previous Balance $4,233.99 94 | Payment, Credits -$4,233.99 95 | Purchases +$5,084.29 96 | Cash Advances $0.00 97 | Balance Transfers $0.00 98 | Fees Charged $0.00 99 | Interest Charged $0.00 100 | New Balance $5,084.29 101 | Opening/Closing Date 01/04/24 - 02/03/24 102 | Credit Access Line $31,700 103 | Available Credit $26,615 104 | Cash Access Line $1,585 105 | Available for Cash $1,585 106 | Past Due Amount $0.00 107 | Balance over the Credit Access Line $0.00 108 | 109 | YOUR ACCOUNT MESSAGES 110 | 111 | Reminder: It is important to continue making your payments on time. Your APRs may increase if the minimum payment is not made on 112 | time or payments are returned. 113 | 114 | Your next AutoPay payment for $5,084.29 will be deducted from your Pay From account and credited on your due 115 | date. If your due date falls on a Saturday, we'll credit your payment the Friday before. 116 | 117 | 0000001 FIS33339 D 12 Y 9 03 24/02/03 Page 1 of 3 06610 MA MA 34942 03410000120003494201 118 | 0404 119 | 120 | 43323770106074170000500000508429000000002 121 | freedom® 122 | 123 | P.O. BOX 15123 124 | AUTOPAY IS ON Payment Due Date: 02/28/24 125 | WILMINGTON, DE 19850-5123 126 | See Your Account New Balance: $5,084.29 127 | For Undeliverable Mail Only 128 | Messages for details. 129 | Minimum Payment Due: $50.00 130 | Account number: 4342 3780 1050 7320 131 | 132 | $ Amount Enclosed 133 | 34942 BEX 9 03424 D AUTOPAY IS ON 134 | LARRY PAGE 135 | C PAGE 136 | 24917 KEYSTONE AVE 137 | LOS ANGELES CA 97015-5505 138 | CARDMEMBER SERVICE 139 | PO BOX 6294 140 | CAROL STREAM IL 60197-6294 141 | 142 | 50001602832370106074177 [ ] 143 | <<< 144 | 145 | 146 | To contact us regarding your account: 147 | Call Customer Service: 148 | In U.S. 1-800-524-3880 ? 149 | Spanish 1-888-446-3308 150 | Pay by phone 1-800-436-7958 Send Inquiries to: Mail Payments to: Visit Our Website: 151 | International 1-302-594-8200 P.O. Box 15298 P.O. Box 6294 www.chase.com/cardhelp 152 | We accept operator relay calls Wilmington, DE 19850-5298 Carol Stream, IL 60197-6294 153 | 154 | Information About Your Account cash advance, or check transaction in the amount stated in your Account Agreement. 155 | There is a foreign transaction fee of 3% of the U.S. dollar amount of any foreign 156 | Making Your Payments: The amount of your payment should be at least your 157 | transaction for some accounts. Please see your Account Agreement for information 158 | minimum payment due, payable in U.S. dollars and drawn on or payable through a 159 | about these fees. 160 | U.S. financial institution or the U.S. branch of a foreign financial institution. You can 161 | pay down balances faster by paying more than the minimum payment or the total We add transactions and fees to your daily balance no earlier than: 162 | unpaid balance on your account. 163 | 1. the date of the transaction - for new purchases, balance transfers, overdraft 164 | You may make payments electronically through our website or by one of our customer advances, cash advances, or My Chase Loans; 165 | service phone numbers above. In using any of these channels, you are authorizing us 166 | to withdraw funds as a one-time electronic funds transfer from your bank account. 2. the date the payee deposits the check - for new cash advance checks or 167 | In our automated phone system, this authorization is provided via entry of a personal balance transfer checks; 168 | identification number. You may revoke this authorization by cancelling your payment 169 | 3. the date of a related transaction, the date they are posted to your account, or 170 | through our website or customer service telephone numbers prior to the payment 171 | the last day of the billing cycle, whichever we may choose - for fees 172 | processing. If we receive your completed payment request through one of these 173 | channels by 11:59 p.m. Eastern Time, we will credit your payment as of that day, If How To Avoid Paying Interest On Purchases: Your due date will be a minimum of 21 174 | we receive your request after 11:59 p.m. Eastern Time, we will credit your payment days after the close of each billing cycle. If you pay your account jor Interest Saving 175 | as of the next calendar day. If you specify a future date in your request we will credit Balance) in full each billing period by the date and time due, no interest is charged 176 | your payment as of that day. on new purchases month to month. Also, we will not impose interest charges on any 177 | portion of a purchase balance you repay while that balance is subject to an interest- 178 | If you pay by regular U.S. mail to the Payments address shown on this statement, 179 | free period. Subject to any interest-free period for new purchases, we will begin 180 | write your account number on your check or money order and include the payment 181 | coupon in the envelope. Do not send more than one payment or coupon per envelope. charging interest from the date a transaction (including any balance transfer, cash 182 | advance or overdraft advancej, fee or interest charge is added to your daily balance 183 | Do not staple, clip or tape the documents. Do not include correspondence. Do not 184 | until your account is paid in full. Because we apply payments in excess of your 185 | send cash. If we receive your properly prepared payment on any day by 5 p.m. local 186 | time at our Payments address on this statement, we will credit to your account that minimum payment first to higher rate balances, you may not be able to avoid interest 187 | day. If your payment is received after 5 p.m. local time at our Payments address on charges on new purchases if you have another balance at a higher interest rate unless 188 | this statement, we will credit it to your account as of the next calendar day. you pay your balance for Interest Saving Balance) in full each month. 189 | For all other payments or for any payment type above for which you do not follow our Credil Limit: If you want to inquire about your options to help prevent your account 190 | payment instructions, crediting of your payments may be delayed for up to 5 days. from exceeding your credit limit, please call the number on the back of your card. 191 | Account Information Reported To Credit Bureau: We may report information about What To Do If You Think You Find A Mistake On Your Stalement: If you think there 192 | your Account to credit bureaus. Late payments, missed payments or other defaults is an error on your statement, write to us on a separate sheet at Customer Service, 193 | on your Account may be reflected in your credit report. If you think we have reported P.O. Box 15299, Wilmington, DE 19850-5299. 194 | inaccurate information to a credit bureau, please write to us at Chase Card Services 195 | In your letter, give us the following information: 196 | P.O. Box 15369, Wilmington, DE 19850-5369. 197 | . Account information: Your name and Account number. 198 | To Service And Manage Any Of Your Account(s): By providing my mobile phone 199 | number, I am giving permission to be contacted at that number about all of my . Dollar amount: The dollar amount of the suspected error. 200 | accounts by JPMorgan Chase and companies working on its behalf. My consent 201 | allows the use of text messages, artificial or prerecorded voice messages and . Description of Problem: If you think there is an error on your bill, describe 202 | automatic dialing technology for informational and account servicing, but not for what you believe is wrong and why you believe it is a mistake. 203 | sales or telemarketing. Message and data rates may apply. 204 | You must contact us within 60 days after the error appeared on your statement. 205 | Authorization To Converl Your Check To An Electronic Transfer Debit: When you 206 | You must notify us of any potential errors in writing. You may call us or notify us 207 | provide a check as payment, you authorize us either to use information from your 208 | electronically, but if you do we are not required to investigate any potential errors and 209 | check to make a one-time electronic fund transfer from your account or to process 210 | you may have to pay the amount in question. 211 | the payment as a check. Your bank account may be debited as soon as the same day 212 | we receive your payment. You will not receive your check back from your institution. While we investigate whether or not there has been an error, the following are true; 213 | Condillonal Payments: Any payment check or other form of payment that you send . We cannot try to collect the amount in question, or report you as delinquent on 214 | us for less than the full balance due that is marked "paid in full" or contains a similar that amount. 215 | notation, or that you otherwise tender in full satisfaction of a disputed amount, 216 | must be sent to Card Services, P.O. Box 15049, Wilmington, DE 19850-5049. We . The charge in question may remain on your statement, and we may continue 217 | reserve all our rights regarding these payments (e.g., if it is determined there is no to charge you interest on that amount. But, if we determine that we made a 218 | valid dispute or if any such check is received at any other address, we may accept mistake, you will not have to pay the amount in question or any interest or 219 | the check and you will still owe any remaining balance). We may refuse to accept other fees related to that amount. 220 | any such payment by returning it to you, not cashing it or destroying it. All other 221 | . While you do not have to pay the amount in question, you are responsible for 222 | payments that you make should be sent to the regular Payment address shown on 223 | the remainder of your balance. 224 | this statement. 225 | . We can apply any unpaid amount against your credit limit. 226 | Annual Renewal Notice: If your Account Agreement has an annual membership 227 | fee, you are responsible for it every year your Account is open. We will add your Your Rights II You Are Dissatisfied With Your Credit Card Purchases: If you are 228 | annual membership fee to your monthly billing statement once a year, whether or not dissatisfied with the goods or services that you have purchased with your credit card, 229 | you use your account. Your annual membership fee will be added to your purchase and you have tried in good faith to correct the problem with the merchant, you may 230 | balance and may incur interest. The annual membership fee is non-refundable unless have the right not to pay the remaining amount due on the purchase. 231 | you notify us that you wish to close your account within 30 days or one billing cycle 232 | (whichever is less) after we provide the statement on which the annual membership To use this right, all of the following must be true: 233 | fee is billed. Your payment of the annual membership fee does not affect our rights 234 | 1. The purchase must have been made in your home state or within 100 miles 235 | to close your Account and to limit your right to make transactions on your Account. 236 | of your current mailing address, and the purchase price must have been 237 | If your Account is closed by you or us, the annual membership fee will no longer be 238 | more than $50. (Note: Neither of these are necessary if your purchase was 239 | billed to your Account. 240 | based on an advertisement we mailed to you, or if we own the company that 241 | Calculation Ol Balance Subject To Interest Rate: To figure your periodic interest sold you the goods or services.) 242 | charges for each billing cycle when a daily periodic rate(s) applies, we use the 243 | 2. You must have used your credit card for the purchase. Purchases made with 244 | daily balance method (including new transactions). To figure your periodic interest 245 | charges for each billing cycle when a monthly periodic rate(s) applies, we use the cash advances from an ATM or with a check that accesses your credit card 246 | average daily balance method (including new transactions). For an explanation of Account do not qualify. 247 | either method, or questions about a particular interest charge calculation on your 3. You must not yet have fully paid for the purchase. 248 | statement, please call us at the toll free customer service phone number listed above. 249 | If all of the criteria above are met and you are still dissatisfied with the purchase, contact 250 | We calculate periodic interest charges separately for each feature (for example, us in writing at Customer Service, P.O. Box 15299, Wilmington, DE 19850-5299. 251 | purchases, balance transfers, cash advances or overdraft advances). These 252 | calculations may combine different categories with the same periodic rates. Variable While we investigate, the same rules apply to the disputed amount as discussed 253 | rates will vary with the market based on the Prime Rate or such index described above. After we finish our investigation, we will tell you our decision. At that point, if 254 | in your Account Agreement. There is a transaction fee for each balance transfer, we think you owe an amount and you do not pay we may report you as delinquent. 255 | 256 | MA05042021 257 | 258 | 7 259 | 260 | To manage your account, including card payments, alerts, and change of address, visit 261 | www.chase.com/cardhelp or call the customer service number which appears on your 262 | account statement. 263 | 264 | L 265 | <<< 266 | 267 | 268 | Manage your account online at: Customer Service: Mobile: Download the 269 | www.chase.com/cardhelp 1-800-524-3880 Chase Mobile® app today 270 | freedom 271 | 272 | YOUR ACCOUNT MESSAGES (CONTINUED) 273 | Your AutoPay amount will be reduced by any payments or merchant credits that post to your account before we 274 | process your AutoPay payment. If the total of these payments and merchant credits is more than your set AutoPay 275 | amount, your AutoPay payment for that month will be zero. 276 | 277 | ACCOUNT ACTIVITY 278 | 279 | Date of 280 | Transaction Merchant Name or Transaction Description $ Amount 281 | 282 | PAYMENTS AND OTHER CREDITS 283 | 01/28 AUTOMATIC PAYMENT - THANK YOU -4,233.99 284 | 285 | PURCHASE 286 | 01/04 LARRY HOPKINS HONDA 7074304151 CA 265.40 287 | 01/04 CICEROS PIZZA SAN JOSE CA 28.18 288 | 01/05 USPS PO 0545640143 LOS ALTOS CA 15.60 289 | 01/07 TRINETHRA SUPER MARKET CUPERTINO CA 7.92 290 | 01/04 SPEEDWAY 5447 LOS ALTOS HIL CA 31.94 291 | 01/06 ATT*BILL PAYMENT 800-288-2020 TX 300.29 292 | 01/07 AMZN Mktp US*RT4G124P0 Amzn.com/bill WA 6.53 293 | 01/07 AMZN Mktp US*RT0Y474Q0 Amzn.com/bill WA 21.81 294 | 01/05 HALAL MEATS SAN JOSE CA 24.33 295 | 01/09 VIVINT INC/US 800-216-5232 UT 52.14 296 | 01/09 COSTCO WHSE #0143 MOUNTAIN VIEW CA 75.57 297 | 01/11 WALGREENS #689 MOUNTAIN VIEW CA 18.54 298 | 01/12 GOOGLE *YouTubePremium g.co/helppay# CA 22.99 299 | 01/13 FEDEX789226298200 Collierville TN 117.86 300 | 01/19 SHELL OIL 57444212500 FREMONT CA 7.16 301 | 01/19 LEXUS OF FREMONT FREMONT CA 936.10 302 | 01/19 STARBUCKS STORE 10885 CUPERTINO CA 11.30 303 | 01/22 TST* CHAAT BHAVAN MOUNTAI MOUNTAIN VIEW CA 28.95 304 | 01/23 AMZN Mktp US*R06VS6MNO Amzn.com/bill WA 7.67 305 | 01/23 UALR REMOTE PAY 501-569-3202 AR 2,163.19 306 | 01/23 UALR REMOTE PAY 501-569-3202 AR 50.00 307 | 01/24 AMZN Mktp US*R02SO5L22 Amzn.com/bill WA 8.61 308 | 01/24 TIRUPATHI BHIMAS MILPITAS CA 58.18 309 | 01/25 AMZN Mktp US*R09PP5NE2 Amzn.com/bill WA 28.36 310 | 01/26 COSTCO WHSE #0143 MOUNTAIN VIEW CA 313.61 311 | 01/29 AMZN Mktp US*R25221T90 Amzn.com/bill WA 8.72 312 | 01/29 COMCAST CALIFORNIA 800-COMCAST CA 97.00 313 | 01/29 TRADER JOE S #127 LOS ALTOS CA 20.75 314 | 01/30 Netflix 1 8445052993 CA 15.49 315 | 01/30 ATT*BILL PAYMENT 800-288-2020 TX 300.35 316 | 01/30 APNI MANDI FARMERS MARKE SUNNYVALE CA 36.76 317 | 02/01 APPLE.COM/BILL 866-712-7753 CA 2.99 318 | 319 | 2024 Totals Year-to-Date 320 | Total fees charged in 2024 $0.00 321 | Total interest charged in 2024 $0.00 322 | Year-to-date totals do not reflect any fee or interest refunds 323 | you may have received. 324 | 325 | INTEREST CHARGES 326 | Your Annual Percentage Rate (APR) is the annual interest rate on your account. 327 | Annual Balance 328 | Balance Type Percentage Subject To Interest 329 | Rate (APR) Interest Rate Charges 330 | 331 | PURCHASES 332 | Purchases 19.99%(v)(d) - 0 - - 0 - 333 | CASH ADVANCES 334 | Cash Advances 29.99%(v)(d) - 0 - - 0 - 335 | BALANCE TRANSFERS 336 | Balance Transfers 19.99%(v)(d) - 0 - - 0 - 337 | 338 | LARRY PAGE Page 2 of 3 Statement Date: 02/03/24 339 | 0000001 FIS33339 D 12 Y 9 03 24/02/03 Page 2 of 3 06610 MA MA 34942 03410000120003494202 340 | <<< 341 | 342 | 343 | 31 Days in Billing Period 344 | (v) = Variable Rate 345 | (d) = Daily Balance Method (including new transactions) 346 | (a) = Average Daily Balance Method (including new transactions) 347 | Please see Information About Your Account section for the Calculation of Balance Subject to Interest Rate, Annual Renewal Notice, How 348 | to Avoid Interest on Purchases, and other important information, as applicable. 349 | 350 | IMPORTANT NEWS 351 | 352 | Get 5% cash back on up to $1,500 in combined purchases in 353 | this quarter's bonus categories from 1/1/24-3/31/24. 354 | Learn more & activate at chase.com/freedom 355 | or call 1-800-524-3880 by March 14, 2024. 356 | 357 | [X] X 0000001 FIS33339 D 12 Y 9 03 24/02/03 Page 3 of 3 06610 MA MA 34942 03410000120003494202 358 | <<< 359 | 360 | ``` 361 | 362 | ## Extracted JSON 363 | 364 | ```json 365 | { 366 | "issuer_name": "CHASE FREEDOM", 367 | "customer_name": "Larry Page", 368 | "customer_address": { 369 | "zip_code": "97015-5505", 370 | "city": "LOS ANGELES", 371 | "full_address": "24917 KEYSTONE AVE, LOS ANGELES, CA 97015-5505" 372 | }, 373 | "payment_info": { 374 | "due_date": "2024-02-28T00:00:00Z", 375 | "minimum_payment": 50.00, 376 | "new_balance": 5084.29 377 | }, 378 | "spend_line_items": [ 379 | { 380 | "spend_date": "2024-01-04T00:00:00Z", 381 | "spend_description": "LARRY HOPKINS HONDA 7074304151 CA", 382 | "amount": 265.40 383 | }, 384 | { 385 | "spend_date": "2024-01-04T00:00:00Z", 386 | "spend_description": "CICEROS PIZZA SAN JOSE CA", 387 | "amount": 28.18 388 | }, 389 | { 390 | "spend_date": "2024-01-05T00:00:00Z", 391 | "spend_description": "USPS PO 0545640143 LOS ALTOS CA", 392 | "amount": 15.60 393 | }, 394 | { 395 | "spend_date": "2024-01-07T00:00:00Z", 396 | "spend_description": "TRINETHRA SUPER MARKET CUPERTINO CA", 397 | "amount": 7.92 398 | }, 399 | { 400 | "spend_date": "2024-01-04T00:00:00Z", 401 | "spend_description": "SPEEDWAY 5447 LOS ALTOS HIL CA", 402 | "amount": 31.94 403 | }, 404 | { 405 | "spend_date": "2024-01-06T00:00:00Z", 406 | "spend_description": "ATT*BILL PAYMENT 800-288-2020 TX", 407 | "amount": 300.29 408 | }, 409 | { 410 | "spend_date": "2024-01-07T00:00:00Z", 411 | "spend_description": "AMZN Mktp US*RT4G124P0 Amzn.com/bill WA", 412 | "amount": 6.53 413 | }, 414 | { 415 | "spend_date": "2024-01-07T00:00:00Z", 416 | "spend_description": "AMZN Mktp US*RT0Y474Q0 Amzn.com/bill WA", 417 | "amount": 21.81 418 | }, 419 | { 420 | "spend_date": "2024-01-05T00:00:00Z", 421 | "spend_description": "HALAL MEATS SAN JOSE CA", 422 | "amount": 24.33 423 | }, 424 | { 425 | "spend_date": "2024-01-09T00:00:00Z", 426 | "spend_description": "VIVINT INC/US 800-216-5232 UT", 427 | "amount": 52.14 428 | }, 429 | { 430 | "spend_date": "2024-01-09T00:00:00Z", 431 | "spend_description": "COSTCO WHSE #0143 MOUNTAIN VIEW CA", 432 | "amount": 75.57 433 | }, 434 | { 435 | "spend_date": "2024-01-11T00:00:00Z", 436 | "spend_description": "WALGREENS #689 MOUNTAIN VIEW CA", 437 | "amount": 18.54 438 | }, 439 | { 440 | "spend_date": "2024-01-12T00:00:00Z", 441 | "spend_description": "GOOGLE *YouTubePremium g.co/helppay# CA", 442 | "amount": 22.99 443 | }, 444 | { 445 | "spend_date": "2024-01-13T00:00:00Z", 446 | "spend_description": "FEDEX789226298200 Collierville TN", 447 | "amount": 117.86 448 | }, 449 | { 450 | "spend_date": "2024-01-19T00:00:00Z", 451 | "spend_description": "SHELL OIL 57444212500 FREMONT CA", 452 | "amount": 7.16 453 | }, 454 | { 455 | "spend_date": "2024-01-19T00:00:00Z", 456 | "spend_description": "LEXUS OF FREMONT FREMONT CA", 457 | "amount": 936.10 458 | }, 459 | { 460 | "spend_date": "2024-01-19T00:00:00Z", 461 | "spend_description": "STARBUCKS STORE 10885 CUPERTINO CA", 462 | "amount": 11.30 463 | }, 464 | { 465 | "spend_date": "2024-01-22T00:00:00Z", 466 | "spend_description": "TST* CHAAT BHAVAN MOUNTAI MOUNTAIN VIEW CA", 467 | "amount": 28.95 468 | }, 469 | { 470 | "spend_date": "2024-01-23T00:00:00Z", 471 | "spend_description": "AMZN Mktp US*R06VS6MNO Amzn.com/bill WA", 472 | "amount": 7.67 473 | }, 474 | { 475 | "spend_date": "2024-01-23T00:00:00Z", 476 | "spend_description": "UALR REMOTE PAY 501-569-3202 AR", 477 | "amount": 2163.19 478 | }, 479 | { 480 | "spend_date": "2024-01-23T00:00:00Z", 481 | "spend_description": "UALR REMOTE PAY 501-569-3202 AR", 482 | "amount": 50.00 483 | }, 484 | { 485 | "spend_date": "2024-01-24T00:00:00Z", 486 | "spend_description": "AMZN Mktp US*R02SO5L22 Amzn.com/bill WA", 487 | "amount": 8.61 488 | }, 489 | { 490 | "spend_date": "2024-01-24T00:00:00Z", 491 | "spend_description": "TIRUPATHI BHIMAS MILPITAS CA", 492 | "amount": 58.18 493 | }, 494 | { 495 | "spend_date": "2024-01-25T00:00:00Z", 496 | "spend_description": "AMZN Mktp US*R09PP5NE2 Amzn.com/bill WA", 497 | "amount": 28.36 498 | }, 499 | { 500 | "spend_date": "2024-01-26T00:00:00Z", 501 | "spend_description": "COSTCO WHSE #0143 MOUNTAIN VIEW CA", 502 | "amount": 313.61 503 | }, 504 | { 505 | "spend_date": "2024-01-29T00:00:00Z", 506 | "spend_description": "AMZN Mktp US*R25221T90 Amzn.com/bill WA", 507 | "amount": 8.72 508 | }, 509 | { 510 | "spend_date": "2024-01-29T00:00:00Z", 511 | "spend_description": "COMCAST CALIFORNIA 800-COMCAST CA", 512 | "amount": 97.00 513 | }, 514 | { 515 | "spend_date": "2024-01-29T00:00:00Z", 516 | "spend_description": "TRADER JOE S #127 LOS ALTOS CA", 517 | "amount": 20.75 518 | }, 519 | { 520 | "spend_date": "2024-01-30T00:00:00Z", 521 | "spend_description": "Netflix 1 8445052993 CA", 522 | "amount": 15.49 523 | }, 524 | { 525 | "spend_date": "2024-01-30T00:00:00Z", 526 | "spend_description": "ATT*BILL PAYMENT 800-288-2020 TX", 527 | "amount": 300.35 528 | }, 529 | { 530 | "spend_date": "2024-01-30T00:00:00Z", 531 | "spend_description": "APNI MANDI FARMERS MARKE SUNNYVALE CA", 532 | "amount": 36.76 533 | }, 534 | { 535 | "spend_date": "2024-02-01T00:00:00Z", 536 | "spend_description": "APPLE.COM/BILL 866-712-7753 CA", 537 | "amount": 2.99 538 | } 539 | ] 540 | } 541 | ``` -------------------------------------------------------------------------------- /assets/imgs/chase_cc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zipstack/structured-extraction/14b10bbf5f24dc57aef7832b21b0da3198f74f5a/assets/imgs/chase_cc.png -------------------------------------------------------------------------------- /extract.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | import os 3 | import sys 4 | from datetime import datetime 5 | from pathlib import Path 6 | 7 | import requests 8 | from dotenv import load_dotenv 9 | from langchain.prompts import SystemMessagePromptTemplate, ChatPromptTemplate, \ 10 | HumanMessagePromptTemplate 11 | from langchain_openai import ChatOpenAI 12 | from langchain.output_parsers import PydanticOutputParser 13 | from pydantic import BaseModel, Field 14 | from unstract.llmwhisperer.client import LLMWhispererClient 15 | 16 | 17 | class CustomerAddress(BaseModel): 18 | zip_code: str = Field(description="Should contain the zip code alone") 19 | city: str = Field(description="Should hold the city name from the address") 20 | full_address: str = Field(description="Should hold the full address of the customer") 21 | 22 | 23 | class PaymentInfo(BaseModel): 24 | due_date: datetime = Field(description="The due date of the credit card statement. Also known as the payment due " 25 | "date") 26 | minimum_payment: float = Field(description="the minimum amount that is due") 27 | new_balance: float = Field(description="the total new balance amount that can be paid") 28 | 29 | 30 | class SpendLineItem(BaseModel): 31 | spend_date: datetime = Field(description="The date of the transaction. If the year part isn't mentioned in the " 32 | "line item explicitly, pick up the year from the statement date and use " 33 | "it instead.") 34 | spend_description: str = Field(description="The description of the spend") 35 | amount: float = Field(description="The amount of the transaction") 36 | 37 | 38 | class ParsedCreditCardStatement(BaseModel): 39 | issuer_name: str = Field(description="What is the name of the issuer or the bank who has issued this credit card? " 40 | "I am not interested in the legal entity, but the primary brand name of the " 41 | "credit card.") 42 | customer_name: str = Field(description="What is the name of the customer to whom this credit card statement " 43 | "belongs to? Format the name of the customer well with the first letter of " 44 | "each name capitalized.") 45 | customer_address: CustomerAddress = Field(description="Since there might be multiple addresses in the context " 46 | "provided to you, first gather all addresses. Try to " 47 | "understand whom this credit card statement is being " 48 | "addressed to or in other words, the name of the customer. " 49 | "Find the address that matches that person's. Be sure to " 50 | "return the customer's address, for whom this credit card " 51 | "statement is for. Do not respond with any other address.") 52 | payment_info: PaymentInfo = Field(description="Payment information is important part of any credit card statement " 53 | "and it consists of the new balance or the full amount due for the " 54 | "current statement, the minimum payment due and the payment due " 55 | "date.") 56 | spend_line_items: list[SpendLineItem] = Field(description="This credit card statement contains spending details " 57 | "line items. Spend details can be split across the " 58 | "provided context. Respond with details of all the " 59 | "spend items by looking at the whole context always.") 60 | 61 | 62 | def make_llm_whisperer_call(file_path): 63 | print(f"Processing file:{file_path}...") 64 | # LLMWhisperer API key is picked up from the environment variable 65 | client = LLMWhispererClient() 66 | result = client.whisper(file_path=file_path, processing_mode="ocr", output_mode="line-printer") 67 | return result["extracted_text"] 68 | 69 | 70 | def generate_cache_file_name(file_path): 71 | # For our use case, PDFs won't be less than 4096, practically speaking. 72 | if os.path.getsize(file_path) < 4096: 73 | error_exit("File too small to process.") 74 | with open(file_path, "rb") as f: 75 | first_block = f.read(4096) 76 | # seek to the last block 77 | f.seek(-4096, os.SEEK_END) 78 | f.read(4096) 79 | last_block = f.read(4096) 80 | 81 | first_md5_hash = hashlib.md5(first_block).hexdigest() 82 | last_md5_hash = hashlib.md5(last_block).hexdigest() 83 | return f"/tmp/{first_md5_hash}_{last_md5_hash}.txt" 84 | 85 | 86 | def is_file_cached(file_path): 87 | cache_file_name = generate_cache_file_name(file_path) 88 | cache_file = Path(cache_file_name) 89 | if cache_file.is_file(): 90 | return True 91 | else: 92 | return False 93 | 94 | 95 | def extract_text(file_path): 96 | if is_file_cached(file_path): 97 | print(f"Info: File {file_path} is already cached.") 98 | cache_file_name = generate_cache_file_name(file_path) 99 | with open(cache_file_name, "r") as f: 100 | return f.read() 101 | else: 102 | data = make_llm_whisperer_call(file_path) 103 | cache_file_name = generate_cache_file_name(file_path) 104 | with open(cache_file_name, "w") as f: 105 | f.write(data) 106 | return data 107 | 108 | 109 | def error_exit(error_message): 110 | print(error_message) 111 | sys.exit(1) 112 | 113 | 114 | def show_usage_and_exit(): 115 | error_exit("Please pass name of directory or file to process.") 116 | 117 | 118 | def enumerate_pdf_files(file_path): 119 | files_to_process = [] 120 | # Users can pass a directory or a file name 121 | if os.path.isfile(file_path): 122 | if os.path.splitext(file_path)[1][1:].strip().lower() == 'pdf': 123 | files_to_process.append(file_path) 124 | elif os.path.isdir(file_path): 125 | files = os.listdir(file_path) 126 | for file_name in files: 127 | full_file_path = os.path.join(file_path, file_name) 128 | if os.path.isfile(full_file_path): 129 | if os.path.splitext(file_name)[1][1:].strip().lower() == 'pdf': 130 | files_to_process.append(full_file_path) 131 | else: 132 | error_exit(f"Error. {file_path} should be a file or a directory.") 133 | 134 | return files_to_process 135 | 136 | 137 | def extract_values_from_file(raw_file_data): 138 | preamble = ("\n" 139 | "Your ability to extract and summarize this information accurately is essential for effective " 140 | "credit card statement analysis. Pay close attention to the credit card statement's language, " 141 | "structure, and any cross-references to ensure a comprehensive and precise extraction of " 142 | "information. Do not use prior knowledge or information from outside the context to answer the " 143 | "questions. Only use the information provided in the context to answer the questions.\n") 144 | postamble = "Do not include any explanation in the reply. Only include the extracted information in the reply." 145 | system_template = "{preamble}" 146 | system_message_prompt = SystemMessagePromptTemplate.from_template(system_template) 147 | human_template = "{format_instructions}\n{raw_file_data}\n{postamble}" 148 | human_message_prompt = HumanMessagePromptTemplate.from_template(human_template) 149 | 150 | parser = PydanticOutputParser(pydantic_object=ParsedCreditCardStatement) 151 | print(parser.get_format_instructions()) 152 | 153 | # compile chat template 154 | chat_prompt = ChatPromptTemplate.from_messages([system_message_prompt, human_message_prompt]) 155 | request = chat_prompt.format_prompt(preamble=preamble, 156 | format_instructions=parser.get_format_instructions(), 157 | raw_file_data=raw_file_data, 158 | postamble=postamble).to_messages() 159 | model = ChatOpenAI() 160 | print("Querying model...") 161 | result = model(request, temperature=0) 162 | print("Response from model:") 163 | print(result.content) 164 | return result.content 165 | 166 | 167 | def process_pdf_files(file_list): 168 | for file_path in file_list: 169 | raw_file_data = extract_text(file_path) 170 | print(f"Extracted text for file {file_path}:\n{raw_file_data}") 171 | extracted_json = extract_values_from_file(raw_file_data) 172 | json_file_path = f"{file_path}.json" 173 | with open(json_file_path, "w") as f: 174 | f.write(extracted_json) 175 | 176 | 177 | def main(): 178 | load_dotenv() 179 | if len(sys.argv) < 2: 180 | show_usage_and_exit() 181 | 182 | print(f"Processing path {sys.argv[1]}...") 183 | file_list = enumerate_pdf_files(sys.argv[1]) 184 | print(f"Processing {len(file_list)} files...") 185 | print(f"Processing first file: {file_list[0]}...") 186 | process_pdf_files(file_list) 187 | 188 | 189 | if __name__ == '__main__': 190 | main() 191 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | langchain==0.2.4 2 | langchain-openai==0.1.8 3 | llmwhisperer-client==0.2.0 4 | pydantic==2.7.4 5 | python-dotenv==1.0.1 --------------------------------------------------------------------------------