├── .gitignore ├── requirements.txt ├── README.md ├── results └── Bank-Statement-Template-2-TemplateLab_extracted_20250827_070642.json ├── pdf_text_extractor_test.py └── bank_statement_parser.py /.gitignore: -------------------------------------------------------------------------------- 1 | .env 2 | __pycache__/* 3 | sample_pdfs/* -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | PyPDF2==3.0.1 2 | openai>=1.0.0 3 | python-dotenv==1.0.0 4 | pdfplumber==0.9.0 5 | pdf2image==1.17.0 6 | pytesseract==0.3.10 7 | PyMuPDF==1.23.0 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Bank Statement Parser 2 | 3 | A GUI application that extracts structured data from bank statement PDFs using OpenAI's GPT API. 4 | 5 | ## Features 6 | 7 | - **GUI Interface**: Easy-to-use interface with file selection, output folder configuration, and processing controls 8 | - **PDF Text Extraction**: Extracts text from PDF bank statements using pdfplumber 9 | - **Structured Data Extraction**: Uses OpenAI GPT-4 to extract structured information from bank statements 10 | - **Real-time Logging**: Displays processing progress and logs in the GUI 11 | - **Progress Indicator**: Shows processing status with progress bar 12 | - **JSON Output**: Saves extracted data in structured JSON format 13 | 14 | ## Installation 15 | 16 | 1. Install required dependencies: 17 | ```bash 18 | pip install -r requirements.txt 19 | ``` 20 | 21 | 2. Set up your OpenAI API key in the `.env` file: 22 | ``` 23 | OPENAI_API_KEY=your_actual_openai_api_key_here 24 | ``` 25 | 26 | ## Usage 27 | 28 | 1. Run the application: 29 | ```bash 30 | python bank_statement_parser.py 31 | ``` 32 | 33 | 2. Use the GUI to: 34 | - Select a PDF bank statement file 35 | - Choose an output folder (defaults to 'results') 36 | - Click "Process PDF" to start extraction 37 | - Monitor progress in the log display 38 | 39 | ## Extracted Data Structure 40 | 41 | The application extracts the following information when available in the PDF: 42 | 43 | - **Bank Information**: Name, address, phone, website 44 | - **Account Holder**: Name, address, phone, email 45 | - **Account Details**: Account number, type, routing number, sort code 46 | - **Statement Period**: Start date, end date, statement date 47 | - **Balance Information**: Opening balance, closing balance, available balance, currency 48 | - **Transactions**: Date, description, amount, balance after transaction, transaction type, reference number 49 | 50 | ## Important Notes 51 | 52 | - The application only extracts information that is explicitly present in the PDF text 53 | - No assumptions or inferred data is added 54 | - Missing information is marked as `null` in the JSON output 55 | - Requires a valid OpenAI API key to function 56 | - Processing time depends on PDF size and OpenAI API response time 57 | 58 | ## Output 59 | 60 | Results are saved as JSON files in the specified output folder with timestamps: 61 | ``` 62 | {original_filename}_extracted_{timestamp}.json 63 | ``` 64 | -------------------------------------------------------------------------------- /results/Bank-Statement-Template-2-TemplateLab_extracted_20250827_070642.json: -------------------------------------------------------------------------------- 1 | { 2 | "bank_information": { 3 | "bank_name": "", 4 | "bank_address": "231 Valley Farms Street Santa Monica, CA", 5 | "bank_phone": null, 6 | "bank_website": "bickslowbank@domain.com" 7 | }, 8 | "account_holder": { 9 | "name": "Bit Manufacturing Ltd", 10 | "address": "2450 Courage St, STE 108 Brownsville, TX 78521", 11 | "phone": null, 12 | "email": null 13 | }, 14 | "account_details": { 15 | "account_number": "111-234-567-890", 16 | "account_type": null, 17 | "routing_number": null, 18 | "sort_code": null 19 | }, 20 | "statement_period": { 21 | "start_date": "mm/dd/yyyy", 22 | "end_date": "mm/dd/yyyy", 23 | "statement_date": "mm/dd/yyyy" 24 | }, 25 | "balance_information": { 26 | "opening_balance": "8313.3", 27 | "closing_balance": "5799.640000000001", 28 | "available_balance": null, 29 | "currency": null 30 | }, 31 | "transactions": [ 32 | { 33 | "date": "mm/dd/yyyy", 34 | "description": "Amazon", 35 | "amount": "132.30", 36 | "balance_after_transaction": "8180.999999999999", 37 | "transaction_type": "Fast Payment", 38 | "reference_number": null 39 | }, 40 | { 41 | "date": "mm/dd/yyyy", 42 | "description": "eBAY Trading Co.", 43 | "amount": "515.22", 44 | "balance_after_transaction": "7665.779999999999", 45 | "transaction_type": "BACS", 46 | "reference_number": null 47 | }, 48 | { 49 | "date": "mm/dd/yyyy", 50 | "description": "Morrisons Petrol", 51 | "amount": "80.00", 52 | "balance_after_transaction": "7585.779999999999", 53 | "transaction_type": "Fast Payment", 54 | "reference_number": null 55 | }, 56 | { 57 | "date": "mm/dd/yyyy", 58 | "description": "Business Loan", 59 | "amount": "20000", 60 | "balance_after_transaction": "27585.78", 61 | "transaction_type": "BACS", 62 | "reference_number": null 63 | }, 64 | { 65 | "date": "mm/dd/yyyy", 66 | "description": "Jumes White Media", 67 | "amount": "2416.85", 68 | "balance_after_transaction": "25168.93", 69 | "transaction_type": "BACS", 70 | "reference_number": null 71 | }, 72 | { 73 | "date": "mm/dd/yyyy", 74 | "description": "ATM High Street", 75 | "amount": "100", 76 | "balance_after_transaction": "25068.93", 77 | "transaction_type": "Fast Payment", 78 | "reference_number": null 79 | }, 80 | { 81 | "date": "mm/dd/yyyy", 82 | "description": "Accorn Advertising Studios", 83 | "amount": "150", 84 | "balance_after_transaction": "24918.93", 85 | "transaction_type": "BACS", 86 | "reference_number": null 87 | }, 88 | { 89 | "date": "mm/dd/yyyy", 90 | "description": "Marriott Hotels", 91 | "amount": "177", 92 | "balance_after_transaction": "24741.93", 93 | "transaction_type": "Fast Payment", 94 | "reference_number": null 95 | }, 96 | { 97 | "date": "mm/dd/yyyy", 98 | "description": "Abelio Scotrail Ltd", 99 | "amount": "122.22", 100 | "balance_after_transaction": "24619.71", 101 | "transaction_type": "Fast Payment", 102 | "reference_number": null 103 | }, 104 | { 105 | "date": "mm/dd/yyyy", 106 | "description": "Cheque 000234", 107 | "amount": "1200", 108 | "balance_after_transaction": "23419.71", 109 | "transaction_type": "Fast Payment", 110 | "reference_number": null 111 | }, 112 | { 113 | "date": "mm/dd/yyyy", 114 | "description": "Interest Paid", 115 | "amount": "9.33", 116 | "balance_after_transaction": "23429.04", 117 | "transaction_type": "Int. Bank", 118 | "reference_number": null 119 | }, 120 | { 121 | "date": "mm/dd/yyyy", 122 | "description": "OVO Energy", 123 | "amount": "270", 124 | "balance_after_transaction": "23159.04", 125 | "transaction_type": "DD", 126 | "reference_number": null 127 | }, 128 | { 129 | "date": "mm/dd/yyyy", 130 | "description": "Toyota Online", 131 | "amount": "10525.40", 132 | "balance_after_transaction": "12633.640000000001", 133 | "transaction_type": "BACS", 134 | "reference_number": null 135 | }, 136 | { 137 | "date": "mm/dd/yyyy", 138 | "description": "HMRC", 139 | "amount": "1000", 140 | "balance_after_transaction": "11633.640000000001", 141 | "transaction_type": "BACS", 142 | "reference_number": null 143 | }, 144 | { 145 | "date": "mm/dd/yyyy", 146 | "description": "OVLA", 147 | "amount": "280", 148 | "balance_after_transaction": "11353.640000000001", 149 | "transaction_type": "DD", 150 | "reference_number": null 151 | }, 152 | { 153 | "date": "mm/dd/yyyy", 154 | "description": "Michael Kor Salary", 155 | "amount": "1554", 156 | "balance_after_transaction": "9799.640000000001", 157 | "transaction_type": "EBP", 158 | "reference_number": null 159 | }, 160 | { 161 | "date": "mm/dd/yyyy", 162 | "description": "BOS Mastercard", 163 | "amount": "4000", 164 | "balance_after_transaction": "5799.640000000001", 165 | "transaction_type": "DD", 166 | "reference_number": null 167 | } 168 | ] 169 | } -------------------------------------------------------------------------------- /pdf_text_extractor_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Dedicated PDF Text Extraction Test Tool 4 | Tests multiple PDF extraction methods to find the best approach 5 | """ 6 | 7 | import os 8 | import sys 9 | from datetime import datetime 10 | 11 | # Test different PDF libraries 12 | def test_pdfplumber(pdf_path): 13 | """Test pdfplumber extraction methods""" 14 | print("\\n" + "="*50) 15 | print("TESTING PDFPLUMBER") 16 | print("="*50) 17 | 18 | try: 19 | import pdfplumber 20 | 21 | with pdfplumber.open(pdf_path) as pdf: 22 | for page_num, page in enumerate(pdf.pages, 1): 23 | print(f"\\n--- Page {page_num} with pdfplumber ---") 24 | 25 | # Method 1: Standard extraction 26 | text1 = page.extract_text() 27 | print(f"Standard extraction: {len(text1 or '')} characters") 28 | if text1: print(f"Preview: {text1[:200]}...") 29 | 30 | # Method 2: With layout 31 | text2 = page.extract_text(layout=True) 32 | print(f"Layout extraction: {len(text2 or '')} characters") 33 | if text2 and len(text2) > len(text1 or ""): print(f"Preview: {text2[:200]}...") 34 | 35 | # Method 3: Words 36 | try: 37 | words = page.extract_words() 38 | if words: 39 | word_text = " ".join([w['text'] for w in words]) 40 | print(f"Word extraction: {len(word_text)} characters") 41 | if len(word_text) > len(text1 or ""): print(f"Preview: {word_text[:200]}...") 42 | except Exception as e: 43 | print(f"Word extraction failed: {e}") 44 | 45 | # Method 4: Characters 46 | chars = page.chars 47 | if chars: 48 | char_text = "".join([c['text'] for c in chars]) 49 | print(f"Character extraction: {len(char_text)} characters") 50 | if len(char_text) > len(text1 or ""): print(f"Preview: {char_text[:200]}...") 51 | 52 | # Method 5: Tables 53 | try: 54 | tables = page.extract_tables() 55 | if tables: 56 | table_text = "\\n".join([ 57 | "\\t".join([str(cell) if cell else "" for cell in row]) 58 | for table in tables for row in table if row 59 | ]) 60 | print(f"Table extraction: {len(table_text)} characters") 61 | if table_text: print(f"Preview: {table_text[:200]}...") 62 | except Exception as e: 63 | print(f"Table extraction failed: {e}") 64 | 65 | # Method 6: Annotations 66 | try: 67 | if hasattr(page, 'annots') and page.annots: 68 | print(f"Found {len(page.annots)} annotations") 69 | for i, annot in enumerate(page.annots): 70 | print(f" Annotation {i}: {annot}") 71 | except Exception as e: 72 | print(f"Annotation check failed: {e}") 73 | 74 | # Method 7: Objects 75 | try: 76 | if hasattr(page, 'objects'): 77 | print(f"Found {len(page.objects)} objects") 78 | for i, obj in enumerate(page.objects[:5]): # Show first 5 79 | print(f" Object {i}: {obj}") 80 | except Exception as e: 81 | print(f"Object check failed: {e}") 82 | 83 | except ImportError: 84 | print("pdfplumber not available") 85 | except Exception as e: 86 | print(f"pdfplumber failed: {e}") 87 | 88 | def test_pypdf2(pdf_path): 89 | """Test PyPDF2 extraction methods""" 90 | print("\\n" + "="*50) 91 | print("TESTING PYPDF2") 92 | print("="*50) 93 | 94 | try: 95 | import PyPDF2 96 | 97 | with open(pdf_path, 'rb') as file: 98 | pdf_reader = PyPDF2.PdfReader(file) 99 | 100 | print(f"Total pages: {len(pdf_reader.pages)}") 101 | 102 | # Check for form fields 103 | try: 104 | form_fields = pdf_reader.get_form_text_fields() 105 | if form_fields: 106 | print(f"Found {len(form_fields)} form fields:") 107 | for name, value in form_fields.items(): 108 | print(f" {name}: {value}") 109 | else: 110 | print("No form fields found") 111 | except Exception as e: 112 | print(f"Form field check failed: {e}") 113 | 114 | for page_num in range(len(pdf_reader.pages)): 115 | page = pdf_reader.pages[page_num] 116 | print(f"\\n--- Page {page_num + 1} with PyPDF2 ---") 117 | 118 | # Standard extraction 119 | text = page.extract_text() 120 | print(f"Standard extraction: {len(text)} characters") 121 | if text: print(f"Preview: {text[:200]}...") 122 | 123 | # Check annotations 124 | try: 125 | if '/Annots' in page: 126 | annotations = page['/Annots'] 127 | print(f"Found {len(annotations)} annotations") 128 | for i, annot_ref in enumerate(annotations): 129 | try: 130 | annot = annot_ref.get_object() 131 | print(f" Annotation {i}: {annot}") 132 | if '/V' in annot: 133 | print(f" Value: {annot['/V']}") 134 | if '/T' in annot: 135 | print(f" Title: {annot['/T']}") 136 | except Exception as e: 137 | print(f" Annotation {i} failed: {e}") 138 | except Exception as e: 139 | print(f"Annotation check failed: {e}") 140 | 141 | except ImportError: 142 | print("PyPDF2 not available") 143 | except Exception as e: 144 | print(f"PyPDF2 failed: {e}") 145 | 146 | def test_pymupdf(pdf_path): 147 | """Test PyMuPDF (fitz) extraction methods""" 148 | print("\\n" + "="*50) 149 | print("TESTING PYMUPDF (FITZ)") 150 | print("="*50) 151 | 152 | try: 153 | import fitz 154 | 155 | doc = fitz.open(pdf_path) 156 | print(f"Total pages: {doc.page_count}") 157 | 158 | for page_num in range(doc.page_count): 159 | page = doc.load_page(page_num) 160 | print(f"\\n--- Page {page_num + 1} with PyMuPDF ---") 161 | 162 | # Method 1: Standard text 163 | text1 = page.get_text() 164 | print(f"Standard get_text(): {len(text1)} characters") 165 | if text1: print(f"Preview: {text1[:200]}...") 166 | 167 | # Method 2: Text with layout 168 | text2 = page.get_text("text") 169 | print(f"get_text('text'): {len(text2)} characters") 170 | if text2 and len(text2) > len(text1): print(f"Preview: {text2[:200]}...") 171 | 172 | # Method 3: Text blocks 173 | try: 174 | blocks = page.get_text("blocks") 175 | if blocks: 176 | block_text = "\\n".join([block[4] for block in blocks if len(block) > 4 and block[4].strip()]) 177 | print(f"Text blocks: {len(block_text)} characters") 178 | if len(block_text) > len(text1): print(f"Preview: {block_text[:200]}...") 179 | except Exception as e: 180 | print(f"Text blocks failed: {e}") 181 | 182 | # Method 4: Text dictionary 183 | try: 184 | text_dict = page.get_text("dict") 185 | text_parts = [] 186 | for block in text_dict.get("blocks", []): 187 | if "lines" in block: 188 | for line in block["lines"]: 189 | for span in line.get("spans", []): 190 | if "text" in span and span["text"].strip(): 191 | text_parts.append(span["text"]) 192 | if text_parts: 193 | dict_text = "\\n".join(text_parts) 194 | print(f"Text dictionary: {len(dict_text)} characters") 195 | if len(dict_text) > len(text1): print(f"Preview: {dict_text[:200]}...") 196 | except Exception as e: 197 | print(f"Text dictionary failed: {e}") 198 | 199 | # Method 5: Widgets (form fields) 200 | try: 201 | widgets = page.widgets() 202 | if widgets: 203 | print(f"Found {len(widgets)} widgets:") 204 | for i, widget in enumerate(widgets): 205 | print(f" Widget {i}: {widget}") 206 | if hasattr(widget, 'field_name'): print(f" Name: {widget.field_name}") 207 | if hasattr(widget, 'field_value'): print(f" Value: {widget.field_value}") 208 | if hasattr(widget, 'field_display'): print(f" Display: {widget.field_display}") 209 | else: 210 | print("No widgets found") 211 | except Exception as e: 212 | print(f"Widget check failed: {e}") 213 | 214 | # Method 6: Annotations 215 | try: 216 | annots = page.annots() 217 | if annots: 218 | print(f"Found {len(annots)} annotations:") 219 | for i, annot in enumerate(annots): 220 | print(f" Annotation {i}: {annot}") 221 | try: 222 | content = annot.info.get("content", "") 223 | if content: print(f" Content: {content}") 224 | except: 225 | pass 226 | else: 227 | print("No annotations found") 228 | except Exception as e: 229 | print(f"Annotation check failed: {e}") 230 | 231 | doc.close() 232 | 233 | except ImportError: 234 | print("PyMuPDF not available - install with: pip install PyMuPDF") 235 | except Exception as e: 236 | print(f"PyMuPDF failed: {e}") 237 | 238 | def test_pdf2image_ocr(pdf_path): 239 | """Test OCR extraction""" 240 | print("\\n" + "="*50) 241 | print("TESTING OCR (PDF2IMAGE + PYTESSERACT)") 242 | print("="*50) 243 | 244 | try: 245 | from pdf2image import convert_from_path 246 | import pytesseract 247 | 248 | print("Converting PDF to images...") 249 | images = convert_from_path(pdf_path, dpi=300) 250 | 251 | for i, image in enumerate(images, 1): 252 | print(f"\\n--- Page {i} with OCR ---") 253 | 254 | # Extract text using OCR 255 | ocr_text = pytesseract.image_to_string(image, lang='eng') 256 | print(f"OCR extraction: {len(ocr_text)} characters") 257 | if ocr_text: print(f"Preview: {ocr_text[:200]}...") 258 | 259 | except ImportError: 260 | print("OCR libraries not available - install with: pip install pdf2image pytesseract") 261 | except Exception as e: 262 | print(f"OCR failed: {e}") 263 | 264 | def main(): 265 | """Main test function""" 266 | if len(sys.argv) != 2: 267 | print("Usage: python pdf_text_extractor_test.py ") 268 | sys.exit(1) 269 | 270 | pdf_path = sys.argv[1] 271 | 272 | if not os.path.exists(pdf_path): 273 | print(f"Error: PDF file '{pdf_path}' not found") 274 | sys.exit(1) 275 | 276 | print(f"Testing PDF text extraction on: {pdf_path}") 277 | print(f"File size: {os.path.getsize(pdf_path)} bytes") 278 | print(f"Test started at: {datetime.now()}") 279 | 280 | # Test all methods 281 | test_pdfplumber(pdf_path) 282 | test_pypdf2(pdf_path) 283 | test_pymupdf(pdf_path) 284 | test_pdf2image_ocr(pdf_path) 285 | 286 | print("\\n" + "="*50) 287 | print("TESTING COMPLETE") 288 | print("="*50) 289 | print("Review the output above to see which method extracts the most text.") 290 | print("Look for the method that shows the highest character count and actual content.") 291 | 292 | if __name__ == "__main__": 293 | main() 294 | -------------------------------------------------------------------------------- /bank_statement_parser.py: -------------------------------------------------------------------------------- 1 | import tkinter as tk 2 | from tkinter import ttk, filedialog, messagebox, scrolledtext 3 | import threading 4 | import json 5 | import logging 6 | import os 7 | from datetime import datetime 8 | from dotenv import load_dotenv 9 | import pdfplumber 10 | import PyPDF2 11 | from openai import OpenAI 12 | try: 13 | from pdf2image import convert_from_path 14 | import pytesseract 15 | from PIL import Image 16 | OCR_AVAILABLE = True 17 | except ImportError: 18 | OCR_AVAILABLE = False 19 | 20 | try: 21 | import fitz # PyMuPDF 22 | PYMUPDF_AVAILABLE = True 23 | except ImportError: 24 | PYMUPDF_AVAILABLE = False 25 | 26 | # Load environment variables 27 | load_dotenv() 28 | 29 | class BankStatementParser: 30 | def __init__(self, root): 31 | self.root = root 32 | self.root.title("Bank Statement Parser") 33 | self.root.geometry("800x700") 34 | self.root.resizable(True, True) 35 | 36 | # Initialize variables 37 | self.selected_file = tk.StringVar() 38 | self.output_folder = tk.StringVar(value="results") 39 | self.is_processing = False 40 | 41 | # Setup OpenAI client 42 | self.setup_openai() 43 | 44 | # Setup logging 45 | self.setup_logging() 46 | 47 | # Create GUI 48 | self.create_gui() 49 | 50 | def setup_openai(self): 51 | """Initialize OpenAI client""" 52 | api_key = os.getenv('OPENAI_API_KEY') 53 | if not api_key: 54 | messagebox.showerror("Error", "Please set your OpenAI API key in the .env file") 55 | self.client = None 56 | return 57 | 58 | try: 59 | # Modern OpenAI v2.x API 60 | self.client = OpenAI(api_key=api_key) 61 | except Exception as e: 62 | messagebox.showerror("Error", f"Failed to initialize OpenAI client: {str(e)}") 63 | self.client = None 64 | 65 | def setup_logging(self): 66 | """Setup logging configuration""" 67 | log_format = '%(asctime)s - %(levelname)s - %(message)s' 68 | logging.basicConfig(level=logging.INFO, format=log_format) 69 | self.logger = logging.getLogger(__name__) 70 | 71 | def create_gui(self): 72 | """Create the main GUI interface""" 73 | # Configure window 74 | self.root.configure(bg='#f0f0f0') 75 | 76 | # Header frame with title and icon 77 | header_frame = tk.Frame(self.root, bg='#2c3e50', height=80) 78 | header_frame.grid(row=0, column=0, sticky=(tk.W, tk.E), padx=0, pady=0) 79 | header_frame.grid_propagate(False) 80 | header_frame.columnconfigure(0, weight=1) 81 | 82 | title_label = tk.Label(header_frame, text="🏦 Bank Statement Parser", 83 | font=('Arial', 18, 'bold'), 84 | fg='white', bg='#2c3e50') 85 | title_label.grid(row=0, column=0, pady=20) 86 | 87 | subtitle_label = tk.Label(header_frame, text="Extract structured data from PDF bank statements using AI", 88 | font=('Arial', 10), 89 | fg='#bdc3c7', bg='#2c3e50') 90 | subtitle_label.grid(row=1, column=0, pady=(0, 10)) 91 | 92 | # Main content frame 93 | main_frame = ttk.Frame(self.root, padding="20") 94 | main_frame.grid(row=1, column=0, sticky=(tk.W, tk.E, tk.N, tk.S), padx=10, pady=10) 95 | 96 | # Configure grid weights 97 | self.root.columnconfigure(0, weight=1) 98 | self.root.rowconfigure(1, weight=1) 99 | main_frame.columnconfigure(1, weight=1) 100 | main_frame.rowconfigure(4, weight=1) 101 | 102 | # File selection section with improved styling 103 | file_section = ttk.LabelFrame(main_frame, text="📁 PDF Source File", padding="15") 104 | file_section.grid(row=0, column=0, columnspan=3, sticky=(tk.W, tk.E), pady=(0, 15)) 105 | file_section.columnconfigure(0, weight=1) 106 | 107 | file_frame = ttk.Frame(file_section) 108 | file_frame.grid(row=0, column=0, sticky=(tk.W, tk.E)) 109 | file_frame.columnconfigure(0, weight=1) 110 | 111 | self.file_entry = ttk.Entry(file_frame, textvariable=self.selected_file, 112 | state='readonly', font=('Arial', 10)) 113 | self.file_entry.grid(row=0, column=0, sticky=(tk.W, tk.E), padx=(0, 10)) 114 | 115 | browse_btn = ttk.Button(file_frame, text="📂 Browse", command=self.browse_file) 116 | browse_btn.grid(row=0, column=1) 117 | 118 | # Output folder section with improved styling 119 | output_section = ttk.LabelFrame(main_frame, text="📤 Output Folder", padding="15") 120 | output_section.grid(row=1, column=0, columnspan=3, sticky=(tk.W, tk.E), pady=(0, 15)) 121 | output_section.columnconfigure(0, weight=1) 122 | 123 | output_frame = ttk.Frame(output_section) 124 | output_frame.grid(row=0, column=0, sticky=(tk.W, tk.E)) 125 | output_frame.columnconfigure(0, weight=1) 126 | 127 | self.output_entry = ttk.Entry(output_frame, textvariable=self.output_folder, 128 | font=('Arial', 10)) 129 | self.output_entry.grid(row=0, column=0, sticky=(tk.W, tk.E), padx=(0, 10)) 130 | 131 | output_browse_btn = ttk.Button(output_frame, text="📂 Browse", command=self.browse_output_folder) 132 | output_browse_btn.grid(row=0, column=1) 133 | 134 | # Process section with enhanced styling 135 | process_section = ttk.LabelFrame(main_frame, text="⚡ Processing", padding="15") 136 | process_section.grid(row=2, column=0, columnspan=3, sticky=(tk.W, tk.E), pady=(0, 15)) 137 | process_section.columnconfigure(0, weight=1) 138 | 139 | button_frame = ttk.Frame(process_section) 140 | button_frame.grid(row=0, column=0) 141 | 142 | self.process_button = ttk.Button(button_frame, text="🚀 Process PDF", 143 | command=self.start_processing, 144 | width=15) 145 | self.process_button.grid(row=0, column=0, padx=(0, 15)) 146 | 147 | self.progress = ttk.Progressbar(button_frame, mode='indeterminate', length=200) 148 | self.progress.grid(row=0, column=1, padx=(0, 15)) 149 | 150 | self.status_label = ttk.Label(button_frame, text="✅ Ready", 151 | font=('Arial', 10, 'bold'), 152 | foreground='#27ae60') 153 | self.status_label.grid(row=0, column=2) 154 | 155 | # Log display section with improved styling 156 | log_section = ttk.LabelFrame(main_frame, text="📋 Processing Log", padding="15") 157 | log_section.grid(row=3, column=0, columnspan=3, sticky=(tk.W, tk.E, tk.N, tk.S), pady=(0, 0)) 158 | log_section.columnconfigure(0, weight=1) 159 | log_section.rowconfigure(0, weight=1) 160 | 161 | self.log_text = scrolledtext.ScrolledText(log_section, height=18, width=80, 162 | font=('Consolas', 9), 163 | bg='#2c3e50', 164 | fg='#ecf0f1', 165 | insertbackground='white', 166 | selectbackground='#34495e') 167 | self.log_text.grid(row=0, column=0, sticky=(tk.W, tk.E, tk.N, tk.S)) 168 | 169 | def browse_file(self): 170 | """Open file dialog to select PDF file""" 171 | file_path = filedialog.askopenfilename( 172 | title="Select PDF File", 173 | filetypes=[("PDF files", "*.pdf"), ("All files", "*.*")] 174 | ) 175 | if file_path: 176 | self.selected_file.set(file_path) 177 | self.log_message(f"Selected file: {file_path}") 178 | 179 | def browse_output_folder(self): 180 | """Open folder dialog to select output directory""" 181 | folder_path = filedialog.askdirectory(title="Select Output Folder") 182 | if folder_path: 183 | self.output_folder.set(folder_path) 184 | self.log_message(f"Output folder set to: {folder_path}") 185 | 186 | def log_message(self, message): 187 | """Add message to log display with color coding""" 188 | timestamp = datetime.now().strftime("%H:%M:%S") 189 | 190 | # Add color coding based on message type 191 | if "successfully" in message.lower() or "completed" in message.lower(): 192 | color_tag = "success" 193 | elif "error" in message.lower() or "failed" in message.lower(): 194 | color_tag = "error" 195 | elif "warning" in message.lower(): 196 | color_tag = "warning" 197 | elif "processing" in message.lower() or "extracting" in message.lower(): 198 | color_tag = "info" 199 | else: 200 | color_tag = "normal" 201 | 202 | log_entry = f"[{timestamp}] {message}\n" 203 | 204 | # Configure color tags if not already done 205 | if not hasattr(self, '_tags_configured'): 206 | self.log_text.tag_configure("success", foreground="#27ae60") 207 | self.log_text.tag_configure("error", foreground="#e74c3c") 208 | self.log_text.tag_configure("warning", foreground="#f39c12") 209 | self.log_text.tag_configure("info", foreground="#3498db") 210 | self.log_text.tag_configure("normal", foreground="#ecf0f1") 211 | self._tags_configured = True 212 | 213 | self.log_text.insert(tk.END, log_entry, color_tag) 214 | self.log_text.see(tk.END) 215 | self.root.update_idletasks() 216 | 217 | # Also log to console 218 | self.logger.info(message) 219 | 220 | def start_processing(self): 221 | """Start processing in a separate thread""" 222 | if self.is_processing: 223 | return 224 | 225 | if not self.selected_file.get(): 226 | messagebox.showerror("Error", "Please select a PDF file") 227 | return 228 | 229 | if not os.path.exists(self.selected_file.get()): 230 | messagebox.showerror("Error", "Selected file does not exist") 231 | return 232 | 233 | if not hasattr(self, 'client') or self.client is None: 234 | messagebox.showerror("Error", "OpenAI client not initialized. Please check your API key.") 235 | return 236 | 237 | # Start processing in background thread 238 | self.is_processing = True 239 | self.process_button.config(state='disabled') 240 | self.progress.start() 241 | self.status_label.config(text="🔄 Processing...", foreground='#f39c12') 242 | 243 | thread = threading.Thread(target=self.process_pdf) 244 | thread.daemon = True 245 | thread.start() 246 | 247 | def extract_text_from_pdf(self, pdf_path): 248 | """Extract text from PDF using PyPDF2 form field extraction (most effective method)""" 249 | try: 250 | self.log_message("Extracting text from PDF...") 251 | text_content = "" 252 | 253 | # Method 1: PyPDF2 Form Field Extraction (PRIORITY METHOD) 254 | self.log_message("Using PyPDF2 form field extraction (primary method)...") 255 | try: 256 | with open(pdf_path, 'rb') as file: 257 | pdf_reader = PyPDF2.PdfReader(file) 258 | self.log_message(f"PDF has {len(pdf_reader.pages)} pages") 259 | 260 | # Extract global form fields 261 | form_data = [] 262 | try: 263 | form_fields = pdf_reader.get_form_text_fields() 264 | if form_fields: 265 | self.log_message(f"Found {len(form_fields)} global form fields") 266 | for field_name, field_value in form_fields.items(): 267 | if field_value: 268 | form_data.append(f"{field_name}: {field_value}") 269 | except Exception as e: 270 | self.log_message(f"Global form field extraction failed: {str(e)}") 271 | 272 | # Extract page-specific annotations and form fields 273 | for page_num in range(len(pdf_reader.pages)): 274 | self.log_message(f"Processing page {page_num + 1}...") 275 | page_obj = pdf_reader.pages[page_num] 276 | 277 | # Get regular text 278 | regular_text = page_obj.extract_text() 279 | 280 | # Extract annotations (form fields) 281 | page_form_data = [] 282 | if '/Annots' in page_obj: 283 | annotations = page_obj['/Annots'] 284 | self.log_message(f"Found {len(annotations)} annotations on page {page_num + 1}") 285 | 286 | for annot_ref in annotations: 287 | try: 288 | annot = annot_ref.get_object() 289 | 290 | # Extract field name and value 291 | field_name = None 292 | field_value = None 293 | 294 | if '/T' in annot: # Field name 295 | field_name = str(annot['/T']) 296 | 297 | if '/V' in annot: # Field value 298 | field_value = str(annot['/V']) 299 | 300 | # Add to form data if we have a value 301 | if field_value: 302 | if field_name: 303 | page_form_data.append(f"{field_name}: {field_value}") 304 | else: 305 | page_form_data.append(field_value) 306 | 307 | except Exception as e: 308 | continue 309 | 310 | # Combine page text 311 | page_text = f"\\n--- Page {page_num + 1} ---\\n" 312 | if regular_text.strip(): 313 | page_text += regular_text + "\\n" 314 | 315 | if page_form_data: 316 | page_text += "\\n--- Form Fields ---\\n" 317 | page_text += "\\n".join(page_form_data) 318 | self.log_message(f"Extracted {len(page_form_data)} form fields from page {page_num + 1}") 319 | 320 | text_content += page_text 321 | 322 | # Add global form data if any 323 | if form_data: 324 | text_content += "\\n\\n--- Global Form Fields ---\\n" 325 | text_content += "\\n".join(form_data) 326 | 327 | self.log_message(f"PyPDF2 extracted {len(text_content)} total characters") 328 | 329 | except Exception as e: 330 | self.log_message(f"PyPDF2 form field extraction failed: {str(e)}") 331 | text_content = "" 332 | 333 | # Fallback Method 2: pdfplumber if PyPDF2 fails 334 | if not text_content or len(text_content.strip()) < 100: 335 | self.log_message("Falling back to pdfplumber extraction...") 336 | try: 337 | with pdfplumber.open(pdf_path) as pdf: 338 | for page_num, page in enumerate(pdf.pages, 1): 339 | self.log_message(f"pdfplumber processing page {page_num}...") 340 | 341 | # Standard text extraction 342 | page_text = page.extract_text() 343 | 344 | if page_text: 345 | text_content += f"\\n--- Page {page_num} ---\\n" 346 | text_content += page_text 347 | 348 | self.log_message(f"pdfplumber extracted {len(text_content)} characters") 349 | except Exception as e: 350 | self.log_message(f"pdfplumber extraction failed: {str(e)}") 351 | 352 | # Final check 353 | if not text_content or len(text_content.strip()) < 50: 354 | self.log_message("WARNING: Very little text extracted from PDF") 355 | text_content = "No significant text content found in PDF" 356 | 357 | self.log_message(f"Successfully extracted text from PDF") 358 | self.log_message(f"Total extracted content length: {len(text_content)} characters") 359 | 360 | return text_content 361 | 362 | except Exception as e: 363 | error_msg = f"Error extracting text from PDF: {str(e)}" 364 | self.log_message(error_msg) 365 | raise Exception(error_msg) 366 | 367 | def extract_structured_data(self, text_content): 368 | """Use OpenAI to extract structured data from text""" 369 | try: 370 | self.log_message("Sending text to OpenAI for structured extraction...") 371 | 372 | prompt = f"""Extract structured information from this bank statement text and return ONLY valid JSON. 373 | 374 | IMPORTANT: Return ONLY the JSON object, no explanatory text, no markdown formatting, no code blocks. 375 | 376 | Extract only information explicitly present in the text. Use null for missing fields. 377 | 378 | Required JSON structure: 379 | {{ 380 | "bank_information": {{ 381 | "bank_name": null, 382 | "bank_address": null, 383 | "bank_phone": null, 384 | "bank_website": null 385 | }}, 386 | "account_holder": {{ 387 | "name": null, 388 | "address": null, 389 | "phone": null, 390 | "email": null 391 | }}, 392 | "account_details": {{ 393 | "account_number": null, 394 | "account_type": null, 395 | "routing_number": null, 396 | "sort_code": null 397 | }}, 398 | "statement_period": {{ 399 | "start_date": null, 400 | "end_date": null, 401 | "statement_date": null 402 | }}, 403 | "balance_information": {{ 404 | "opening_balance": null, 405 | "closing_balance": null, 406 | "available_balance": null, 407 | "currency": null 408 | }}, 409 | "transactions": [] 410 | }} 411 | 412 | For transactions array, each transaction should have: 413 | - date, description, amount, balance_after_transaction, transaction_type, reference_number 414 | 415 | Bank statement text: 416 | {text_content}""" 417 | 418 | # Modern OpenAI v2.x API 419 | response = self.client.chat.completions.create( 420 | model="gpt-4", 421 | messages=[ 422 | {"role": "system", "content": "You are a precise data extraction assistant. Extract only information that is explicitly present in the provided text. Do not infer or add any information."}, 423 | {"role": "user", "content": prompt} 424 | ], 425 | temperature=0.1, 426 | max_tokens=4000 427 | ) 428 | 429 | extracted_data = response.choices[0].message.content 430 | self.log_message("Successfully received structured data from OpenAI") 431 | 432 | # Log the raw response for debugging 433 | self.log_message(f"Raw OpenAI response: {extracted_data[:200]}...") 434 | 435 | # Try to parse as JSON to validate 436 | try: 437 | json_data = json.loads(extracted_data) 438 | return json_data 439 | except json.JSONDecodeError: 440 | # If not valid JSON, try to extract JSON from the response 441 | import re 442 | # Look for JSON block in markdown code blocks 443 | json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', extracted_data, re.DOTALL) 444 | if json_match: 445 | try: 446 | return json.loads(json_match.group(1)) 447 | except json.JSONDecodeError: 448 | pass 449 | 450 | # Look for JSON object without code blocks 451 | json_match = re.search(r'(\{.*\})', extracted_data, re.DOTALL) 452 | if json_match: 453 | try: 454 | return json.loads(json_match.group(1)) 455 | except json.JSONDecodeError: 456 | pass 457 | 458 | # If still can't parse, create a structured response manually 459 | self.log_message("Could not parse JSON, creating manual structure...") 460 | return { 461 | "bank_information": { 462 | "bank_name": None, 463 | "bank_address": None, 464 | "bank_phone": None, 465 | "bank_website": None 466 | }, 467 | "account_holder": { 468 | "name": None, 469 | "address": None, 470 | "phone": None, 471 | "email": None 472 | }, 473 | "account_details": { 474 | "account_number": None, 475 | "account_type": None, 476 | "routing_number": None, 477 | "sort_code": None 478 | }, 479 | "statement_period": { 480 | "start_date": None, 481 | "end_date": None, 482 | "statement_date": None 483 | }, 484 | "balance_information": { 485 | "opening_balance": None, 486 | "closing_balance": None, 487 | "available_balance": None, 488 | "currency": None 489 | }, 490 | "transactions": [], 491 | "raw_response": extracted_data, 492 | "note": "Could not parse structured JSON, raw response included" 493 | } 494 | 495 | except Exception as e: 496 | error_msg = f"Error extracting structured data: {str(e)}" 497 | self.log_message(error_msg) 498 | raise Exception(error_msg) 499 | 500 | def save_results(self, structured_data, original_filename): 501 | """Save extracted data to JSON file""" 502 | try: 503 | # Create output directory if it doesn't exist 504 | output_dir = self.output_folder.get() 505 | os.makedirs(output_dir, exist_ok=True) 506 | 507 | # Generate output filename 508 | base_name = os.path.splitext(os.path.basename(original_filename))[0] 509 | timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") 510 | output_filename = f"{base_name}_extracted_{timestamp}.json" 511 | output_path = os.path.join(output_dir, output_filename) 512 | 513 | # Save structured data 514 | with open(output_path, 'w', encoding='utf-8') as f: 515 | json.dump(structured_data, f, indent=2, ensure_ascii=False) 516 | 517 | self.log_message(f"Results saved to: {output_path}") 518 | 519 | # Also save raw response if it exists for debugging 520 | if 'raw_response' in structured_data: 521 | raw_filename = f"{base_name}_raw_response_{timestamp}.txt" 522 | raw_path = os.path.join(output_dir, raw_filename) 523 | with open(raw_path, 'w', encoding='utf-8') as f: 524 | f.write(structured_data['raw_response']) 525 | self.log_message(f"Raw response saved to: {raw_path}") 526 | 527 | return output_path 528 | 529 | except Exception as e: 530 | error_msg = f"Error saving results: {str(e)}" 531 | self.log_message(error_msg) 532 | raise Exception(error_msg) 533 | 534 | def process_pdf(self): 535 | """Main processing function""" 536 | try: 537 | pdf_path = self.selected_file.get() 538 | self.log_message(f"Starting processing of: {os.path.basename(pdf_path)}") 539 | 540 | # Extract text from PDF 541 | text_content = self.extract_text_from_pdf(pdf_path) 542 | 543 | if not text_content.strip(): 544 | raise Exception("No text content extracted from PDF") 545 | 546 | # Extract structured data using OpenAI 547 | structured_data = self.extract_structured_data(text_content) 548 | 549 | # Save results 550 | output_path = self.save_results(structured_data, pdf_path) 551 | 552 | self.log_message("Processing completed successfully!") 553 | self.log_message(f"Extracted data saved to: {output_path}") 554 | 555 | # Show success message 556 | self.status_label.config(text="✅ Completed", foreground='#27ae60') 557 | messagebox.showinfo("🎉 Success", f"Processing completed successfully!\\n\\n📁 Results saved to:\\n{output_path}") 558 | 559 | except Exception as e: 560 | self.log_message(f"Processing failed: {str(e)}") 561 | self.status_label.config(text="❌ Failed", foreground='#e74c3c') 562 | messagebox.showerror("❌ Error", f"Processing failed:\\n\\n{str(e)}") 563 | 564 | finally: 565 | # Reset UI state 566 | self.is_processing = False 567 | self.process_button.config(state='normal') 568 | self.progress.stop() 569 | if not hasattr(self, '_processing_completed'): 570 | self.status_label.config(text="✅ Ready", foreground='#27ae60') 571 | 572 | def main(): 573 | root = tk.Tk() 574 | app = BankStatementParser(root) 575 | root.mainloop() 576 | 577 | if __name__ == "__main__": 578 | main() 579 | --------------------------------------------------------------------------------