├── requirements.txt
├── sample_transactions.csv
├── .gitignore
├── SECURITY_CHECKLIST.md
├── README.md
├── clean_analysis.py
├── saas_analysis.py
└── extract_pdf_data.py


/requirements.txt:
--------------------------------------------------------------------------------
1 | PyPDF2==3.0.1
2 | pandas==2.1.4
3 | matplotlib==3.8.2
4 | seaborn==0.13.0
5 | numpy==1.25.2 


--------------------------------------------------------------------------------
/sample_transactions.csv:
--------------------------------------------------------------------------------
 1 | date,description,amount,category,amount_abs
 2 | 04/22,CURSOR AI POWERED IDE,651.0,Technology/Software,651.0
 3 | 04/29,ADOBE SOFTWARE,1900.0,Technology/Software,1900.0
 4 | 04/29,OPENAI *CHATGPT SUBSCR,652.0,Technology/Software,652.0
 5 | 04/30,FIGMA PRO SUBSCRIPTION,4357.0,Technology/Software,4357.0
 6 | 05/05,HEYGEN VIDEO AI,936.0,Technology/Software,936.0
 7 | 05/06,REPORTDASH ANALYTICS,9744.0,Technology/Software,9744.0
 8 | 05/07,ANTHROPIC CLAUDE API,690.0,Technology/Software,690.0
 9 | 05/13,CURSOR USAGE APR,378.0,Technology/Software,378.0
10 | 05/13,LEONARDO AI CREDITS,305.0,Technology/Software,305.0
11 | 05/19,GOOGLE CLOUD SERVICES,149.0,Technology/Software,149.0
12 | 04/22,ABC Restaurant,780.0,Food & Dining,780.0
13 | 04/25,Coffee Shop Downtown,165.0,Food & Dining,165.0
14 | 04/28,Italian Bistro,1914.0,Food & Dining,1914.0
15 | 05/02,Supermarket Chain,473.0,Food & Dining,473.0
16 | 05/07,Burger Restaurant,597.0,Food & Dining,597.0
17 | 05/11,Asian Cuisine,2288.0,Food & Dining,2288.0
18 | 04/22,Taxi Service,120.0,Transportation,120.0
19 | 05/06,Gas Station,840.0,Transportation,840.0
20 | 05/13,Public Transport,108.0,Transportation,108.0
21 | 05/19,Ride Share Service,120.0,Transportation,120.0
22 | 04/24,Electronics Store,1160.0,Shopping,1160.0
23 | 05/06,Hardware Store,120.0,Shopping,120.0
24 | 05/14,Home Goods Store,155.0,Shopping,155.0
25 | 05/16,Department Store,1843.0,Shopping,1843.0
26 | 04/20,Entertainment Center,9285.0,Entertainment,9285.0
27 | 05/03,Movie Theater,315.0,Entertainment,315.0
28 | 05/06,Gaming Platform,10470.0,Entertainment,10470.0
29 | 04/18,Bank Fee,94.0,Bills & Utilities,94.0
30 | 04/20,International Fee,5.0,Bills & Utilities,5.0
31 | 05/15,Service Charge,85.0,Bills & Utilities,85.0 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # ================================================
  2 | # The Pocket Company by Accucrazy
  3 | # Security .gitignore for Credit Card Analyzer
  4 | # ================================================
  5 | 
  6 | # SENSITIVE FINANCIAL DATA - DO NOT COMMIT
  7 | # ==========================================
  8 | 
  9 | # Bank statements and financial documents
 10 | *.pdf
 11 | 第一銀行電子對帳單*.pdf
 12 | 
 13 | # Real transaction data (explicitly listed to avoid blocking samples)
 14 | transactions.csv
 15 | clean_transactions.csv
 16 | saas_transactions.csv
 17 | 
 18 | # Analysis reports with real data
 19 | *_report.txt
 20 | *_analysis_report.txt
 21 | clean_credit_card_report.txt
 22 | saas_analysis_report.txt
 23 | 
 24 | # Charts and visualizations with real data
 25 | *.png
 26 | !sample_*.png              # Allow sample charts
 27 | !demo_*.png                # Allow demo charts
 28 | 
 29 | # Backup files with sensitive data
 30 | *.backup
 31 | *.bak
 32 | *_backup.*
 33 | 
 34 | # Personal configuration files
 35 | config.json
 36 | settings.json
 37 | .env
 38 | .env.local
 39 | .env.production
 40 | 
 41 | # PYTHON DEVELOPMENT
 42 | # ===================
 43 | 
 44 | # Byte-compiled / optimized / DLL files
 45 | __pycache__/
 46 | *.py[cod]
 47 | *$py.class
 48 | 
 49 | # C extensions
 50 | *.so
 51 | 
 52 | # Distribution / packaging
 53 | .Python
 54 | build/
 55 | develop-eggs/
 56 | dist/
 57 | downloads/
 58 | eggs/
 59 | .eggs/
 60 | lib/
 61 | lib64/
 62 | parts/
 63 | sdist/
 64 | var/
 65 | wheels/
 66 | share/python-wheels/
 67 | *.egg-info/
 68 | .installed.cfg
 69 | *.egg
 70 | MANIFEST
 71 | 
 72 | # PyInstaller
 73 | *.manifest
 74 | *.spec
 75 | 
 76 | # Installer logs
 77 | pip-log.txt
 78 | pip-delete-this-directory.txt
 79 | 
 80 | # Unit test / coverage reports
 81 | htmlcov/
 82 | .tox/
 83 | .nox/
 84 | .coverage
 85 | .coverage.*
 86 | .cache
 87 | nosetests.xml
 88 | coverage.xml
 89 | *.cover
 90 | *.py,cover
 91 | .hypothesis/
 92 | .pytest_cache/
 93 | cover/
 94 | 
 95 | # Virtual environments
 96 | .env
 97 | .venv
 98 | env/
 99 | venv/
100 | ENV/
101 | env.bak/
102 | venv.bak/
103 | 
104 | # IDE and Editor files
105 | .vscode/
106 | .idea/
107 | *.swp
108 | *.swo
109 | *~
110 | 
111 | # OS generated files
112 | .DS_Store
113 | .DS_Store?
114 | ._*
115 | .Spotlight-V100
116 | .Trashes
117 | ehthumbs.db
118 | Thumbs.db
119 | 
120 | # TEMPORARY FILES
121 | # ================
122 | temp/
123 | tmp/
124 | *.tmp
125 | *.log
126 | 
127 | # SAMPLE DATA (ALLOWED)
128 | # =====================
129 | # These files are OK to commit as they contain demo data
130 | # Note: These are already allowed by the ! rules above
131 | demo_*.csv
132 | example_*.csv 


--------------------------------------------------------------------------------
/SECURITY_CHECKLIST.md:
--------------------------------------------------------------------------------
  1 | # 🔒 Security Checklist for GitHub Upload
  2 | **The Pocket Company by Accucrazy**
  3 | 
  4 | ---
  5 | 
  6 | ## ⚠️ **CRITICAL: Before Uploading to GitHub**
  7 | 
  8 | This checklist ensures you don't accidentally upload sensitive financial data to a public repository.
  9 | 
 10 | ### 🚫 **Files You MUST NOT Upload**
 11 | 
 12 | #### **Real Financial Data**
 13 | - [ ] ❌ **Bank statement PDFs** (any `.pdf` files with real bank data)
 14 | - [ ] ❌ **Real transaction CSV files** (`transactions.csv`, `clean_transactions.csv`, etc.)
 15 | - [ ] ❌ **Analysis reports with real data** (any `*_report.txt` files)
 16 | - [ ] ❌ **Charts with real data** (`.png` files showing actual spending)
 17 | 
 18 | #### **Sensitive Configuration**
 19 | - [ ] ❌ **Hard-coded passwords** in Python files
 20 | - [ ] ❌ **Real bank account information**
 21 | - [ ] ❌ **Personal identification numbers**
 22 | - [ ] ❌ **Credit card details**
 23 | 
 24 | ### ✅ **Files That Are SAFE to Upload**
 25 | 
 26 | #### **Code and Documentation**
 27 | - [ ] ✅ **Python scripts** (with passwords removed)
 28 | - [ ] ✅ **README.md** (updated with company branding)
 29 | - [ ] ✅ **requirements.txt**
 30 | - [ ] ✅ **.gitignore** (properly configured)
 31 | 
 32 | #### **Sample/Demo Data**
 33 | - [ ] ✅ **sample_transactions.csv** (anonymized demo data)
 34 | - [ ] ✅ **demo_*.png** (sample charts with fake data)
 35 | - [ ] ✅ **example_*.csv** (template files)
 36 | 
 37 | ---
 38 | 
 39 | ## 🔧 **Pre-Upload Security Steps**
 40 | 
 41 | ### **1. Remove Hard-coded Secrets**
 42 | ```bash
 43 | # Check for hard-coded passwords
 44 | grep -r "password.*=" *.py
 45 | grep -r "09444722" *.py  # Your specific password
 46 | grep -r "第一銀行" *.py  # Bank name
 47 | 
 48 | # Should return no results!
 49 | ```
 50 | 
 51 | ### **2. Verify .gitignore is Working**
 52 | ```bash
 53 | # Check what Git will track
 54 | git status
 55 | git add --dry-run .
 56 | 
 57 | # Ensure these files are NOT listed:
 58 | # - *.pdf
 59 | # - *transactions.csv (real data)
 60 | # - *_report.txt (real reports)
 61 | # - *.png (real charts)
 62 | ```
 63 | 
 64 | ### **3. Environment Variable Setup**
 65 | Ensure scripts use environment variables:
 66 | ```python
 67 | # ✅ GOOD (secure)
 68 | password = os.getenv('PDF_PASSWORD')
 69 | 
 70 | # ❌ BAD (insecure)
 71 | password = "09444722"
 72 | ```
 73 | 
 74 | ### **4. Test with Sample Data**
 75 | ```bash
 76 | # Run scripts with sample data to ensure they work
 77 | python saas_analysis.py  # Should use sample_transactions.csv
 78 | ```
 79 | 
 80 | ---
 81 | 
 82 | ## 📋 **Final Verification Checklist**
 83 | 
 84 | ### **File Content Review**
 85 | - [ ] All Python files use `os.getenv()` for sensitive data
 86 | - [ ] No real merchant names in sample data
 87 | - [ ] No real amounts that could identify spending patterns
 88 | - [ ] No dates that match real transaction periods
 89 | - [ ] All reports contain "The Pocket Company by Accucrazy" branding
 90 | 
 91 | ### **Repository Structure**
 92 | ```
 93 | ✅ SAFE TO UPLOAD:
 94 | ├── README.md                    ✅ (updated with company info)
 95 | ├── requirements.txt             ✅ (dependencies only)
 96 | ├── .gitignore                   ✅ (protects sensitive files)
 97 | ├── SECURITY_CHECKLIST.md        ✅ (this file)
 98 | ├── extract_pdf_data.py          ✅ (no hard-coded passwords)
 99 | ├── saas_analysis.py             ✅ (clean code)
100 | ├── clean_analysis.py            ✅ (clean code)
101 | ├── sample_transactions.csv      ✅ (demo data only)
102 | └── sample_analysis_chart.png    ✅ (demo chart)
103 | 
104 | ❌ NEVER UPLOAD:
105 | ├── *.pdf                        ❌ (real bank statements)
106 | ├── transactions.csv             ❌ (real transaction data)
107 | ├── clean_transactions.csv       ❌ (real processed data)
108 | ├── saas_transactions.csv        ❌ (real SaaS data)
109 | ├── *_report.txt                 ❌ (real analysis reports)
110 | ├── *.png (with real data)       ❌ (real spending charts)
111 | └── .env files                   ❌ (environment variables)
112 | ```
113 | 
114 | ---
115 | 
116 | ## 🛡️ **Best Practices for Users**
117 | 
118 | ### **For Repository Maintainers**
119 | 1. **Set Repository to Private** initially while testing
120 | 2. **Review all commits** before making public
121 | 3. **Use GitHub's secret scanning** features
122 | 4. **Add branch protection rules**
123 | 
124 | ### **For End Users**
125 | 1. **Fork the repository** to your private account first
126 | 2. **Never commit real financial data** to any branch
127 | 3. **Use environment variables** for all sensitive configuration
128 | 4. **Regularly audit** your commit history
129 | 
130 | ### **Environment Setup Template**
131 | Create a `.env.example` file (safe to upload):
132 | ```bash
133 | # Copy this to .env and fill in your values
134 | PDF_PASSWORD=your_pdf_password_here
135 | PDF_PATH=your_statement_file.pdf
136 | ```
137 | 
138 | ---
139 | 
140 | ## 🚨 **Emergency: If You Accidentally Uploaded Sensitive Data**
141 | 
142 | ### **Immediate Actions**
143 | 1. **Make repository private** immediately
144 | 2. **Contact GitHub support** to purge sensitive data
145 | 3. **Change any exposed passwords** or account numbers
146 | 4. **Review commit history** for other sensitive data
147 | 5. **Force push** to remove sensitive commits
148 | 
149 | ### **GitHub Data Removal**
150 | ```bash
151 | # Remove sensitive files from Git history
152 | git filter-branch --force --index-filter \
153 |   "git rm --cached --ignore-unmatch sensitive_file.csv" \
154 |   --prune-empty --tag-name-filter cat -- --all
155 | 
156 | # Force push to overwrite history
157 | git push origin --force --all
158 | ```
159 | 
160 | ---
161 | 
162 | ## ✅ **Final Sign-off**
163 | 
164 | Before uploading to GitHub, confirm:
165 | 
166 | - [ ] ✅ I have reviewed all files for sensitive data
167 | - [ ] ✅ No real financial information is included
168 | - [ ] ✅ All scripts use environment variables for secrets
169 | - [ ] ✅ .gitignore is properly configured
170 | - [ ] ✅ Sample data is anonymized and safe
171 | - [ ] ✅ Repository includes proper company branding
172 | - [ ] ✅ README includes security warnings
173 | 
174 | **Signed off by**: ________________ **Date**: ____________
175 | 
176 | ---
177 | 
178 | **The Pocket Company by Accucrazy**  
179 | *Committed to data security and privacy protection* 
180 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Credit Card Statement PDF Analyzer
  2 | **The Pocket Company by Accucrazy**
  3 | 
  4 | ---
  5 | 
  6 | ## 📊 **Enterprise SaaS Spending Analysis Tool**
  7 | 
  8 | This comprehensive tool extracts and analyzes transaction data from password-protected PDF credit card statements, with a specialized focus on **SaaS service spending analysis** for modern tech companies.
  9 | 
 10 | ## ✨ **Key Features**
 11 | 
 12 | ### 🔐 **PDF Processing**
 13 | - **PDF Text Extraction**: Decrypts and extracts text from password-protected PDF files
 14 | - **Smart Transaction Parsing**: Automatically identifies and parses transaction data
 15 | - **Multi-format Support**: Handles various bank statement formats
 16 | 
 17 | ### 📈 **Advanced Analytics**
 18 | - **Smart Categorization**: Categorizes transactions into spending categories (Food, Transportation, Shopping, SaaS, etc.)
 19 | - **SaaS-Focused Analysis**: Specialized analysis for enterprise software subscriptions
 20 | - **Comprehensive Insights**: Provides detailed spending statistics and optimization recommendations
 21 | - **Visual Reports**: Creates professional charts and graphs for spending patterns
 22 | 
 23 | ### 🎯 **SaaS Specialization**
 24 | - **AI/ML Tools Tracking**: Monitors spending on Cursor, OpenAI, Anthropic, etc.
 25 | - **Subscription Type Analysis**: Differentiates between usage-based vs monthly subscriptions
 26 | - **Cost Optimization**: Provides specific recommendations for SaaS spending optimization
 27 | - **Duplicate Detection**: Removes duplicate transactions for accurate analysis
 28 | 
 29 | ### 💾 **Export Options**
 30 | - **Multiple Formats**: Saves analysis results in CSV, PNG, TXT formats
 31 | - **Professional Reports**: Generated with company branding
 32 | - **Clean Data**: Deduplicated and categorized transaction data
 33 | 
 34 | ## 🛠 **Requirements**
 35 | 
 36 | - Python 3.7 or higher
 37 | - Required packages (install using `pip install -r requirements.txt`)
 38 | 
 39 | ## 📦 **Installation**
 40 | 
 41 | 1. **Clone the repository**:
 42 |    ```bash
 43 |    git clone https://github.com/your-username/credit-card-analyzer.git
 44 |    cd credit-card-analyzer
 45 |    ```
 46 | 
 47 | 2. **Install dependencies**:
 48 |    ```bash
 49 |    pip install -r requirements.txt
 50 |    ```
 51 | 
 52 | 3. **Prepare your PDF file**: Place your PDF file in the project directory
 53 | 
 54 | ## 🚀 **Usage**
 55 | 
 56 | ### **Basic Analysis**
 57 | ```bash
 58 | python extract_pdf_data.py
 59 | ```
 60 | 
 61 | ### **SaaS-Focused Analysis**
 62 | ```bash
 63 | python saas_analysis.py
 64 | ```
 65 | 
 66 | ### **Clean Data Analysis**
 67 | ```bash
 68 | python clean_analysis.py
 69 | ```
 70 | 
 71 | ### **Custom Configuration**
 72 | Edit the script parameters:
 73 | ```python
 74 | pdf_path = "your_pdf_file.pdf"
 75 | password = "your_password"  # Use environment variable in production
 76 | ```
 77 | 
 78 | ## 📁 **Output Files**
 79 | 
 80 | ### **Comprehensive Analysis**
 81 | - `saas_spending_analysis.png` - SaaS-focused visualization charts
 82 | - `saas_analysis_report.txt` - Detailed SaaS spending report
 83 | - `saas_transactions.csv` - Clean SaaS transaction data
 84 | 
 85 | ### **General Analysis**
 86 | - `credit_card_analysis.png` - General spending visualization
 87 | - `credit_card_analysis_report.txt` - Complete spending analysis
 88 | - `clean_transactions.csv` - Processed transaction data
 89 | 
 90 | ## 🏷 **Spending Categories**
 91 | 
 92 | ### **SaaS Categories**
 93 | - **AI/ML Tools**: Cursor, OpenAI, Anthropic, Leonardo AI, HeyGen
 94 | - **Design Tools**: Figma, Adobe Creative Suite
 95 | - **Development Tools**: ReportDash, GitHub, hosting services
 96 | - **Cloud Services**: Google Cloud, AWS, Colab
 97 | - **Marketing Tools**: ManyChat, analytics platforms
 98 | 
 99 | ### **General Categories**
100 | - **Food & Dining**: Restaurants, convenience stores, supermarkets
101 | - **Transportation**: Public transport, ride-sharing, fuel
102 | - **Entertainment**: Movies, games, recreation
103 | - **Shopping**: Retail, electronics, household items
104 | - **Bills & Utilities**: Utilities, insurance, banking fees
105 | 
106 | ## ⚙ **Customization**
107 | 
108 | ### **Adding SaaS Services**
109 | ```python
110 | saas_keywords = {
111 |     'Your Category': ['SERVICE_NAME', 'KEYWORD'],
112 |     # Add new categories here
113 | }
114 | ```
115 | 
116 | ### **Transaction Pattern Matching**
117 | ```python
118 | patterns = [
119 |     r'(\d{2}/\d{2})\s+(.+?)\s+([\d,]+\.?\d*)',
120 |     # Add custom patterns for your bank format
121 | ]
122 | ```
123 | 
124 | ## 📊 **Sample Analysis Report**
125 | 
126 | ```
127 | ================================================
128 | 企業 SaaS 服務支出分析報告
129 | 第一銀行信用卡帳單 - 2025年5月
130 | The Pocket Company by Accucrazy
131 | ================================================
132 | 
133 | 總體概況：
134 | - SaaS 總支出：NT$ 30,476.00
135 | - 使用服務數量：12 個
136 | - 總交易次數：25 筆
137 | - 平均每筆交易：NT$ 1,219.04
138 | 
139 | 前5大 SaaS 服務支出：
140 | 1. ReportDash Analytics: NT$ 9,744.00 (32.0%)
141 | 2. Cursor AI IDE: NT$ 5,591.00 (18.3%)
142 | 3. Figma Design: NT$ 4,357.00 (14.3%)
143 | 4. HeyGen Video AI: NT$ 3,331.00 (10.9%)
144 | 5. OpenAI (ChatGPT/API): NT$ 2,934.00 (9.6%)
145 | ```
146 | 
147 | ## 🔒 **Security & Privacy**
148 | 
149 | ### **Data Protection**
150 | - All processing is done locally
151 | - No data sent to external servers
152 | - Sensitive files excluded from version control
153 | 
154 | ### **Best Practices**
155 | - Use environment variables for passwords
156 | - Delete sensitive files after analysis
157 | - Review `.gitignore` before committing
158 | 
159 | ## 🚫 **Files NOT to Upload to GitHub**
160 | 
161 | ```
162 | # Sensitive files - DO NOT COMMIT
163 | *.pdf                    # Bank statements
164 | *transactions.csv        # Real transaction data
165 | *_report.txt            # Reports with real data
166 | *.png                   # Charts with real data (sample charts OK)
167 | ```
168 | 
169 | ## ⚠ **Troubleshooting**
170 | 
171 | ### **Common Issues**
172 | 1. **PDF Password Error**: Verify password or use environment variable
173 | 2. **No Transactions Found**: Check PDF format compatibility
174 | 3. **Encoding Issues**: Ensure UTF-8 system encoding
175 | 4. **Font Display**: Install Chinese fonts for proper chart rendering
176 | 
177 | ### **Getting Help**
178 | 1. Check extracted text patterns in debug mode
179 | 2. Verify regex patterns match your bank format
180 | 3. Ensure all dependencies are installed
181 | 
182 | ## 🤝 **Contributing**
183 | 
184 | 1. Fork the repository
185 | 2. Create a feature branch
186 | 3. Add sample data (NOT real financial data)
187 | 4. Submit a pull request
188 | 
189 | ## 📄 **License**
190 | 
191 | This tool is provided for educational and personal use. Ensure compliance with your financial institution's terms of service.
192 | 
193 | ## 🔗 **Company Information**
194 | 
195 | **The Pocket Company by Accucrazy**
196 | - Specialized in enterprise SaaS spending optimization
197 | - AI-driven financial analysis tools
198 | - Modern tech company cost management solutions
199 | 
200 | ---
201 | 
202 | ⚠️ **Important**: This tool processes sensitive financial data. Always follow data protection best practices and never commit real financial information to version control. 


--------------------------------------------------------------------------------
/clean_analysis.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Clean Credit Card Analysis
  5 | Cleans the extracted data and provides accurate spending analysis
  6 | """
  7 | 
  8 | import pandas as pd
  9 | import matplotlib.pyplot as plt
 10 | import numpy as np
 11 | from collections import defaultdict
 12 | 
 13 | def clean_transactions(csv_path):
 14 |     """Clean transaction data by removing outliers and parsing errors"""
 15 |     df = pd.read_csv(csv_path, encoding='utf-8')
 16 |     
 17 |     print(f"Original transactions: {len(df)}")
 18 |     
 19 |     # Remove rows with invalid dates
 20 |     df = df[~df['date'].str.contains('0/0|14/05', na=False)]
 21 |     
 22 |     # Remove transactions with unrealistic amounts (likely parsing errors)
 23 |     # Most credit card transactions should be under NT$100,000
 24 |     df = df[df['amount_abs'] < 100000]
 25 |     
 26 |     # Remove rows with garbled text (cid: patterns)
 27 |     df = df[~df['description'].str.contains('cid:', na=False)]
 28 |     
 29 |     # Remove duplicate transactions (keeping first occurrence)
 30 |     df = df.drop_duplicates(subset=['date', 'description', 'amount'], keep='first')
 31 |     
 32 |     print(f"Cleaned transactions: {len(df)}")
 33 |     
 34 |     return df
 35 | 
 36 | def improve_categorization(df):
 37 |     """Improve transaction categorization"""
 38 |     
 39 |     # Enhanced categories with better keywords
 40 |     categories = {
 41 |         'Food & Dining': [
 42 |             '餐廳', '食', '飲', '麥當勞', '星巴克', '便利商店', '超市', '7-11', '全家', 
 43 |             '美食', '餐', '飯', '咖啡', 'cama', '杭州小籠包', '養心殿', '京星港式飲茶',
 44 |             '北村家', '吐司利亞', '優食', 'Subway', '燒肉', '創義麵', '湘川', 
 45 |             '珍蜜咖啡', 'Fake Sober', 'J WOW', '全聯福利中心', '麥當勞'
 46 |         ],
 47 |         'Transportation': [
 48 |             '捷運', '公車', '計程車', '加油', '停車', 'UBER', '油站', '交通', 
 49 |             '高鐵', '台鐵', '客運', '台灣大車隊', '優步', 'Taxi'
 50 |         ],
 51 |         'Technology/Software': [
 52 |             'CURSOR', 'ADOBE', 'OPENAI', 'GOOGLE', 'FIGMA', 'HEYGEN', 'SEASALT',
 53 |             'REPORTDASH', 'MANYCHAT', 'RSS.APP', 'PADDLE', 'LEONARDO', 'Colab',
 54 |             'SPOTIFY', 'ANTHROPIC', 'Gandi', 'APIFY', 'SHOPIFY', 'PCHOME'
 55 |         ],
 56 |         'Shopping': [
 57 |             '百貨', '購物', '服飾', '電器', '網購', '商城', 'AMAZON', '買', '購', 
 58 |             '商店', '市場', 'IKEA', '宜家家居', '永昇五金', '今華電子', '源達科技'
 59 |         ],
 60 |         'Entertainment': [
 61 |             '電影', '遊戲', '娛樂', 'KTV', '健身', '運動', '書店', '音樂', '錢櫃'
 62 |         ],
 63 |         'Bills & Utilities': [
 64 |             '電費', '水費', '瓦斯', '電信', '保險', '銀行', '費用', '帳單', '繳費',
 65 |             '手續費', '國外交易手續費', 'ATT'
 66 |         ],
 67 |         'Cash/ATM': [
 68 |             '提款', 'ATM', '現金', '轉帳', '匯款', '現金回饋', '自動扣繳'
 69 |         ],
 70 |         'Business/Marketing': [
 71 |             '全球商務科技', 'LINE Ads', '連加'
 72 |         ],
 73 |         'Other': []
 74 |     }
 75 |     
 76 |     # Recategorize transactions
 77 |     for idx, row in df.iterrows():
 78 |         description = row['description']
 79 |         categorized = False
 80 |         
 81 |         for category, keywords in categories.items():
 82 |             if category == 'Other':
 83 |                 continue
 84 |             for keyword in keywords:
 85 |                 if keyword in description:
 86 |                     df.at[idx, 'category'] = category
 87 |                     categorized = True
 88 |                     break
 89 |             if categorized:
 90 |                 break
 91 |         
 92 |         if not categorized:
 93 |             df.at[idx, 'category'] = 'Other'
 94 |     
 95 |     return df
 96 | 
 97 | def generate_clean_analysis(df):
 98 |     """Generate comprehensive analysis of cleaned data"""
 99 |     
100 |     # Basic statistics
101 |     total_spending = df['amount_abs'].sum()
102 |     avg_transaction = df['amount_abs'].mean()
103 |     median_transaction = df['amount_abs'].median()
104 |     num_transactions = len(df)
105 |     
106 |     # Category analysis
107 |     category_spending = df.groupby('category')['amount_abs'].sum().sort_values(ascending=False)
108 |     category_counts = df['category'].value_counts()
109 |     
110 |     # Date analysis
111 |     spending_by_date = df.groupby('date')['amount_abs'].sum().sort_values(ascending=False)
112 |     
113 |     # Top merchants
114 |     merchant_spending = df.groupby('description')['amount_abs'].sum().sort_values(ascending=False).head(10)
115 |     
116 |     # Analysis dictionary
117 |     analysis = {
118 |         'total_spending': total_spending,
119 |         'average_transaction': avg_transaction,
120 |         'median_transaction': median_transaction,
121 |         'number_of_transactions': num_transactions,
122 |         'category_breakdown': category_spending.to_dict(),
123 |         'category_counts': category_counts.to_dict(),
124 |         'top_transactions': df.nlargest(10, 'amount_abs')[['date', 'description', 'amount', 'category']].to_dict('records'),
125 |         'top_merchants': merchant_spending.to_dict(),
126 |         'spending_by_date': spending_by_date.head(10).to_dict()
127 |     }
128 |     
129 |     return analysis
130 | 
131 | def create_clean_visualizations(df, analysis):
132 |     """Create improved visualizations"""
133 |     plt.style.use('default')
134 |     fig, axes = plt.subplots(2, 3, figsize=(18, 12))
135 |     fig.suptitle('Credit Card Spending Analysis - May 2025', fontsize=16, fontweight='bold')
136 |     
137 |     # 1. Category spending pie chart
138 |     category_data = pd.Series(analysis['category_breakdown'])
139 |     # Only show categories with more than 1% of total spending
140 |     threshold = category_data.sum() * 0.01
141 |     category_filtered = category_data[category_data > threshold]
142 |     other_amount = category_data[category_data <= threshold].sum()
143 |     if other_amount > 0:
144 |         category_filtered['Other (Small)'] = other_amount
145 |     
146 |     colors = plt.cm.Set3(np.linspace(0, 1, len(category_filtered)))
147 |     axes[0, 0].pie(category_filtered.values, labels=category_filtered.index, autopct='%1.1f%%', 
148 |                    startangle=90, colors=colors)
149 |     axes[0, 0].set_title('Spending by Category')
150 |     
151 |     # 2. Category spending bar chart
152 |     category_data.plot(kind='bar', ax=axes[0, 1], color='skyblue')
153 |     axes[0, 1].set_title('Amount by Category')
154 |     axes[0, 1].set_xlabel('Category')
155 |     axes[0, 1].set_ylabel('Amount (NT$)')
156 |     axes[0, 1].tick_params(axis='x', rotation=45)
157 |     
158 |     # 3. Transaction count by category
159 |     cat_counts = pd.Series(analysis['category_counts'])
160 |     cat_counts.plot(kind='bar', ax=axes[0, 2], color='lightcoral')
161 |     axes[0, 2].set_title('Transaction Count by Category')
162 |     axes[0, 2].set_xlabel('Category')
163 |     axes[0, 2].set_ylabel('Number of Transactions')
164 |     axes[0, 2].tick_params(axis='x', rotation=45)
165 |     
166 |     # 4. Transaction amount distribution
167 |     axes[1, 0].hist(df['amount_abs'], bins=30, edgecolor='black', alpha=0.7, color='lightgreen')
168 |     axes[1, 0].set_title('Transaction Amount Distribution')
169 |     axes[1, 0].set_xlabel('Amount (NT$)')
170 |     axes[1, 0].set_ylabel('Frequency')
171 |     
172 |     # 5. Top merchants
173 |     top_merchants = pd.Series(analysis['top_merchants']).head(8)
174 |     top_merchants.plot(kind='barh', ax=axes[1, 1], color='orange')
175 |     axes[1, 1].set_title('Top Merchants by Spending')
176 |     axes[1, 1].set_xlabel('Amount (NT$)')
177 |     
178 |     # 6. Spending by amount ranges
179 |     amount_ranges = ['<100', '100-500', '500-1000', '1000-5000', '5000+']
180 |     range_counts = [
181 |         len(df[df['amount_abs'] < 100]),
182 |         len(df[(df['amount_abs'] >= 100) & (df['amount_abs'] < 500)]),
183 |         len(df[(df['amount_abs'] >= 500) & (df['amount_abs'] < 1000)]),
184 |         len(df[(df['amount_abs'] >= 1000) & (df['amount_abs'] < 5000)]),
185 |         len(df[df['amount_abs'] >= 5000])
186 |     ]
187 |     
188 |     axes[1, 2].bar(amount_ranges, range_counts, color='purple', alpha=0.7)
189 |     axes[1, 2].set_title('Transactions by Amount Range')
190 |     axes[1, 2].set_xlabel('Amount Range (NT$)')
191 |     axes[1, 2].set_ylabel('Number of Transactions')
192 |     
193 |     plt.tight_layout()
194 |     plt.savefig('clean_credit_card_analysis.png', dpi=300, bbox_inches='tight')
195 |     print("Clean visualization saved as 'clean_credit_card_analysis.png'")
196 | 
197 | def generate_clean_report(analysis):
198 |     """Generate a comprehensive cleaned analysis report"""
199 |     report = f"""
200 | ====================================
201 | CLEANED CREDIT CARD SPENDING ANALYSIS
202 | First Bank Statement - May 2025
203 | ====================================
204 | 
205 | SUMMARY STATISTICS:
206 | - Total Spending: NT$ {analysis['total_spending']:,.2f}
207 | - Number of Transactions: {analysis['number_of_transactions']}
208 | - Average Transaction: NT$ {analysis['average_transaction']:,.2f}
209 | - Median Transaction: NT$ {analysis['median_transaction']:,.2f}
210 | 
211 | SPENDING BY CATEGORY:
212 | """
213 |     
214 |     total = analysis['total_spending']
215 |     for category, amount in analysis['category_breakdown'].items():
216 |         percentage = (amount / total) * 100
217 |         count = analysis['category_counts'].get(category, 0)
218 |         avg_per_transaction = amount / count if count > 0 else 0
219 |         report += f"- {category}: NT$ {amount:,.2f} ({percentage:.1f}%) - {count} transactions (avg: NT$ {avg_per_transaction:,.2f})\n"
220 |     
221 |     report += f"""
222 | 
223 | TOP 10 TRANSACTIONS:
224 | """
225 |     
226 |     for i, transaction in enumerate(analysis['top_transactions'], 1):
227 |         report += f"{i:2d}. {transaction['date']} | {transaction['description'][:40]:<40} | NT$ {transaction['amount']:>8,.0f} | {transaction['category']}\n"
228 |     
229 |     report += f"""
230 | 
231 | TOP MERCHANTS BY TOTAL SPENDING:
232 | """
233 |     
234 |     for i, (merchant, amount) in enumerate(analysis['top_merchants'].items(), 1):
235 |         report += f"{i:2d}. {merchant[:50]:<50} | NT$ {amount:>8,.2f}\n"
236 |     
237 |     report += f"""
238 | 
239 | SPENDING INSIGHTS:
240 | - Highest spending category: {max(analysis['category_breakdown'], key=analysis['category_breakdown'].get)}
241 | - Most frequent transaction category: {max(analysis['category_counts'], key=analysis['category_counts'].get)}
242 | - Largest single transaction: NT$ {max(t['amount'] for t in analysis['top_transactions']):,.2f}
243 | - Technology/Software spending: NT$ {analysis['category_breakdown'].get('Technology/Software', 0):,.2f}
244 | - Food & Dining spending: NT$ {analysis['category_breakdown'].get('Food & Dining', 0):,.2f}
245 | 
246 | RECOMMENDATIONS:
247 | 1. Monitor Technology/Software subscriptions - this appears to be a major expense category
248 | 2. Consider consolidating software subscriptions to save costs
249 | 3. Track food delivery and dining expenses for budgeting
250 | 4. Review recurring payments for optimization opportunities
251 | 5. Set up alerts for transactions over NT$ 5,000
252 | 
253 | SPENDING PATTERNS:
254 | - Small transactions (<NT$500): {len([t for t in analysis['top_transactions'] if abs(t['amount']) < 500])} transactions
255 | - Medium transactions (NT$500-5,000): Most common range
256 | - Large transactions (>NT$5,000): Focus on technology and entertainment expenses
257 | """
258 |     
259 |     return report
260 | 
261 | def main():
262 |     """Main function for clean analysis"""
263 |     print("Starting Clean Credit Card Analysis...")
264 |     
265 |     # Load and clean data
266 |     df = clean_transactions('transactions.csv')
267 |     
268 |     if len(df) == 0:
269 |         print("No valid transactions found after cleaning")
270 |         return
271 |     
272 |     # Improve categorization
273 |     df = improve_categorization(df)
274 |     
275 |     # Generate analysis
276 |     analysis = generate_clean_analysis(df)
277 |     
278 |     # Create visualizations
279 |     create_clean_visualizations(df, analysis)
280 |     
281 |     # Generate report
282 |     report = generate_clean_report(analysis)
283 |     print(report)
284 |     
285 |     # Save files
286 |     with open('clean_credit_card_report.txt', 'w', encoding='utf-8') as f:
287 |         f.write(report)
288 |     
289 |     df.to_csv('clean_transactions.csv', index=False, encoding='utf-8')
290 |     
291 |     print("\n" + "="*60)
292 |     print("CLEAN ANALYSIS COMPLETE!")
293 |     print("="*60)
294 |     print("Files saved:")
295 |     print("- clean_credit_card_analysis.png (clean visualizations)")
296 |     print("- clean_credit_card_report.txt (detailed clean report)")
297 |     print("- clean_transactions.csv (cleaned transaction data)")
298 |     print("="*60)
299 | 
300 | if __name__ == "__main__":
301 |     main() 


--------------------------------------------------------------------------------
/saas_analysis.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | 企業 SaaS 服務支出分析
  5 | 專門分析 Cursor、OpenAI 等技術工具的支出
  6 | """
  7 | 
  8 | import pandas as pd
  9 | import matplotlib.pyplot as plt
 10 | import matplotlib.font_manager as fm
 11 | import numpy as np
 12 | from collections import defaultdict
 13 | 
 14 | def load_and_filter_saas_data(csv_path):
 15 |     """載入並篩選 SaaS 相關交易"""
 16 |     df = pd.read_csv(csv_path, encoding='utf-8')
 17 |     
 18 |     # SaaS 服務關鍵字
 19 |     saas_keywords = {
 20 |         'AI/ML Tools': ['CURSOR', 'OPENAI', 'ANTHROPIC', 'LEONARDO', 'HEYGEN'],
 21 |         'Design Tools': ['FIGMA', 'ADOBE'],
 22 |         'Cloud Services': ['GOOGLE', 'Colab'],
 23 |         'Development Tools': ['REPORTDASH', 'Gandi'],
 24 |         'Marketing Tools': ['MANYCHAT', 'RSS.APP', 'SEASALT'],
 25 |         'Media': ['SPOTIFY', 'PADDLE']
 26 |     }
 27 |     
 28 |     # 篩選 SaaS 相關交易
 29 |     saas_transactions = []
 30 |     
 31 |     for idx, row in df.iterrows():
 32 |         description = row['description'].upper()
 33 |         for category, keywords in saas_keywords.items():
 34 |             found = False
 35 |             for keyword in keywords:
 36 |                 if keyword in description:
 37 |                     row_copy = row.copy()
 38 |                     row_copy['saas_category'] = category
 39 |                     row_copy['saas_service'] = keyword.lower()
 40 |                     saas_transactions.append(row_copy)
 41 |                     found = True
 42 |                     break
 43 |             if found:
 44 |                 break
 45 |     
 46 |     if not saas_transactions:
 47 |         print("未找到 SaaS 相關交易")
 48 |         return pd.DataFrame()
 49 |     
 50 |     saas_df = pd.DataFrame(saas_transactions)
 51 |     
 52 |     # 去除重複交易 - 基於 description 和 amount 的組合
 53 |     print(f"去重前: {len(saas_df)} 筆交易")
 54 |     
 55 |     # 創建唯一標識符
 56 |     saas_df['unique_id'] = saas_df['description'].str.replace('^\d{2}/\d{2} ', '', regex=True) + '_' + saas_df['amount_abs'].astype(str)
 57 |     
 58 |     # 去除重複，保留第一筆
 59 |     saas_df_clean = saas_df.drop_duplicates(subset=['unique_id'], keep='first')
 60 |     saas_df_clean = saas_df_clean.drop('unique_id', axis=1)
 61 |     
 62 |     print(f"去重後: {len(saas_df_clean)} 筆 SaaS 相關交易")
 63 |     
 64 |     return saas_df_clean
 65 | 
 66 | def extract_service_details(saas_df):
 67 |     """提取服務詳細信息"""
 68 |     
 69 |     service_mapping = {
 70 |         'cursor': 'Cursor AI IDE',
 71 |         'openai': 'OpenAI (ChatGPT/API)',
 72 |         'anthropic': 'Anthropic Claude',
 73 |         'leonardo': 'Leonardo AI',
 74 |         'heygen': 'HeyGen Video AI',
 75 |         'figma': 'Figma Design',
 76 |         'adobe': 'Adobe Creative Suite',
 77 |         'google': 'Google Cloud/Services',
 78 |         'reportdash': 'ReportDash Analytics',
 79 |         'gandi': 'Gandi Domain/Hosting',
 80 |         'colab': 'Google Colab Pro',
 81 |         'manychat': 'ManyChat Marketing',
 82 |         'seasalt': 'Seasalt.AI',
 83 |         'spotify': 'Spotify Premium',
 84 |         'paddle': 'Paddle Payment'
 85 |     }
 86 |     
 87 |     saas_df['service_name'] = saas_df['saas_service'].map(service_mapping).fillna(saas_df['saas_service'])
 88 |     
 89 |     # 檢測訂閱類型
 90 |     def detect_subscription_type(description):
 91 |         desc_upper = description.upper()
 92 |         if 'USAGE' in desc_upper:
 93 |             return '按使用量計費'
 94 |         elif 'SUBSCR' in desc_upper or 'SUBSCRIPTION' in desc_upper:
 95 |             return '月度訂閱'
 96 |         elif any(word in desc_upper for word in ['PRO', 'PREMIUM', 'PLUS']):
 97 |             return '月度訂閱'
 98 |         else:
 99 |             return '一次性/其他'
100 |     
101 |     saas_df['subscription_type'] = saas_df['description'].apply(detect_subscription_type)
102 |     
103 |     return saas_df
104 | 
105 | def analyze_saas_spending(saas_df):
106 |     """分析 SaaS 支出"""
107 |     
108 |     total_saas_spending = saas_df['amount_abs'].sum()
109 |     num_services = saas_df['service_name'].nunique()
110 |     num_transactions = len(saas_df)
111 |     avg_transaction = saas_df['amount_abs'].mean()
112 |     
113 |     # 按服務分類統計
114 |     category_stats = saas_df.groupby('saas_category').agg({
115 |         'amount_abs': ['sum', 'count', 'mean'],
116 |         'service_name': 'nunique'
117 |     }).round(2)
118 |     
119 |     # 按具體服務統計
120 |     service_stats = saas_df.groupby('service_name').agg({
121 |         'amount_abs': ['sum', 'count', 'mean']
122 |     }).round(2)
123 |     
124 |     # 按訂閱類型統計
125 |     subscription_stats = saas_df.groupby('subscription_type').agg({
126 |         'amount_abs': ['sum', 'count', 'mean']
127 |     }).round(2)
128 |     
129 |     analysis = {
130 |         'total_spending': total_saas_spending,
131 |         'num_services': num_services,
132 |         'num_transactions': num_transactions,
133 |         'avg_transaction': avg_transaction,
134 |         'category_stats': category_stats,
135 |         'service_stats': service_stats,
136 |         'subscription_stats': subscription_stats
137 |     }
138 |     
139 |     return analysis
140 | 
141 | def create_saas_visualizations(saas_df, analysis):
142 |     """創建 SaaS 支出可視化圖表"""
143 |     
144 |     # 嘗試找到可用的中文字體
145 |     chinese_fonts = ['Microsoft YaHei', 'SimHei', 'KaiTi', 'FangSong', 'Microsoft JhengHei']
146 |     available_font = None
147 |     
148 |     for font_name in chinese_fonts:
149 |         try:
150 |             # 檢查字體是否可用
151 |             font_files = fm.findSystemFonts()
152 |             for font_file in font_files:
153 |                 try:
154 |                     font_prop = fm.FontProperties(fname=font_file)
155 |                     if font_name.lower() in font_prop.get_name().lower():
156 |                         available_font = font_name
157 |                         break
158 |                 except:
159 |                     continue
160 |             if available_font:
161 |                 break
162 |         except:
163 |             continue
164 |     
165 |     # 如果找不到中文字體，使用英文標題
166 |     if available_font:
167 |         plt.rcParams['font.sans-serif'] = [available_font, 'Arial', 'DejaVu Sans']
168 |         plt.rcParams['axes.unicode_minus'] = False
169 |         use_chinese = True
170 |     else:
171 |         plt.rcParams['font.family'] = ['Arial', 'DejaVu Sans']
172 |         use_chinese = False
173 |     
174 |     fig, axes = plt.subplots(2, 3, figsize=(18, 12))
175 |     
176 |     if use_chinese:
177 |         fig.suptitle('企業 SaaS 服務支出分析 - 2025年5月\nThe Pocket Company by Accucrazy', fontsize=16, fontweight='bold')
178 |     else:
179 |         fig.suptitle('SaaS Service Spending Analysis - May 2025\nThe Pocket Company by Accucrazy', fontsize=16, fontweight='bold')
180 |     
181 |     # 1. 按服務類別的支出餅圖
182 |     category_spending = saas_df.groupby('saas_category')['amount_abs'].sum()
183 |     colors = plt.cm.Set3(np.linspace(0, 1, len(category_spending)))
184 |     
185 |     # 翻譯類別名稱
186 |     if use_chinese:
187 |         category_labels = {
188 |             'AI/ML Tools': 'AI/ML 工具',
189 |             'Cloud Services': '雲端服務',
190 |             'Design Tools': '設計工具',
191 |             'Development Tools': '開發工具',
192 |             'Marketing Tools': '行銷工具',
193 |             'Media': '媒體工具'
194 |         }
195 |         display_labels = [category_labels.get(cat, cat) for cat in category_spending.index]
196 |     else:
197 |         display_labels = category_spending.index
198 |     
199 |     axes[0, 0].pie(category_spending.values, 
200 |                    labels=display_labels, 
201 |                    autopct='%1.1f%%',
202 |                    colors=colors,
203 |                    startangle=90)
204 |     
205 |     if use_chinese:
206 |         axes[0, 0].set_title('SaaS 支出按類別分布')
207 |     else:
208 |         axes[0, 0].set_title('Spending by SaaS Category')
209 |     
210 |     # 2. 前10大服務支出條形圖
211 |     top_services = saas_df.groupby('service_name')['amount_abs'].sum().sort_values(ascending=True).tail(10)
212 |     
213 |     axes[0, 1].barh(range(len(top_services)), top_services.values, color='skyblue')
214 |     axes[0, 1].set_yticks(range(len(top_services)))
215 |     axes[0, 1].set_yticklabels(top_services.index)
216 |     
217 |     if use_chinese:
218 |         axes[0, 1].set_xlabel('支出金額 (NT$)')
219 |         axes[0, 1].set_title('前10大 SaaS 服務支出')
220 |     else:
221 |         axes[0, 1].set_xlabel('Amount (NT$)')
222 |         axes[0, 1].set_title('Top 10 SaaS Services by Spending')
223 |     
224 |     # 3. 訂閱類型分布
225 |     subscription_counts = saas_df['subscription_type'].value_counts()
226 |     
227 |     # 翻譯訂閱類型
228 |     if use_chinese:
229 |         subscription_labels = subscription_counts.index
230 |     else:
231 |         subscription_translation = {
232 |             '按使用量計費': 'Usage-based',
233 |             '月度訂閱': 'Monthly Subscription',
234 |             '一次性/其他': 'One-time/Other'
235 |         }
236 |         subscription_labels = [subscription_translation.get(label, label) for label in subscription_counts.index]
237 |     
238 |     axes[0, 2].pie(subscription_counts.values, labels=subscription_labels, 
239 |                    autopct='%1.1f%%', startangle=90)
240 |     
241 |     if use_chinese:
242 |         axes[0, 2].set_title('訂閱類型分布')
243 |     else:
244 |         axes[0, 2].set_title('Subscription Type Distribution')
245 |     
246 |     # 4. 交易金額分布
247 |     axes[1, 0].hist(saas_df['amount_abs'], bins=15, alpha=0.7, color='lightgreen', edgecolor='black')
248 |     
249 |     if use_chinese:
250 |         axes[1, 0].set_xlabel('交易金額 (NT$)')
251 |         axes[1, 0].set_ylabel('頻次')
252 |         axes[1, 0].set_title('SaaS 交易金額分布')
253 |     else:
254 |         axes[1, 0].set_xlabel('Transaction Amount (NT$)')
255 |         axes[1, 0].set_ylabel('Frequency')
256 |         axes[1, 0].set_title('Transaction Amount Distribution')
257 |     
258 |     # 5. AI/ML 工具詳細分析
259 |     ai_ml_data = saas_df[saas_df['saas_category'] == 'AI/ML Tools']
260 |     if not ai_ml_data.empty:
261 |         ai_spending = ai_ml_data.groupby('service_name')['amount_abs'].sum().sort_values(ascending=True)
262 |         
263 |         axes[1, 1].barh(range(len(ai_spending)), ai_spending.values, color='orange')
264 |         axes[1, 1].set_yticks(range(len(ai_spending)))
265 |         axes[1, 1].set_yticklabels(ai_spending.index)
266 |         
267 |         if use_chinese:
268 |             axes[1, 1].set_xlabel('支出金額 (NT$)')
269 |             axes[1, 1].set_title('AI/ML 工具支出詳細')
270 |         else:
271 |             axes[1, 1].set_xlabel('Amount (NT$)')
272 |             axes[1, 1].set_title('AI/ML Tools Spending Detail')
273 |     else:
274 |         no_data_text = '無 AI/ML 工具數據' if use_chinese else 'No AI/ML Tools Data'
275 |         axes[1, 1].text(0.5, 0.5, no_data_text, ha='center', va='center', transform=axes[1, 1].transAxes)
276 |     
277 |     # 6. Cursor 專項分析
278 |     cursor_data = saas_df[saas_df['saas_service'] == 'cursor']
279 |     if not cursor_data.empty:
280 |         cursor_by_type = cursor_data.groupby('subscription_type')['amount_abs'].sum()
281 |         
282 |         axes[1, 2].bar(range(len(cursor_by_type)), cursor_by_type.values, color='purple', alpha=0.7)
283 |         axes[1, 2].set_xticks(range(len(cursor_by_type)))
284 |         
285 |         if use_chinese:
286 |             type_labels = cursor_by_type.index
287 |             axes[1, 2].set_ylabel('支出金額 (NT$)')
288 |             axes[1, 2].set_title('Cursor AI 支出按類型')
289 |         else:
290 |             type_translation = {
291 |                 '按使用量計費': 'Usage-based',
292 |                 '月度訂閱': 'Monthly Sub',
293 |                 '一次性/其他': 'One-time'
294 |             }
295 |             type_labels = [type_translation.get(label, label) for label in cursor_by_type.index]
296 |             axes[1, 2].set_ylabel('Amount (NT$)')
297 |             axes[1, 2].set_title('Cursor AI Spending by Type')
298 |         
299 |         axes[1, 2].set_xticklabels(type_labels, rotation=45)
300 |     else:
301 |         no_cursor_text = '無 Cursor 數據' if use_chinese else 'No Cursor Data'
302 |         axes[1, 2].text(0.5, 0.5, no_cursor_text, ha='center', va='center', transform=axes[1, 2].transAxes)
303 |     
304 |     plt.tight_layout()
305 |     
306 |     # 在圖表底部添加公司標識
307 |     fig.text(0.5, 0.02, 'The Pocket Company by Accucrazy', ha='center', va='bottom', 
308 |              fontsize=10, style='italic', alpha=0.7)
309 |     
310 |     plt.savefig('saas_spending_analysis.png', dpi=300, bbox_inches='tight', facecolor='white')
311 |     print("SaaS 分析圖表已保存為 'saas_spending_analysis.png'")
312 | 
313 | def generate_saas_report(saas_df, analysis):
314 |     """生成 SaaS 支出分析報告"""
315 |     
316 |     report = f"""
317 | ================================================
318 | 企業 SaaS 服務支出分析報告
319 | 第一銀行信用卡帳單 - 2025年5月
320 | The Pocket Company by Accucrazy
321 | ================================================
322 | 
323 | 總體概況：
324 | - SaaS 總支出：NT$ {analysis['total_spending']:,.2f}
325 | - 使用服務數量：{analysis['num_services']} 個
326 | - 總交易次數：{analysis['num_transactions']} 筆
327 | - 平均每筆交易：NT$ {analysis['avg_transaction']:,.2f}
328 | 
329 | 按服務類別分析：
330 | """
331 |     
332 |     for category in analysis['category_stats'].index:
333 |         total = analysis['category_stats'].loc[category, ('amount_abs', 'sum')]
334 |         count = analysis['category_stats'].loc[category, ('amount_abs', 'count')]
335 |         percentage = (total / analysis['total_spending']) * 100
336 |         
337 |         report += f"""
338 | {category}：
339 |   - 總支出：NT$ {total:,.2f} ({percentage:.1f}%)
340 |   - 交易次數：{count} 筆
341 | """
342 |     
343 |     report += "\n前10大 SaaS 服務支出：\n"
344 |     
345 |     top_services = analysis['service_stats'].sort_values(('amount_abs', 'sum'), ascending=False).head(10)
346 |     
347 |     for i, (service, data) in enumerate(top_services.iterrows(), 1):
348 |         total = data[('amount_abs', 'sum')]
349 |         count = data[('amount_abs', 'count')]
350 |         percentage = (total / analysis['total_spending']) * 100
351 |         
352 |         report += f"{i:2d}. {service}: NT$ {total:,.2f} ({percentage:.1f}%) - {count}筆交易\n"
353 |     
354 |     # AI/ML 工具詳細分析
355 |     ai_ml_tools = saas_df[saas_df['saas_category'] == 'AI/ML Tools']
356 |     if not ai_ml_tools.empty:
357 |         ai_ml_total = ai_ml_tools['amount_abs'].sum()
358 |         
359 |         report += f"""
360 | AI/ML 工具詳細分析：
361 | 總支出：NT$ {ai_ml_total:,.2f}
362 | 
363 | 具體工具：
364 | """
365 |         
366 |         ai_tools_detail = ai_ml_tools.groupby('service_name')['amount_abs'].agg(['sum', 'count'])
367 |         
368 |         for tool in ai_tools_detail.index:
369 |             tool_total = ai_tools_detail.loc[tool, 'sum']
370 |             tool_count = ai_tools_detail.loc[tool, 'count']
371 |             report += f"• {tool}：NT$ {tool_total:,.2f} ({tool_count}筆)\n"
372 |     
373 |     report += """
374 | 成本優化建議：
375 | 
376 | 1. Cursor AI 使用監控：檢查使用量計費是否合理
377 | 2. OpenAI API 成本控制：設置使用限額
378 | 3. 訂閱整合：檢查重複功能的工具
379 | 4. 定期評估：每月檢查 ROI
380 | 
381 | ================================================
382 | 報告生成：The Pocket Company by Accucrazy
383 | 分析日期：2025年
384 | ================================================
385 | """
386 |     
387 |     return report
388 | 
389 | def main():
390 |     """主要執行函數"""
391 |     print("開始 SaaS 服務支出分析...")
392 |     
393 |     saas_df = load_and_filter_saas_data('clean_transactions.csv')
394 |     
395 |     if saas_df.empty:
396 |         print("沒有找到 SaaS 相關交易數據")
397 |         return
398 |     
399 |     saas_df = extract_service_details(saas_df)
400 |     analysis = analyze_saas_spending(saas_df)
401 |     
402 |     create_saas_visualizations(saas_df, analysis)
403 |     
404 |     report = generate_saas_report(saas_df, analysis)
405 |     print(report)
406 |     
407 |     with open('saas_analysis_report.txt', 'w', encoding='utf-8') as f:
408 |         f.write(report)
409 |     
410 |     saas_df.to_csv('saas_transactions.csv', index=False, encoding='utf-8')
411 |     
412 |     print("\n" + "="*60)
413 |     print("SaaS 分析完成！")
414 |     print("="*60)
415 | 
416 | if __name__ == "__main__":
417 |     main() 


--------------------------------------------------------------------------------
/extract_pdf_data.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | PDF Credit Card Statement Analyzer
  5 | Extracts and analyzes data from password-protected PDF credit card statements
  6 | """
  7 | 
  8 | import PyPDF2
  9 | import pandas as pd
 10 | import re
 11 | from datetime import datetime
 12 | import matplotlib.pyplot as plt
 13 | import seaborn as sns
 14 | from collections import defaultdict
 15 | import sys
 16 | 
 17 | def extract_pdf_text(pdf_path, password):
 18 |     """Extract text from password-protected PDF with improved error handling"""
 19 |     try:
 20 |         with open(pdf_path, 'rb') as file:
 21 |             # Try different PDF readers
 22 |             try:
 23 |                 pdf_reader = PyPDF2.PdfReader(file)
 24 |                 print(f"PDF has {len(pdf_reader.pages)} pages")
 25 |                 
 26 |                 # Check if PDF is encrypted
 27 |                 if pdf_reader.is_encrypted:
 28 |                     print("PDF is encrypted, attempting to decrypt...")
 29 |                     try:
 30 |                         # Try to decrypt with the password
 31 |                         result = pdf_reader.decrypt(password)
 32 |                         print(f"Decryption result: {result}")
 33 |                         if result == 0:
 34 |                             print("Failed to decrypt PDF with provided password")
 35 |                             return None
 36 |                     except Exception as decrypt_error:
 37 |                         print(f"Decryption error: {decrypt_error}")
 38 |                         return None
 39 |                 
 40 |                 # Extract text from all pages
 41 |                 text = ""
 42 |                 for page_num, page in enumerate(pdf_reader.pages):
 43 |                     try:
 44 |                         page_text = page.extract_text()
 45 |                         text += f"--- Page {page_num + 1} ---\n"
 46 |                         text += page_text + "\n\n"
 47 |                         print(f"Extracted {len(page_text)} characters from page {page_num + 1}")
 48 |                     except Exception as page_error:
 49 |                         print(f"Error extracting text from page {page_num + 1}: {page_error}")
 50 |                         continue
 51 |                 
 52 |                 return text
 53 |                 
 54 |             except Exception as reader_error:
 55 |                 print(f"PyPDF2 error: {reader_error}")
 56 |                 
 57 |                 # Try alternative approach with different parameters
 58 |                 try:
 59 |                     print("Trying alternative PDF reading approach...")
 60 |                     file.seek(0)  # Reset file pointer
 61 |                     pdf_reader = PyPDF2.PdfReader(file, strict=False)
 62 |                     
 63 |                     if pdf_reader.is_encrypted:
 64 |                         pdf_reader.decrypt(password)
 65 |                     
 66 |                     text = ""
 67 |                     for page in pdf_reader.pages:
 68 |                         text += page.extract_text() + "\n"
 69 |                     
 70 |                     return text
 71 |                     
 72 |                 except Exception as alt_error:
 73 |                     print(f"Alternative approach failed: {alt_error}")
 74 |                     return None
 75 |                     
 76 |     except FileNotFoundError:
 77 |         print(f"PDF file not found: {pdf_path}")
 78 |         return None
 79 |     except Exception as e:
 80 |         print(f"Unexpected error: {str(e)}")
 81 |         return None
 82 | 
 83 | def debug_pdf_content(pdf_path, password):
 84 |     """Debug PDF content to understand structure"""
 85 |     try:
 86 |         import pdfplumber
 87 |         print("Trying with pdfplumber...")
 88 |         
 89 |         with pdfplumber.open(pdf_path, password=password) as pdf:
 90 |             print(f"PDF opened successfully with pdfplumber. Pages: {len(pdf.pages)}")
 91 |             
 92 |             text = ""
 93 |             for page_num, page in enumerate(pdf.pages):
 94 |                 page_text = page.extract_text()
 95 |                 if page_text:
 96 |                     text += f"--- Page {page_num + 1} ---\n"
 97 |                     text += page_text + "\n\n"
 98 |                     print(f"Page {page_num + 1}: {len(page_text)} characters")
 99 |             
100 |             return text
101 |             
102 |     except ImportError:
103 |         print("pdfplumber not available, installing...")
104 |         import subprocess
105 |         subprocess.check_call([sys.executable, "-m", "pip", "install", "pdfplumber"])
106 |         
107 |         # Try again after installation
108 |         try:
109 |             import pdfplumber
110 |             with pdfplumber.open(pdf_path, password=password) as pdf:
111 |                 text = ""
112 |                 for page in pdf.pages:
113 |                     page_text = page.extract_text()
114 |                     if page_text:
115 |                         text += page_text + "\n"
116 |                 return text
117 |         except Exception as e:
118 |             print(f"pdfplumber also failed: {e}")
119 |             return None
120 |             
121 |     except Exception as e:
122 |         print(f"pdfplumber error: {e}")
123 |         return None
124 | 
125 | def parse_transactions(text):
126 |     """Parse transactions from extracted text with enhanced patterns"""
127 |     transactions = []
128 |     
129 |     # Print first 2000 characters for debugging
130 |     print("First 2000 characters of extracted text:")
131 |     print("=" * 50)
132 |     print(text[:2000])
133 |     print("=" * 50)
134 |     
135 |     # Enhanced patterns for Taiwanese bank statements
136 |     patterns = [
137 |         # Pattern for MM/DD format with Chinese description
138 |         r'(\d{2}/\d{2})\s+([\u4e00-\u9fff\w\s\-\*\.]+?)\s+([\d,]+\.?\d*)',
139 |         # Pattern for YYYY/MM/DD format
140 |         r'(\d{4}/\d{2}/\d{2})\s+([\u4e00-\u9fff\w\s\-\*\.]+?)\s+([\d,]+\.?\d*)',
141 |         # Pattern with transaction codes
142 |         r'(\d{2}/\d{2})\s+(\d+)\s+([\u4e00-\u9fff\w\s\-\*\.]+?)\s+([\d,]+\.?\d*)',
143 |         # Pattern for transactions with negative amounts
144 |         r'(\d{2}/\d{2})\s+([\u4e00-\u9fff\w\s\-\*\.]+?)\s+\-?([\d,]+\.?\d*)',
145 |         # Generic pattern for any transaction-like data
146 |         r'(\d{1,2}/\d{1,2})\s+(.+?)\s+([\d,]+\.?\d*)',
147 |         # Pattern with extra spaces
148 |         r'(\d{2}/\d{2})\s{2,}([\u4e00-\u9fff\w\s\-\*\.]+?)\s{2,}([\d,]+\.?\d*)',
149 |     ]
150 |     
151 |     for pattern_num, pattern in enumerate(patterns, 1):
152 |         print(f"Trying pattern {pattern_num}: {pattern}")
153 |         matches = re.findall(pattern, text, re.MULTILINE | re.DOTALL)
154 |         print(f"Found {len(matches)} matches with pattern {pattern_num}")
155 |         
156 |         for match in matches:
157 |             try:
158 |                 if len(match) >= 3:
159 |                     date_str = match[0]
160 |                     description = match[-2].strip()  # Second to last element
161 |                     amount_str = match[-1].strip()   # Last element
162 |                     
163 |                     # Clean up amount string
164 |                     amount_str = re.sub(r'[^\d,.]', '', amount_str)
165 |                     if amount_str:
166 |                         amount = float(amount_str.replace(',', ''))
167 |                         
168 |                         transaction = {
169 |                             'date': date_str,
170 |                             'description': description,
171 |                             'amount': amount
172 |                         }
173 |                         transactions.append(transaction)
174 |                         print(f"Added transaction: {date_str} | {description[:30]}... | {amount}")
175 |                         
176 |             except (ValueError, IndexError) as e:
177 |                 print(f"Error parsing match {match}: {e}")
178 |                 continue
179 |     
180 |     # Remove duplicates
181 |     unique_transactions = []
182 |     seen = set()
183 |     for t in transactions:
184 |         key = (t['date'], t['description'], t['amount'])
185 |         if key not in seen:
186 |             seen.add(key)
187 |             unique_transactions.append(t)
188 |     
189 |     print(f"Total unique transactions found: {len(unique_transactions)}")
190 |     return unique_transactions
191 | 
192 | def categorize_transactions(transactions):
193 |     """Categorize transactions based on description"""
194 |     categories = {
195 |         'Food & Dining': ['餐廳', '食', '飲', '麥當勞', '星巴克', '便利商店', '超市', '7-11', '全家', '美食', '餐', '飯', '咖啡'],
196 |         'Transportation': ['捷運', '公車', '計程車', '加油', '停車', 'UBER', '油站', '交通', '高鐵', '台鐵', '客運'],
197 |         'Shopping': ['百貨', '購物', '服飾', '電器', '網購', '商城', 'AMAZON', '買', '購', '商店', '市場'],
198 |         'Entertainment': ['電影', '遊戲', '娛樂', 'KTV', '健身', '運動', '書店', '音樂'],
199 |         'Bills & Utilities': ['電費', '水費', '瓦斯', '電信', '保險', '銀行', '費用', '帳單', '繳費'],
200 |         'Healthcare': ['醫院', '診所', '藥局', '健康', '醫療', '牙科', '眼科'],
201 |         'Education': ['學校', '補習', '書店', '文具', '教育', '學費'],
202 |         'Travel': ['飯店', '機票', '旅遊', '住宿', '旅行', 'HOTEL'],
203 |         'Cash/ATM': ['提款', 'ATM', '現金', '轉帳', '匯款'],
204 |         'Other': []
205 |     }
206 |     
207 |     for transaction in transactions:
208 |         description = transaction['description']
209 |         categorized = False
210 |         
211 |         for category, keywords in categories.items():
212 |             if category == 'Other':
213 |                 continue
214 |             for keyword in keywords:
215 |                 if keyword in description:
216 |                     transaction['category'] = category
217 |                     categorized = True
218 |                     break
219 |             if categorized:
220 |                 break
221 |         
222 |         if not categorized:
223 |             transaction['category'] = 'Other'
224 |     
225 |     return transactions
226 | 
227 | def analyze_spending(transactions):
228 |     """Analyze spending patterns"""
229 |     df = pd.DataFrame(transactions)
230 |     
231 |     if df.empty:
232 |         print("No transactions found to analyze")
233 |         return None, None
234 |     
235 |     print(f"Analyzing {len(df)} transactions...")
236 |     
237 |     # Convert amount to absolute value for spending analysis
238 |     df['amount_abs'] = df['amount'].abs()
239 |     
240 |     # Basic statistics
241 |     total_spending = df['amount_abs'].sum()
242 |     avg_transaction = df['amount_abs'].mean()
243 |     num_transactions = len(df)
244 |     
245 |     # Category analysis
246 |     category_spending = df.groupby('category')['amount_abs'].sum().sort_values(ascending=False)
247 |     
248 |     # Create analysis report
249 |     analysis = {
250 |         'total_spending': total_spending,
251 |         'average_transaction': avg_transaction,
252 |         'number_of_transactions': num_transactions,
253 |         'category_breakdown': category_spending.to_dict(),
254 |         'top_transactions': df.nlargest(5, 'amount_abs')[['description', 'amount', 'category']].to_dict('records'),
255 |         'transactions_by_category': df.groupby('category').size().to_dict()
256 |     }
257 |     
258 |     return analysis, df
259 | 
260 | def create_visualizations(df, analysis):
261 |     """Create spending visualization charts"""
262 |     try:
263 |         plt.style.use('default')  # Use default style instead of seaborn
264 |         fig, axes = plt.subplots(2, 2, figsize=(15, 12))
265 |         
266 |         # Category spending pie chart
267 |         category_data = pd.Series(analysis['category_breakdown'])
268 |         axes[0, 0].pie(category_data.values, labels=category_data.index, autopct='%1.1f%%', startangle=90)
269 |         axes[0, 0].set_title('Spending by Category')
270 |         
271 |         # Category spending bar chart
272 |         category_data.plot(kind='bar', ax=axes[0, 1])
273 |         axes[0, 1].set_title('Spending Amount by Category')
274 |         axes[0, 1].set_xlabel('Category')
275 |         axes[0, 1].set_ylabel('Amount (NT$)')
276 |         axes[0, 1].tick_params(axis='x', rotation=45)
277 |         
278 |         # Transaction amount distribution
279 |         axes[1, 0].hist(df['amount_abs'], bins=20, edgecolor='black', alpha=0.7)
280 |         axes[1, 0].set_title('Transaction Amount Distribution')
281 |         axes[1, 0].set_xlabel('Amount (NT$)')
282 |         axes[1, 0].set_ylabel('Frequency')
283 |         
284 |         # Transaction count by category
285 |         cat_counts = pd.Series(analysis['transactions_by_category'])
286 |         cat_counts.plot(kind='bar', ax=axes[1, 1])
287 |         axes[1, 1].set_title('Number of Transactions by Category')
288 |         axes[1, 1].set_xlabel('Category')
289 |         axes[1, 1].set_ylabel('Number of Transactions')
290 |         axes[1, 1].tick_params(axis='x', rotation=45)
291 |         
292 |         plt.tight_layout()
293 |         plt.savefig('credit_card_analysis.png', dpi=300, bbox_inches='tight')
294 |         print("Visualization saved as 'credit_card_analysis.png'")
295 |         
296 |     except Exception as e:
297 |         print(f"Error creating visualizations: {e}")
298 | 
299 | def generate_report(analysis):
300 |     """Generate a comprehensive analysis report"""
301 |     report = f"""
302 | ====================================
303 | CREDIT CARD SPENDING ANALYSIS REPORT
304 | ====================================
305 | 
306 | SUMMARY STATISTICS:
307 | - Total Spending: NT$ {analysis['total_spending']:,.2f}
308 | - Number of Transactions: {analysis['number_of_transactions']}
309 | - Average Transaction Amount: NT$ {analysis['average_transaction']:,.2f}
310 | 
311 | SPENDING BY CATEGORY:
312 | """
313 |     
314 |     for category, amount in analysis['category_breakdown'].items():
315 |         percentage = (amount / analysis['total_spending']) * 100
316 |         count = analysis['transactions_by_category'].get(category, 0)
317 |         report += f"- {category}: NT$ {amount:,.2f} ({percentage:.1f}%) - {count} transactions\n"
318 |     
319 |     report += f"""
320 | TOP 5 TRANSACTIONS:
321 | """
322 |     
323 |     for i, transaction in enumerate(analysis['top_transactions'], 1):
324 |         report += f"{i}. {transaction['description']}: NT$ {transaction['amount']:,.2f} ({transaction['category']})\n"
325 |     
326 |     report += f"""
327 | 
328 | INSIGHTS & RECOMMENDATIONS:
329 | - Largest spending category: {max(analysis['category_breakdown'], key=analysis['category_breakdown'].get)}
330 | - Most frequent transaction category: {max(analysis['transactions_by_category'], key=analysis['transactions_by_category'].get)}
331 | - Consider setting budgets for high-spending categories
332 | - Review recurring transactions for potential savings
333 | - Monitor transaction patterns for unusual activity
334 | 
335 | SPENDING BREAKDOWN:
336 | """
337 |     
338 |     for category in sorted(analysis['category_breakdown'].keys()):
339 |         amount = analysis['category_breakdown'][category]
340 |         count = analysis['transactions_by_category'].get(category, 0)
341 |         avg_per_transaction = amount / count if count > 0 else 0
342 |         report += f"- {category}: {count} transactions, avg NT$ {avg_per_transaction:.2f} per transaction\n"
343 |     
344 |     return report
345 | 
346 | def main():
347 |     """Main function to run the analysis"""
348 |     print("=" * 60)
349 |     print("The Pocket Company by Accucrazy")
350 |     print("Credit Card Statement PDF Analyzer")
351 |     print("=" * 60)
352 |     
353 |     # SECURITY: Use environment variables for sensitive data
354 |     import os
355 |     
356 |     pdf_path = os.getenv('PDF_PATH', 'your_bank_statement.pdf')
357 |     password = os.getenv('PDF_PASSWORD')
358 |     
359 |     # Check if using default values (not secure for production)
360 |     if not password:
361 |         print("⚠️  ERROR: PDF_PASSWORD environment variable not set.")
362 |         print("For security, set the environment variable:")
363 |         print("   Windows: set PDF_PASSWORD=your_actual_password")
364 |         print("   Linux/Mac: export PDF_PASSWORD=your_actual_password")
365 |         print("\nExample usage:")
366 |         print("   set PDF_PASSWORD=12345678")
367 |         print("   python extract_pdf_data.py")
368 |         return
369 |     
370 |     if pdf_path == 'your_bank_statement.pdf':
371 |         print("⚠️  WARNING: Using default PDF path. Set PDF_PATH environment variable.")
372 |         print("   Windows: set PDF_PATH=your_statement.pdf")
373 |         print("   Linux/Mac: export PDF_PATH=your_statement.pdf")
374 |         # Don't return here, use default filename
375 |         pdf_path = "bank_statement.pdf"
376 |     
377 |     print(f"\n📄 PDF Path: {pdf_path}")
378 |     print("🔐 Password: [PROTECTED]")
379 |     print("Starting PDF Credit Card Analysis...")
380 |     
381 |     print("\nAttempting to extract text from PDF...")
382 |     text = extract_pdf_text(pdf_path, password)
383 |     
384 |     if not text:
385 |         print("PyPDF2 failed, trying alternative method...")
386 |         text = debug_pdf_content(pdf_path, password)
387 |     
388 |     if not text:
389 |         print("Failed to extract text from PDF using all available methods")
390 |         print("Please check:")
391 |         print("1. PDF file exists and is readable")
392 |         print("2. Password is correct")
393 |         print("3. PDF is not corrupted")
394 |         return
395 |     
396 |     print(f"\nSuccessfully extracted {len(text)} characters from PDF")
397 |     
398 |     print("\nParsing transactions...")
399 |     transactions = parse_transactions(text)
400 |     
401 |     if not transactions:
402 |         print("No transactions found. The PDF format might not match expected patterns.")
403 |         # Save extracted text for manual review
404 |         with open('extracted_text.txt', 'w', encoding='utf-8') as f:
405 |             f.write(text)
406 |         print("Extracted text saved to 'extracted_text.txt' for manual review")
407 |         return
408 |     
409 |     print(f"\nFound {len(transactions)} transactions")
410 |     
411 |     print("Categorizing transactions...")
412 |     transactions = categorize_transactions(transactions)
413 |     
414 |     print("Analyzing spending patterns...")
415 |     analysis, df = analyze_spending(transactions)
416 |     
417 |     if analysis:
418 |         print("Generating visualizations...")
419 |         create_visualizations(df, analysis)
420 |         
421 |         print("Generating report...")
422 |         report = generate_report(analysis)
423 |         print(report)
424 |         
425 |         # Save report to file
426 |         with open('credit_card_analysis_report.txt', 'w', encoding='utf-8') as f:
427 |             f.write(report)
428 |         
429 |         # Save transactions to CSV
430 |         df.to_csv('transactions.csv', index=False, encoding='utf-8')
431 |         
432 |         print("\n" + "="*50)
433 |         print("ANALYSIS COMPLETE!")
434 |         print("="*50)
435 |         print("Files saved:")
436 |         print("- credit_card_analysis.png (visualization charts)")
437 |         print("- credit_card_analysis_report.txt (detailed report)")
438 |         print("- transactions.csv (transaction data)")
439 |         print("="*50)
440 |     else:
441 |         print("Analysis failed - no valid transactions found")
442 | 
443 | if __name__ == "__main__":
444 |     main() 


--------------------------------------------------------------------------------