├── requirements.txt ├── sample_transactions.csv ├── .gitignore ├── SECURITY_CHECKLIST.md ├── README.md ├── clean_analysis.py ├── saas_analysis.py └── extract_pdf_data.py /requirements.txt: -------------------------------------------------------------------------------- 1 | PyPDF2==3.0.1 2 | pandas==2.1.4 3 | matplotlib==3.8.2 4 | seaborn==0.13.0 5 | numpy==1.25.2 -------------------------------------------------------------------------------- /sample_transactions.csv: -------------------------------------------------------------------------------- 1 | date,description,amount,category,amount_abs 2 | 04/22,CURSOR AI POWERED IDE,651.0,Technology/Software,651.0 3 | 04/29,ADOBE SOFTWARE,1900.0,Technology/Software,1900.0 4 | 04/29,OPENAI *CHATGPT SUBSCR,652.0,Technology/Software,652.0 5 | 04/30,FIGMA PRO SUBSCRIPTION,4357.0,Technology/Software,4357.0 6 | 05/05,HEYGEN VIDEO AI,936.0,Technology/Software,936.0 7 | 05/06,REPORTDASH ANALYTICS,9744.0,Technology/Software,9744.0 8 | 05/07,ANTHROPIC CLAUDE API,690.0,Technology/Software,690.0 9 | 05/13,CURSOR USAGE APR,378.0,Technology/Software,378.0 10 | 05/13,LEONARDO AI CREDITS,305.0,Technology/Software,305.0 11 | 05/19,GOOGLE CLOUD SERVICES,149.0,Technology/Software,149.0 12 | 04/22,ABC Restaurant,780.0,Food & Dining,780.0 13 | 04/25,Coffee Shop Downtown,165.0,Food & Dining,165.0 14 | 04/28,Italian Bistro,1914.0,Food & Dining,1914.0 15 | 05/02,Supermarket Chain,473.0,Food & Dining,473.0 16 | 05/07,Burger Restaurant,597.0,Food & Dining,597.0 17 | 05/11,Asian Cuisine,2288.0,Food & Dining,2288.0 18 | 04/22,Taxi Service,120.0,Transportation,120.0 19 | 05/06,Gas Station,840.0,Transportation,840.0 20 | 05/13,Public Transport,108.0,Transportation,108.0 21 | 05/19,Ride Share Service,120.0,Transportation,120.0 22 | 04/24,Electronics Store,1160.0,Shopping,1160.0 23 | 05/06,Hardware Store,120.0,Shopping,120.0 24 | 05/14,Home Goods Store,155.0,Shopping,155.0 25 | 05/16,Department Store,1843.0,Shopping,1843.0 26 | 04/20,Entertainment Center,9285.0,Entertainment,9285.0 27 | 05/03,Movie Theater,315.0,Entertainment,315.0 28 | 05/06,Gaming Platform,10470.0,Entertainment,10470.0 29 | 04/18,Bank Fee,94.0,Bills & Utilities,94.0 30 | 04/20,International Fee,5.0,Bills & Utilities,5.0 31 | 05/15,Service Charge,85.0,Bills & Utilities,85.0 -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # ================================================ 2 | # The Pocket Company by Accucrazy 3 | # Security .gitignore for Credit Card Analyzer 4 | # ================================================ 5 | 6 | # SENSITIVE FINANCIAL DATA - DO NOT COMMIT 7 | # ========================================== 8 | 9 | # Bank statements and financial documents 10 | *.pdf 11 | 第一銀行電子對帳單*.pdf 12 | 13 | # Real transaction data (explicitly listed to avoid blocking samples) 14 | transactions.csv 15 | clean_transactions.csv 16 | saas_transactions.csv 17 | 18 | # Analysis reports with real data 19 | *_report.txt 20 | *_analysis_report.txt 21 | clean_credit_card_report.txt 22 | saas_analysis_report.txt 23 | 24 | # Charts and visualizations with real data 25 | *.png 26 | !sample_*.png # Allow sample charts 27 | !demo_*.png # Allow demo charts 28 | 29 | # Backup files with sensitive data 30 | *.backup 31 | *.bak 32 | *_backup.* 33 | 34 | # Personal configuration files 35 | config.json 36 | settings.json 37 | .env 38 | .env.local 39 | .env.production 40 | 41 | # PYTHON DEVELOPMENT 42 | # =================== 43 | 44 | # Byte-compiled / optimized / DLL files 45 | __pycache__/ 46 | *.py[cod] 47 | *$py.class 48 | 49 | # C extensions 50 | *.so 51 | 52 | # Distribution / packaging 53 | .Python 54 | build/ 55 | develop-eggs/ 56 | dist/ 57 | downloads/ 58 | eggs/ 59 | .eggs/ 60 | lib/ 61 | lib64/ 62 | parts/ 63 | sdist/ 64 | var/ 65 | wheels/ 66 | share/python-wheels/ 67 | *.egg-info/ 68 | .installed.cfg 69 | *.egg 70 | MANIFEST 71 | 72 | # PyInstaller 73 | *.manifest 74 | *.spec 75 | 76 | # Installer logs 77 | pip-log.txt 78 | pip-delete-this-directory.txt 79 | 80 | # Unit test / coverage reports 81 | htmlcov/ 82 | .tox/ 83 | .nox/ 84 | .coverage 85 | .coverage.* 86 | .cache 87 | nosetests.xml 88 | coverage.xml 89 | *.cover 90 | *.py,cover 91 | .hypothesis/ 92 | .pytest_cache/ 93 | cover/ 94 | 95 | # Virtual environments 96 | .env 97 | .venv 98 | env/ 99 | venv/ 100 | ENV/ 101 | env.bak/ 102 | venv.bak/ 103 | 104 | # IDE and Editor files 105 | .vscode/ 106 | .idea/ 107 | *.swp 108 | *.swo 109 | *~ 110 | 111 | # OS generated files 112 | .DS_Store 113 | .DS_Store? 114 | ._* 115 | .Spotlight-V100 116 | .Trashes 117 | ehthumbs.db 118 | Thumbs.db 119 | 120 | # TEMPORARY FILES 121 | # ================ 122 | temp/ 123 | tmp/ 124 | *.tmp 125 | *.log 126 | 127 | # SAMPLE DATA (ALLOWED) 128 | # ===================== 129 | # These files are OK to commit as they contain demo data 130 | # Note: These are already allowed by the ! rules above 131 | demo_*.csv 132 | example_*.csv -------------------------------------------------------------------------------- /SECURITY_CHECKLIST.md: -------------------------------------------------------------------------------- 1 | # 🔒 Security Checklist for GitHub Upload 2 | **The Pocket Company by Accucrazy** 3 | 4 | --- 5 | 6 | ## ⚠️ **CRITICAL: Before Uploading to GitHub** 7 | 8 | This checklist ensures you don't accidentally upload sensitive financial data to a public repository. 9 | 10 | ### 🚫 **Files You MUST NOT Upload** 11 | 12 | #### **Real Financial Data** 13 | - [ ] ❌ **Bank statement PDFs** (any `.pdf` files with real bank data) 14 | - [ ] ❌ **Real transaction CSV files** (`transactions.csv`, `clean_transactions.csv`, etc.) 15 | - [ ] ❌ **Analysis reports with real data** (any `*_report.txt` files) 16 | - [ ] ❌ **Charts with real data** (`.png` files showing actual spending) 17 | 18 | #### **Sensitive Configuration** 19 | - [ ] ❌ **Hard-coded passwords** in Python files 20 | - [ ] ❌ **Real bank account information** 21 | - [ ] ❌ **Personal identification numbers** 22 | - [ ] ❌ **Credit card details** 23 | 24 | ### ✅ **Files That Are SAFE to Upload** 25 | 26 | #### **Code and Documentation** 27 | - [ ] ✅ **Python scripts** (with passwords removed) 28 | - [ ] ✅ **README.md** (updated with company branding) 29 | - [ ] ✅ **requirements.txt** 30 | - [ ] ✅ **.gitignore** (properly configured) 31 | 32 | #### **Sample/Demo Data** 33 | - [ ] ✅ **sample_transactions.csv** (anonymized demo data) 34 | - [ ] ✅ **demo_*.png** (sample charts with fake data) 35 | - [ ] ✅ **example_*.csv** (template files) 36 | 37 | --- 38 | 39 | ## 🔧 **Pre-Upload Security Steps** 40 | 41 | ### **1. Remove Hard-coded Secrets** 42 | ```bash 43 | # Check for hard-coded passwords 44 | grep -r "password.*=" *.py 45 | grep -r "09444722" *.py # Your specific password 46 | grep -r "第一銀行" *.py # Bank name 47 | 48 | # Should return no results! 49 | ``` 50 | 51 | ### **2. Verify .gitignore is Working** 52 | ```bash 53 | # Check what Git will track 54 | git status 55 | git add --dry-run . 56 | 57 | # Ensure these files are NOT listed: 58 | # - *.pdf 59 | # - *transactions.csv (real data) 60 | # - *_report.txt (real reports) 61 | # - *.png (real charts) 62 | ``` 63 | 64 | ### **3. Environment Variable Setup** 65 | Ensure scripts use environment variables: 66 | ```python 67 | # ✅ GOOD (secure) 68 | password = os.getenv('PDF_PASSWORD') 69 | 70 | # ❌ BAD (insecure) 71 | password = "09444722" 72 | ``` 73 | 74 | ### **4. Test with Sample Data** 75 | ```bash 76 | # Run scripts with sample data to ensure they work 77 | python saas_analysis.py # Should use sample_transactions.csv 78 | ``` 79 | 80 | --- 81 | 82 | ## 📋 **Final Verification Checklist** 83 | 84 | ### **File Content Review** 85 | - [ ] All Python files use `os.getenv()` for sensitive data 86 | - [ ] No real merchant names in sample data 87 | - [ ] No real amounts that could identify spending patterns 88 | - [ ] No dates that match real transaction periods 89 | - [ ] All reports contain "The Pocket Company by Accucrazy" branding 90 | 91 | ### **Repository Structure** 92 | ``` 93 | ✅ SAFE TO UPLOAD: 94 | ├── README.md ✅ (updated with company info) 95 | ├── requirements.txt ✅ (dependencies only) 96 | ├── .gitignore ✅ (protects sensitive files) 97 | ├── SECURITY_CHECKLIST.md ✅ (this file) 98 | ├── extract_pdf_data.py ✅ (no hard-coded passwords) 99 | ├── saas_analysis.py ✅ (clean code) 100 | ├── clean_analysis.py ✅ (clean code) 101 | ├── sample_transactions.csv ✅ (demo data only) 102 | └── sample_analysis_chart.png ✅ (demo chart) 103 | 104 | ❌ NEVER UPLOAD: 105 | ├── *.pdf ❌ (real bank statements) 106 | ├── transactions.csv ❌ (real transaction data) 107 | ├── clean_transactions.csv ❌ (real processed data) 108 | ├── saas_transactions.csv ❌ (real SaaS data) 109 | ├── *_report.txt ❌ (real analysis reports) 110 | ├── *.png (with real data) ❌ (real spending charts) 111 | └── .env files ❌ (environment variables) 112 | ``` 113 | 114 | --- 115 | 116 | ## 🛡️ **Best Practices for Users** 117 | 118 | ### **For Repository Maintainers** 119 | 1. **Set Repository to Private** initially while testing 120 | 2. **Review all commits** before making public 121 | 3. **Use GitHub's secret scanning** features 122 | 4. **Add branch protection rules** 123 | 124 | ### **For End Users** 125 | 1. **Fork the repository** to your private account first 126 | 2. **Never commit real financial data** to any branch 127 | 3. **Use environment variables** for all sensitive configuration 128 | 4. **Regularly audit** your commit history 129 | 130 | ### **Environment Setup Template** 131 | Create a `.env.example` file (safe to upload): 132 | ```bash 133 | # Copy this to .env and fill in your values 134 | PDF_PASSWORD=your_pdf_password_here 135 | PDF_PATH=your_statement_file.pdf 136 | ``` 137 | 138 | --- 139 | 140 | ## 🚨 **Emergency: If You Accidentally Uploaded Sensitive Data** 141 | 142 | ### **Immediate Actions** 143 | 1. **Make repository private** immediately 144 | 2. **Contact GitHub support** to purge sensitive data 145 | 3. **Change any exposed passwords** or account numbers 146 | 4. **Review commit history** for other sensitive data 147 | 5. **Force push** to remove sensitive commits 148 | 149 | ### **GitHub Data Removal** 150 | ```bash 151 | # Remove sensitive files from Git history 152 | git filter-branch --force --index-filter \ 153 | "git rm --cached --ignore-unmatch sensitive_file.csv" \ 154 | --prune-empty --tag-name-filter cat -- --all 155 | 156 | # Force push to overwrite history 157 | git push origin --force --all 158 | ``` 159 | 160 | --- 161 | 162 | ## ✅ **Final Sign-off** 163 | 164 | Before uploading to GitHub, confirm: 165 | 166 | - [ ] ✅ I have reviewed all files for sensitive data 167 | - [ ] ✅ No real financial information is included 168 | - [ ] ✅ All scripts use environment variables for secrets 169 | - [ ] ✅ .gitignore is properly configured 170 | - [ ] ✅ Sample data is anonymized and safe 171 | - [ ] ✅ Repository includes proper company branding 172 | - [ ] ✅ README includes security warnings 173 | 174 | **Signed off by**: ________________ **Date**: ____________ 175 | 176 | --- 177 | 178 | **The Pocket Company by Accucrazy** 179 | *Committed to data security and privacy protection* 180 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Credit Card Statement PDF Analyzer 2 | **The Pocket Company by Accucrazy** 3 | 4 | --- 5 | 6 | ## 📊 **Enterprise SaaS Spending Analysis Tool** 7 | 8 | This comprehensive tool extracts and analyzes transaction data from password-protected PDF credit card statements, with a specialized focus on **SaaS service spending analysis** for modern tech companies. 9 | 10 | ## ✨ **Key Features** 11 | 12 | ### 🔐 **PDF Processing** 13 | - **PDF Text Extraction**: Decrypts and extracts text from password-protected PDF files 14 | - **Smart Transaction Parsing**: Automatically identifies and parses transaction data 15 | - **Multi-format Support**: Handles various bank statement formats 16 | 17 | ### 📈 **Advanced Analytics** 18 | - **Smart Categorization**: Categorizes transactions into spending categories (Food, Transportation, Shopping, SaaS, etc.) 19 | - **SaaS-Focused Analysis**: Specialized analysis for enterprise software subscriptions 20 | - **Comprehensive Insights**: Provides detailed spending statistics and optimization recommendations 21 | - **Visual Reports**: Creates professional charts and graphs for spending patterns 22 | 23 | ### 🎯 **SaaS Specialization** 24 | - **AI/ML Tools Tracking**: Monitors spending on Cursor, OpenAI, Anthropic, etc. 25 | - **Subscription Type Analysis**: Differentiates between usage-based vs monthly subscriptions 26 | - **Cost Optimization**: Provides specific recommendations for SaaS spending optimization 27 | - **Duplicate Detection**: Removes duplicate transactions for accurate analysis 28 | 29 | ### 💾 **Export Options** 30 | - **Multiple Formats**: Saves analysis results in CSV, PNG, TXT formats 31 | - **Professional Reports**: Generated with company branding 32 | - **Clean Data**: Deduplicated and categorized transaction data 33 | 34 | ## 🛠 **Requirements** 35 | 36 | - Python 3.7 or higher 37 | - Required packages (install using `pip install -r requirements.txt`) 38 | 39 | ## 📦 **Installation** 40 | 41 | 1. **Clone the repository**: 42 | ```bash 43 | git clone https://github.com/your-username/credit-card-analyzer.git 44 | cd credit-card-analyzer 45 | ``` 46 | 47 | 2. **Install dependencies**: 48 | ```bash 49 | pip install -r requirements.txt 50 | ``` 51 | 52 | 3. **Prepare your PDF file**: Place your PDF file in the project directory 53 | 54 | ## 🚀 **Usage** 55 | 56 | ### **Basic Analysis** 57 | ```bash 58 | python extract_pdf_data.py 59 | ``` 60 | 61 | ### **SaaS-Focused Analysis** 62 | ```bash 63 | python saas_analysis.py 64 | ``` 65 | 66 | ### **Clean Data Analysis** 67 | ```bash 68 | python clean_analysis.py 69 | ``` 70 | 71 | ### **Custom Configuration** 72 | Edit the script parameters: 73 | ```python 74 | pdf_path = "your_pdf_file.pdf" 75 | password = "your_password" # Use environment variable in production 76 | ``` 77 | 78 | ## 📁 **Output Files** 79 | 80 | ### **Comprehensive Analysis** 81 | - `saas_spending_analysis.png` - SaaS-focused visualization charts 82 | - `saas_analysis_report.txt` - Detailed SaaS spending report 83 | - `saas_transactions.csv` - Clean SaaS transaction data 84 | 85 | ### **General Analysis** 86 | - `credit_card_analysis.png` - General spending visualization 87 | - `credit_card_analysis_report.txt` - Complete spending analysis 88 | - `clean_transactions.csv` - Processed transaction data 89 | 90 | ## 🏷 **Spending Categories** 91 | 92 | ### **SaaS Categories** 93 | - **AI/ML Tools**: Cursor, OpenAI, Anthropic, Leonardo AI, HeyGen 94 | - **Design Tools**: Figma, Adobe Creative Suite 95 | - **Development Tools**: ReportDash, GitHub, hosting services 96 | - **Cloud Services**: Google Cloud, AWS, Colab 97 | - **Marketing Tools**: ManyChat, analytics platforms 98 | 99 | ### **General Categories** 100 | - **Food & Dining**: Restaurants, convenience stores, supermarkets 101 | - **Transportation**: Public transport, ride-sharing, fuel 102 | - **Entertainment**: Movies, games, recreation 103 | - **Shopping**: Retail, electronics, household items 104 | - **Bills & Utilities**: Utilities, insurance, banking fees 105 | 106 | ## ⚙ **Customization** 107 | 108 | ### **Adding SaaS Services** 109 | ```python 110 | saas_keywords = { 111 | 'Your Category': ['SERVICE_NAME', 'KEYWORD'], 112 | # Add new categories here 113 | } 114 | ``` 115 | 116 | ### **Transaction Pattern Matching** 117 | ```python 118 | patterns = [ 119 | r'(\d{2}/\d{2})\s+(.+?)\s+([\d,]+\.?\d*)', 120 | # Add custom patterns for your bank format 121 | ] 122 | ``` 123 | 124 | ## 📊 **Sample Analysis Report** 125 | 126 | ``` 127 | ================================================ 128 | 企業 SaaS 服務支出分析報告 129 | 第一銀行信用卡帳單 - 2025年5月 130 | The Pocket Company by Accucrazy 131 | ================================================ 132 | 133 | 總體概況: 134 | - SaaS 總支出:NT$ 30,476.00 135 | - 使用服務數量:12 個 136 | - 總交易次數:25 筆 137 | - 平均每筆交易:NT$ 1,219.04 138 | 139 | 前5大 SaaS 服務支出: 140 | 1. ReportDash Analytics: NT$ 9,744.00 (32.0%) 141 | 2. Cursor AI IDE: NT$ 5,591.00 (18.3%) 142 | 3. Figma Design: NT$ 4,357.00 (14.3%) 143 | 4. HeyGen Video AI: NT$ 3,331.00 (10.9%) 144 | 5. OpenAI (ChatGPT/API): NT$ 2,934.00 (9.6%) 145 | ``` 146 | 147 | ## 🔒 **Security & Privacy** 148 | 149 | ### **Data Protection** 150 | - All processing is done locally 151 | - No data sent to external servers 152 | - Sensitive files excluded from version control 153 | 154 | ### **Best Practices** 155 | - Use environment variables for passwords 156 | - Delete sensitive files after analysis 157 | - Review `.gitignore` before committing 158 | 159 | ## 🚫 **Files NOT to Upload to GitHub** 160 | 161 | ``` 162 | # Sensitive files - DO NOT COMMIT 163 | *.pdf # Bank statements 164 | *transactions.csv # Real transaction data 165 | *_report.txt # Reports with real data 166 | *.png # Charts with real data (sample charts OK) 167 | ``` 168 | 169 | ## ⚠ **Troubleshooting** 170 | 171 | ### **Common Issues** 172 | 1. **PDF Password Error**: Verify password or use environment variable 173 | 2. **No Transactions Found**: Check PDF format compatibility 174 | 3. **Encoding Issues**: Ensure UTF-8 system encoding 175 | 4. **Font Display**: Install Chinese fonts for proper chart rendering 176 | 177 | ### **Getting Help** 178 | 1. Check extracted text patterns in debug mode 179 | 2. Verify regex patterns match your bank format 180 | 3. Ensure all dependencies are installed 181 | 182 | ## 🤝 **Contributing** 183 | 184 | 1. Fork the repository 185 | 2. Create a feature branch 186 | 3. Add sample data (NOT real financial data) 187 | 4. Submit a pull request 188 | 189 | ## 📄 **License** 190 | 191 | This tool is provided for educational and personal use. Ensure compliance with your financial institution's terms of service. 192 | 193 | ## 🔗 **Company Information** 194 | 195 | **The Pocket Company by Accucrazy** 196 | - Specialized in enterprise SaaS spending optimization 197 | - AI-driven financial analysis tools 198 | - Modern tech company cost management solutions 199 | 200 | --- 201 | 202 | ⚠️ **Important**: This tool processes sensitive financial data. Always follow data protection best practices and never commit real financial information to version control. -------------------------------------------------------------------------------- /clean_analysis.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Clean Credit Card Analysis 5 | Cleans the extracted data and provides accurate spending analysis 6 | """ 7 | 8 | import pandas as pd 9 | import matplotlib.pyplot as plt 10 | import numpy as np 11 | from collections import defaultdict 12 | 13 | def clean_transactions(csv_path): 14 | """Clean transaction data by removing outliers and parsing errors""" 15 | df = pd.read_csv(csv_path, encoding='utf-8') 16 | 17 | print(f"Original transactions: {len(df)}") 18 | 19 | # Remove rows with invalid dates 20 | df = df[~df['date'].str.contains('0/0|14/05', na=False)] 21 | 22 | # Remove transactions with unrealistic amounts (likely parsing errors) 23 | # Most credit card transactions should be under NT$100,000 24 | df = df[df['amount_abs'] < 100000] 25 | 26 | # Remove rows with garbled text (cid: patterns) 27 | df = df[~df['description'].str.contains('cid:', na=False)] 28 | 29 | # Remove duplicate transactions (keeping first occurrence) 30 | df = df.drop_duplicates(subset=['date', 'description', 'amount'], keep='first') 31 | 32 | print(f"Cleaned transactions: {len(df)}") 33 | 34 | return df 35 | 36 | def improve_categorization(df): 37 | """Improve transaction categorization""" 38 | 39 | # Enhanced categories with better keywords 40 | categories = { 41 | 'Food & Dining': [ 42 | '餐廳', '食', '飲', '麥當勞', '星巴克', '便利商店', '超市', '7-11', '全家', 43 | '美食', '餐', '飯', '咖啡', 'cama', '杭州小籠包', '養心殿', '京星港式飲茶', 44 | '北村家', '吐司利亞', '優食', 'Subway', '燒肉', '創義麵', '湘川', 45 | '珍蜜咖啡', 'Fake Sober', 'J WOW', '全聯福利中心', '麥當勞' 46 | ], 47 | 'Transportation': [ 48 | '捷運', '公車', '計程車', '加油', '停車', 'UBER', '油站', '交通', 49 | '高鐵', '台鐵', '客運', '台灣大車隊', '優步', 'Taxi' 50 | ], 51 | 'Technology/Software': [ 52 | 'CURSOR', 'ADOBE', 'OPENAI', 'GOOGLE', 'FIGMA', 'HEYGEN', 'SEASALT', 53 | 'REPORTDASH', 'MANYCHAT', 'RSS.APP', 'PADDLE', 'LEONARDO', 'Colab', 54 | 'SPOTIFY', 'ANTHROPIC', 'Gandi', 'APIFY', 'SHOPIFY', 'PCHOME' 55 | ], 56 | 'Shopping': [ 57 | '百貨', '購物', '服飾', '電器', '網購', '商城', 'AMAZON', '買', '購', 58 | '商店', '市場', 'IKEA', '宜家家居', '永昇五金', '今華電子', '源達科技' 59 | ], 60 | 'Entertainment': [ 61 | '電影', '遊戲', '娛樂', 'KTV', '健身', '運動', '書店', '音樂', '錢櫃' 62 | ], 63 | 'Bills & Utilities': [ 64 | '電費', '水費', '瓦斯', '電信', '保險', '銀行', '費用', '帳單', '繳費', 65 | '手續費', '國外交易手續費', 'ATT' 66 | ], 67 | 'Cash/ATM': [ 68 | '提款', 'ATM', '現金', '轉帳', '匯款', '現金回饋', '自動扣繳' 69 | ], 70 | 'Business/Marketing': [ 71 | '全球商務科技', 'LINE Ads', '連加' 72 | ], 73 | 'Other': [] 74 | } 75 | 76 | # Recategorize transactions 77 | for idx, row in df.iterrows(): 78 | description = row['description'] 79 | categorized = False 80 | 81 | for category, keywords in categories.items(): 82 | if category == 'Other': 83 | continue 84 | for keyword in keywords: 85 | if keyword in description: 86 | df.at[idx, 'category'] = category 87 | categorized = True 88 | break 89 | if categorized: 90 | break 91 | 92 | if not categorized: 93 | df.at[idx, 'category'] = 'Other' 94 | 95 | return df 96 | 97 | def generate_clean_analysis(df): 98 | """Generate comprehensive analysis of cleaned data""" 99 | 100 | # Basic statistics 101 | total_spending = df['amount_abs'].sum() 102 | avg_transaction = df['amount_abs'].mean() 103 | median_transaction = df['amount_abs'].median() 104 | num_transactions = len(df) 105 | 106 | # Category analysis 107 | category_spending = df.groupby('category')['amount_abs'].sum().sort_values(ascending=False) 108 | category_counts = df['category'].value_counts() 109 | 110 | # Date analysis 111 | spending_by_date = df.groupby('date')['amount_abs'].sum().sort_values(ascending=False) 112 | 113 | # Top merchants 114 | merchant_spending = df.groupby('description')['amount_abs'].sum().sort_values(ascending=False).head(10) 115 | 116 | # Analysis dictionary 117 | analysis = { 118 | 'total_spending': total_spending, 119 | 'average_transaction': avg_transaction, 120 | 'median_transaction': median_transaction, 121 | 'number_of_transactions': num_transactions, 122 | 'category_breakdown': category_spending.to_dict(), 123 | 'category_counts': category_counts.to_dict(), 124 | 'top_transactions': df.nlargest(10, 'amount_abs')[['date', 'description', 'amount', 'category']].to_dict('records'), 125 | 'top_merchants': merchant_spending.to_dict(), 126 | 'spending_by_date': spending_by_date.head(10).to_dict() 127 | } 128 | 129 | return analysis 130 | 131 | def create_clean_visualizations(df, analysis): 132 | """Create improved visualizations""" 133 | plt.style.use('default') 134 | fig, axes = plt.subplots(2, 3, figsize=(18, 12)) 135 | fig.suptitle('Credit Card Spending Analysis - May 2025', fontsize=16, fontweight='bold') 136 | 137 | # 1. Category spending pie chart 138 | category_data = pd.Series(analysis['category_breakdown']) 139 | # Only show categories with more than 1% of total spending 140 | threshold = category_data.sum() * 0.01 141 | category_filtered = category_data[category_data > threshold] 142 | other_amount = category_data[category_data <= threshold].sum() 143 | if other_amount > 0: 144 | category_filtered['Other (Small)'] = other_amount 145 | 146 | colors = plt.cm.Set3(np.linspace(0, 1, len(category_filtered))) 147 | axes[0, 0].pie(category_filtered.values, labels=category_filtered.index, autopct='%1.1f%%', 148 | startangle=90, colors=colors) 149 | axes[0, 0].set_title('Spending by Category') 150 | 151 | # 2. Category spending bar chart 152 | category_data.plot(kind='bar', ax=axes[0, 1], color='skyblue') 153 | axes[0, 1].set_title('Amount by Category') 154 | axes[0, 1].set_xlabel('Category') 155 | axes[0, 1].set_ylabel('Amount (NT$)') 156 | axes[0, 1].tick_params(axis='x', rotation=45) 157 | 158 | # 3. Transaction count by category 159 | cat_counts = pd.Series(analysis['category_counts']) 160 | cat_counts.plot(kind='bar', ax=axes[0, 2], color='lightcoral') 161 | axes[0, 2].set_title('Transaction Count by Category') 162 | axes[0, 2].set_xlabel('Category') 163 | axes[0, 2].set_ylabel('Number of Transactions') 164 | axes[0, 2].tick_params(axis='x', rotation=45) 165 | 166 | # 4. Transaction amount distribution 167 | axes[1, 0].hist(df['amount_abs'], bins=30, edgecolor='black', alpha=0.7, color='lightgreen') 168 | axes[1, 0].set_title('Transaction Amount Distribution') 169 | axes[1, 0].set_xlabel('Amount (NT$)') 170 | axes[1, 0].set_ylabel('Frequency') 171 | 172 | # 5. Top merchants 173 | top_merchants = pd.Series(analysis['top_merchants']).head(8) 174 | top_merchants.plot(kind='barh', ax=axes[1, 1], color='orange') 175 | axes[1, 1].set_title('Top Merchants by Spending') 176 | axes[1, 1].set_xlabel('Amount (NT$)') 177 | 178 | # 6. Spending by amount ranges 179 | amount_ranges = ['<100', '100-500', '500-1000', '1000-5000', '5000+'] 180 | range_counts = [ 181 | len(df[df['amount_abs'] < 100]), 182 | len(df[(df['amount_abs'] >= 100) & (df['amount_abs'] < 500)]), 183 | len(df[(df['amount_abs'] >= 500) & (df['amount_abs'] < 1000)]), 184 | len(df[(df['amount_abs'] >= 1000) & (df['amount_abs'] < 5000)]), 185 | len(df[df['amount_abs'] >= 5000]) 186 | ] 187 | 188 | axes[1, 2].bar(amount_ranges, range_counts, color='purple', alpha=0.7) 189 | axes[1, 2].set_title('Transactions by Amount Range') 190 | axes[1, 2].set_xlabel('Amount Range (NT$)') 191 | axes[1, 2].set_ylabel('Number of Transactions') 192 | 193 | plt.tight_layout() 194 | plt.savefig('clean_credit_card_analysis.png', dpi=300, bbox_inches='tight') 195 | print("Clean visualization saved as 'clean_credit_card_analysis.png'") 196 | 197 | def generate_clean_report(analysis): 198 | """Generate a comprehensive cleaned analysis report""" 199 | report = f""" 200 | ==================================== 201 | CLEANED CREDIT CARD SPENDING ANALYSIS 202 | First Bank Statement - May 2025 203 | ==================================== 204 | 205 | SUMMARY STATISTICS: 206 | - Total Spending: NT$ {analysis['total_spending']:,.2f} 207 | - Number of Transactions: {analysis['number_of_transactions']} 208 | - Average Transaction: NT$ {analysis['average_transaction']:,.2f} 209 | - Median Transaction: NT$ {analysis['median_transaction']:,.2f} 210 | 211 | SPENDING BY CATEGORY: 212 | """ 213 | 214 | total = analysis['total_spending'] 215 | for category, amount in analysis['category_breakdown'].items(): 216 | percentage = (amount / total) * 100 217 | count = analysis['category_counts'].get(category, 0) 218 | avg_per_transaction = amount / count if count > 0 else 0 219 | report += f"- {category}: NT$ {amount:,.2f} ({percentage:.1f}%) - {count} transactions (avg: NT$ {avg_per_transaction:,.2f})\n" 220 | 221 | report += f""" 222 | 223 | TOP 10 TRANSACTIONS: 224 | """ 225 | 226 | for i, transaction in enumerate(analysis['top_transactions'], 1): 227 | report += f"{i:2d}. {transaction['date']} | {transaction['description'][:40]:<40} | NT$ {transaction['amount']:>8,.0f} | {transaction['category']}\n" 228 | 229 | report += f""" 230 | 231 | TOP MERCHANTS BY TOTAL SPENDING: 232 | """ 233 | 234 | for i, (merchant, amount) in enumerate(analysis['top_merchants'].items(), 1): 235 | report += f"{i:2d}. {merchant[:50]:<50} | NT$ {amount:>8,.2f}\n" 236 | 237 | report += f""" 238 | 239 | SPENDING INSIGHTS: 240 | - Highest spending category: {max(analysis['category_breakdown'], key=analysis['category_breakdown'].get)} 241 | - Most frequent transaction category: {max(analysis['category_counts'], key=analysis['category_counts'].get)} 242 | - Largest single transaction: NT$ {max(t['amount'] for t in analysis['top_transactions']):,.2f} 243 | - Technology/Software spending: NT$ {analysis['category_breakdown'].get('Technology/Software', 0):,.2f} 244 | - Food & Dining spending: NT$ {analysis['category_breakdown'].get('Food & Dining', 0):,.2f} 245 | 246 | RECOMMENDATIONS: 247 | 1. Monitor Technology/Software subscriptions - this appears to be a major expense category 248 | 2. Consider consolidating software subscriptions to save costs 249 | 3. Track food delivery and dining expenses for budgeting 250 | 4. Review recurring payments for optimization opportunities 251 | 5. Set up alerts for transactions over NT$ 5,000 252 | 253 | SPENDING PATTERNS: 254 | - Small transactions (NT$5,000): Focus on technology and entertainment expenses 257 | """ 258 | 259 | return report 260 | 261 | def main(): 262 | """Main function for clean analysis""" 263 | print("Starting Clean Credit Card Analysis...") 264 | 265 | # Load and clean data 266 | df = clean_transactions('transactions.csv') 267 | 268 | if len(df) == 0: 269 | print("No valid transactions found after cleaning") 270 | return 271 | 272 | # Improve categorization 273 | df = improve_categorization(df) 274 | 275 | # Generate analysis 276 | analysis = generate_clean_analysis(df) 277 | 278 | # Create visualizations 279 | create_clean_visualizations(df, analysis) 280 | 281 | # Generate report 282 | report = generate_clean_report(analysis) 283 | print(report) 284 | 285 | # Save files 286 | with open('clean_credit_card_report.txt', 'w', encoding='utf-8') as f: 287 | f.write(report) 288 | 289 | df.to_csv('clean_transactions.csv', index=False, encoding='utf-8') 290 | 291 | print("\n" + "="*60) 292 | print("CLEAN ANALYSIS COMPLETE!") 293 | print("="*60) 294 | print("Files saved:") 295 | print("- clean_credit_card_analysis.png (clean visualizations)") 296 | print("- clean_credit_card_report.txt (detailed clean report)") 297 | print("- clean_transactions.csv (cleaned transaction data)") 298 | print("="*60) 299 | 300 | if __name__ == "__main__": 301 | main() -------------------------------------------------------------------------------- /saas_analysis.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | 企業 SaaS 服務支出分析 5 | 專門分析 Cursor、OpenAI 等技術工具的支出 6 | """ 7 | 8 | import pandas as pd 9 | import matplotlib.pyplot as plt 10 | import matplotlib.font_manager as fm 11 | import numpy as np 12 | from collections import defaultdict 13 | 14 | def load_and_filter_saas_data(csv_path): 15 | """載入並篩選 SaaS 相關交易""" 16 | df = pd.read_csv(csv_path, encoding='utf-8') 17 | 18 | # SaaS 服務關鍵字 19 | saas_keywords = { 20 | 'AI/ML Tools': ['CURSOR', 'OPENAI', 'ANTHROPIC', 'LEONARDO', 'HEYGEN'], 21 | 'Design Tools': ['FIGMA', 'ADOBE'], 22 | 'Cloud Services': ['GOOGLE', 'Colab'], 23 | 'Development Tools': ['REPORTDASH', 'Gandi'], 24 | 'Marketing Tools': ['MANYCHAT', 'RSS.APP', 'SEASALT'], 25 | 'Media': ['SPOTIFY', 'PADDLE'] 26 | } 27 | 28 | # 篩選 SaaS 相關交易 29 | saas_transactions = [] 30 | 31 | for idx, row in df.iterrows(): 32 | description = row['description'].upper() 33 | for category, keywords in saas_keywords.items(): 34 | found = False 35 | for keyword in keywords: 36 | if keyword in description: 37 | row_copy = row.copy() 38 | row_copy['saas_category'] = category 39 | row_copy['saas_service'] = keyword.lower() 40 | saas_transactions.append(row_copy) 41 | found = True 42 | break 43 | if found: 44 | break 45 | 46 | if not saas_transactions: 47 | print("未找到 SaaS 相關交易") 48 | return pd.DataFrame() 49 | 50 | saas_df = pd.DataFrame(saas_transactions) 51 | 52 | # 去除重複交易 - 基於 description 和 amount 的組合 53 | print(f"去重前: {len(saas_df)} 筆交易") 54 | 55 | # 創建唯一標識符 56 | saas_df['unique_id'] = saas_df['description'].str.replace('^\d{2}/\d{2} ', '', regex=True) + '_' + saas_df['amount_abs'].astype(str) 57 | 58 | # 去除重複,保留第一筆 59 | saas_df_clean = saas_df.drop_duplicates(subset=['unique_id'], keep='first') 60 | saas_df_clean = saas_df_clean.drop('unique_id', axis=1) 61 | 62 | print(f"去重後: {len(saas_df_clean)} 筆 SaaS 相關交易") 63 | 64 | return saas_df_clean 65 | 66 | def extract_service_details(saas_df): 67 | """提取服務詳細信息""" 68 | 69 | service_mapping = { 70 | 'cursor': 'Cursor AI IDE', 71 | 'openai': 'OpenAI (ChatGPT/API)', 72 | 'anthropic': 'Anthropic Claude', 73 | 'leonardo': 'Leonardo AI', 74 | 'heygen': 'HeyGen Video AI', 75 | 'figma': 'Figma Design', 76 | 'adobe': 'Adobe Creative Suite', 77 | 'google': 'Google Cloud/Services', 78 | 'reportdash': 'ReportDash Analytics', 79 | 'gandi': 'Gandi Domain/Hosting', 80 | 'colab': 'Google Colab Pro', 81 | 'manychat': 'ManyChat Marketing', 82 | 'seasalt': 'Seasalt.AI', 83 | 'spotify': 'Spotify Premium', 84 | 'paddle': 'Paddle Payment' 85 | } 86 | 87 | saas_df['service_name'] = saas_df['saas_service'].map(service_mapping).fillna(saas_df['saas_service']) 88 | 89 | # 檢測訂閱類型 90 | def detect_subscription_type(description): 91 | desc_upper = description.upper() 92 | if 'USAGE' in desc_upper: 93 | return '按使用量計費' 94 | elif 'SUBSCR' in desc_upper or 'SUBSCRIPTION' in desc_upper: 95 | return '月度訂閱' 96 | elif any(word in desc_upper for word in ['PRO', 'PREMIUM', 'PLUS']): 97 | return '月度訂閱' 98 | else: 99 | return '一次性/其他' 100 | 101 | saas_df['subscription_type'] = saas_df['description'].apply(detect_subscription_type) 102 | 103 | return saas_df 104 | 105 | def analyze_saas_spending(saas_df): 106 | """分析 SaaS 支出""" 107 | 108 | total_saas_spending = saas_df['amount_abs'].sum() 109 | num_services = saas_df['service_name'].nunique() 110 | num_transactions = len(saas_df) 111 | avg_transaction = saas_df['amount_abs'].mean() 112 | 113 | # 按服務分類統計 114 | category_stats = saas_df.groupby('saas_category').agg({ 115 | 'amount_abs': ['sum', 'count', 'mean'], 116 | 'service_name': 'nunique' 117 | }).round(2) 118 | 119 | # 按具體服務統計 120 | service_stats = saas_df.groupby('service_name').agg({ 121 | 'amount_abs': ['sum', 'count', 'mean'] 122 | }).round(2) 123 | 124 | # 按訂閱類型統計 125 | subscription_stats = saas_df.groupby('subscription_type').agg({ 126 | 'amount_abs': ['sum', 'count', 'mean'] 127 | }).round(2) 128 | 129 | analysis = { 130 | 'total_spending': total_saas_spending, 131 | 'num_services': num_services, 132 | 'num_transactions': num_transactions, 133 | 'avg_transaction': avg_transaction, 134 | 'category_stats': category_stats, 135 | 'service_stats': service_stats, 136 | 'subscription_stats': subscription_stats 137 | } 138 | 139 | return analysis 140 | 141 | def create_saas_visualizations(saas_df, analysis): 142 | """創建 SaaS 支出可視化圖表""" 143 | 144 | # 嘗試找到可用的中文字體 145 | chinese_fonts = ['Microsoft YaHei', 'SimHei', 'KaiTi', 'FangSong', 'Microsoft JhengHei'] 146 | available_font = None 147 | 148 | for font_name in chinese_fonts: 149 | try: 150 | # 檢查字體是否可用 151 | font_files = fm.findSystemFonts() 152 | for font_file in font_files: 153 | try: 154 | font_prop = fm.FontProperties(fname=font_file) 155 | if font_name.lower() in font_prop.get_name().lower(): 156 | available_font = font_name 157 | break 158 | except: 159 | continue 160 | if available_font: 161 | break 162 | except: 163 | continue 164 | 165 | # 如果找不到中文字體,使用英文標題 166 | if available_font: 167 | plt.rcParams['font.sans-serif'] = [available_font, 'Arial', 'DejaVu Sans'] 168 | plt.rcParams['axes.unicode_minus'] = False 169 | use_chinese = True 170 | else: 171 | plt.rcParams['font.family'] = ['Arial', 'DejaVu Sans'] 172 | use_chinese = False 173 | 174 | fig, axes = plt.subplots(2, 3, figsize=(18, 12)) 175 | 176 | if use_chinese: 177 | fig.suptitle('企業 SaaS 服務支出分析 - 2025年5月\nThe Pocket Company by Accucrazy', fontsize=16, fontweight='bold') 178 | else: 179 | fig.suptitle('SaaS Service Spending Analysis - May 2025\nThe Pocket Company by Accucrazy', fontsize=16, fontweight='bold') 180 | 181 | # 1. 按服務類別的支出餅圖 182 | category_spending = saas_df.groupby('saas_category')['amount_abs'].sum() 183 | colors = plt.cm.Set3(np.linspace(0, 1, len(category_spending))) 184 | 185 | # 翻譯類別名稱 186 | if use_chinese: 187 | category_labels = { 188 | 'AI/ML Tools': 'AI/ML 工具', 189 | 'Cloud Services': '雲端服務', 190 | 'Design Tools': '設計工具', 191 | 'Development Tools': '開發工具', 192 | 'Marketing Tools': '行銷工具', 193 | 'Media': '媒體工具' 194 | } 195 | display_labels = [category_labels.get(cat, cat) for cat in category_spending.index] 196 | else: 197 | display_labels = category_spending.index 198 | 199 | axes[0, 0].pie(category_spending.values, 200 | labels=display_labels, 201 | autopct='%1.1f%%', 202 | colors=colors, 203 | startangle=90) 204 | 205 | if use_chinese: 206 | axes[0, 0].set_title('SaaS 支出按類別分布') 207 | else: 208 | axes[0, 0].set_title('Spending by SaaS Category') 209 | 210 | # 2. 前10大服務支出條形圖 211 | top_services = saas_df.groupby('service_name')['amount_abs'].sum().sort_values(ascending=True).tail(10) 212 | 213 | axes[0, 1].barh(range(len(top_services)), top_services.values, color='skyblue') 214 | axes[0, 1].set_yticks(range(len(top_services))) 215 | axes[0, 1].set_yticklabels(top_services.index) 216 | 217 | if use_chinese: 218 | axes[0, 1].set_xlabel('支出金額 (NT$)') 219 | axes[0, 1].set_title('前10大 SaaS 服務支出') 220 | else: 221 | axes[0, 1].set_xlabel('Amount (NT$)') 222 | axes[0, 1].set_title('Top 10 SaaS Services by Spending') 223 | 224 | # 3. 訂閱類型分布 225 | subscription_counts = saas_df['subscription_type'].value_counts() 226 | 227 | # 翻譯訂閱類型 228 | if use_chinese: 229 | subscription_labels = subscription_counts.index 230 | else: 231 | subscription_translation = { 232 | '按使用量計費': 'Usage-based', 233 | '月度訂閱': 'Monthly Subscription', 234 | '一次性/其他': 'One-time/Other' 235 | } 236 | subscription_labels = [subscription_translation.get(label, label) for label in subscription_counts.index] 237 | 238 | axes[0, 2].pie(subscription_counts.values, labels=subscription_labels, 239 | autopct='%1.1f%%', startangle=90) 240 | 241 | if use_chinese: 242 | axes[0, 2].set_title('訂閱類型分布') 243 | else: 244 | axes[0, 2].set_title('Subscription Type Distribution') 245 | 246 | # 4. 交易金額分布 247 | axes[1, 0].hist(saas_df['amount_abs'], bins=15, alpha=0.7, color='lightgreen', edgecolor='black') 248 | 249 | if use_chinese: 250 | axes[1, 0].set_xlabel('交易金額 (NT$)') 251 | axes[1, 0].set_ylabel('頻次') 252 | axes[1, 0].set_title('SaaS 交易金額分布') 253 | else: 254 | axes[1, 0].set_xlabel('Transaction Amount (NT$)') 255 | axes[1, 0].set_ylabel('Frequency') 256 | axes[1, 0].set_title('Transaction Amount Distribution') 257 | 258 | # 5. AI/ML 工具詳細分析 259 | ai_ml_data = saas_df[saas_df['saas_category'] == 'AI/ML Tools'] 260 | if not ai_ml_data.empty: 261 | ai_spending = ai_ml_data.groupby('service_name')['amount_abs'].sum().sort_values(ascending=True) 262 | 263 | axes[1, 1].barh(range(len(ai_spending)), ai_spending.values, color='orange') 264 | axes[1, 1].set_yticks(range(len(ai_spending))) 265 | axes[1, 1].set_yticklabels(ai_spending.index) 266 | 267 | if use_chinese: 268 | axes[1, 1].set_xlabel('支出金額 (NT$)') 269 | axes[1, 1].set_title('AI/ML 工具支出詳細') 270 | else: 271 | axes[1, 1].set_xlabel('Amount (NT$)') 272 | axes[1, 1].set_title('AI/ML Tools Spending Detail') 273 | else: 274 | no_data_text = '無 AI/ML 工具數據' if use_chinese else 'No AI/ML Tools Data' 275 | axes[1, 1].text(0.5, 0.5, no_data_text, ha='center', va='center', transform=axes[1, 1].transAxes) 276 | 277 | # 6. Cursor 專項分析 278 | cursor_data = saas_df[saas_df['saas_service'] == 'cursor'] 279 | if not cursor_data.empty: 280 | cursor_by_type = cursor_data.groupby('subscription_type')['amount_abs'].sum() 281 | 282 | axes[1, 2].bar(range(len(cursor_by_type)), cursor_by_type.values, color='purple', alpha=0.7) 283 | axes[1, 2].set_xticks(range(len(cursor_by_type))) 284 | 285 | if use_chinese: 286 | type_labels = cursor_by_type.index 287 | axes[1, 2].set_ylabel('支出金額 (NT$)') 288 | axes[1, 2].set_title('Cursor AI 支出按類型') 289 | else: 290 | type_translation = { 291 | '按使用量計費': 'Usage-based', 292 | '月度訂閱': 'Monthly Sub', 293 | '一次性/其他': 'One-time' 294 | } 295 | type_labels = [type_translation.get(label, label) for label in cursor_by_type.index] 296 | axes[1, 2].set_ylabel('Amount (NT$)') 297 | axes[1, 2].set_title('Cursor AI Spending by Type') 298 | 299 | axes[1, 2].set_xticklabels(type_labels, rotation=45) 300 | else: 301 | no_cursor_text = '無 Cursor 數據' if use_chinese else 'No Cursor Data' 302 | axes[1, 2].text(0.5, 0.5, no_cursor_text, ha='center', va='center', transform=axes[1, 2].transAxes) 303 | 304 | plt.tight_layout() 305 | 306 | # 在圖表底部添加公司標識 307 | fig.text(0.5, 0.02, 'The Pocket Company by Accucrazy', ha='center', va='bottom', 308 | fontsize=10, style='italic', alpha=0.7) 309 | 310 | plt.savefig('saas_spending_analysis.png', dpi=300, bbox_inches='tight', facecolor='white') 311 | print("SaaS 分析圖表已保存為 'saas_spending_analysis.png'") 312 | 313 | def generate_saas_report(saas_df, analysis): 314 | """生成 SaaS 支出分析報告""" 315 | 316 | report = f""" 317 | ================================================ 318 | 企業 SaaS 服務支出分析報告 319 | 第一銀行信用卡帳單 - 2025年5月 320 | The Pocket Company by Accucrazy 321 | ================================================ 322 | 323 | 總體概況: 324 | - SaaS 總支出:NT$ {analysis['total_spending']:,.2f} 325 | - 使用服務數量:{analysis['num_services']} 個 326 | - 總交易次數:{analysis['num_transactions']} 筆 327 | - 平均每筆交易:NT$ {analysis['avg_transaction']:,.2f} 328 | 329 | 按服務類別分析: 330 | """ 331 | 332 | for category in analysis['category_stats'].index: 333 | total = analysis['category_stats'].loc[category, ('amount_abs', 'sum')] 334 | count = analysis['category_stats'].loc[category, ('amount_abs', 'count')] 335 | percentage = (total / analysis['total_spending']) * 100 336 | 337 | report += f""" 338 | {category}: 339 | - 總支出:NT$ {total:,.2f} ({percentage:.1f}%) 340 | - 交易次數:{count} 筆 341 | """ 342 | 343 | report += "\n前10大 SaaS 服務支出:\n" 344 | 345 | top_services = analysis['service_stats'].sort_values(('amount_abs', 'sum'), ascending=False).head(10) 346 | 347 | for i, (service, data) in enumerate(top_services.iterrows(), 1): 348 | total = data[('amount_abs', 'sum')] 349 | count = data[('amount_abs', 'count')] 350 | percentage = (total / analysis['total_spending']) * 100 351 | 352 | report += f"{i:2d}. {service}: NT$ {total:,.2f} ({percentage:.1f}%) - {count}筆交易\n" 353 | 354 | # AI/ML 工具詳細分析 355 | ai_ml_tools = saas_df[saas_df['saas_category'] == 'AI/ML Tools'] 356 | if not ai_ml_tools.empty: 357 | ai_ml_total = ai_ml_tools['amount_abs'].sum() 358 | 359 | report += f""" 360 | AI/ML 工具詳細分析: 361 | 總支出:NT$ {ai_ml_total:,.2f} 362 | 363 | 具體工具: 364 | """ 365 | 366 | ai_tools_detail = ai_ml_tools.groupby('service_name')['amount_abs'].agg(['sum', 'count']) 367 | 368 | for tool in ai_tools_detail.index: 369 | tool_total = ai_tools_detail.loc[tool, 'sum'] 370 | tool_count = ai_tools_detail.loc[tool, 'count'] 371 | report += f"• {tool}:NT$ {tool_total:,.2f} ({tool_count}筆)\n" 372 | 373 | report += """ 374 | 成本優化建議: 375 | 376 | 1. Cursor AI 使用監控:檢查使用量計費是否合理 377 | 2. OpenAI API 成本控制:設置使用限額 378 | 3. 訂閱整合:檢查重複功能的工具 379 | 4. 定期評估:每月檢查 ROI 380 | 381 | ================================================ 382 | 報告生成:The Pocket Company by Accucrazy 383 | 分析日期:2025年 384 | ================================================ 385 | """ 386 | 387 | return report 388 | 389 | def main(): 390 | """主要執行函數""" 391 | print("開始 SaaS 服務支出分析...") 392 | 393 | saas_df = load_and_filter_saas_data('clean_transactions.csv') 394 | 395 | if saas_df.empty: 396 | print("沒有找到 SaaS 相關交易數據") 397 | return 398 | 399 | saas_df = extract_service_details(saas_df) 400 | analysis = analyze_saas_spending(saas_df) 401 | 402 | create_saas_visualizations(saas_df, analysis) 403 | 404 | report = generate_saas_report(saas_df, analysis) 405 | print(report) 406 | 407 | with open('saas_analysis_report.txt', 'w', encoding='utf-8') as f: 408 | f.write(report) 409 | 410 | saas_df.to_csv('saas_transactions.csv', index=False, encoding='utf-8') 411 | 412 | print("\n" + "="*60) 413 | print("SaaS 分析完成!") 414 | print("="*60) 415 | 416 | if __name__ == "__main__": 417 | main() -------------------------------------------------------------------------------- /extract_pdf_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | PDF Credit Card Statement Analyzer 5 | Extracts and analyzes data from password-protected PDF credit card statements 6 | """ 7 | 8 | import PyPDF2 9 | import pandas as pd 10 | import re 11 | from datetime import datetime 12 | import matplotlib.pyplot as plt 13 | import seaborn as sns 14 | from collections import defaultdict 15 | import sys 16 | 17 | def extract_pdf_text(pdf_path, password): 18 | """Extract text from password-protected PDF with improved error handling""" 19 | try: 20 | with open(pdf_path, 'rb') as file: 21 | # Try different PDF readers 22 | try: 23 | pdf_reader = PyPDF2.PdfReader(file) 24 | print(f"PDF has {len(pdf_reader.pages)} pages") 25 | 26 | # Check if PDF is encrypted 27 | if pdf_reader.is_encrypted: 28 | print("PDF is encrypted, attempting to decrypt...") 29 | try: 30 | # Try to decrypt with the password 31 | result = pdf_reader.decrypt(password) 32 | print(f"Decryption result: {result}") 33 | if result == 0: 34 | print("Failed to decrypt PDF with provided password") 35 | return None 36 | except Exception as decrypt_error: 37 | print(f"Decryption error: {decrypt_error}") 38 | return None 39 | 40 | # Extract text from all pages 41 | text = "" 42 | for page_num, page in enumerate(pdf_reader.pages): 43 | try: 44 | page_text = page.extract_text() 45 | text += f"--- Page {page_num + 1} ---\n" 46 | text += page_text + "\n\n" 47 | print(f"Extracted {len(page_text)} characters from page {page_num + 1}") 48 | except Exception as page_error: 49 | print(f"Error extracting text from page {page_num + 1}: {page_error}") 50 | continue 51 | 52 | return text 53 | 54 | except Exception as reader_error: 55 | print(f"PyPDF2 error: {reader_error}") 56 | 57 | # Try alternative approach with different parameters 58 | try: 59 | print("Trying alternative PDF reading approach...") 60 | file.seek(0) # Reset file pointer 61 | pdf_reader = PyPDF2.PdfReader(file, strict=False) 62 | 63 | if pdf_reader.is_encrypted: 64 | pdf_reader.decrypt(password) 65 | 66 | text = "" 67 | for page in pdf_reader.pages: 68 | text += page.extract_text() + "\n" 69 | 70 | return text 71 | 72 | except Exception as alt_error: 73 | print(f"Alternative approach failed: {alt_error}") 74 | return None 75 | 76 | except FileNotFoundError: 77 | print(f"PDF file not found: {pdf_path}") 78 | return None 79 | except Exception as e: 80 | print(f"Unexpected error: {str(e)}") 81 | return None 82 | 83 | def debug_pdf_content(pdf_path, password): 84 | """Debug PDF content to understand structure""" 85 | try: 86 | import pdfplumber 87 | print("Trying with pdfplumber...") 88 | 89 | with pdfplumber.open(pdf_path, password=password) as pdf: 90 | print(f"PDF opened successfully with pdfplumber. Pages: {len(pdf.pages)}") 91 | 92 | text = "" 93 | for page_num, page in enumerate(pdf.pages): 94 | page_text = page.extract_text() 95 | if page_text: 96 | text += f"--- Page {page_num + 1} ---\n" 97 | text += page_text + "\n\n" 98 | print(f"Page {page_num + 1}: {len(page_text)} characters") 99 | 100 | return text 101 | 102 | except ImportError: 103 | print("pdfplumber not available, installing...") 104 | import subprocess 105 | subprocess.check_call([sys.executable, "-m", "pip", "install", "pdfplumber"]) 106 | 107 | # Try again after installation 108 | try: 109 | import pdfplumber 110 | with pdfplumber.open(pdf_path, password=password) as pdf: 111 | text = "" 112 | for page in pdf.pages: 113 | page_text = page.extract_text() 114 | if page_text: 115 | text += page_text + "\n" 116 | return text 117 | except Exception as e: 118 | print(f"pdfplumber also failed: {e}") 119 | return None 120 | 121 | except Exception as e: 122 | print(f"pdfplumber error: {e}") 123 | return None 124 | 125 | def parse_transactions(text): 126 | """Parse transactions from extracted text with enhanced patterns""" 127 | transactions = [] 128 | 129 | # Print first 2000 characters for debugging 130 | print("First 2000 characters of extracted text:") 131 | print("=" * 50) 132 | print(text[:2000]) 133 | print("=" * 50) 134 | 135 | # Enhanced patterns for Taiwanese bank statements 136 | patterns = [ 137 | # Pattern for MM/DD format with Chinese description 138 | r'(\d{2}/\d{2})\s+([\u4e00-\u9fff\w\s\-\*\.]+?)\s+([\d,]+\.?\d*)', 139 | # Pattern for YYYY/MM/DD format 140 | r'(\d{4}/\d{2}/\d{2})\s+([\u4e00-\u9fff\w\s\-\*\.]+?)\s+([\d,]+\.?\d*)', 141 | # Pattern with transaction codes 142 | r'(\d{2}/\d{2})\s+(\d+)\s+([\u4e00-\u9fff\w\s\-\*\.]+?)\s+([\d,]+\.?\d*)', 143 | # Pattern for transactions with negative amounts 144 | r'(\d{2}/\d{2})\s+([\u4e00-\u9fff\w\s\-\*\.]+?)\s+\-?([\d,]+\.?\d*)', 145 | # Generic pattern for any transaction-like data 146 | r'(\d{1,2}/\d{1,2})\s+(.+?)\s+([\d,]+\.?\d*)', 147 | # Pattern with extra spaces 148 | r'(\d{2}/\d{2})\s{2,}([\u4e00-\u9fff\w\s\-\*\.]+?)\s{2,}([\d,]+\.?\d*)', 149 | ] 150 | 151 | for pattern_num, pattern in enumerate(patterns, 1): 152 | print(f"Trying pattern {pattern_num}: {pattern}") 153 | matches = re.findall(pattern, text, re.MULTILINE | re.DOTALL) 154 | print(f"Found {len(matches)} matches with pattern {pattern_num}") 155 | 156 | for match in matches: 157 | try: 158 | if len(match) >= 3: 159 | date_str = match[0] 160 | description = match[-2].strip() # Second to last element 161 | amount_str = match[-1].strip() # Last element 162 | 163 | # Clean up amount string 164 | amount_str = re.sub(r'[^\d,.]', '', amount_str) 165 | if amount_str: 166 | amount = float(amount_str.replace(',', '')) 167 | 168 | transaction = { 169 | 'date': date_str, 170 | 'description': description, 171 | 'amount': amount 172 | } 173 | transactions.append(transaction) 174 | print(f"Added transaction: {date_str} | {description[:30]}... | {amount}") 175 | 176 | except (ValueError, IndexError) as e: 177 | print(f"Error parsing match {match}: {e}") 178 | continue 179 | 180 | # Remove duplicates 181 | unique_transactions = [] 182 | seen = set() 183 | for t in transactions: 184 | key = (t['date'], t['description'], t['amount']) 185 | if key not in seen: 186 | seen.add(key) 187 | unique_transactions.append(t) 188 | 189 | print(f"Total unique transactions found: {len(unique_transactions)}") 190 | return unique_transactions 191 | 192 | def categorize_transactions(transactions): 193 | """Categorize transactions based on description""" 194 | categories = { 195 | 'Food & Dining': ['餐廳', '食', '飲', '麥當勞', '星巴克', '便利商店', '超市', '7-11', '全家', '美食', '餐', '飯', '咖啡'], 196 | 'Transportation': ['捷運', '公車', '計程車', '加油', '停車', 'UBER', '油站', '交通', '高鐵', '台鐵', '客運'], 197 | 'Shopping': ['百貨', '購物', '服飾', '電器', '網購', '商城', 'AMAZON', '買', '購', '商店', '市場'], 198 | 'Entertainment': ['電影', '遊戲', '娛樂', 'KTV', '健身', '運動', '書店', '音樂'], 199 | 'Bills & Utilities': ['電費', '水費', '瓦斯', '電信', '保險', '銀行', '費用', '帳單', '繳費'], 200 | 'Healthcare': ['醫院', '診所', '藥局', '健康', '醫療', '牙科', '眼科'], 201 | 'Education': ['學校', '補習', '書店', '文具', '教育', '學費'], 202 | 'Travel': ['飯店', '機票', '旅遊', '住宿', '旅行', 'HOTEL'], 203 | 'Cash/ATM': ['提款', 'ATM', '現金', '轉帳', '匯款'], 204 | 'Other': [] 205 | } 206 | 207 | for transaction in transactions: 208 | description = transaction['description'] 209 | categorized = False 210 | 211 | for category, keywords in categories.items(): 212 | if category == 'Other': 213 | continue 214 | for keyword in keywords: 215 | if keyword in description: 216 | transaction['category'] = category 217 | categorized = True 218 | break 219 | if categorized: 220 | break 221 | 222 | if not categorized: 223 | transaction['category'] = 'Other' 224 | 225 | return transactions 226 | 227 | def analyze_spending(transactions): 228 | """Analyze spending patterns""" 229 | df = pd.DataFrame(transactions) 230 | 231 | if df.empty: 232 | print("No transactions found to analyze") 233 | return None, None 234 | 235 | print(f"Analyzing {len(df)} transactions...") 236 | 237 | # Convert amount to absolute value for spending analysis 238 | df['amount_abs'] = df['amount'].abs() 239 | 240 | # Basic statistics 241 | total_spending = df['amount_abs'].sum() 242 | avg_transaction = df['amount_abs'].mean() 243 | num_transactions = len(df) 244 | 245 | # Category analysis 246 | category_spending = df.groupby('category')['amount_abs'].sum().sort_values(ascending=False) 247 | 248 | # Create analysis report 249 | analysis = { 250 | 'total_spending': total_spending, 251 | 'average_transaction': avg_transaction, 252 | 'number_of_transactions': num_transactions, 253 | 'category_breakdown': category_spending.to_dict(), 254 | 'top_transactions': df.nlargest(5, 'amount_abs')[['description', 'amount', 'category']].to_dict('records'), 255 | 'transactions_by_category': df.groupby('category').size().to_dict() 256 | } 257 | 258 | return analysis, df 259 | 260 | def create_visualizations(df, analysis): 261 | """Create spending visualization charts""" 262 | try: 263 | plt.style.use('default') # Use default style instead of seaborn 264 | fig, axes = plt.subplots(2, 2, figsize=(15, 12)) 265 | 266 | # Category spending pie chart 267 | category_data = pd.Series(analysis['category_breakdown']) 268 | axes[0, 0].pie(category_data.values, labels=category_data.index, autopct='%1.1f%%', startangle=90) 269 | axes[0, 0].set_title('Spending by Category') 270 | 271 | # Category spending bar chart 272 | category_data.plot(kind='bar', ax=axes[0, 1]) 273 | axes[0, 1].set_title('Spending Amount by Category') 274 | axes[0, 1].set_xlabel('Category') 275 | axes[0, 1].set_ylabel('Amount (NT$)') 276 | axes[0, 1].tick_params(axis='x', rotation=45) 277 | 278 | # Transaction amount distribution 279 | axes[1, 0].hist(df['amount_abs'], bins=20, edgecolor='black', alpha=0.7) 280 | axes[1, 0].set_title('Transaction Amount Distribution') 281 | axes[1, 0].set_xlabel('Amount (NT$)') 282 | axes[1, 0].set_ylabel('Frequency') 283 | 284 | # Transaction count by category 285 | cat_counts = pd.Series(analysis['transactions_by_category']) 286 | cat_counts.plot(kind='bar', ax=axes[1, 1]) 287 | axes[1, 1].set_title('Number of Transactions by Category') 288 | axes[1, 1].set_xlabel('Category') 289 | axes[1, 1].set_ylabel('Number of Transactions') 290 | axes[1, 1].tick_params(axis='x', rotation=45) 291 | 292 | plt.tight_layout() 293 | plt.savefig('credit_card_analysis.png', dpi=300, bbox_inches='tight') 294 | print("Visualization saved as 'credit_card_analysis.png'") 295 | 296 | except Exception as e: 297 | print(f"Error creating visualizations: {e}") 298 | 299 | def generate_report(analysis): 300 | """Generate a comprehensive analysis report""" 301 | report = f""" 302 | ==================================== 303 | CREDIT CARD SPENDING ANALYSIS REPORT 304 | ==================================== 305 | 306 | SUMMARY STATISTICS: 307 | - Total Spending: NT$ {analysis['total_spending']:,.2f} 308 | - Number of Transactions: {analysis['number_of_transactions']} 309 | - Average Transaction Amount: NT$ {analysis['average_transaction']:,.2f} 310 | 311 | SPENDING BY CATEGORY: 312 | """ 313 | 314 | for category, amount in analysis['category_breakdown'].items(): 315 | percentage = (amount / analysis['total_spending']) * 100 316 | count = analysis['transactions_by_category'].get(category, 0) 317 | report += f"- {category}: NT$ {amount:,.2f} ({percentage:.1f}%) - {count} transactions\n" 318 | 319 | report += f""" 320 | TOP 5 TRANSACTIONS: 321 | """ 322 | 323 | for i, transaction in enumerate(analysis['top_transactions'], 1): 324 | report += f"{i}. {transaction['description']}: NT$ {transaction['amount']:,.2f} ({transaction['category']})\n" 325 | 326 | report += f""" 327 | 328 | INSIGHTS & RECOMMENDATIONS: 329 | - Largest spending category: {max(analysis['category_breakdown'], key=analysis['category_breakdown'].get)} 330 | - Most frequent transaction category: {max(analysis['transactions_by_category'], key=analysis['transactions_by_category'].get)} 331 | - Consider setting budgets for high-spending categories 332 | - Review recurring transactions for potential savings 333 | - Monitor transaction patterns for unusual activity 334 | 335 | SPENDING BREAKDOWN: 336 | """ 337 | 338 | for category in sorted(analysis['category_breakdown'].keys()): 339 | amount = analysis['category_breakdown'][category] 340 | count = analysis['transactions_by_category'].get(category, 0) 341 | avg_per_transaction = amount / count if count > 0 else 0 342 | report += f"- {category}: {count} transactions, avg NT$ {avg_per_transaction:.2f} per transaction\n" 343 | 344 | return report 345 | 346 | def main(): 347 | """Main function to run the analysis""" 348 | print("=" * 60) 349 | print("The Pocket Company by Accucrazy") 350 | print("Credit Card Statement PDF Analyzer") 351 | print("=" * 60) 352 | 353 | # SECURITY: Use environment variables for sensitive data 354 | import os 355 | 356 | pdf_path = os.getenv('PDF_PATH', 'your_bank_statement.pdf') 357 | password = os.getenv('PDF_PASSWORD') 358 | 359 | # Check if using default values (not secure for production) 360 | if not password: 361 | print("⚠️ ERROR: PDF_PASSWORD environment variable not set.") 362 | print("For security, set the environment variable:") 363 | print(" Windows: set PDF_PASSWORD=your_actual_password") 364 | print(" Linux/Mac: export PDF_PASSWORD=your_actual_password") 365 | print("\nExample usage:") 366 | print(" set PDF_PASSWORD=12345678") 367 | print(" python extract_pdf_data.py") 368 | return 369 | 370 | if pdf_path == 'your_bank_statement.pdf': 371 | print("⚠️ WARNING: Using default PDF path. Set PDF_PATH environment variable.") 372 | print(" Windows: set PDF_PATH=your_statement.pdf") 373 | print(" Linux/Mac: export PDF_PATH=your_statement.pdf") 374 | # Don't return here, use default filename 375 | pdf_path = "bank_statement.pdf" 376 | 377 | print(f"\n📄 PDF Path: {pdf_path}") 378 | print("🔐 Password: [PROTECTED]") 379 | print("Starting PDF Credit Card Analysis...") 380 | 381 | print("\nAttempting to extract text from PDF...") 382 | text = extract_pdf_text(pdf_path, password) 383 | 384 | if not text: 385 | print("PyPDF2 failed, trying alternative method...") 386 | text = debug_pdf_content(pdf_path, password) 387 | 388 | if not text: 389 | print("Failed to extract text from PDF using all available methods") 390 | print("Please check:") 391 | print("1. PDF file exists and is readable") 392 | print("2. Password is correct") 393 | print("3. PDF is not corrupted") 394 | return 395 | 396 | print(f"\nSuccessfully extracted {len(text)} characters from PDF") 397 | 398 | print("\nParsing transactions...") 399 | transactions = parse_transactions(text) 400 | 401 | if not transactions: 402 | print("No transactions found. The PDF format might not match expected patterns.") 403 | # Save extracted text for manual review 404 | with open('extracted_text.txt', 'w', encoding='utf-8') as f: 405 | f.write(text) 406 | print("Extracted text saved to 'extracted_text.txt' for manual review") 407 | return 408 | 409 | print(f"\nFound {len(transactions)} transactions") 410 | 411 | print("Categorizing transactions...") 412 | transactions = categorize_transactions(transactions) 413 | 414 | print("Analyzing spending patterns...") 415 | analysis, df = analyze_spending(transactions) 416 | 417 | if analysis: 418 | print("Generating visualizations...") 419 | create_visualizations(df, analysis) 420 | 421 | print("Generating report...") 422 | report = generate_report(analysis) 423 | print(report) 424 | 425 | # Save report to file 426 | with open('credit_card_analysis_report.txt', 'w', encoding='utf-8') as f: 427 | f.write(report) 428 | 429 | # Save transactions to CSV 430 | df.to_csv('transactions.csv', index=False, encoding='utf-8') 431 | 432 | print("\n" + "="*50) 433 | print("ANALYSIS COMPLETE!") 434 | print("="*50) 435 | print("Files saved:") 436 | print("- credit_card_analysis.png (visualization charts)") 437 | print("- credit_card_analysis_report.txt (detailed report)") 438 | print("- transactions.csv (transaction data)") 439 | print("="*50) 440 | else: 441 | print("Analysis failed - no valid transactions found") 442 | 443 | if __name__ == "__main__": 444 | main() --------------------------------------------------------------------------------