├── .gitignore ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.md ├── gmail-to-sqlite ├── gmail_to_sqlite ├── __init__.py ├── __main__.py ├── auth.py ├── constants.py ├── db.py ├── main.py ├── message.py ├── migrations.py ├── schema_migrations │ ├── __init__.py │ └── v1_add_is_deleted_column.py └── sync.py ├── main.py ├── pyproject.toml ├── setup.py ├── tests ├── __init__.py ├── conftest.py ├── test_db.py ├── test_message.py └── test_migrations.py └── uv.lock /.gitignore: -------------------------------------------------------------------------------- 1 | # Application specific 2 | credentials.json 3 | token.json 4 | *.db 5 | *.sqlite 6 | *.sqlite3 7 | data/ 8 | 9 | # Python 10 | __pycache__/ 11 | *.py[cod] 12 | *$py.class 13 | *.so 14 | .Python 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | wheels/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | MANIFEST 31 | 32 | # Virtual environments 33 | .venv/ 34 | venv/ 35 | .env 36 | 37 | # IDE 38 | .vscode/ 39 | .idea/ 40 | *.swp 41 | *.swo 42 | 43 | # Testing 44 | .pytest_cache/ 45 | .coverage 46 | htmlcov/ 47 | .tox/ 48 | .cache 49 | 50 | # OS specific 51 | .DS_Store 52 | Thumbs.db 53 | 54 | # mypy 55 | .mypy_cache/ 56 | .dmypy.json 57 | dmypy.json -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2023 Marc Boeker 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 8 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md 2 | include LICENSE 3 | include pyproject.toml 4 | include uv.lock 5 | recursive-include gmail_to_sqlite *.py 6 | recursive-include tests *.py 7 | global-exclude *.pyc 8 | global-exclude __pycache__ 9 | global-exclude .pytest_cache 10 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: help install install-dev test lint format clean build upload 2 | 3 | help: ## Show this help message 4 | @echo "Available commands:" 5 | @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf " \033[36m%-15s\033[0m %s\n", $$1, $$2}' 6 | 7 | install: ## Install the package 8 | uv sync 9 | 10 | install-dev: ## Install the package with development dependencies 11 | uv sync --dev 12 | 13 | test: ## Run tests 14 | uv run pytest tests/ -v 15 | 16 | test-cov: ## Run tests with coverage 17 | uv run pytest tests/ --cov=gmail_to_sqlite --cov-report=html --cov-report=term 18 | 19 | lint: ## Run linting 20 | uv run flake8 gmail_to_sqlite tests 21 | uv run mypy gmail_to_sqlite 22 | 23 | format: ## Format code 24 | uv run black gmail_to_sqlite tests 25 | 26 | format-check: ## Check code formatting 27 | uv run black --check gmail_to_sqlite tests 28 | 29 | clean: ## Clean build artifacts 30 | rm -rf build/ 31 | rm -rf dist/ 32 | rm -rf *.egg-info/ 33 | find . -type d -name __pycache__ -exec rm -rf {} + 34 | find . -type f -name "*.pyc" -delete 35 | 36 | build: ## Build the package 37 | uv build 38 | 39 | upload: ## Upload to PyPI (requires authentication) 40 | uv publish 41 | 42 | dev-setup: ## Set up development environment 43 | uv sync --dev 44 | uv run pre-commit install 45 | 46 | run: ## Run the application (requires credentials.json) 47 | uv run python -m gmail_to_sqlite 48 | 49 | run-cli: ## Run via installed CLI command 50 | uv run gmail-to-sqlite 51 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Gmail to SQLite 2 | 3 | A robust Python application that syncs Gmail messages to a local SQLite database for analysis and archival purposes. 4 | 5 | ## Features 6 | 7 | - **Incremental Sync**: Only downloads new messages by default 8 | - **Full Sync**: Option to download all messages and detect deletions 9 | - **Parallel Processing**: Multi-threaded message fetching for improved performance 10 | - **Robust Error Handling**: Automatic retries with exponential backoff 11 | - **Graceful Shutdown**: Handles interruption signals cleanly 12 | - **Type Safety**: Comprehensive type hints throughout the codebase 13 | 14 | ## Installation 15 | 16 | ### Prerequisites 17 | 18 | - Python 3.8 or higher 19 | - Google Cloud Project with Gmail API enabled 20 | - OAuth 2.0 credentials file (`credentials.json`) 21 | 22 | ### Setup 23 | 24 | 1. **Clone the repository:** 25 | 26 | ```bash 27 | git clone https://github.com/marcboeker/gmail-to-sqlite.git 28 | cd gmail-to-sqlite 29 | ``` 30 | 31 | 2. **Install dependencies:** 32 | 33 | ```bash 34 | # Using uv 35 | uv sync 36 | ``` 37 | 38 | 3. **Set up Gmail API credentials:** 39 | - Go to the [Google Cloud Console](https://console.cloud.google.com/) 40 | - Create a new project or select an existing one 41 | - Enable the Gmail API 42 | - Create OAuth 2.0 credentials (Desktop application) 43 | - Download the credentials file and save it as `credentials.json` in the project root 44 | 45 | ## Usage 46 | 47 | ### Basic Commands 48 | 49 | ```bash 50 | # Incremental sync (default) 51 | python main.py sync --data-dir ./data 52 | 53 | # Full sync with deletion detection 54 | python main.py sync --data-dir ./data --full-sync 55 | 56 | # Sync a specific message 57 | python main.py sync-message --data-dir ./data --message-id MESSAGE_ID 58 | 59 | # Detect and mark deleted messages only 60 | python main.py sync-deleted-messages --data-dir ./data 61 | 62 | # Use custom number of worker threads 63 | python main.py sync --data-dir ./data --workers 8 64 | ``` 65 | 66 | ### Command Line Arguments 67 | 68 | - `command`: Required. One of `sync`, `sync-message`, or `sync-deleted-messages` 69 | - `--data-dir`: Required. Directory where the SQLite database will be stored 70 | - `--full-sync`: Optional. Forces a complete sync of all messages 71 | - `--message-id`: Required for `sync-message`. The ID of a specific message to sync 72 | - `--workers`: Optional. Number of worker threads (default: number of CPU cores) 73 | 74 | ### Graceful Shutdown 75 | 76 | The application supports graceful shutdown when you press CTRL+C: 77 | 78 | 1. Stops accepting new tasks 79 | 2. Waits for currently running tasks to complete 80 | 3. Saves progress of completed work 81 | 4. Exits cleanly 82 | 83 | Pressing CTRL+C a second time will force an immediate exit. 84 | 85 | ## Database Schema 86 | 87 | The application creates a SQLite database with the following schema: 88 | 89 | | Field | Type | Description | 90 | | ------------ | -------- | -------------------------------- | 91 | | message_id | TEXT | Unique Gmail message ID | 92 | | thread_id | TEXT | Gmail thread ID | 93 | | sender | JSON | Sender information (name, email) | 94 | | recipients | JSON | Recipients by type (to, cc, bcc) | 95 | | labels | JSON | Array of Gmail labels | 96 | | subject | TEXT | Message subject | 97 | | body | TEXT | Message body (plain text) | 98 | | size | INTEGER | Message size in bytes | 99 | | timestamp | DATETIME | Message timestamp | 100 | | is_read | BOOLEAN | Read status | 101 | | is_outgoing | BOOLEAN | Whether sent by user | 102 | | is_deleted | BOOLEAN | Whether deleted from Gmail | 103 | | last_indexed | DATETIME | Last sync timestamp | 104 | 105 | ## Example queries 106 | 107 | ### Get the number of emails per sender 108 | 109 | ```sql 110 | SELECT sender->>'$.email', COUNT(*) AS count 111 | FROM messages 112 | GROUP BY sender->>'$.email' 113 | ORDER BY count DESC 114 | ``` 115 | 116 | ### Show the number of unread emails by sender 117 | 118 | This is great to determine who is spamming you the most with uninteresting emails. 119 | 120 | ```sql 121 | SELECT sender->>'$.email', COUNT(*) AS count 122 | FROM messages 123 | WHERE is_read = 0 124 | GROUP BY sender->>'$.email' 125 | ORDER BY count DESC 126 | ``` 127 | 128 | ### Get the number of emails for a specific period 129 | 130 | - For years: `strftime('%Y', timestamp)` 131 | - For months in a year: `strftime('%m', timestamp)` 132 | - For days in a month: `strftime('%d', timestamp)` 133 | - For weekdays: `strftime('%w', timestamp)` 134 | - For hours in a day: `strftime('%H', timestamp)` 135 | 136 | ```sql 137 | SELECT strftime('%Y', timestamp) AS period, COUNT(*) AS count 138 | FROM messages 139 | GROUP BY period 140 | ORDER BY count DESC 141 | ``` 142 | 143 | ### Find all newsletters and group them by sender 144 | 145 | This is an amateurish way to find all newsletters and group them by sender. It's not perfect, but it's a start. You could also use 146 | 147 | ```sql 148 | SELECT sender->>'$.email', COUNT(*) AS count 149 | FROM messages 150 | WHERE body LIKE '%newsletter%' OR body LIKE '%unsubscribe%' 151 | GROUP BY sender->>'$.email' 152 | ORDER BY count DESC 153 | ``` 154 | 155 | ### Show who has sent the largest emails in MB 156 | 157 | ```sql 158 | SELECT sender->>'$.email', sum(size)/1024/1024 AS size 159 | FROM messages 160 | GROUP BY sender->>'$.email' 161 | ORDER BY size DESC 162 | ``` 163 | 164 | ### Count the number of emails that I have sent to myself 165 | 166 | ```sql 167 | SELECT count(*) 168 | FROM messages 169 | WHERE EXISTS ( 170 | SELECT 1 171 | FROM json_each(messages.recipients->'$.to') 172 | WHERE json_extract(value, '$.email') = 'foo@example.com' 173 | ) 174 | AND sender->>'$.email' = 'foo@example.com' 175 | ``` 176 | 177 | ### List the senders who have sent me the largest total volume of emails in megabytes 178 | 179 | ```sql 180 | SELECT sender->>'$.email', sum(size)/1024/1024 as total_size 181 | FROM messages 182 | WHERE is_outgoing=false 183 | GROUP BY sender->>'$.email' 184 | ORDER BY total_size DESC 185 | ``` 186 | 187 | ### Find all deleted messages 188 | 189 | ```sql 190 | SELECT message_id, subject, timestamp 191 | FROM messages 192 | WHERE is_deleted=1 193 | ORDER BY timestamp DESC 194 | ``` 195 | -------------------------------------------------------------------------------- /gmail-to-sqlite: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Gmail to SQLite CLI entry point. 4 | 5 | This script provides a command-line interface for the gmail-to-sqlite package. 6 | """ 7 | 8 | import sys 9 | from gmail_to_sqlite.main import main 10 | 11 | if __name__ == "__main__": 12 | sys.exit(main()) 13 | -------------------------------------------------------------------------------- /gmail_to_sqlite/__init__.py: -------------------------------------------------------------------------------- 1 | """Gmail to SQLite package. 2 | 3 | A robust Python application that syncs Gmail messages to a local SQLite database 4 | for analysis and archival purposes. 5 | """ 6 | 7 | __version__ = "0.2.0" 8 | 9 | from .auth import get_credentials 10 | from .db import init, Message, create_message, get_all_message_ids 11 | from .sync import all_messages, single_message, get_labels 12 | 13 | __all__ = [ 14 | "get_credentials", 15 | "init", 16 | "Message", 17 | "create_message", 18 | "get_all_message_ids", 19 | "all_messages", 20 | "single_message", 21 | "get_labels", 22 | ] 23 | -------------------------------------------------------------------------------- /gmail_to_sqlite/__main__.py: -------------------------------------------------------------------------------- 1 | """CLI entry point for gmail-to-sqlite when run as a module.""" 2 | 3 | from gmail_to_sqlite.main import main 4 | 5 | if __name__ == "__main__": 6 | main() 7 | -------------------------------------------------------------------------------- /gmail_to_sqlite/auth.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Any, Optional 3 | 4 | from google.auth.transport.requests import Request 5 | from google.oauth2.credentials import Credentials 6 | from google_auth_oauthlib.flow import InstalledAppFlow 7 | 8 | from .constants import GMAIL_SCOPES, OAUTH2_CREDENTIALS_FILE, TOKEN_FILE_NAME 9 | 10 | 11 | class AuthenticationError(Exception): 12 | """Custom exception for authentication-related errors.""" 13 | 14 | pass 15 | 16 | 17 | def get_credentials(data_dir: str) -> Any: 18 | """ 19 | Retrieves the authentication credentials for the specified data_dir by either loading 20 | them from the token file or by running the authentication flow. 21 | 22 | Args: 23 | data_dir (str): The path where to store data. 24 | 25 | Returns: 26 | Any: The authentication credentials (compatible with Google API clients). 27 | 28 | Raises: 29 | AuthenticationError: If credentials cannot be obtained or are invalid. 30 | FileNotFoundError: If the OAuth2 credentials file is not found. 31 | """ 32 | if not os.path.exists(OAUTH2_CREDENTIALS_FILE): 33 | raise FileNotFoundError(f"{OAUTH2_CREDENTIALS_FILE} not found") 34 | 35 | token_file_path = os.path.join(data_dir, TOKEN_FILE_NAME) 36 | creds: Optional[Any] = None 37 | 38 | # Load existing credentials if available 39 | if os.path.exists(token_file_path): 40 | try: 41 | creds = Credentials.from_authorized_user_file(token_file_path, GMAIL_SCOPES) 42 | except Exception as e: 43 | raise AuthenticationError(f"Failed to load existing credentials: {e}") 44 | 45 | # Refresh or obtain new credentials if needed 46 | if not creds or not creds.valid: 47 | if creds and creds.expired and creds.refresh_token: 48 | try: 49 | creds.refresh(Request()) 50 | except Exception as e: 51 | raise AuthenticationError(f"Failed to refresh credentials: {e}") 52 | else: 53 | try: 54 | flow = InstalledAppFlow.from_client_secrets_file( 55 | OAUTH2_CREDENTIALS_FILE, GMAIL_SCOPES 56 | ) 57 | # The flow returns credentials that may be of different types 58 | # but all are compatible with the API usage 59 | flow_creds = flow.run_local_server(port=0) 60 | creds = flow_creds 61 | except Exception as e: 62 | raise AuthenticationError(f"Failed to obtain new credentials: {e}") 63 | 64 | # Save credentials for future use 65 | if creds: 66 | try: 67 | with open(token_file_path, "w") as token: 68 | token.write(creds.to_json()) 69 | except Exception as e: 70 | raise AuthenticationError(f"Failed to save credentials: {e}") 71 | 72 | if not creds: 73 | raise AuthenticationError("Failed to obtain valid credentials") 74 | 75 | return creds 76 | -------------------------------------------------------------------------------- /gmail_to_sqlite/constants.py: -------------------------------------------------------------------------------- 1 | """ 2 | Constants and configuration values for the Gmail to SQLite application. 3 | """ 4 | 5 | from typing import List 6 | 7 | # API Configuration 8 | GMAIL_API_VERSION: str = "v1" 9 | GMAIL_SCOPES: List[str] = ["https://www.googleapis.com/auth/gmail.readonly"] 10 | OAUTH2_CREDENTIALS_FILE: str = "credentials.json" 11 | TOKEN_FILE_NAME: str = "token.json" 12 | DATABASE_FILE_NAME: str = "messages.db" 13 | 14 | # Sync Configuration 15 | MAX_RESULTS_PER_PAGE: int = 500 16 | DEFAULT_WORKERS: int = 4 17 | MAX_RETRY_ATTEMPTS: int = 3 18 | RETRY_DELAY_SECONDS: int = 5 19 | 20 | # MIME Types for email body extraction 21 | SUPPORTED_MIME_TYPES: List[str] = [ 22 | "text/html", 23 | "text/plain", 24 | "multipart/related", 25 | "multipart/alternative", 26 | ] 27 | 28 | # Logging Configuration 29 | LOG_FORMAT: str = "%(asctime)s - %(levelname)s: %(message)s" 30 | PROGRESS_LOG_INTERVAL: int = 50 31 | COLLECTION_LOG_INTERVAL: int = 100 32 | -------------------------------------------------------------------------------- /gmail_to_sqlite/db.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from datetime import datetime 3 | from typing import Any, List, Optional 4 | 5 | from peewee import ( 6 | BooleanField, 7 | DateTimeField, 8 | IntegerField, 9 | Model, 10 | Proxy, 11 | TextField, 12 | SQL, 13 | ) 14 | from playhouse.sqlite_ext import JSONField, SqliteDatabase 15 | 16 | from .constants import DATABASE_FILE_NAME 17 | 18 | database_proxy = Proxy() 19 | 20 | 21 | class DatabaseError(Exception): 22 | """Custom exception for database-related errors.""" 23 | 24 | pass 25 | 26 | 27 | class SchemaVersion(Model): 28 | """ 29 | Represents the database schema version. 30 | 31 | Attributes: 32 | version (IntegerField): The current schema version number. 33 | 34 | Meta: 35 | database (Database): The database connection to use. 36 | db_table (str): The name of the database table for storing schema version. 37 | """ 38 | 39 | version = IntegerField() 40 | 41 | class Meta: 42 | database = database_proxy 43 | db_table = "schema_version" 44 | 45 | 46 | class Message(Model): 47 | """ 48 | Represents an email message. 49 | 50 | Attributes: 51 | message_id (TextField): The unique identifier of the message. 52 | thread_id (TextField): The unique identifier of the thread. 53 | sender (JSONField): The sender of the message. 54 | recipients (JSONField): The recipients of the message. 55 | labels (JSONField): The labels of the message. 56 | subject (TextField): The subject of the message. 57 | body (TextField): The last messages sent or received without all other replies to the thread. 58 | size (IntegerField): The size of the message. 59 | timestamp (DateTimeField): The timestamp of the message. 60 | is_read (BooleanField): Indicates whether the message has been read. 61 | is_outgoing BooleanField(): Indicates whether the message was sent by the user. 62 | is_deleted (BooleanField): Indicates whether the message has been deleted from Gmail. 63 | last_indexed (DateTimeField): The timestamp when the message was last indexed. 64 | 65 | Meta: 66 | database (Database): The database connection to use. 67 | db_table (str): The name of the database table for storing messages. 68 | """ 69 | 70 | message_id = TextField(unique=True) 71 | thread_id = TextField() 72 | sender = JSONField() 73 | recipients = JSONField() 74 | labels = JSONField() 75 | subject = TextField(null=True) 76 | body = TextField(null=True) 77 | size = IntegerField() 78 | timestamp = DateTimeField() 79 | is_read = BooleanField() 80 | is_outgoing = BooleanField() 81 | is_deleted = BooleanField(default=False) 82 | last_indexed = DateTimeField() 83 | 84 | class Meta: 85 | database = database_proxy 86 | db_table = "messages" 87 | 88 | 89 | def init(data_dir: str, enable_logging: bool = False) -> SqliteDatabase: 90 | """ 91 | Initialize the database for the given data_dir. 92 | 93 | Args: 94 | data_dir (str): The path where to store the data. 95 | enable_logging (bool, optional): Whether to enable logging. Defaults to False. 96 | 97 | Returns: 98 | SqliteDatabase: The initialized database object. 99 | 100 | Raises: 101 | DatabaseError: If database initialization fails. 102 | """ 103 | try: 104 | db_path = f"{data_dir}/{DATABASE_FILE_NAME}" 105 | db = SqliteDatabase(db_path) 106 | database_proxy.initialize(db) 107 | db.create_tables([Message, SchemaVersion]) 108 | 109 | if enable_logging: 110 | logger = logging.getLogger("peewee") 111 | logger.setLevel(logging.DEBUG) 112 | logger.addHandler(logging.StreamHandler()) 113 | 114 | from .migrations import run_migrations 115 | 116 | if not run_migrations(): 117 | raise DatabaseError("Failed to run database migrations") 118 | 119 | return db 120 | except Exception as e: 121 | raise DatabaseError(f"Failed to initialize database: {e}") 122 | 123 | 124 | def create_message(msg: Any) -> None: 125 | """ 126 | Saves a message to the database with conflict resolution. 127 | 128 | Args: 129 | msg: The message object to save (from message.Message class). 130 | 131 | Raises: 132 | DatabaseError: If the message cannot be saved to the database. 133 | """ 134 | try: 135 | last_indexed = datetime.now() 136 | Message.insert( 137 | message_id=msg.id, 138 | thread_id=msg.thread_id, 139 | sender=msg.sender, 140 | recipients=msg.recipients, 141 | labels=msg.labels, 142 | subject=msg.subject, 143 | body=msg.body, 144 | size=msg.size, 145 | timestamp=msg.timestamp, 146 | is_read=msg.is_read, 147 | is_outgoing=msg.is_outgoing, 148 | is_deleted=False, 149 | last_indexed=last_indexed, 150 | ).on_conflict( 151 | conflict_target=[Message.message_id], 152 | update={ 153 | Message.is_read: msg.is_read, 154 | Message.last_indexed: last_indexed, 155 | Message.labels: msg.labels, 156 | Message.is_deleted: False, 157 | }, 158 | ).execute() 159 | except Exception as e: 160 | raise DatabaseError(f"Failed to save message {msg.id}: {e}") 161 | 162 | 163 | def last_indexed() -> Optional[datetime]: 164 | """ 165 | Returns the timestamp of the last indexed message. 166 | 167 | Returns: 168 | Optional[datetime]: The timestamp of the last indexed message, or None if no messages exist. 169 | """ 170 | 171 | msg = Message.select().order_by(Message.timestamp.desc()).first() 172 | if msg: 173 | timestamp: Optional[datetime] = msg.timestamp 174 | return timestamp 175 | else: 176 | return None 177 | 178 | 179 | def first_indexed() -> Optional[datetime]: 180 | """ 181 | Returns the timestamp of the first indexed message. 182 | 183 | Returns: 184 | Optional[datetime]: The timestamp of the first indexed message, or None if no messages exist. 185 | """ 186 | 187 | msg = Message.select().order_by(Message.timestamp.asc()).first() 188 | if msg: 189 | timestamp: Optional[datetime] = msg.timestamp 190 | return timestamp 191 | else: 192 | return None 193 | 194 | 195 | def mark_messages_as_deleted(message_ids: List[str]) -> None: 196 | """ 197 | Mark messages as deleted in the database. 198 | 199 | Args: 200 | message_ids (List[str]): List of message IDs to mark as deleted. 201 | 202 | Raises: 203 | DatabaseError: If the operation fails. 204 | """ 205 | if not message_ids: 206 | return 207 | 208 | try: 209 | if not message_ids: 210 | return 211 | 212 | # Use the SQL IN clause with proper parameter binding 213 | batch_size = 100 214 | for i in range(0, len(message_ids), batch_size): 215 | batch = message_ids[i : i + batch_size] 216 | placeholders = ",".join(["?" for _ in batch]) 217 | query = Message.update(is_deleted=True, last_indexed=datetime.now()) 218 | query = query.where(SQL(f"message_id IN ({placeholders})", batch)) 219 | query.execute() 220 | except Exception as e: 221 | raise DatabaseError(f"Failed to mark messages as deleted: {e}") 222 | 223 | 224 | def get_all_message_ids() -> List[str]: 225 | """ 226 | Returns all message IDs stored in the database. 227 | 228 | Returns: 229 | List[str]: List of message IDs. 230 | 231 | Raises: 232 | DatabaseError: If the query fails. 233 | """ 234 | try: 235 | return [message.message_id for message in Message.select(Message.message_id)] 236 | except Exception as e: 237 | raise DatabaseError(f"Failed to retrieve message IDs: {e}") 238 | 239 | 240 | def get_deleted_message_ids() -> List[str]: 241 | """ 242 | Returns all message IDs that are already marked as deleted. 243 | 244 | Returns: 245 | List[str]: List of deleted message IDs. 246 | 247 | Raises: 248 | DatabaseError: If the query fails. 249 | """ 250 | try: 251 | return [ 252 | message.message_id 253 | for message in Message.select(Message.message_id).where( 254 | Message.is_deleted == True 255 | ) 256 | ] 257 | except Exception as e: 258 | raise DatabaseError(f"Failed to retrieve deleted message IDs: {e}") 259 | -------------------------------------------------------------------------------- /gmail_to_sqlite/main.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import os 4 | import signal 5 | import sys 6 | from typing import Any, Callable, List, Optional 7 | 8 | from . import auth, db, sync 9 | from .constants import DEFAULT_WORKERS, LOG_FORMAT 10 | 11 | 12 | class ApplicationError(Exception): 13 | """Custom exception for application-level errors.""" 14 | 15 | pass 16 | 17 | 18 | def prepare_data_dir(data_dir: str) -> None: 19 | """ 20 | Create the data directory if it doesn't exist. 21 | 22 | Args: 23 | data_dir (str): The path where to store data. 24 | 25 | Raises: 26 | ApplicationError: If directory creation fails. 27 | """ 28 | try: 29 | if not os.path.exists(data_dir): 30 | os.makedirs(data_dir) 31 | except Exception as e: 32 | raise ApplicationError(f"Failed to create data directory {data_dir}: {e}") 33 | 34 | 35 | def setup_signal_handler( 36 | shutdown_requested: Optional[List[bool]] = None, 37 | executor: Any = None, 38 | futures: Any = None, 39 | ) -> Any: 40 | """ 41 | Set up a signal handler for graceful shutdown. 42 | 43 | Args: 44 | shutdown_requested: Mutable container for shutdown state. 45 | executor: The executor instance to manage task cancellation. 46 | futures: Dictionary mapping futures to their IDs. 47 | 48 | Returns: 49 | The original signal handler. 50 | """ 51 | 52 | def handle_sigint(sig: Any, frame: Any) -> None: 53 | if shutdown_requested is not None: 54 | if not shutdown_requested[0]: 55 | logging.info( 56 | "Shutdown requested. Waiting for current tasks to complete..." 57 | ) 58 | shutdown_requested[0] = True 59 | 60 | # Cancel non-running futures if provided 61 | if executor and futures: 62 | for future in list(futures.keys()): 63 | if not future.running(): 64 | future.cancel() 65 | else: 66 | logging.warning("Forced shutdown. Exiting immediately.") 67 | sys.exit(1) 68 | else: 69 | logging.warning( 70 | "Forced shutdown. No graceful shutdown available. Exiting immediately." 71 | ) 72 | sys.exit(1) 73 | 74 | original_sigint_handler = signal.getsignal(signal.SIGINT) 75 | signal.signal(signal.SIGINT, handle_sigint) 76 | return original_sigint_handler 77 | 78 | 79 | def setup_logging() -> None: 80 | """Set up application logging configuration.""" 81 | logging.basicConfig( 82 | level=logging.INFO, 83 | format=LOG_FORMAT, 84 | handlers=[logging.StreamHandler()], 85 | ) 86 | 87 | 88 | def create_argument_parser() -> argparse.ArgumentParser: 89 | """Create and configure the command line argument parser.""" 90 | parser = argparse.ArgumentParser( 91 | description="Gmail to SQLite synchronization tool", 92 | formatter_class=argparse.RawDescriptionHelpFormatter, 93 | epilog=""" 94 | Commands: 95 | sync Sync all messages (incremental by default) 96 | sync-message Sync a single message by ID 97 | sync-deleted-messages Detect and mark deleted messages 98 | 99 | Examples: 100 | %(prog)s sync --data-dir ./data 101 | %(prog)s sync --data-dir ./data --full-sync 102 | %(prog)s sync-message --data-dir ./data --message-id abc123 103 | """, 104 | ) 105 | 106 | parser.add_argument( 107 | "command", 108 | choices=["sync", "sync-message", "sync-deleted-messages"], 109 | help="The command to run", 110 | ) 111 | parser.add_argument( 112 | "--data-dir", required=True, help="The path where the data should be stored" 113 | ) 114 | parser.add_argument( 115 | "--full-sync", 116 | action="store_true", 117 | help="Force a full sync of all messages and detect deleted messages", 118 | ) 119 | parser.add_argument( 120 | "--message-id", 121 | help="The ID of the message to sync (required for sync-message command)", 122 | ) 123 | parser.add_argument( 124 | "--workers", 125 | type=int, 126 | default=DEFAULT_WORKERS, 127 | help=f"Number of worker threads for parallel fetching (default: {DEFAULT_WORKERS})", 128 | ) 129 | 130 | return parser 131 | 132 | 133 | def main() -> None: 134 | """Main application entry point.""" 135 | setup_logging() 136 | 137 | try: 138 | parser = create_argument_parser() 139 | args = parser.parse_args() 140 | 141 | # Validate command-specific arguments 142 | if args.command == "sync-message" and not args.message_id: 143 | parser.error("--message-id is required for sync-message command") 144 | 145 | prepare_data_dir(args.data_dir) 146 | credentials = auth.get_credentials(args.data_dir) 147 | 148 | # Set up shutdown handling 149 | shutdown_state = [False] 150 | 151 | def check_shutdown() -> bool: 152 | return shutdown_state[0] 153 | 154 | original_sigint_handler = setup_signal_handler( 155 | shutdown_requested=shutdown_state 156 | ) 157 | 158 | try: 159 | db_conn = db.init(args.data_dir) 160 | 161 | if args.command == "sync": 162 | sync.all_messages( 163 | credentials, 164 | full_sync=args.full_sync, 165 | num_workers=args.workers, 166 | check_shutdown=check_shutdown, 167 | ) 168 | elif args.command == "sync-message": 169 | sync.single_message( 170 | credentials, args.message_id, check_shutdown=check_shutdown 171 | ) 172 | elif args.command == "sync-deleted-messages": 173 | sync.sync_deleted_messages(credentials, check_shutdown=check_shutdown) 174 | 175 | db_conn.close() 176 | logging.info("Operation completed successfully") 177 | 178 | except (auth.AuthenticationError, db.DatabaseError, sync.SyncError) as e: 179 | logging.error(f"Operation failed: {e}") 180 | sys.exit(1) 181 | except Exception as e: 182 | logging.error(f"Unexpected error: {e}") 183 | sys.exit(1) 184 | finally: 185 | signal.signal(signal.SIGINT, original_sigint_handler) 186 | 187 | except KeyboardInterrupt: 188 | logging.info("Operation cancelled by user") 189 | sys.exit(0) 190 | 191 | 192 | if __name__ == "__main__": 193 | main() 194 | -------------------------------------------------------------------------------- /gmail_to_sqlite/message.py: -------------------------------------------------------------------------------- 1 | import base64 2 | from datetime import datetime 3 | from email.utils import parseaddr, parsedate_to_datetime 4 | from typing import Dict, List, Optional 5 | 6 | from bs4 import BeautifulSoup 7 | 8 | from .constants import SUPPORTED_MIME_TYPES 9 | 10 | 11 | class MessageParsingError(Exception): 12 | """Custom exception for message parsing errors.""" 13 | 14 | pass 15 | 16 | 17 | class Message: 18 | """ 19 | Represents a Gmail message with all its attributes and parsing capabilities. 20 | 21 | Attributes: 22 | id (Optional[str]): Message ID 23 | thread_id (Optional[str]): Thread ID 24 | sender (Dict): Sender information with name and email 25 | recipients (Dict): Recipients organized by type (to, cc, bcc) 26 | labels (List[str]): List of label names 27 | subject (Optional[str]): Message subject 28 | body (Optional[str]): Message body text 29 | size (int): Message size in bytes 30 | timestamp (Optional[datetime]): Message timestamp 31 | is_read (bool): Whether message has been read 32 | is_outgoing (bool): Whether message was sent by user 33 | """ 34 | 35 | def __init__(self) -> None: 36 | self.id: Optional[str] = None 37 | self.thread_id: Optional[str] = None 38 | self.sender: Dict[str, str] = {} 39 | self.recipients: Dict[str, List[Dict[str, str]]] = {} 40 | self.labels: List[str] = [] 41 | self.subject: Optional[str] = None 42 | self.body: Optional[str] = None 43 | self.size: int = 0 44 | self.timestamp: Optional[datetime] = None 45 | self.is_read: bool = False 46 | self.is_outgoing: bool = False 47 | 48 | @classmethod 49 | def from_raw(cls, raw: Dict, labels: Dict[str, str]) -> "Message": 50 | """ 51 | Create a Message object from a raw Gmail API response. 52 | 53 | Args: 54 | raw (Dict): The raw message data from Gmail API. 55 | labels (Dict[str, str]): Mapping of label IDs to label names. 56 | 57 | Returns: 58 | Message: The parsed Message object. 59 | 60 | Raises: 61 | MessageParsingError: If message parsing fails. 62 | """ 63 | try: 64 | msg = cls() 65 | msg.parse(raw, labels) 66 | return msg 67 | except Exception as e: 68 | raise MessageParsingError(f"Failed to parse message: {e}") 69 | 70 | def parse_addresses(self, addresses: str) -> List[Dict[str, str]]: 71 | """ 72 | Parse a comma-separated list of email addresses. 73 | 74 | Args: 75 | addresses (str): The comma-separated email addresses. 76 | 77 | Returns: 78 | List[Dict[str, str]]: List of parsed addresses with 'name' and 'email' keys. 79 | """ 80 | parsed_addresses: List[Dict[str, str]] = [] 81 | if not addresses: 82 | return parsed_addresses 83 | 84 | for address in addresses.split(","): 85 | name, email = parseaddr(address.strip()) 86 | if email: 87 | parsed_addresses.append( 88 | {"email": email.lower(), "name": name.strip() if name else ""} 89 | ) 90 | 91 | return parsed_addresses 92 | 93 | def decode_body(self, part: Dict) -> str: 94 | """ 95 | Recursively decode the body of a message part. 96 | 97 | Args: 98 | part (Dict): The message part to decode. 99 | 100 | Returns: 101 | str: The decoded body text, or empty string if not found. 102 | """ 103 | try: 104 | if "data" in part.get("body", {}): 105 | return base64.urlsafe_b64decode(part["body"]["data"]).decode("utf-8") 106 | elif "parts" in part: 107 | for subpart in part["parts"]: 108 | decoded_body = self.decode_body(subpart) 109 | if decoded_body: 110 | return decoded_body 111 | except Exception: 112 | # If decoding fails, return empty string 113 | pass 114 | 115 | return "" 116 | 117 | def html2text(self, html: str) -> str: 118 | """ 119 | Convert HTML content to plain text. 120 | 121 | Args: 122 | html (str): The HTML content to convert. 123 | 124 | Returns: 125 | str: The plain text content. 126 | """ 127 | if not html: 128 | return "" 129 | 130 | try: 131 | soup = BeautifulSoup(html, features="html.parser") 132 | text_content: str = soup.get_text() 133 | return text_content 134 | except Exception: 135 | # If HTML parsing fails, return the original text 136 | return html 137 | 138 | def parse(self, msg: Dict, labels: Dict[str, str]) -> None: 139 | """ 140 | Parses a raw Gmail message and populates the Message object. 141 | 142 | Args: 143 | msg (Dict): The raw message data from Gmail API. 144 | labels (Dict[str, str]): Mapping of label IDs to label names. 145 | 146 | Raises: 147 | MessageParsingError: If critical message data cannot be parsed. 148 | """ 149 | try: 150 | # Basic message info 151 | self.id = msg["id"] 152 | self.thread_id = msg["threadId"] 153 | self.size = msg.get("sizeEstimate", 0) 154 | 155 | # Parse timestamp - prefer internal date 156 | if "internalDate" in msg: 157 | internal_date_secs = int(msg["internalDate"]) / 1000 158 | self.timestamp = datetime.fromtimestamp(internal_date_secs) 159 | 160 | # Parse headers 161 | headers = msg.get("payload", {}).get("headers", []) 162 | for header in headers: 163 | name = header["name"].lower() 164 | value = header["value"] 165 | 166 | if name == "from": 167 | addr = parseaddr(value) 168 | self.sender = {"name": addr[0], "email": addr[1]} 169 | elif name == "to": 170 | self.recipients["to"] = self.parse_addresses(value) 171 | elif name == "cc": 172 | self.recipients["cc"] = self.parse_addresses(value) 173 | elif name == "bcc": 174 | self.recipients["bcc"] = self.parse_addresses(value) 175 | elif name == "subject": 176 | self.subject = value 177 | elif name == "date" and self.timestamp is None: 178 | try: 179 | self.timestamp = parsedate_to_datetime(value) if value else None 180 | except Exception: 181 | # If date parsing fails, leave timestamp as None 182 | pass 183 | 184 | # Parse labels 185 | if "labelIds" in msg: 186 | for label_id in msg["labelIds"]: 187 | if label_id in labels: 188 | self.labels.append(labels[label_id]) 189 | 190 | self.is_read = "UNREAD" not in msg["labelIds"] 191 | self.is_outgoing = "SENT" in msg["labelIds"] 192 | 193 | # Extract message body 194 | self._extract_body(msg.get("payload", {})) 195 | 196 | except Exception as e: 197 | raise MessageParsingError( 198 | f"Failed to parse message {msg.get('id', 'unknown')}: {e}" 199 | ) 200 | 201 | def _extract_body(self, payload: Dict) -> None: 202 | """ 203 | Extract the body text from message payload. 204 | 205 | Args: 206 | payload (Dict): The message payload from Gmail API. 207 | """ 208 | # For non-multipart messages 209 | if "body" in payload and "data" in payload["body"]: 210 | try: 211 | self.body = base64.urlsafe_b64decode(payload["body"]["data"]).decode( 212 | "utf-8" 213 | ) 214 | self.body = self.html2text(self.body) 215 | return 216 | except Exception: 217 | pass 218 | 219 | # For multipart messages 220 | if "parts" in payload and self.body is None: 221 | for part in payload["parts"]: 222 | mime_type = part.get("mimeType", "") 223 | if mime_type in SUPPORTED_MIME_TYPES: 224 | body_text = self.decode_body(part) 225 | if body_text: 226 | self.body = self.html2text(body_text) 227 | break 228 | -------------------------------------------------------------------------------- /gmail_to_sqlite/migrations.py: -------------------------------------------------------------------------------- 1 | """ 2 | Database migrations for gmail-to-sqlite. 3 | 4 | This module contains migration functions to update the database schema 5 | when new features are added or existing schema needs to be modified. 6 | """ 7 | 8 | import logging 9 | from typing import Optional 10 | 11 | from peewee import BooleanField, SQL 12 | from playhouse.migrate import SqliteMigrator, migrate 13 | 14 | from .db import database_proxy, Message, SchemaVersion 15 | 16 | 17 | logger = logging.getLogger(__name__) 18 | 19 | 20 | def column_exists(table_name: str, column_name: str) -> bool: 21 | """ 22 | Check if a column exists in a table. 23 | 24 | Args: 25 | table_name (str): The name of the table. 26 | column_name (str): The name of the column to check. 27 | 28 | Returns: 29 | bool: True if the column exists, False otherwise. 30 | """ 31 | try: 32 | # PRAGMA table_info doesn't support parameter binding for table names 33 | cursor = database_proxy.obj.execute_sql(f"PRAGMA table_info({table_name})") 34 | columns = [row[1] for row in cursor.fetchall()] 35 | return column_name in columns 36 | except Exception as e: 37 | logger.error(f"Error checking if column {column_name} exists: {e}") 38 | return False 39 | 40 | 41 | def get_schema_version() -> int: 42 | """ 43 | Get the current database schema version. 44 | 45 | Returns: 46 | int: The current schema version, or 0 if no version is tracked. 47 | """ 48 | try: 49 | version_record = SchemaVersion.select().first() 50 | return version_record.version if version_record else 0 51 | except Exception as e: 52 | logger.debug(f"Schema version table doesn't exist or error occurred: {e}") 53 | return 0 54 | 55 | 56 | def set_schema_version(version: int) -> bool: 57 | """ 58 | Set the database schema version. 59 | 60 | Args: 61 | version (int): The schema version to set. 62 | 63 | Returns: 64 | bool: True if successful, False otherwise. 65 | """ 66 | try: 67 | database_proxy.obj.create_tables([SchemaVersion], safe=True) 68 | 69 | # Delete existing version record and insert new one 70 | SchemaVersion.delete().execute() 71 | SchemaVersion.create(version=version) 72 | logger.info(f"Schema version set to {version}") 73 | return True 74 | except Exception as e: 75 | logger.error(f"Failed to set schema version to {version}: {e}") 76 | return False 77 | 78 | 79 | def run_migrations() -> bool: 80 | """ 81 | Run all necessary migrations for the database. 82 | 83 | Returns: 84 | bool: True if all migrations were successful, False otherwise. 85 | """ 86 | logger.info("Running database migrations...") 87 | 88 | try: 89 | current_version = get_schema_version() 90 | logger.info(f"Current schema version: {current_version}") 91 | 92 | if current_version == 0: 93 | logger.info("Running migration v1: add is_deleted column") 94 | from .schema_migrations.v1_add_is_deleted_column import run 95 | 96 | if run(): 97 | if set_schema_version(1): 98 | logger.info("Migration v1 completed successfully, version set to 1") 99 | else: 100 | logger.error("Failed to set schema version to 1") 101 | return False 102 | else: 103 | logger.error("Migration v1 failed") 104 | return False 105 | elif current_version >= 1: 106 | logger.info( 107 | f"Database already at version {current_version}, no migrations needed" 108 | ) 109 | else: 110 | logger.warning(f"Unexpected schema version {current_version}") 111 | 112 | logger.info("All migrations completed successfully") 113 | return True 114 | 115 | except Exception as e: 116 | logger.error(f"Error during migrations: {e}") 117 | return False 118 | -------------------------------------------------------------------------------- /gmail_to_sqlite/schema_migrations/__init__.py: -------------------------------------------------------------------------------- 1 | """Database migrations package.""" 2 | -------------------------------------------------------------------------------- /gmail_to_sqlite/schema_migrations/v1_add_is_deleted_column.py: -------------------------------------------------------------------------------- 1 | """ 2 | Migration v1: Add is_deleted column to messages table. 3 | 4 | This migration adds a BooleanField with default value False to track 5 | whether messages have been deleted from Gmail. 6 | """ 7 | 8 | import logging 9 | from peewee import BooleanField 10 | from playhouse.migrate import SqliteMigrator, migrate 11 | 12 | from ..db import database_proxy 13 | from ..migrations import column_exists 14 | 15 | 16 | logger = logging.getLogger(__name__) 17 | 18 | 19 | def run() -> bool: 20 | """ 21 | Add the is_deleted column to the messages table if it doesn't exist. 22 | 23 | This migration adds a BooleanField with default value False to track 24 | whether messages have been deleted from Gmail. 25 | 26 | Returns: 27 | bool: True if the migration was successful or column already exists, 28 | False if the migration failed. 29 | """ 30 | table_name = "messages" 31 | column_name = "is_deleted" 32 | 33 | try: 34 | if column_exists(table_name, column_name): 35 | logger.info(f"Column {column_name} already exists in {table_name} table") 36 | return True 37 | 38 | logger.info(f"Adding {column_name} column to {table_name} table") 39 | 40 | migrator = SqliteMigrator(database_proxy.obj) 41 | is_deleted_field = BooleanField(default=False) 42 | 43 | migrate(migrator.add_column(table_name, column_name, is_deleted_field)) 44 | database_proxy.obj.execute_sql( 45 | f"UPDATE {table_name} SET {column_name} = ? WHERE {column_name} IS NULL", 46 | (False,), 47 | ) 48 | 49 | logger.info(f"Successfully added {column_name} column to {table_name} table") 50 | return True 51 | 52 | except Exception as e: 53 | logger.error(f"Failed to add {column_name} column: {e}") 54 | return False 55 | -------------------------------------------------------------------------------- /gmail_to_sqlite/sync.py: -------------------------------------------------------------------------------- 1 | import concurrent.futures 2 | import logging 3 | import socket 4 | import time 5 | from typing import Any, Callable, Dict, List, Optional, Set 6 | 7 | from googleapiclient.discovery import build 8 | from googleapiclient.errors import HttpError 9 | from peewee import IntegrityError 10 | 11 | from . import db, message 12 | from .constants import ( 13 | DEFAULT_WORKERS, 14 | GMAIL_API_VERSION, 15 | MAX_RESULTS_PER_PAGE, 16 | MAX_RETRY_ATTEMPTS, 17 | RETRY_DELAY_SECONDS, 18 | PROGRESS_LOG_INTERVAL, 19 | COLLECTION_LOG_INTERVAL, 20 | ) 21 | 22 | 23 | class SyncError(Exception): 24 | """Custom exception for synchronization errors.""" 25 | 26 | pass 27 | 28 | 29 | def _fetch_message( 30 | service: Any, 31 | message_id: str, 32 | labels: Dict[str, str], 33 | check_interrupt: Optional[Callable[[], bool]] = None, 34 | ) -> message.Message: 35 | """ 36 | Fetches a single message from Gmail API with retry logic and robust error handling. 37 | 38 | Args: 39 | service: The Gmail API service object. 40 | message_id: The ID of the message to fetch. 41 | labels: Dictionary mapping label IDs to label names. 42 | check_interrupt: Optional callback that returns True if process should be interrupted. 43 | 44 | Returns: 45 | Message: The parsed message object. 46 | 47 | Raises: 48 | InterruptedError: If the process was interrupted. 49 | SyncError: If the message cannot be fetched after all retries. 50 | """ 51 | for attempt in range(MAX_RETRY_ATTEMPTS): 52 | if check_interrupt and check_interrupt(): 53 | raise InterruptedError("Process was interrupted") 54 | 55 | try: 56 | raw_msg = ( 57 | service.users().messages().get(userId="me", id=message_id).execute() 58 | ) 59 | return message.Message.from_raw(raw_msg, labels) 60 | 61 | except HttpError as e: 62 | if e.resp.status >= 500 and attempt < MAX_RETRY_ATTEMPTS - 1: 63 | logging.warning( 64 | f"Attempt {attempt + 1}/{MAX_RETRY_ATTEMPTS} failed for message {message_id} " 65 | f"due to server error {e.resp.status}. Retrying in {RETRY_DELAY_SECONDS}s..." 66 | ) 67 | if check_interrupt and check_interrupt(): 68 | raise InterruptedError("Process was interrupted") 69 | time.sleep(RETRY_DELAY_SECONDS) 70 | else: 71 | error_msg = ( 72 | f"Failed to fetch message {message_id} after {attempt + 1} attempts " 73 | f"due to HttpError {e.resp.status}: {str(e)}" 74 | ) 75 | logging.error(error_msg) 76 | raise SyncError(error_msg) 77 | 78 | except (TimeoutError, socket.timeout) as e: 79 | if attempt < MAX_RETRY_ATTEMPTS - 1: 80 | logging.warning( 81 | f"Attempt {attempt + 1}/{MAX_RETRY_ATTEMPTS} failed for message {message_id} " 82 | f"due to timeout. Retrying in {RETRY_DELAY_SECONDS}s..." 83 | ) 84 | if check_interrupt and check_interrupt(): 85 | raise InterruptedError("Process was interrupted") 86 | time.sleep(RETRY_DELAY_SECONDS) 87 | else: 88 | error_msg = ( 89 | f"Failed to fetch message {message_id} after {attempt + 1} attempts " 90 | f"due to timeout: {str(e)}" 91 | ) 92 | logging.error(error_msg) 93 | raise SyncError(error_msg) 94 | 95 | except Exception as e: 96 | logging.error( 97 | f"Unexpected error processing message {message_id} on attempt {attempt + 1}: {str(e)}" 98 | ) 99 | if attempt < MAX_RETRY_ATTEMPTS - 1: 100 | if check_interrupt and check_interrupt(): 101 | raise InterruptedError("Process was interrupted") 102 | time.sleep(RETRY_DELAY_SECONDS) 103 | else: 104 | error_msg = f"Failed to fetch message {message_id} after {MAX_RETRY_ATTEMPTS} attempts" 105 | logging.error(error_msg) 106 | raise SyncError(error_msg) 107 | 108 | # This should never be reached due to the exception handling above 109 | raise SyncError(f"Unexpected error: failed to fetch message {message_id}") 110 | 111 | 112 | def get_labels(service: Any) -> Dict[str, str]: 113 | """ 114 | Retrieves all labels from the Gmail API. 115 | 116 | Args: 117 | service: The Gmail API service object. 118 | 119 | Returns: 120 | Dict[str, str]: Mapping of label IDs to label names. 121 | 122 | Raises: 123 | SyncError: If labels cannot be retrieved. 124 | """ 125 | try: 126 | labels = {} 127 | response = service.users().labels().list(userId="me").execute() 128 | for label in response.get("labels", []): 129 | labels[label["id"]] = label["name"] 130 | return labels 131 | except Exception as e: 132 | raise SyncError(f"Failed to retrieve labels: {e}") 133 | 134 | 135 | def _create_service(credentials: Any) -> Any: 136 | """ 137 | Creates a new Gmail API service object. 138 | 139 | Args: 140 | credentials: The credentials object for API authentication. 141 | 142 | Returns: 143 | The Gmail API service object. 144 | 145 | Raises: 146 | SyncError: If service creation fails. 147 | """ 148 | try: 149 | return build("gmail", GMAIL_API_VERSION, credentials=credentials) 150 | except Exception as e: 151 | raise SyncError(f"Failed to create Gmail service: {e}") 152 | 153 | 154 | def get_message_ids_from_gmail( 155 | service: Any, 156 | query: Optional[List[str]] = None, 157 | check_shutdown: Optional[Callable[[], bool]] = None, 158 | ) -> List[str]: 159 | """ 160 | Fetches all message IDs from Gmail matching the query. 161 | 162 | Args: 163 | service: The Gmail API service object. 164 | query: Optional list of query strings to filter messages. 165 | check_shutdown: Callback that returns True if shutdown is requested. 166 | 167 | Returns: 168 | List[str]: List of message IDs from Gmail. 169 | 170 | Raises: 171 | SyncError: If message ID collection fails. 172 | """ 173 | all_message_ids = [] 174 | page_token = None 175 | collected_count = 0 176 | 177 | logging.info("Collecting all message IDs from Gmail...") 178 | 179 | try: 180 | while not (check_shutdown and check_shutdown()): 181 | list_params = { 182 | "userId": "me", 183 | "maxResults": MAX_RESULTS_PER_PAGE, 184 | } 185 | 186 | if page_token: 187 | list_params["pageToken"] = page_token 188 | 189 | if query: 190 | list_params["q"] = " | ".join(query) 191 | 192 | results = service.users().messages().list(**list_params).execute() 193 | messages_page = results.get("messages", []) 194 | 195 | for m_info in messages_page: 196 | all_message_ids.append(m_info["id"]) 197 | collected_count += 1 198 | 199 | if collected_count % COLLECTION_LOG_INTERVAL == 0: 200 | logging.info( 201 | f"Collected {collected_count} message IDs from Gmail..." 202 | ) 203 | 204 | page_token = results.get("nextPageToken") 205 | if not page_token: 206 | break 207 | 208 | except KeyboardInterrupt: 209 | logging.info("Message ID collection interrupted by user") 210 | except Exception as e: 211 | raise SyncError(f"Failed to collect message IDs: {e}") 212 | 213 | if check_shutdown and check_shutdown(): 214 | logging.info( 215 | "Shutdown requested during message ID collection. Exiting gracefully." 216 | ) 217 | return [] 218 | 219 | logging.info(f"Collected {len(all_message_ids)} message IDs from Gmail") 220 | return all_message_ids 221 | 222 | 223 | def _detect_and_mark_deleted_messages( 224 | gmail_message_ids: List[str], check_shutdown: Optional[Callable[[], bool]] = None 225 | ) -> Optional[int]: 226 | """ 227 | Helper function to detect and mark deleted messages based on comparison 228 | between Gmail message IDs and database message IDs. 229 | 230 | Args: 231 | gmail_message_ids (list): List of message IDs from Gmail. 232 | check_shutdown (callable): A callback function that returns True if shutdown is requested. 233 | 234 | Returns: 235 | int: Number of messages newly marked as deleted, or None if no action taken. 236 | """ 237 | try: 238 | db_message_ids = set(db.get_all_message_ids()) 239 | logging.info( 240 | f"Retrieved {len(db_message_ids)} message IDs from database for deletion detection" 241 | ) 242 | 243 | if not db_message_ids: 244 | logging.info("No messages in database to check for deletion") 245 | return None 246 | 247 | if check_shutdown and check_shutdown(): 248 | logging.info( 249 | "Shutdown requested during deletion detection. Exiting gracefully." 250 | ) 251 | return None 252 | 253 | already_deleted_ids = set(db.get_deleted_message_ids()) 254 | if already_deleted_ids: 255 | logging.info( 256 | f"Found {len(already_deleted_ids)} already deleted messages to skip" 257 | ) 258 | 259 | gmail_ids_set = set(gmail_message_ids) 260 | potential_deleted_ids = db_message_ids - gmail_ids_set 261 | new_deleted_ids = ( 262 | list(potential_deleted_ids - already_deleted_ids) 263 | if already_deleted_ids 264 | else list(potential_deleted_ids) 265 | ) 266 | 267 | if new_deleted_ids: 268 | logging.info(f"Found {len(new_deleted_ids)} new deleted messages to mark") 269 | db.mark_messages_as_deleted(new_deleted_ids) 270 | logging.info( 271 | f"Deletion sync complete. {len(new_deleted_ids)} messages newly marked as deleted." 272 | ) 273 | return len(new_deleted_ids) 274 | else: 275 | logging.info("No new deleted messages found") 276 | return None 277 | except Exception as e: 278 | logging.error(f"Error during deletion detection: {str(e)}") 279 | return None 280 | 281 | 282 | def all_messages( 283 | credentials: Any, 284 | full_sync: bool = False, 285 | num_workers: int = 4, 286 | check_shutdown: Optional[Callable[[], bool]] = None, 287 | ) -> int: 288 | """ 289 | Fetches messages from the Gmail API using the provided credentials, in parallel. 290 | Also detects and marks deleted messages. 291 | 292 | Args: 293 | credentials (object): The credentials object used to authenticate the API request. 294 | db_conn (object): The database connection object. 295 | full_sync (bool): Whether to do a full sync or not. 296 | num_workers (int): Number of worker threads for parallel fetching. 297 | check_shutdown (callable): A callback function that returns True if shutdown is requested. 298 | 299 | Returns: 300 | int: The number of messages successfully synced. 301 | """ 302 | executor = None 303 | future_to_id = {} 304 | 305 | try: 306 | query = [] 307 | if not full_sync: 308 | last = db.last_indexed() 309 | if last: 310 | query.append(f"after:{int(last.timestamp())}") 311 | first = db.first_indexed() 312 | if first: 313 | query.append(f"before:{int(first.timestamp())}") 314 | 315 | service = _create_service(credentials) 316 | labels = get_labels(service) 317 | 318 | all_message_ids = get_message_ids_from_gmail(service, query, check_shutdown) 319 | 320 | if check_shutdown and check_shutdown(): 321 | logging.info( 322 | "Shutdown requested during message ID collection. Exiting gracefully." 323 | ) 324 | return 0 325 | 326 | if full_sync: 327 | _detect_and_mark_deleted_messages(all_message_ids, check_shutdown) 328 | 329 | logging.info(f"Found {len(all_message_ids)} messages to sync.") 330 | 331 | total_synced_count = 0 332 | processed_count = 0 333 | 334 | def thread_worker(message_id: str) -> bool: 335 | if check_shutdown and check_shutdown(): 336 | return False 337 | 338 | service = _create_service(credentials) 339 | 340 | try: 341 | msg = _fetch_message( 342 | service, 343 | message_id, 344 | labels, 345 | check_interrupt=check_shutdown, 346 | ) 347 | try: 348 | db.create_message(msg) 349 | logging.info( 350 | f"Successfully synced message {msg.id} (Original ID: {message_id}) from {msg.timestamp}" 351 | ) 352 | return True 353 | except IntegrityError as e: 354 | logging.error( 355 | f"Could not process message {message_id} due to integrity error: {str(e)}" 356 | ) 357 | return False 358 | except InterruptedError: 359 | logging.info(f"Message fetch for {message_id} was interrupted") 360 | return False 361 | except Exception as e: 362 | logging.error(f"Failed to fetch message {message_id}: {str(e)}") 363 | return False 364 | 365 | with concurrent.futures.ThreadPoolExecutor( 366 | max_workers=num_workers 367 | ) as executor_instance: 368 | executor = executor_instance 369 | future_to_id = { 370 | executor.submit(thread_worker, msg_id): msg_id 371 | for msg_id in all_message_ids 372 | } 373 | 374 | for future in concurrent.futures.as_completed(future_to_id): 375 | if check_shutdown and check_shutdown() and not future.running(): 376 | continue 377 | 378 | message_id = future_to_id[future] 379 | processed_count += 1 380 | try: 381 | if not future.cancelled(): 382 | if future.result(): 383 | total_synced_count += 1 384 | if ( 385 | processed_count % PROGRESS_LOG_INTERVAL == 0 386 | or processed_count == len(all_message_ids) 387 | ): 388 | logging.info( 389 | f"Processed {processed_count}/{len(all_message_ids)} messages..." 390 | ) 391 | except concurrent.futures.CancelledError: 392 | logging.info( 393 | f"Task for message {message_id} was cancelled due to shutdown" 394 | ) 395 | except Exception as exc: 396 | logging.error( 397 | f"Message ID {message_id} generated an exception during future processing: {exc}" 398 | ) 399 | logging.error( 400 | f"Message ID {message_id} generated an exception during future processing: {exc}" 401 | ) 402 | 403 | if check_shutdown and check_shutdown(): 404 | logging.info("Sync process was interrupted. Partial results saved.") 405 | else: 406 | logging.info( 407 | f"Total messages successfully synced: {total_synced_count} out of {len(all_message_ids)}" 408 | ) 409 | return total_synced_count 410 | finally: 411 | pass 412 | 413 | 414 | def sync_deleted_messages( 415 | credentials: Any, check_shutdown: Optional[Callable[[], bool]] = None 416 | ) -> None: 417 | """ 418 | Compares message IDs in Gmail with those in the database and marks missing messages as deleted. 419 | This function only updates the is_deleted flag and doesn't download full message content. 420 | It skips messages that are already marked as deleted for efficiency. 421 | 422 | Args: 423 | credentials: The credentials used to authenticate the Gmail API. 424 | check_shutdown (callable): A callback function that returns True if shutdown is requested. 425 | 426 | Returns: 427 | int: Number of messages marked as deleted. 428 | """ 429 | try: 430 | service = _create_service(credentials) 431 | gmail_message_ids = get_message_ids_from_gmail( 432 | service, check_shutdown=check_shutdown 433 | ) 434 | 435 | if check_shutdown and check_shutdown(): 436 | logging.info( 437 | "Shutdown requested during message ID collection. Exiting gracefully." 438 | ) 439 | return None 440 | 441 | _detect_and_mark_deleted_messages(gmail_message_ids, check_shutdown) 442 | except Exception as e: 443 | logging.error(f"Error during deletion sync: {str(e)}") 444 | return None 445 | 446 | 447 | def single_message( 448 | credentials: Any, 449 | message_id: str, 450 | check_shutdown: Optional[Callable[[], bool]] = None, 451 | ) -> None: 452 | """ 453 | Syncs a single message from Gmail using the provided credentials and message ID. 454 | 455 | Args: 456 | credentials: The credentials used to authenticate the Gmail API. 457 | message_id: The ID of the message to fetch. 458 | check_shutdown (callable): A callback function that returns True if shutdown is requested. 459 | 460 | Returns: 461 | None 462 | """ 463 | try: 464 | service = _create_service(credentials) 465 | labels = get_labels(service) 466 | 467 | if check_shutdown and check_shutdown(): 468 | logging.info("Shutdown requested. Exiting gracefully.") 469 | return None 470 | 471 | try: 472 | msg = _fetch_message( 473 | service, 474 | message_id, 475 | labels, 476 | check_interrupt=check_shutdown, 477 | ) 478 | if check_shutdown and check_shutdown(): 479 | logging.info( 480 | "Shutdown requested after message fetch. Exiting gracefully." 481 | ) 482 | return None 483 | 484 | try: 485 | db.create_message(msg) 486 | logging.info( 487 | f"Successfully synced message {msg.id} (Original ID: {message_id}) from {msg.timestamp}" 488 | ) 489 | except IntegrityError as e: 490 | logging.error( 491 | f"Could not process message {message_id} due to integrity error: {str(e)}" 492 | ) 493 | except InterruptedError: 494 | logging.info(f"Message fetch for {message_id} was interrupted") 495 | except Exception as e: 496 | logging.error(f"Failed to fetch message {message_id}: {str(e)}") 497 | finally: 498 | pass 499 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import os 4 | import signal 5 | import sys 6 | from typing import Any, Callable, List, Optional 7 | 8 | from gmail_to_sqlite import auth, db, sync 9 | from gmail_to_sqlite.constants import DEFAULT_WORKERS, LOG_FORMAT 10 | 11 | 12 | class ApplicationError(Exception): 13 | """Custom exception for application-level errors.""" 14 | 15 | pass 16 | 17 | 18 | def prepare_data_dir(data_dir: str) -> None: 19 | """ 20 | Create the data directory if it doesn't exist. 21 | 22 | Args: 23 | data_dir (str): The path where to store data. 24 | 25 | Raises: 26 | ApplicationError: If directory creation fails. 27 | """ 28 | try: 29 | if not os.path.exists(data_dir): 30 | os.makedirs(data_dir) 31 | except Exception as e: 32 | raise ApplicationError(f"Failed to create data directory {data_dir}: {e}") 33 | 34 | 35 | def setup_signal_handler( 36 | shutdown_requested: Optional[List[bool]] = None, 37 | executor: Any = None, 38 | futures: Any = None, 39 | ) -> Any: 40 | """ 41 | Set up a signal handler for graceful shutdown. 42 | 43 | Args: 44 | shutdown_requested: Mutable container for shutdown state. 45 | executor: The executor instance to manage task cancellation. 46 | futures: Dictionary mapping futures to their IDs. 47 | 48 | Returns: 49 | The original signal handler. 50 | """ 51 | 52 | def handle_sigint(sig: Any, frame: Any) -> None: 53 | if shutdown_requested is not None: 54 | if not shutdown_requested[0]: 55 | logging.info( 56 | "Shutdown requested. Waiting for current tasks to complete..." 57 | ) 58 | shutdown_requested[0] = True 59 | 60 | # Cancel non-running futures if provided 61 | if executor and futures: 62 | for future in list(futures.keys()): 63 | if not future.running(): 64 | future.cancel() 65 | else: 66 | logging.warning("Forced shutdown. Exiting immediately.") 67 | sys.exit(1) 68 | else: 69 | logging.warning( 70 | "Forced shutdown. No graceful shutdown available. Exiting immediately." 71 | ) 72 | sys.exit(1) 73 | 74 | original_sigint_handler = signal.getsignal(signal.SIGINT) 75 | signal.signal(signal.SIGINT, handle_sigint) 76 | return original_sigint_handler 77 | 78 | 79 | def setup_logging() -> None: 80 | """Set up application logging configuration.""" 81 | logging.basicConfig( 82 | level=logging.INFO, 83 | format=LOG_FORMAT, 84 | handlers=[logging.StreamHandler()], 85 | ) 86 | 87 | 88 | def create_argument_parser() -> argparse.ArgumentParser: 89 | """Create and configure the command line argument parser.""" 90 | parser = argparse.ArgumentParser( 91 | description="Gmail to SQLite synchronization tool", 92 | formatter_class=argparse.RawDescriptionHelpFormatter, 93 | epilog=""" 94 | Commands: 95 | sync Sync all messages (incremental by default) 96 | sync-message Sync a single message by ID 97 | sync-deleted-messages Detect and mark deleted messages 98 | 99 | Examples: 100 | %(prog)s sync --data-dir ./data 101 | %(prog)s sync --data-dir ./data --full-sync 102 | %(prog)s sync-message --data-dir ./data --message-id abc123 103 | """, 104 | ) 105 | 106 | parser.add_argument( 107 | "command", 108 | choices=["sync", "sync-message", "sync-deleted-messages"], 109 | help="The command to run", 110 | ) 111 | parser.add_argument( 112 | "--data-dir", required=True, help="The path where the data should be stored" 113 | ) 114 | parser.add_argument( 115 | "--full-sync", 116 | action="store_true", 117 | help="Force a full sync of all messages and detect deleted messages", 118 | ) 119 | parser.add_argument( 120 | "--message-id", 121 | help="The ID of the message to sync (required for sync-message command)", 122 | ) 123 | parser.add_argument( 124 | "--workers", 125 | type=int, 126 | default=DEFAULT_WORKERS, 127 | help=f"Number of worker threads for parallel fetching (default: {DEFAULT_WORKERS})", 128 | ) 129 | 130 | return parser 131 | 132 | 133 | def main() -> None: 134 | """Main application entry point.""" 135 | setup_logging() 136 | 137 | try: 138 | parser = create_argument_parser() 139 | args = parser.parse_args() 140 | 141 | # Validate command-specific arguments 142 | if args.command == "sync-message" and not args.message_id: 143 | parser.error("--message-id is required for sync-message command") 144 | 145 | prepare_data_dir(args.data_dir) 146 | credentials = auth.get_credentials(args.data_dir) 147 | 148 | # Set up shutdown handling 149 | shutdown_state = [False] 150 | 151 | def check_shutdown() -> bool: 152 | return shutdown_state[0] 153 | 154 | original_sigint_handler = setup_signal_handler( 155 | shutdown_requested=shutdown_state 156 | ) 157 | 158 | try: 159 | db_conn = db.init(args.data_dir) 160 | 161 | if args.command == "sync": 162 | sync.all_messages( 163 | credentials, 164 | full_sync=args.full_sync, 165 | num_workers=args.workers, 166 | check_shutdown=check_shutdown, 167 | ) 168 | elif args.command == "sync-message": 169 | sync.single_message( 170 | credentials, args.message_id, check_shutdown=check_shutdown 171 | ) 172 | elif args.command == "sync-deleted-messages": 173 | sync.sync_deleted_messages(credentials, check_shutdown=check_shutdown) 174 | 175 | db_conn.close() 176 | logging.info("Operation completed successfully") 177 | 178 | except (auth.AuthenticationError, db.DatabaseError, sync.SyncError) as e: 179 | logging.error(f"Operation failed: {e}") 180 | sys.exit(1) 181 | except Exception as e: 182 | logging.error(f"Unexpected error: {e}") 183 | sys.exit(1) 184 | finally: 185 | signal.signal(signal.SIGINT, original_sigint_handler) 186 | 187 | except KeyboardInterrupt: 188 | logging.info("Operation cancelled by user") 189 | sys.exit(0) 190 | 191 | 192 | if __name__ == "__main__": 193 | main() 194 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "gmail-to-sqlite" 3 | version = "0.2.0" 4 | description = "A robust Python application that syncs Gmail messages to a local SQLite database for analysis and archival purposes." 5 | readme = "README.md" 6 | requires-python = ">=3.8.1" 7 | license = { file = "LICENSE" } 8 | keywords = ["gmail", "sqlite", "email", "sync", "backup", "archive"] 9 | 10 | dependencies = [ 11 | "beautifulsoup4>=4.12.0", 12 | "google-api-python-client>=2.100.0", 13 | "google-auth>=2.20.0", 14 | "google-auth-oauthlib>=1.0.0", 15 | "google-auth-httplib2>=0.2.0", 16 | "peewee>=3.15.0", 17 | ] 18 | 19 | [project.optional-dependencies] 20 | dev = [ 21 | "black>=23.0.0", 22 | "flake8>=6.0.0", 23 | "mypy>=1.0.0", 24 | "pytest>=7.0.0", 25 | "pytest-cov>=4.0.0", 26 | "pre-commit>=3.0.0", 27 | ] 28 | 29 | [project.urls] 30 | "Homepage" = "https://github.com/marcboeker/gmail-to-sqlite" 31 | "Bug Tracker" = "https://github.com/marcboeker/gmail-to-sqlite/issues" 32 | "Documentation" = "https://github.com/marcboeker/gmail-to-sqlite#readme" 33 | "Source Code" = "https://github.com/marcboeker/gmail-to-sqlite" 34 | 35 | [project.scripts] 36 | gmail-to-sqlite = "gmail_to_sqlite.main:main" 37 | 38 | [build-system] 39 | requires = ["hatchling"] 40 | build-backend = "hatchling.build" 41 | 42 | [tool.hatch.build.targets.wheel] 43 | packages = ["gmail_to_sqlite"] 44 | 45 | [tool.black] 46 | line-length = 88 47 | target-version = ['py38'] 48 | include = '\.pyi?$' 49 | extend-exclude = ''' 50 | /( 51 | # directories 52 | \.eggs 53 | | \.git 54 | | \.hg 55 | | \.mypy_cache 56 | | \.tox 57 | | \.venv 58 | | __pycache__ 59 | )/ 60 | ''' 61 | 62 | [tool.mypy] 63 | python_version = "3.8" 64 | warn_return_any = true 65 | warn_unused_configs = true 66 | disallow_untyped_defs = true 67 | disallow_incomplete_defs = true 68 | check_untyped_defs = true 69 | disallow_untyped_decorators = true 70 | no_implicit_optional = true 71 | warn_redundant_casts = true 72 | warn_unused_ignores = true 73 | warn_no_return = true 74 | warn_unreachable = true 75 | strict_equality = true 76 | 77 | [tool.pytest.ini_options] 78 | testpaths = ["tests"] 79 | python_files = ["test_*.py"] 80 | python_classes = ["Test*"] 81 | python_functions = ["test_*"] 82 | addopts = ["--strict-markers", "--disable-warnings", "-v"] 83 | 84 | [tool.coverage.run] 85 | source = ["gmail_to_sqlite"] 86 | omit = ["tests/*", "venv/*", ".venv/*", "*/site-packages/*"] 87 | 88 | [tool.coverage.report] 89 | exclude_lines = [ 90 | "pragma: no cover", 91 | "def __repr__", 92 | "raise AssertionError", 93 | "raise NotImplementedError", 94 | "if __name__ == .__main__.:", 95 | ] 96 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | """Setup script for gmail-to-sqlite. 2 | 3 | This file is for backward compatibility and development installs. 4 | The actual package configuration is in pyproject.toml. 5 | """ 6 | 7 | from setuptools import setup 8 | 9 | if __name__ == "__main__": 10 | setup() 11 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | """Tests for the gmail-to-sqlite package.""" 2 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | """Test configuration and fixtures.""" 2 | 3 | import pytest 4 | import tempfile 5 | import os 6 | from pathlib import Path 7 | 8 | 9 | @pytest.fixture 10 | def temp_dir(): 11 | """Create a temporary directory for testing.""" 12 | with tempfile.TemporaryDirectory() as tmpdir: 13 | yield Path(tmpdir) 14 | 15 | 16 | @pytest.fixture 17 | def mock_credentials_file(temp_dir): 18 | """Create a mock credentials file for testing.""" 19 | creds_file = temp_dir / "credentials.json" 20 | creds_file.write_text('{"test": "credentials"}') 21 | return creds_file 22 | -------------------------------------------------------------------------------- /tests/test_db.py: -------------------------------------------------------------------------------- 1 | """Tests for database functionality.""" 2 | 3 | import pytest 4 | from gmail_to_sqlite.db import init, Message 5 | 6 | 7 | class TestDatabase: 8 | """Test database operations.""" 9 | 10 | def test_initialize_database(self, temp_dir): 11 | """Test database initialization.""" 12 | db_path = str(temp_dir) 13 | db = init(db_path) 14 | assert db is not None 15 | 16 | def test_message_model(self): 17 | """Test Message model creation.""" 18 | # Test that the model exists and has required fields 19 | assert Message is not None 20 | assert hasattr(Message, "message_id") 21 | assert hasattr(Message, "subject") 22 | assert hasattr(Message, "thread_id") 23 | assert hasattr(Message, "sender") 24 | assert hasattr(Message, "recipients") 25 | -------------------------------------------------------------------------------- /tests/test_message.py: -------------------------------------------------------------------------------- 1 | """Tests for message parsing functionality.""" 2 | 3 | import pytest 4 | from gmail_to_sqlite.message import Message, MessageParsingError 5 | 6 | 7 | class TestMessageParsing: 8 | """Test message parsing operations.""" 9 | 10 | def test_message_creation(self): 11 | """Test basic message creation.""" 12 | message = Message() 13 | assert message is not None 14 | 15 | def test_from_raw_empty(self): 16 | """Test parsing with empty message data.""" 17 | with pytest.raises((MessageParsingError, KeyError, AttributeError)): 18 | Message.from_raw({}, {}) 19 | 20 | def test_from_raw_minimal(self): 21 | """Test parsing with minimal valid message data.""" 22 | message_data = { 23 | "id": "test123", 24 | "threadId": "thread123", 25 | "payload": { 26 | "headers": [ 27 | {"name": "Subject", "value": "Test Subject"}, 28 | {"name": "From", "value": "test@example.com"}, 29 | {"name": "Date", "value": "Mon, 1 Jan 2024 12:00:00 +0000"}, 30 | ] 31 | }, 32 | "sizeEstimate": 1000, 33 | } 34 | 35 | labels = {"INBOX": "INBOX"} 36 | 37 | try: 38 | message = Message.from_raw(message_data, labels) 39 | assert message.id == "test123" 40 | assert message.thread_id == "thread123" 41 | assert message.subject == "Test Subject" 42 | except Exception as e: 43 | # Some fields might be missing for this minimal test 44 | pytest.skip(f"Minimal test data insufficient: {e}") 45 | 46 | def test_parse_addresses(self): 47 | """Test address parsing.""" 48 | message = Message() 49 | addresses = "test@example.com, John Doe " 50 | parsed = message.parse_addresses(addresses) 51 | assert isinstance(parsed, list) 52 | assert len(parsed) >= 1 53 | -------------------------------------------------------------------------------- /tests/test_migrations.py: -------------------------------------------------------------------------------- 1 | """Tests for database migrations functionality.""" 2 | 3 | import pytest 4 | import tempfile 5 | import os 6 | from gmail_to_sqlite.db import database_proxy, SchemaVersion, Message 7 | from gmail_to_sqlite.migrations import ( 8 | get_schema_version, 9 | set_schema_version, 10 | run_migrations, 11 | column_exists, 12 | ) 13 | from gmail_to_sqlite.schema_migrations.v1_add_is_deleted_column import ( 14 | run as migration_v1_run, 15 | ) 16 | from peewee import SqliteDatabase 17 | 18 | 19 | class TestMigrations: 20 | """Test migration operations.""" 21 | 22 | def setup_method(self): 23 | """Set up test database for each test.""" 24 | self.temp_dir = tempfile.mkdtemp() 25 | self.db_path = os.path.join(self.temp_dir, "test.db") 26 | self.db = SqliteDatabase(self.db_path) 27 | database_proxy.initialize(self.db) 28 | 29 | def teardown_method(self): 30 | """Clean up after each test.""" 31 | if hasattr(self, "db") and self.db: 32 | self.db.close() 33 | 34 | def test_schema_version_functions(self): 35 | """Test schema version tracking functions.""" 36 | # Create schema version table 37 | self.db.create_tables([SchemaVersion]) 38 | 39 | # Test initial version (should be 0) 40 | version = get_schema_version() 41 | assert version == 0 42 | 43 | # Test setting version 44 | success = set_schema_version(5) 45 | assert success is True 46 | 47 | # Test getting version again 48 | version = get_schema_version() 49 | assert version == 5 50 | 51 | def test_run_migrations_from_scratch(self): 52 | """Test running migrations from a fresh database.""" 53 | # Also need to create Message table for the migration to work 54 | self.db.create_tables([Message]) 55 | 56 | # Run migrations 57 | success = run_migrations() 58 | assert success is True 59 | 60 | # Check that schema version is set to 1 61 | version = get_schema_version() 62 | assert version == 1 63 | 64 | # Check that is_deleted column was added 65 | assert column_exists("messages", "is_deleted") is True 66 | 67 | def test_run_migrations_already_up_to_date(self): 68 | """Test running migrations when database is already up to date.""" 69 | # Create tables and set version to 1 70 | self.db.create_tables([SchemaVersion, Message]) 71 | set_schema_version(1) 72 | 73 | # Run migrations 74 | success = run_migrations() 75 | assert success is True 76 | 77 | # Version should still be 1 78 | version = get_schema_version() 79 | assert version == 1 80 | 81 | def test_migration_v1_add_is_deleted_column(self): 82 | """Test migration v1 directly.""" 83 | # Create Message table 84 | self.db.create_tables([Message]) 85 | 86 | # Run the migration 87 | success = migration_v1_run() 88 | assert success is True 89 | 90 | # Check that is_deleted column was added 91 | assert column_exists("messages", "is_deleted") is True 92 | 93 | # Running again should still succeed (idempotent) 94 | success = migration_v1_run() 95 | assert success is True 96 | --------------------------------------------------------------------------------