├── .gitattributes ├── .gitignore ├── organize.py ├── README.md └── gmail_downer.py /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | 4 | # Custom for Visual Studio 5 | *.cs diff=csharp 6 | 7 | # Standard to msysgit 8 | *.doc diff=astextplain 9 | *.DOC diff=astextplain 10 | *.docx diff=astextplain 11 | *.DOCX diff=astextplain 12 | *.dot diff=astextplain 13 | *.DOT diff=astextplain 14 | *.pdf diff=astextplain 15 | *.PDF diff=astextplain 16 | *.rtf diff=astextplain 17 | *.RTF diff=astextplain 18 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Windows image file caches 2 | Thumbs.db 3 | ehthumbs.db 4 | 5 | # Folder config file 6 | Desktop.ini 7 | .vscode/ 8 | 9 | # Recycle Bin used on file shares 10 | $RECYCLE.BIN/ 11 | 12 | # Windows Installer files 13 | *.cab 14 | *.msi 15 | *.msm 16 | *.msp 17 | 18 | # Windows shortcuts 19 | *.lnk 20 | 21 | # ========================= 22 | # Operating System Files 23 | # ========================= 24 | 25 | # OSX 26 | # ========================= 27 | 28 | .DS_Store 29 | .AppleDouble 30 | .LSOverride 31 | 32 | # Thumbnails 33 | ._* 34 | 35 | # Files that might appear on external disk 36 | .Spotlight-V100 37 | .Trashes 38 | 39 | # Directories potentially created on remote AFP share 40 | .AppleDB 41 | .AppleDesktop 42 | Network Trash Folder 43 | Temporary Items 44 | .apdisk 45 | *.name 46 | *.xml 47 | .idea/Gmail-Attachment-Downloader.iml 48 | attachments 49 | attachments/* 50 | __pycache__/ 51 | 52 | resume.txt 53 | -------------------------------------------------------------------------------- /organize.py: -------------------------------------------------------------------------------- 1 | # Description: This module provides functions to organize downloaded email attachments based on various criteria such as size, type, date, sender, or domain. 2 | 3 | from pathlib import Path 4 | from enum import Enum 5 | from email.utils import parsedate_to_datetime 6 | import mimetypes 7 | import os 8 | 9 | 10 | class SizeCategoryEnum(Enum): 11 | TINY = "tiny" 12 | SMALL = "small" 13 | MEDIUM = "medium" 14 | LARGE = "large" 15 | HUGE = "huge" 16 | 17 | 18 | def build_and_return_directory(directory: Path) -> Path: 19 | """ 20 | Builds the directory if it does not exist and returns the directory path. 21 | 22 | Args: 23 | directory (Path): The path to the directory to be created. 24 | 25 | Returns: 26 | Path: The path to the created (or existing) directory. 27 | """ 28 | directory.mkdir(parents=True, exist_ok=True) 29 | return directory 30 | 31 | 32 | def by_size(size: int) -> str: 33 | """ 34 | Categorizes a file based on its size. 35 | 36 | Args: 37 | size (int): Size of the file in bytes. 38 | 39 | Returns: 40 | str: The size category (e.g., "tiny", "small"). 41 | """ 42 | if size < 10240: 43 | return SizeCategoryEnum.TINY.value 44 | elif size < 102400: 45 | return SizeCategoryEnum.SMALL.value 46 | elif size < 1024000: 47 | return SizeCategoryEnum.MEDIUM.value 48 | elif size < 10240000: 49 | return SizeCategoryEnum.LARGE.value 50 | else: 51 | return SizeCategoryEnum.HUGE.value 52 | 53 | 54 | def by_mime_type(extension: str) -> Path: 55 | """ 56 | Determines the general type of a file based on its MIME type. 57 | 58 | Args: 59 | extension (str): The file extension (e.g., ".jpg", ".pdf"). 60 | 61 | Returns: 62 | Path: The path segment for the file type (e.g., Path("image/jpeg"), Path("other")). 63 | """ 64 | mime_type, _ = mimetypes.guess_type(extension) 65 | if mime_type: 66 | main_type, sub_type = mime_type.split("/") 67 | return Path(main_type) / sub_type 68 | return Path("other") 69 | 70 | 71 | def by_date(save_folder: Path, date: str | None) -> Path: 72 | """ 73 | Creates a folder structure based on the date of an email, falling back to 74 | an `unknown_date` folder when parsing fails. 75 | 76 | Args: 77 | save_folder (Path): The base folder where the attachments will be saved. 78 | date (str | None): The date string from the email header. 79 | 80 | Returns: 81 | Path: The path to the created directory. 82 | """ 83 | try: 84 | parsed = parsedate_to_datetime(date) if date else None 85 | except Exception: 86 | parsed = None 87 | 88 | if not parsed: 89 | return build_and_return_directory(save_folder / "unknown_date") 90 | 91 | parsed_date = parsed.date() 92 | year = str(parsed_date.year) 93 | month = parsed_date.strftime("%b") # Use month abbreviation for readability 94 | day = str(parsed_date.day) 95 | return build_and_return_directory(save_folder / year / month / day) 96 | 97 | 98 | def by_sender_email(save_folder: Path, sender: str | None) -> Path: 99 | """ 100 | Creates a folder structure based on the sender's email address. 101 | 102 | Args: 103 | save_folder (Path): The base folder where the attachments will be saved. 104 | sender (str | None): The sender's email address. 105 | 106 | Returns: 107 | Path: The path to the created directory. 108 | """ 109 | if not sender: 110 | return build_and_return_directory(save_folder / "unknown_sender" / "unknown_sender") 111 | 112 | domain = (sender.split("@")[-1]).replace(">", "") if "@" in sender else "unknown_sender" 113 | if "<" in sender: 114 | sender = sender.split("<")[1].split(">")[0] 115 | sender = sender or "unknown_sender" 116 | return build_and_return_directory(save_folder / domain / sender) 117 | 118 | 119 | if __name__ == "__main__": 120 | pass 121 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Gmail Attachment Manager 2 | 3 | ## Description 4 | 5 | This script connects to a Gmail account via IMAP, identifies all emails containing attachments, and downloads those attachments. Users can specify a directory to save the attachments and choose from various sorting methods, such as by file extension, size, type, sender, or date. The script also supports session recovery to handle interruptions and avoid redownloading previously processed emails. 6 | 7 | ## Features 8 | 9 | - **IMAP Connection**: Connects securely to Gmail via IMAP to access emails. 10 | - **Attachment Download**: Downloads all attachments found in the mailbox. 11 | - **Sorting Methods**: Attachments can be sorted by: 12 | - File Extension 13 | - File Size 14 | - File MIME Type 15 | - Date Year -> Month -> Day 16 | - Sender Domain -> sender 17 | - **Session Recovery**: Supports resuming the download process to avoid redownloading attachments. 18 | - **Environment Variable Support**: Optionally set your password via the `EMAIL_PASSWORD` environment variable. 19 | 20 | ## Requirements 21 | 22 | - **Python 3.6+** 23 | - **Required Libraries**: 24 | - `email` (Standard Library) 25 | - `hashlib` (Standard Library) 26 | - `getpass` (Standard Library) 27 | - `imaplib` (Standard Library) 28 | - `os` (Standard Library) 29 | - `logging` (Standard Library) 30 | - `collections` (Standard Library) 31 | - `pathlib` (Standard Library) 32 | - `enum` (Standard Library) 33 | - `mimetypes` (Standard Library) 34 | - `organize` (Included Custom Module) 35 | 36 | ## Additional requirements 37 | 38 | - **Make sure you have IMAP enabled in your GMail settings.** 39 | - https://support.google.com/mail/troubleshooter/1668960?hl=en 40 | 41 | - **If you are using 2 step verification you may need an APP Password.** 42 | - https://support.google.com/accounts/answer/185833 43 | - https://myaccount.google.com/apppasswords 44 | 45 | - **Reference information for GMail IMAP extension can be found here.** 46 | - https://developers.google.com/gmail/imap_extensions 47 | 48 | ## Installation 49 | 50 | 1. **Clone the Repository** 51 | 52 | ```sh 53 | git clone https://github.com/mjseeley/Gmail-Attachment-Downloader.git 54 | cd Gmail-Attachment-Downloader 55 | ``` 56 | 57 | 2. **Verify Python Version** 58 | 59 | Make sure you have Python 3.6 or later installed: 60 | 61 | ```sh 62 | python --version 63 | ``` 64 | 65 | ## Usage 66 | 67 | 1. **Run the Script** 68 | 69 | ```sh 70 | python gmail_downer.py 71 | ``` 72 | 73 | 2. **User Prompts** 74 | 75 | - The script will prompt for your Gmail credentials (or use the `EMAIL_PASSWORD` environment variable). 76 | - You will be asked to specify a directory where attachments should be saved. 77 | - You can select a sorting method to organize the attachments. 78 | 79 | 3. **Resuming Sessions** 80 | - If interrupted, the script can resume from where it left off by recovering from saved state files (`resume.txt` and `processed_ids.txt`). 81 | 82 | ### Using Environment Variables 83 | 84 | You can set your password as an environment variable to avoid entering it each time: 85 | 86 | ```sh 87 | # Linux/macOS 88 | export EMAIL_PASSWORD="your-app-password" 89 | 90 | # Windows (PowerShell) 91 | $env:EMAIL_PASSWORD="your-app-password" 92 | 93 | # Windows (Command Prompt) 94 | set EMAIL_PASSWORD=your-app-password 95 | ``` 96 | 97 | ## Sorting Methods Explained 98 | 99 | - **Extension**: Organizes files by their extension (e.g., `.pdf`, `.jpg`). 100 | - **Size**: Organizes files by their size into categories: 101 | - Tiny: < 10 KB 102 | - Small: 10 KB - 100 KB 103 | - Medium: 100 KB - 1 MB 104 | - Large: 1 MB - 10 MB 105 | - Huge: > 10 MB 106 | - **MIMEType**: Organizes files by general type (e.g., image, text, video). 107 | - **Date**: Organizes files into folders based on the email date (Year/Month/Day). 108 | - **Sender**: Organizes files based on the sender's domain and email address. 109 | 110 | ## Security Note 111 | 112 | The script uses `getpass` to securely input your Gmail password. You will need to set up an [App Password](https://support.google.com/accounts/answer/185833?hl=en) instead of using your regular main Gmail password. 113 | 114 | ## Troubleshooting 115 | 116 | - **Login failed**: Ensure IMAP is enabled in your Gmail settings and you are using an App Password if 2FA is enabled. 117 | - **FileNotFoundError**: Make sure the destination directory path is valid and you have write permissions. 118 | - **Connection errors**: Check your internet connection and firewall settings. Gmail IMAP uses port 993. 119 | - **Empty folders created**: This can occur when sorting by date if attachments were already saved elsewhere and matched by hash (duplicate detection). 120 | 121 | ## Contributing 122 | 123 | Feel free to open issues or submit pull requests with improvements or bug fixes. 124 | 125 | ## License 126 | 127 | This project is licensed under the MIT License. 128 | -------------------------------------------------------------------------------- /gmail_downer.py: -------------------------------------------------------------------------------- 1 | # Description: This script connects to a Gmail account via IMAP, identifies all emails containing attachments, and downloads those attachments. Users can specify a directory to save the attachments and choose from various sorting methods, such as by file extension, size, type, sender, or date. The script also supports session recovery to handle interruptions and avoid redownloading previously processed emails. 2 | 3 | from email import message, header, message_from_bytes 4 | from hashlib import md5 5 | from getpass import getpass 6 | import imaplib 7 | import os 8 | import json 9 | import logging 10 | from collections import defaultdict, Counter 11 | from pathlib import Path 12 | from enum import Enum 13 | import organize 14 | 15 | # Setting up logging 16 | logging.basicConfig( 17 | level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(message)s" 18 | ) 19 | 20 | # TODO: prevent overwriting of files on resume since the count has been reset (tag the files with the message id?) 21 | 22 | 23 | class SortMethod(Enum): 24 | EXTENSION = 1 # working 25 | SIZE = 2 # working 26 | MIMETYPE = 3 # working 27 | DATE = 4 # working but creates empty folders may be due to files already being saved elsewhere and matching the hash. 28 | SENDER = 5 # working 29 | 30 | 31 | IMAP_SERVER = "imap.gmail.com" 32 | MANIFEST_FILE = Path("file_manifest.json") 33 | 34 | 35 | def recover( 36 | resume_file: Path, processed_id_file: Path 37 | ) -> tuple[str, Path, SortMethod, set]: 38 | """ 39 | Recovers the last state of the script from saved files if available. 40 | 41 | Args: 42 | resume_file (Path): The file containing saved state information. 43 | processed_id_file (Path): The file containing IDs of processed messages. 44 | 45 | Returns: 46 | tuple: A tuple containing user_name, save_path, sort_by, and processed_msg_ids. 47 | """ 48 | user_name, save_path, sort_by = None, None, None 49 | processed_msg_ids = set() 50 | if resume_file.exists(): 51 | recover = input( 52 | "Recovery files found. Would you like to recover the last state? (y/n) " 53 | ).lower() 54 | if recover == "y": 55 | logging.info("Recovering last state...") 56 | if processed_id_file.exists(): 57 | with processed_id_file.open() as f: 58 | processed_ids = f.read() 59 | processed_msg_ids = set(filter(None, processed_ids.split(","))) 60 | processed_msg_ids = {msg_id.strip() for msg_id in processed_msg_ids} 61 | with resume_file.open() as f: 62 | last_state = f.read().splitlines() 63 | user_name = last_state[0].split(" = ")[1] 64 | save_path = Path(last_state[1].split(" = ")[1]) 65 | sort_by = SortMethod[last_state[2].split(" = ")[1]] 66 | else: 67 | logging.info( 68 | "Recovery file found but not recovered. Deleting recovery files..." 69 | ) 70 | processed_id_file.unlink(missing_ok=True) 71 | resume_file.unlink(missing_ok=True) 72 | else: 73 | logging.info("No Recovery file found.") 74 | resume_file.touch() 75 | processed_id_file.touch() 76 | return user_name, save_path, sort_by, processed_msg_ids 77 | 78 | 79 | def save_state(resume_file: Path, user_name: str, save_path: Path, sort_by: SortMethod): 80 | """ 81 | Saves the current state of the script for recovery in case of interruption. 82 | 83 | Args: 84 | resume_file (Path): The file to save the state information. 85 | user_name (str): The Gmail username. 86 | save_path (Path): The directory path where attachments are saved. 87 | sort_by (SortMethod): The sorting method used for organizing attachments. 88 | """ 89 | with resume_file.open("w") as f: 90 | f.write(f"user_name = {user_name}\n") 91 | f.write(f"save_path = {save_path}\n") 92 | f.write(f"sort_by = {sort_by.name}\n") 93 | 94 | 95 | def load_manifest(manifest_path: Path) -> tuple[Counter, defaultdict]: 96 | """Load persisted counters and hashes to avoid overwriting on resume.""" 97 | if manifest_path.exists(): 98 | try: 99 | data = json.loads(manifest_path.read_text()) 100 | counters = Counter(data.get("counters", {})) 101 | hashes_raw = data.get("hashes", {}) 102 | hashes = defaultdict(set, {k: set(v) for k, v in hashes_raw.items()}) 103 | return counters, hashes 104 | except Exception: 105 | logging.warning("Manifest file unreadable; starting fresh.") 106 | return Counter(), defaultdict(set) 107 | 108 | 109 | def save_manifest(manifest_path: Path, counters: Counter, hashes: defaultdict): 110 | data = { 111 | "counters": counters, 112 | "hashes": {k: list(v) for k, v in hashes.items()}, 113 | } 114 | manifest_path.write_text(json.dumps(data, indent=2)) 115 | 116 | 117 | def decode_mime_words(s: str) -> str: 118 | """ 119 | Decodes MIME-encoded words in an email header to a UTF-8 string. 120 | 121 | Args: 122 | s (str): The MIME-encoded string. 123 | 124 | Returns: 125 | str: The decoded string. 126 | """ 127 | decoded_words = header.decode_header(s) 128 | return "".join( 129 | word.decode(encoding or "utf-8") if isinstance(word, bytes) else word 130 | for word, encoding in decoded_words 131 | ) 132 | 133 | 134 | def generate_mail_messages( 135 | gmail_user_name: str, 136 | password: str, 137 | processed_id_file: Path, 138 | processed_ids: set, 139 | max_attempts: int = 3, 140 | ): 141 | """ 142 | Generates email messages from the Gmail account that have attachments. 143 | 144 | Args: 145 | gmail_user_name (str): The Gmail username. 146 | password (str): The Gmail password. 147 | processed_id_file (Path): The file containing IDs of processed messages. 148 | processed_ids (set): The set of processed message IDs. 149 | max_attempts (int, optional): Maximum attempts to fetch a message. Defaults to 3. 150 | 151 | Yields: 152 | message.Message: The email message with attachments. 153 | """ 154 | with imaplib.IMAP4_SSL(IMAP_SERVER) as imap_session: 155 | try: 156 | imap_session.login(gmail_user_name, password) 157 | logging.info("Login successful.") 158 | except imaplib.IMAP4.error: 159 | logging.error("Login failed. Please check your credentials.") 160 | return 161 | imap_session.select('"[Gmail]/All Mail"') 162 | session_typ, data = imap_session.search(None, '(X-GM-RAW "has:attachment")') 163 | if session_typ != "OK": 164 | raise Exception("Error searching Inbox.") 165 | for msg_id in data[0].split(): 166 | msg_id_str = msg_id.decode() 167 | if msg_id_str not in processed_ids: 168 | for attempt in range(max_attempts): 169 | msg_typ, message_parts = imap_session.fetch(msg_id, "(RFC822)") 170 | if msg_typ == "OK": 171 | yield message_from_bytes(message_parts[0][1]) 172 | processed_ids.add(msg_id_str) 173 | with processed_id_file.open("a") as resume: 174 | resume.write(f"{msg_id_str},") 175 | break 176 | else: 177 | logging.warning( 178 | f"Error fetching mail {msg_id_str}, attempt {attempt + 1}/{max_attempts}" 179 | ) 180 | else: 181 | logging.error( 182 | f"Failed to fetch mail {msg_id_str} after {max_attempts} attempts." 183 | ) 184 | 185 | 186 | def save_attachments( 187 | message: message.Message, 188 | directory: Path, 189 | sort_by: SortMethod, 190 | file_name_counter: Counter, 191 | file_name_hashes: defaultdict, 192 | manifest_path: Path, 193 | ): 194 | """ 195 | Saves attachments from an email message to the specified directory. 196 | 197 | Args: 198 | message (message.Message): The email message containing attachments. 199 | directory (Path): The base directory where attachments will be saved. 200 | sort_by (SortMethod): The sorting method used for organizing attachments. 201 | file_name_counter (Counter): A counter to manage duplicate file names. 202 | file_name_hashes (defaultdict): A dictionary to track unique attachments by hash. 203 | """ 204 | msg_from = message.get("From") or "unknown_sender" 205 | msg_date = message.get("Date") 206 | 207 | if sort_by == SortMethod.DATE: 208 | directory = organize.by_date(directory, msg_date) 209 | elif sort_by == SortMethod.SENDER: 210 | directory = organize.by_sender_email(directory, msg_from) 211 | 212 | for part in message.walk(): 213 | if ( 214 | part.get_content_maintype() == "multipart" 215 | or part.get("Content-Disposition") is None 216 | ): 217 | continue 218 | file_name = part.get_filename() 219 | if not file_name: 220 | # Fallback for nameless attachments 221 | file_name_counter["__unnamed__"] += 1 222 | subtype = part.get_content_subtype() or "bin" 223 | file_name = f"attachment{file_name_counter['__unnamed__']}.{subtype}" 224 | 225 | logging.debug(f"Original file name: {file_name}") 226 | file_name = ( 227 | decode_mime_words(file_name) 228 | .replace("/", "_") 229 | .replace("\\", "_") 230 | .replace(":", "_") 231 | .replace("*", "_") 232 | .replace("?", "_") 233 | .replace('"', "_") 234 | .replace("<", "_") 235 | .replace(">", "_") 236 | .replace("|", "_") 237 | .replace("\n", "") 238 | .replace("\r", "") 239 | ) 240 | logging.debug(f"Sanitized file name: {file_name}") 241 | # Limit the file name length to avoid exceeding the Windows path length limit 242 | max_length = 150 243 | if len(file_name) > max_length: 244 | file_name = file_name[:max_length] + os.path.splitext(file_name)[1] 245 | payload = part.get_payload(decode=True) 246 | if not payload: 247 | logging.info("Skipped attachment with empty payload") 248 | continue 249 | 250 | x_hash = md5(payload).hexdigest() 251 | if x_hash not in file_name_hashes[file_name]: 252 | file_name_counter[file_name] += 1 253 | file_str, file_extension = os.path.splitext(file_name) 254 | new_file_name = ( 255 | f"{file_str}(v.{file_name_counter[file_name]}){file_extension}" 256 | if file_name_counter[file_name] > 1 257 | else file_name 258 | ) 259 | file_name_hashes[file_name].add(x_hash) 260 | if sort_by == SortMethod.EXTENSION: 261 | file_dir = directory / ( 262 | file_extension.lower().strip(".") 263 | if file_extension 264 | else "other" 265 | ) 266 | elif sort_by == SortMethod.SIZE: 267 | file_dir = directory / organize.by_size( 268 | len(payload) 269 | ) 270 | elif sort_by == SortMethod.MIMETYPE: 271 | file_dir = directory / ( 272 | organize.by_mime_type(file_name) 273 | ) 274 | else: 275 | file_dir = directory 276 | file_dir = organize.build_and_return_directory(file_dir) 277 | file_path = (file_dir / new_file_name).resolve() 278 | # Use Windows extended-length path prefix for long paths on Windows only 279 | if os.name == 'nt' and len(str(file_path)) > 260: 280 | file_path = Path(rf"\\?\{file_path}") 281 | if not file_path.exists(): 282 | with file_path.open("wb") as fp: 283 | fp.write(payload) 284 | save_manifest(manifest_path, file_name_counter, file_name_hashes) 285 | else: 286 | logging.info(f"\tExists in destination: {new_file_name}") 287 | 288 | 289 | def main(): 290 | """ 291 | Main function that drives the script, handling user input, downloading attachments, and organizing them. 292 | """ 293 | file_name_counter, file_name_hashes = load_manifest(MANIFEST_FILE) 294 | resume_file = Path("resume.txt") 295 | processed_id_file = Path("processed_ids.txt") 296 | 297 | user_name, save_path, sort_by, processed_ids = recover( 298 | resume_file, processed_id_file 299 | ) 300 | if user_name is not None: 301 | password = os.getenv("EMAIL_PASSWORD") or getpass( 302 | f"Enter the password for {user_name}: " 303 | ) 304 | else: 305 | user_name = input("Enter your Gmail username: ") 306 | password = os.getenv("EMAIL_PASSWORD") or getpass("Enter your password: ") 307 | save_path = Path(input("Enter Destination path: ")) 308 | save_path.mkdir(parents=True, exist_ok=True) 309 | while True: 310 | try: 311 | sort_choice = int( 312 | input( 313 | """Enter sort method [1-5]: 314 | 1. Extension 315 | 2. Size 316 | 3. MIME Type 317 | 4. Date Year -> Month -> Day 318 | 5. Domain -> Sender 319 | """ 320 | ).strip() 321 | ) 322 | sort_by = SortMethod(sort_choice) 323 | break 324 | except (ValueError, KeyError): 325 | print("Please enter a number between 1 and 5.") 326 | save_state(resume_file, user_name, save_path, sort_by) 327 | 328 | for msg in generate_mail_messages( 329 | user_name, password, processed_id_file, processed_ids 330 | ): 331 | save_attachments( 332 | msg, 333 | save_path, 334 | sort_by, 335 | file_name_counter, 336 | file_name_hashes, 337 | MANIFEST_FILE, 338 | ) 339 | 340 | processed_id_file.unlink(missing_ok=True) 341 | resume_file.unlink(missing_ok=True) 342 | save_manifest(MANIFEST_FILE, file_name_counter, file_name_hashes) 343 | 344 | 345 | if __name__ == "__main__": 346 | main() 347 | --------------------------------------------------------------------------------