├── LICENSE ├── README.md ├── email_stats_by_people.py └── email_stats_by_domain.py /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Yohei Nakajima 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Quick Email Scripts 2 | 3 | A collection of simple scripts for email analysis and management. 4 | 5 | ## Current Scripts 6 | 7 | ### 1. `email_stats_by_domain.py` 8 | 9 | Analyzes your Gmail account and provides statistics on email interactions by domain. 10 | 11 | Features: 12 | - Counts emails sent and received per domain 13 | - Identifies most frequent contacts within each domain 14 | - Generates a CSV report with detailed statistics 15 | 16 | ### 2. `email_stats_by_people.py` 17 | 18 | Analyzes your Gmail account and provides statistics on email interactions with specific email addresses. 19 | 20 | Features: 21 | - Counts emails sent and received per email address 22 | - Tracks first and last email dates for each contact 23 | - Generates a CSV report with detailed statistics 24 | 25 | ## Usage 26 | 27 | 1. Clone the repository 28 | 2. Set up your Gmail credentials as environment variables: 29 | - `GMAIL_ADDRESS`: Your Gmail address 30 | - `GMAIL_APP_PASSWORD`: Your Gmail app password 31 | 3. Update the script you want to use: 32 | - For `email_stats_by_domain.py`: Edit the `domains` list in the `main()` function 33 | - For `email_stats_by_people.py`: Edit the `email_addresses` list in the `main()` function 34 | ```python 35 | email_addresses = [ 36 | "example1@example.com", 37 | "example2@example.com" 38 | # Add or modify email addresses as needed 39 | ] 40 | ``` 41 | 4. Run the desired script: 42 | ``` 43 | python email_stats_by_domain.py 44 | ``` 45 | or 46 | ``` 47 | python email_stats_by_people.py 48 | ``` 49 | 5. Check the generated `email_stats.csv` file for results 50 | 51 | ## Note 52 | 53 | This is a personal project shared for convenience. While you're welcome to use and modify the scripts, I'm not actively seeking contributions or maintaining this as a collaborative project. 54 | 55 | ## License 56 | 57 | This project is open source and available under the MIT License. 58 | 59 | ## Background 60 | 61 | Created to quickly analyze email interactions with various contacts, organizations, and domains. Inspired by the need to efficiently assess communication history with potential business partners or investors. 62 | -------------------------------------------------------------------------------- /email_stats_by_people.py: -------------------------------------------------------------------------------- 1 | import imaplib 2 | import email 3 | from email.header import decode_header 4 | import re 5 | import os 6 | from collections import defaultdict 7 | from datetime import datetime 8 | from email.utils import parsedate_to_datetime 9 | import pandas as pd 10 | 11 | # Email credentials (fetched from environment variables) 12 | EMAIL = os.environ['GMAIL_ADDRESS'] 13 | PASSWORD = os.environ['GMAIL_APP_PASSWORD'] 14 | IMAP_SERVER = "imap.gmail.com" 15 | 16 | # Limit for how many emails to fetch at a time 17 | MAX_EMAILS_TO_FETCH = 100000 18 | BATCH_SIZE = 50 # Number of emails to fetch at once 19 | 20 | def connect_imap(): 21 | print("Connecting to Gmail IMAP server...") 22 | mail = imaplib.IMAP4_SSL(IMAP_SERVER) 23 | try: 24 | mail.login(EMAIL, PASSWORD) 25 | print("Logged in successfully.") 26 | except imaplib.IMAP4.error as e: 27 | print(f"Login failed. Check your credentials. Error: {e}") 28 | return None 29 | return mail 30 | 31 | def fetch_emails_by_address(mail, email_addresses, max_emails): 32 | try: 33 | mail.select('"[Gmail]/All Mail"') 34 | except imaplib.IMAP4.abort as e: 35 | print(f"Error selecting All Mail folder: {e}") 36 | return defaultdict() 37 | 38 | email_stats = defaultdict(lambda: { 39 | 'first_email_date': None, 40 | 'last_email_date': None, 41 | 'total_sent': 0, 42 | 'total_received': 0, 43 | 'total_emails': 0 44 | }) 45 | 46 | for email_address in email_addresses: 47 | print(f"\nSearching for emails to/from: {email_address}") 48 | 49 | # Define search criteria for TO or FROM the email address 50 | criteria = '(OR TO "{}" FROM "{}")'.format(email_address, email_address) 51 | 52 | try: 53 | status, messages = mail.search(None, criteria) 54 | except imaplib.IMAP4.abort as e: 55 | print(f"Error searching emails for address {email_address}: {e}") 56 | continue 57 | 58 | if status != 'OK': 59 | print(f"No emails found for {email_address}.") 60 | continue 61 | 62 | email_ids = messages[0].split() 63 | email_ids = email_ids[-max_emails:] # Limit to most recent `max_emails` 64 | 65 | print(f"Found {len(email_ids)} emails for {email_address}. Processing now in batches...") 66 | 67 | for i in range(0, len(email_ids), BATCH_SIZE): 68 | batch = email_ids[i:i + BATCH_SIZE] 69 | batch_str = ",".join(batch.decode() for batch in batch) 70 | 71 | try: 72 | # Fetch only the headers (FROM, TO, DATE) 73 | status, msg_data = mail.fetch(batch_str, "(BODY[HEADER.FIELDS (FROM TO DATE)])") 74 | if status != 'OK': 75 | print(f"Failed to fetch batch starting with ID {batch[0]}") 76 | continue 77 | except imaplib.IMAP4.abort as e: 78 | print(f"Error fetching batch starting with email ID {batch[0]}: {e}") 79 | continue 80 | 81 | for response_part in msg_data: 82 | if isinstance(response_part, tuple): 83 | try: 84 | msg = email.message_from_bytes(response_part[1]) 85 | 86 | # Decode date and handle potential errors 87 | date = msg.get("Date") 88 | email_date = parsedate_to_datetime(date) 89 | 90 | # Update first/last email dates 91 | if email_stats[email_address]['first_email_date'] is None or email_date < email_stats[email_address]['first_email_date']: 92 | email_stats[email_address]['first_email_date'] = email_date 93 | if email_stats[email_address]['last_email_date'] is None or email_date > email_stats[email_address]['last_email_date']: 94 | email_stats[email_address]['last_email_date'] = email_date 95 | 96 | from_ = msg.get("From") 97 | to_ = msg.get("To") 98 | 99 | # Extract sender's email 100 | from_email = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', from_) 101 | to_emails = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', to_) 102 | 103 | # Process the FROM field (emails sent by the user) 104 | if email_address in from_email: 105 | email_stats[email_address]['total_sent'] += 1 106 | email_stats[email_address]['total_emails'] += 1 107 | 108 | # Process the TO field (emails received by the user) 109 | if email_address in to_emails: 110 | email_stats[email_address]['total_received'] += 1 111 | email_stats[email_address]['total_emails'] += 1 112 | 113 | except Exception as e: 114 | print(f"Error processing email in batch: {e}") 115 | continue 116 | 117 | return email_stats 118 | 119 | def summarize_stats(email_stats): 120 | sorted_emails = sorted(email_stats.items(), key=lambda x: x[1]['total_emails'], reverse=True) 121 | for email_address, stats in sorted_emails: 122 | print(f"\nSummary for email: {email_address}") 123 | print(f" Total emails sent: {stats['total_sent']}") 124 | print(f" Total emails received: {stats['total_received']}") 125 | print(f" Total emails exchanged: {stats['total_emails']}") 126 | print(f" First email: {stats['first_email_date'].strftime('%Y-%m-%d') if stats['first_email_date'] else 'N/A'}") 127 | print(f" Last email: {stats['last_email_date'].strftime('%Y-%m-%d') if stats['last_email_date'] else 'N/A'}") 128 | 129 | def save_stats_to_csv(email_stats, filename="email_stats.csv"): 130 | data = [] 131 | for email_address, stats in email_stats.items(): 132 | data.append({ 133 | "Email Address": email_address, 134 | "Sent": stats['total_sent'], 135 | "Received": stats['total_received'], 136 | "Total": stats['total_emails'], 137 | "First Email": stats['first_email_date'].strftime('%Y-%m-%d') if stats['first_email_date'] else None, 138 | "Last Email": stats['last_email_date'].strftime('%Y-%m-%d') if stats['last_email_date'] else None 139 | }) 140 | 141 | # Create a pandas DataFrame 142 | df = pd.DataFrame(data) 143 | 144 | # Save to a CSV file 145 | df.to_csv(filename, index=False) 146 | print(f"CSV file saved to {filename}") 147 | 148 | def main(): 149 | email_addresses = ["example@email.com","example2@email.com"] 150 | 151 | mail = connect_imap() 152 | if mail: 153 | email_stats = fetch_emails_by_address(mail, email_addresses, MAX_EMAILS_TO_FETCH) 154 | mail.logout() 155 | 156 | # Summarize and save stats 157 | summarize_stats(email_stats) 158 | save_stats_to_csv(email_stats, filename="email_stats.csv") 159 | 160 | if __name__ == "__main__": 161 | main() 162 | -------------------------------------------------------------------------------- /email_stats_by_domain.py: -------------------------------------------------------------------------------- 1 | import imaplib 2 | import email 3 | from email.header import decode_header 4 | import re 5 | import os 6 | from collections import defaultdict 7 | from datetime import datetime 8 | from email.utils import parsedate_to_datetime 9 | 10 | # Email credentials (fetched from environment variables) 11 | EMAIL = os.environ['GMAIL_ADDRESS'] 12 | PASSWORD = os.environ['GMAIL_APP_PASSWORD'] 13 | IMAP_SERVER = "imap.gmail.com" 14 | 15 | # Limit for how many emails to fetch at a time (if you want to limit results, adjust this) 16 | MAX_EMAILS_TO_FETCH = 100000 # Adjust as needed 17 | BATCH_SIZE = 50 # Number of emails to fetch at once 18 | 19 | def connect_imap(): 20 | print("Connecting to Gmail IMAP server...") 21 | mail = imaplib.IMAP4_SSL(IMAP_SERVER) 22 | try: 23 | mail.login(EMAIL, PASSWORD) 24 | print("Logged in successfully.") 25 | except imaplib.IMAP4.error as e: 26 | print(f"Login failed. Check your credentials. Error: {e}") 27 | return None 28 | return mail 29 | 30 | def fetch_emails_by_domain(mail, domains, max_emails): 31 | try: 32 | mail.select('"[Gmail]/All Mail"') 33 | except imaplib.IMAP4.abort as e: 34 | print(f"Error selecting All Mail folder: {e}") 35 | return defaultdict() 36 | 37 | domain_stats = defaultdict(lambda: { 38 | 'people': defaultdict(lambda: {'to': 0, 'from': 0, 'total': 0}), 39 | 'first_email_date': None, 40 | 'last_email_date': None, 41 | 'total_sent': 0, 42 | 'total_received': 0, 43 | 'total_emails': 0 44 | }) 45 | 46 | for domain in domains: 47 | print(f"\nSearching for emails to/from domain: {domain}") 48 | 49 | # Define search criteria for TO or FROM the domain 50 | criteria = '(OR TO "@{}" FROM "@{}")'.format(domain, domain) 51 | 52 | try: 53 | status, messages = mail.search(None, criteria) 54 | except imaplib.IMAP4.abort as e: 55 | print(f"Error searching emails for domain {domain}: {e}") 56 | continue 57 | 58 | if status != 'OK': 59 | print(f"No emails found for {domain}.") 60 | continue 61 | 62 | email_ids = messages[0].split() 63 | email_ids = email_ids[-max_emails:] # Limit to most recent `max_emails` 64 | 65 | print(f"Found {len(email_ids)} emails for domain {domain}. Processing now in batches...") 66 | 67 | for i in range(0, len(email_ids), BATCH_SIZE): 68 | batch = email_ids[i:i + BATCH_SIZE] 69 | batch_str = ",".join(batch.decode() for batch in batch) 70 | 71 | try: 72 | # Fetch only the headers (FROM, TO, DATE) 73 | status, msg_data = mail.fetch(batch_str, "(BODY[HEADER.FIELDS (FROM TO DATE)])") 74 | if status != 'OK': 75 | print(f"Failed to fetch batch starting with ID {batch[0]}") 76 | continue 77 | except imaplib.IMAP4.abort as e: 78 | print(f"Error fetching batch starting with email ID {batch[0]}: {e}") 79 | continue 80 | 81 | for response_part in msg_data: 82 | if isinstance(response_part, tuple): 83 | try: 84 | msg = email.message_from_bytes(response_part[1]) 85 | 86 | # Decode date and handle potential errors 87 | date = msg.get("Date") 88 | email_date = parsedate_to_datetime(date) 89 | 90 | # Update first/last email dates for the domain 91 | if domain_stats[domain]['first_email_date'] is None or email_date < domain_stats[domain]['first_email_date']: 92 | domain_stats[domain]['first_email_date'] = email_date 93 | if domain_stats[domain]['last_email_date'] is None or email_date > domain_stats[domain]['last_email_date']: 94 | domain_stats[domain]['last_email_date'] = email_date 95 | 96 | from_ = msg.get("From") 97 | to_ = msg.get("To") 98 | 99 | # Extract sender's email 100 | from_email = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', from_) 101 | to_emails = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', to_) 102 | 103 | # Process the FROM field 104 | for email_addr in from_email: 105 | if domain in email_addr: 106 | name = email_addr 107 | domain_stats[domain]['people'][name]['from'] += 1 108 | domain_stats[domain]['people'][name]['total'] += 1 109 | domain_stats[domain]['total_received'] += 1 110 | domain_stats[domain]['total_emails'] += 1 111 | 112 | # Process the TO field 113 | for email_addr in to_emails: 114 | if domain in email_addr: 115 | name = email_addr 116 | domain_stats[domain]['people'][name]['to'] += 1 117 | domain_stats[domain]['people'][name]['total'] += 1 118 | domain_stats[domain]['total_sent'] += 1 119 | domain_stats[domain]['total_emails'] += 1 120 | except Exception as e: 121 | print(f"Error processing email ID in batch: {e}") 122 | continue 123 | 124 | return domain_stats 125 | 126 | def summarize_stats(domain_stats): 127 | sorted_domains = sorted(domain_stats.items(), key=lambda x: x[1]['total_emails'], reverse=True) 128 | for domain, stats in sorted_domains: 129 | print(f"\nSummary for domain: {domain}") 130 | print(f" Total emails sent: {stats['total_sent']}") 131 | print(f" Total emails received: {stats['total_received']}") 132 | print(f" Total emails exchanged: {stats['total_emails']}") 133 | print(f" First email: {stats['first_email_date'].strftime('%Y-%m-%d')}") 134 | print(f" Last email: {stats['last_email_date'].strftime('%Y-%m-%d')}") 135 | 136 | sorted_people = sorted(stats['people'].items(), key=lambda x: x[1]['total'], reverse=True) 137 | for person, person_stats in sorted_people: 138 | print(f" {person} - sent: {person_stats['to']}, received: {person_stats['from']}, total: {person_stats['total']}") 139 | 140 | import pandas as pd 141 | 142 | def save_stats_to_spreadsheet(domain_stats, filename="email_stats.csv"): 143 | data = [] 144 | for domain, stats in domain_stats.items(): 145 | for person, person_stats in stats['people'].items(): 146 | data.append({ 147 | "Domain": domain, 148 | "Person": person, 149 | "Sent": person_stats['to'], 150 | "Received": person_stats['from'], 151 | "Total": person_stats['total'], 152 | "First Email": stats['first_email_date'].strftime('%Y-%m-%d') if stats['first_email_date'] else None, 153 | "Last Email": stats['last_email_date'].strftime('%Y-%m-%d') if stats['last_email_date'] else None, 154 | "Total Emails Sent (Domain)": stats['total_sent'], 155 | "Total Emails Received (Domain)": stats['total_received'], 156 | "Total Emails Exchanged (Domain)": stats['total_emails'] 157 | }) 158 | 159 | # Create a pandas DataFrame 160 | df = pd.DataFrame(data) 161 | 162 | # Save to an Excel file 163 | df.to_csv(filename, index=False) 164 | print(f"CSV file saved to {filename}") 165 | 166 | 167 | def main(): 168 | domains = [ 169 | "example.com", 170 | "example2.com" 171 | ] 172 | 173 | 174 | mail = connect_imap() 175 | if mail: 176 | domain_stats = fetch_emails_by_domain(mail, domains, MAX_EMAILS_TO_FETCH) 177 | mail.logout() 178 | 179 | summarize_stats(domain_stats) 180 | save_stats_to_spreadsheet(domain_stats, filename="email_stats.csv") 181 | 182 | if __name__ == "__main__": 183 | main() 184 | --------------------------------------------------------------------------------