├── .coverage ├── .coveragerc ├── .github └── workflows │ ├── ci.yml │ └── python-package.yml ├── .gitignore ├── LICENSE ├── MANIFEST.in ├── README.md ├── assets ├── graph1.png ├── graph2.png ├── graph3.png ├── graph4.png ├── graph5.png ├── histo.png └── summaryplot.png ├── build └── lib │ └── emailnetwork │ ├── __init__.py │ ├── emails.py │ ├── extract.py │ ├── graph.py │ ├── header.py │ ├── network.py │ ├── summary.py │ ├── tests │ ├── __init__.py │ ├── test.mbox │ ├── test_extract.py │ ├── test_graph.py │ ├── test_summary.py │ └── test_utils.py │ ├── utils.py │ └── version.py ├── coverage.xml ├── dist ├── emailnetwork-0.0.1-py3-none-any.whl ├── emailnetwork-0.0.1.tar.gz ├── emailnetwork-0.0.2-py3-none-any.whl └── emailnetwork-0.0.2.tar.gz ├── emailnetwork.egg-info ├── PKG-INFO ├── SOURCES.txt ├── dependency_links.txt ├── not-zip-safe ├── requires.txt └── top_level.txt ├── emailnetwork ├── .coverage ├── __init__.py ├── emails.py ├── extract.py ├── graph.py ├── header.py ├── network.py ├── summary.py ├── tests │ ├── __init__.py │ ├── test.mbox │ ├── test_extract.py │ ├── test_graph.py │ ├── test_summary.py │ └── test_utils.py ├── utils.py └── version.py ├── requirements.txt └── setup.py /.coverage: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/onlyphantom/emailnetwork/24b323e7c5159b5a02a7933b5f609664689a7316/.coverage -------------------------------------------------------------------------------- /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | omit = 3 | */test* -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: Run Python Tests 2 | on: 3 | push: 4 | branches: 5 | - main 6 | pull_request: 7 | branches: 8 | - main 9 | 10 | jobs: 11 | build: 12 | runs-on: ${{ matrix.os }} 13 | strategy: 14 | matrix: 15 | os: [ubuntu-latest, macos-latest, windows-latest] 16 | python-version: ["3.7", "3.8", "3.9"] 17 | steps: 18 | - uses: actions/checkout@v2 19 | - name: Install Python 3 20 | uses: actions/setup-python@v1 21 | with: 22 | python-version: ${{ matrix.python-version }} 23 | - name: Install dependencies 24 | run: | 25 | python -m pip install --upgrade pip 26 | pip install -r requirements.txt 27 | - name: Test with pytest 28 | run: | 29 | pip install pytest-cov 30 | pytest --cov 31 | -------------------------------------------------------------------------------- /.github/workflows/python-package.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: Python package 5 | 6 | on: 7 | push: 8 | branches: [ main ] 9 | pull_request: 10 | branches: [ main ] 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: ubuntu-latest 16 | strategy: 17 | matrix: 18 | python-version: [3.7, 3.8, 3.9] 19 | 20 | steps: 21 | - uses: actions/checkout@v2 22 | - name: Set up Python ${{ matrix.python-version }} 23 | uses: actions/setup-python@v2 24 | with: 25 | python-version: ${{ matrix.python-version }} 26 | - name: Install dependencies 27 | run: | 28 | python -m pip install --upgrade pip 29 | python -m pip install flake8 pytest 30 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 31 | - name: Lint with flake8 32 | run: | 33 | # stop the build if there are Python syntax errors or undefined names 34 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 35 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 36 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 37 | - name: Test with pytest 38 | run: | 39 | pytest 40 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | .DS_Store 3 | .vscode 4 | htmlcov/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2021 Supertype Pte Ltd 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include emailnetwork/tests/*.mbox -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | [![Supertype](https://img.shields.io/badge/supertype.ai-incubate-b1976b)](https://supertype.ai/incubate) 3 | [![PyPI Version](https://img.shields.io/pypi/v/emailnetwork)](https://pypi.org/project/emailnetwork/) 4 | [![Colab Notebook](https://img.shields.io/badge/notebook-colab%20notebook-orange)](https://colab.research.google.com/drive/1mSKbt9-dTTtQq296QUkMZlZpAMybgmpV?usp=sharing) 5 | [![Downloads](https://pepy.tech/badge/emailnetwork)](https://pepy.tech/project/emailnetwork) 6 | 7 | # Email Network 8 | 9 | ## Description 10 | Network graphing utilities for email/mailbox data. 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | For the social scientists, creating social networks from your mailbox data and among other things: 19 | * Discover subgroups within your organization (whether the different task forces established were as cohesive as it seems on the outside) 20 | * Study social actors (most emails from Marketing involve Peter and Andy) and their relative influence 21 | * Identify the key social groups (Sales team hangs out a lot, but the IT / product division less so) 22 | * Key account managers of the company (Despite being with the company only recently, Margaretha is connected to more key clients than her peers) 23 | * Compare distributions and patterns of email behaviors and aggregated statistics between groups of employees 24 | 25 | 26 | If you're a graph theorist and looking for something more statistical: 27 | * Support directed and undirected graphs (**already implemented in version 0.0.2**, see below) 28 | * Also output statistical measurements such as centrality distribution (**planned for version 0.0.3**) 29 | * Betweenness, closeness, hubness, distance histograms plotting (**planned for version 0.0.3**) 30 | * Exports to `.graphml` format for use in other graphing software (**already implemented in version 0.0.2**, see below) 31 | 32 | ## Dependencies 33 | * Python 3.7+ 34 | * Only dependencies are NetworkX and Matplotlib 35 | 36 | ## Example Usage 37 | To install `emailnetwork`: 38 | ``` 39 | pip install emailnetwork 40 | ``` 41 | 42 | A sample `.mbox` file is provided to you, but you can obtain export your own mailbox from your email service provider. If you use Google (Gmail), you can [use the Google Takeout service](https://takeout.google.com/settings/takeout) to export your mail data. 43 | 44 | 45 | ```python 46 | from emailnetwork.extract import MBoxReader 47 | reader = MBoxReader('path-to-mbox.mbox') 48 | print(f'{len(reader)} emails in the sample mbox.') 49 | 50 | # extract a specific email 51 | from emailnetwork.extract import extract_meta 52 | email = reader.mbox[5] 53 | emailmsg = extract_meta(email) 54 | 55 | # filter emails by certain date 56 | thisyearmails = reader.filter_emails(dateoperator='>=', datestring='2021-01-05') 57 | 58 | # print email domains of recipients 59 | print(emailmsg.recipients) 60 | print(emailmsg.recipients[0].domain) 61 | 62 | # extract all emails 63 | emails = reader.extract() 64 | ``` 65 | 66 | For graph visualization: 67 | ```py 68 | from emailnetwork.extract import MBoxReader 69 | from emailnetwork.graph import plot_directed, plot_undirected, plot_single_directed, plot_single_undirected 70 | 71 | # Read from .mbox 72 | MBOX_PATH = f'{os.path.dirname(__file__)}/tests/test.mbox' 73 | reader = MBoxReader(MBOX_PATH) 74 | 75 | # Try the following: 76 | # plot a single directed graph the email at index 3 77 | plot_single_directed(reader,3) 78 | 79 | # plot a single undirected graph the email at index 3, show title in plot 80 | plot_single_undirected(reader, 1, showtitle=True) 81 | 82 | # plot a directed graph, optionally specifying a layout style 83 | plot_directed(reader) 84 | plot_directed(reader, 'shell') 85 | # optionally export a .graphml to your working directory for use 86 | # in other network / graphing software 87 | plot_undirected(reader, 'spring', graphml=True) 88 | ``` 89 | 90 | #### Email Header Analysis 91 | 92 | To obtain a histogram: 93 | 94 | ```py 95 | from emailnetwork.extract import MBoxReader 96 | reader = MBoxReader('path-to-mbox') 97 | headers = HeaderCounter(reader) 98 | headers.histogram() 99 | # to show only top 10 header, set an optional n parameter 100 | # headers.histogram(n=10) 101 | ``` 102 | Because `HeaderCounter` is a subclass of Python's `Counter`, you can also perform operations such as `headers.most_common(8)` to get the 8 most-common headers from the `mbox` file. 103 | 104 | If you want to find all email headers with the word "spam" in it (e.g spam score, other antispam mechanism), you can use Python's `filter()` function: 105 | ```python 106 | reader = MBoxReader('path-to-mbox') 107 | headers = HeaderCounter(reader) 108 | spamheaders = list(filter(lambda v: "spam" in v.lower(), headers.keys())) 109 | # return: 110 | # ['X-Spam-Checked-In-Group', 'X-Microsoft-Antispam-PRVS', 'X-Microsoft-Antispam-Untrusted', 'X-Microsoft-Antispam-Message-Info-Original', 'X-Forefront-Antispam-Report-Untrusted', 'x-ms-exchange-antispam-messagedata', 'X-Microsoft-Antispam', 'X-Microsoft-Antispam-Message-Info', 'X-Forefront-Antispam-Report', 'X-Mimecast-Spam-Score', 'x-microsoft-antispam-prvs', 'x-microsoft-antispam', 'x-microsoft-antispam-message-info', 'x-forefront-antispam-report'] 111 | ``` 112 | 113 | #### Mailbox Summary 114 | 115 | To get a simple barchart on the distribution of email domains in your `.mbox`, you can create a `DomainSummary` object and call the `.plot()` function: 116 | 117 | 118 | 119 | ```python 120 | from emailnetwork.summary import DomainSummary 121 | summary = DomainSummary(reader) 122 | summary.plot() 123 | ``` 124 | 125 | You can also return a `Counter()` (a subclass of `dict`) instead of a plot: 126 | 127 | ```python 128 | summary.summary 129 | # return: 130 | # Counter({'supertype.ai': 203, 'hubspot.com': 115, 'gmail.com': 75, 'google.com': 53, 'adcolony.com': 38, 'fbworkmail.com': 35, 'elementor.com': 29, 'payoneer.com': 15, 'gogame.net': 14, 'zoomd.com': 13, 'am.atlassian.com': 10, 'theafternaut.com': 6, 'alegrium.com': 5, 'accounts.google.com': 4, 'e.atlassian.com': 4, 'tnbaura.com': 4, 'support.lazada.sg': 4, '3kraters.com': 3, 'go.facebookmail.com': 2, 'docs.google.com': 2, 'mail.hellosign.com': 2, 'algorit.ma': 2, 'supertype.atlassian.net': 2, 'ucdconnect.ie': 2, 'mc.facebookmail.com': 1, 'inplacesoftware.com': 1, 'aura.co': 1, 'atlassian.com': 1, 'greenhouse.io': 1}) 131 | ``` 132 | ##### Why Python 3.7+? 133 | Python 3.7+ is required because the package is written to take advantage of many features of Python 3.7 and above. 134 | 135 | Examples of features that were used extensively in the creation of this package: 136 | * [Dataclasses, new in Python 3.7](https://www.youtube.com/watch?v=sH_jLQvnpBo) 137 | * [Insertion-ordered Dictionaries, new in Python 3.7](https://www.youtube.com/watch?v=h-DBWPjpqWY) 138 | * [Typing (Type hints), new in Python 3.5](https://docs.python.org/3/library/typing.html) 139 | * [Formatted string literal, new in Python 3.6](https://docs.python.org/3/reference/lexical_analysis.html#f-strings) 140 | ## Testing 141 | Git clone, and run `pytest`. You can also run pytest with coverage: 142 | ``` 143 | pytest --cov 144 | 145 | ......... 146 | 147 | Name Stmts Miss Cover 148 | ---------------------------------------------- 149 | emailnetwork/__init__.py 2 0 100% 150 | emailnetwork/emails.py 39 1 97% 151 | emailnetwork/extract.py 94 15 84% 152 | emailnetwork/graph.py 120 12 90% 153 | emailnetwork/header.py 39 24 38% 154 | emailnetwork/network.py 13 1 92% 155 | emailnetwork/summary.py 73 22 70% 156 | emailnetwork/utils.py 30 9 70% 157 | emailnetwork/version.py 1 0 100% 158 | ---------------------------------------------- 159 | TOTAL 411 84 80% 160 | 161 | 162 | =============== 17 passed in 2.85s ============== 163 | ``` 164 | 165 | All tests are located in the `/tests/` directory. 166 | 167 | ## Email Network Demo 168 | 169 | [Aurellia Christie](https://github.com/AurelliaChristie) has created a Colab Notebook: [Email Network Walkthrough](https://colab.research.google.com/drive/1mSKbt9-dTTtQq296QUkMZlZpAMybgmpV?usp=sharing) to walk you through the most common functionalities of Email Network 170 | 171 | ## Authors and Copyright 172 | 173 | Samuel Chan, [Supertype](https://supertype.ai) 174 | - Github: [onlyphantom](https://github.com/onlyphantom) 175 | 176 | Vincentius Christopher Calvin, [Supertype](https://supertype.ai) 177 | - Github: [vccalvin33](https://github.com/vccalvin33) 178 | 179 | If you find the code useful in your project, please link to this repository in your citation. 180 | 181 | ##### The MIT License (MIT) 182 | 183 | Copyright (c) 2021 Supertype Pte Ltd 184 | 185 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 186 | 187 | * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 188 | 189 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 190 | -------------------------------------------------------------------------------- /assets/graph1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/onlyphantom/emailnetwork/24b323e7c5159b5a02a7933b5f609664689a7316/assets/graph1.png -------------------------------------------------------------------------------- /assets/graph2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/onlyphantom/emailnetwork/24b323e7c5159b5a02a7933b5f609664689a7316/assets/graph2.png -------------------------------------------------------------------------------- /assets/graph3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/onlyphantom/emailnetwork/24b323e7c5159b5a02a7933b5f609664689a7316/assets/graph3.png -------------------------------------------------------------------------------- /assets/graph4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/onlyphantom/emailnetwork/24b323e7c5159b5a02a7933b5f609664689a7316/assets/graph4.png -------------------------------------------------------------------------------- /assets/graph5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/onlyphantom/emailnetwork/24b323e7c5159b5a02a7933b5f609664689a7316/assets/graph5.png -------------------------------------------------------------------------------- /assets/histo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/onlyphantom/emailnetwork/24b323e7c5159b5a02a7933b5f609664689a7316/assets/histo.png -------------------------------------------------------------------------------- /assets/summaryplot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/onlyphantom/emailnetwork/24b323e7c5159b5a02a7933b5f609664689a7316/assets/summaryplot.png -------------------------------------------------------------------------------- /build/lib/emailnetwork/__init__.py: -------------------------------------------------------------------------------- 1 | # Email Network 2 | # Network graphing utilities for email/mailbox (.mbox) data 3 | # 4 | # Author: Samuel Chan 5 | # Credits: Benjamin Bengfort 6 | # Created: Jan 2021 7 | # 8 | # Copyright (c) 2021 Supertype Pte Ltd 9 | # 10 | 11 | """ 12 | Network graphing utilities for email/mailbox (.mbox) data 13 | """ 14 | 15 | ########################################################################## 16 | ## Package Version 17 | ########################################################################## 18 | 19 | from .version import __version__ 20 | -------------------------------------------------------------------------------- /build/lib/emailnetwork/emails.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from datetime import datetime 3 | from emailnetwork.utils import parse_date 4 | 5 | @dataclass 6 | class EmailMeta: 7 | """ 8 | Also Refer to: 9 | https://www.iana.org/assignments/message-headers/message-headers.xhtml 10 | 11 | """ 12 | 13 | sender: str 14 | subject: str 15 | date: str 16 | recipients: list 17 | cc: list 18 | origin_domain: str=None 19 | 20 | def __post_init__(self): 21 | self.origin_domain = self.sender.domain 22 | self.date = parse_date(self.date) 23 | 24 | def __eq__(self, targetdate): 25 | if isinstance(targetdate, datetime): 26 | return self.date.date() == targetdate.date() 27 | 28 | def __ge__(self, targetdate): 29 | if isinstance(targetdate, datetime): 30 | return self.date.date() >= targetdate.date() 31 | 32 | def __le__(self, targetdate): 33 | if isinstance(targetdate, datetime): 34 | return self.date.date() <= targetdate.date() 35 | 36 | @dataclass 37 | class EmailAddress: 38 | name: str=None 39 | email: str=None 40 | 41 | def __init__(self, string): 42 | self.name, self.email = string 43 | self.email = self.email.lower() 44 | 45 | def __getitem__(self): 46 | return self.name, self.email 47 | 48 | @property 49 | def domain(self): 50 | return self.email.split('@')[-1] or None 51 | 52 | @dataclass 53 | class EmailBody: 54 | subject: str = None 55 | body: str = None -------------------------------------------------------------------------------- /build/lib/emailnetwork/extract.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from email.utils import getaddresses 3 | from mailbox import mbox 4 | 5 | from mailbox import mboxMessage 6 | 7 | from emailnetwork.utils import clean_subject, clean_body 8 | from emailnetwork.emails import EmailAddress, EmailMeta, EmailBody 9 | from emailnetwork.summary import DomainSummary 10 | 11 | from emailnetwork.header import HeaderCounter 12 | 13 | 14 | def extract_meta(email): 15 | 16 | recs = email.get_all('To', []) + email.get_all('Resent-To', []) 17 | ccs = email.get_all('Cc', []) + email.get_all('Resent-Cc', []) 18 | 19 | return EmailMeta( 20 | sender=EmailAddress(getaddresses(email.get_all('From'))[0]), 21 | recipients=[EmailAddress(rec) for rec in getaddresses(recs)], 22 | cc=[EmailAddress(cc) for cc in getaddresses(ccs)], 23 | subject=clean_subject(email['Subject']) or None, 24 | date=email['Date'] 25 | ) 26 | 27 | 28 | def extract_body(email): 29 | 30 | return EmailBody( 31 | subject=clean_subject(email['Subject']) or None, 32 | body=clean_body(email) 33 | ) 34 | 35 | 36 | class MBoxReader(object): 37 | """ A class that extends python's `mailbox` module to provide additional 38 | functionalities such as length, date filtering and parsing. A key component of 39 | many `emailnetwork`'s operations. 40 | 41 | Usage: 42 | reader = MboxReader('path-to-mbox.mbox') 43 | 44 | Args: 45 | object ([type]): Instantiate this class by specifying a path to an `.mbox` object 46 | """ 47 | 48 | def __init__(self, path) -> None: 49 | super().__init__() 50 | self.path = path 51 | self.mbox = mbox(path) 52 | 53 | def __iter__(self): 54 | for msg in self.mbox: 55 | yield msg 56 | 57 | def __len__(self): 58 | return self.count() 59 | 60 | def count(self): 61 | """ 62 | Count the number of emails in the mbox instance. 63 | Helper function to implement __len__ 64 | """ 65 | return self.mbox.keys()[-1]+1 66 | # return len(self.mbox.keys()) 67 | 68 | def extract(self): 69 | """ 70 | Extract the meta data from the Mbox instance 71 | """ 72 | for email in self: 73 | try: 74 | emailmeta = extract_meta(email) 75 | if emailmeta is not None: 76 | yield emailmeta 77 | 78 | except Exception as e: 79 | print(e) 80 | continue 81 | 82 | def filter_emails(self, emailaddress=None, datestring=None, dateoperator="=="): 83 | if emailaddress != None: 84 | if type(emailaddress) != str: 85 | raise ValueError( 86 | "Please use a valid string representing an email address") 87 | 88 | if dateoperator not in ['>=', '==', '<=']: 89 | raise ValueError("Please use one of ['>=', '==', '<=']") 90 | 91 | if datestring != None: 92 | try: 93 | targetdate = datetime.strptime(datestring, "%Y-%m-%d") 94 | except ValueError: 95 | print(ValueError) 96 | return "Please use the ISO format for comparison: YYYY-MM-DD" 97 | 98 | val = [] 99 | if emailaddress == None and datestring == None: 100 | for email in self.mbox: 101 | emailmeta = extract_meta(email) 102 | val.append(emailmeta) 103 | elif emailaddress != None and datestring == None: 104 | for email in self.mbox: 105 | emailmeta = extract_meta(email) 106 | checkers = [emailmeta.sender.email] + [recipient.email for recipient in emailmeta.recipients] 107 | if emailaddress in checkers: 108 | val.append(emailmeta) 109 | elif emailaddress == None and datestring != None: 110 | for email in self.mbox: 111 | emailmeta = extract_meta(email) 112 | if dateoperator == '>=': 113 | if emailmeta >= targetdate: 114 | val.append(emailmeta) 115 | elif dateoperator == '==': 116 | if emailmeta == targetdate: 117 | val.append(emailmeta) 118 | elif dateoperator == '<=': 119 | if emailmeta <= targetdate: 120 | val.append(emailmeta) 121 | else: 122 | for email in self.mbox: 123 | emailmeta = extract_meta(email) 124 | checkers = [emailmeta.sender.email] + [recipient.email for recipient in emailmeta.recipients] 125 | if emailaddress in checkers: 126 | if dateoperator == '>=': 127 | if emailmeta >= targetdate: 128 | val.append(emailmeta) 129 | elif dateoperator == '==': 130 | if emailmeta == targetdate: 131 | val.append(emailmeta) 132 | elif dateoperator == '<=': 133 | if emailmeta <= targetdate: 134 | val.append(emailmeta) 135 | 136 | return val 137 | 138 | 139 | if __name__ == '__main__': 140 | reader = MBoxReader('/Users/samuel/Footprints/samuel-supertype.mbox') 141 | # reader = MBoxReader('/Users/vincentiuscalvin/Documents/Supertype/mbox-dataset/Ori_Sample_01.mbox') 142 | headers = HeaderCounter(reader) 143 | k = headers.keys() 144 | spamheaders = list(filter(lambda v: "spam" in v.lower(), k)) 145 | 146 | summary = DomainSummary(reader) 147 | 148 | email = reader.mbox[1] 149 | emailmsg = extract_meta(email) 150 | emailbody = extract_body(email) 151 | mails = reader.filter_emails(datestring='2020-12-31', dateoperator="==") -------------------------------------------------------------------------------- /build/lib/emailnetwork/graph.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | import math 3 | import os 4 | import uuid 5 | import matplotlib.pyplot as plt 6 | import networkx as nx 7 | import textwrap 8 | 9 | from emailnetwork.extract import MBoxReader, extract_meta 10 | from emailnetwork.network import PeopleCombination 11 | 12 | 13 | def plot_single_directed(reader:MBoxReader, id:int=False, showtitle:bool=False) -> None: 14 | """ 15 | Plot a directed graph from a single email, as determined by `id`. 16 | If `showtitle` is `True`, render the plot with a email subject and date as title. 17 | 18 | Usage: 19 | reader = MboxReader('path-to-mbox.mbox') 20 | plot_single_directed(reader, 300) plots the 300th email from the mbox 21 | Args: 22 | reader (MBoxReader): A `MBoxReader` object 23 | id (int, optional): `id` of the email in the `MBoxReader`. Defaults to False. 24 | showtitle (bool, optional): If `True`, render the plot with a email subject and date as title. Defaults to False. 25 | """ 26 | 27 | len_reader = len(reader) 28 | if not id: 29 | email = reader.mbox[len_reader-1] 30 | else: 31 | email = reader.mbox[id] 32 | emailmsg = extract_meta(email) 33 | 34 | subject = textwrap.fill(emailmsg.subject, 40) 35 | sender = emailmsg.sender.name if len(emailmsg.sender.name) != 0 else emailmsg.sender.email.split('@')[0] 36 | 37 | plt.figure(figsize=(9, 6)) 38 | G = nx.DiGraph(name='Single Email Flow') 39 | 40 | for recipient in emailmsg.recipients: 41 | rec = recipient.name 42 | G.add_edge(sender, rec if len(rec) != 0 else recipient.email, 43 | message=subject, color='darkorchid', weight=3) 44 | 45 | for cc in emailmsg.cc: 46 | ccc = cc.name 47 | G.add_edge(sender, ccc if len(ccc) != 0 else cc.email, 48 | message='cc', color='lightsteelblue', weight=2) 49 | 50 | colors = nx.get_edge_attributes(G,'color').values() 51 | weights = nx.get_edge_attributes(G,'weight').values() 52 | edge_labels = nx.get_edge_attributes(G, 'message') 53 | 54 | pos = nx.planar_layout(G) 55 | 56 | # nx.draw_spectral(G,node_size=0, alpha=0.8, edge_color=colors, width=list(weights), font_size=8, with_labels=True) 57 | nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_size=6, label_pos=0.5) 58 | nx.draw_planar(G,node_size=0, alpha=1, edge_color=colors, width=list(weights), font_size=8, font_weight='bold', with_labels=True, verticalalignment='bottom') 59 | 60 | if showtitle: 61 | font = {"fontname": "Helvetica", "color": "k", "fontweight": "bold", "fontsize": 8} 62 | plt.title(subject + '\n Delivery date: ' + emailmsg.date.strftime("%m/%d/%Y"), fontdict=font) 63 | 64 | plt.tight_layout(pad=0.5) 65 | plt.axis('off') 66 | plt.show() 67 | 68 | def plot_single_undirected(reader:MBoxReader, id:int=False, showtitle:bool=False) -> None: 69 | """ 70 | Plot an undirected social network graph from a single email, as determined by `id`. 71 | If `showtitle` is `True`, render the plot with a email subject and date as title. 72 | 73 | Usage: 74 | reader = MboxReader('path-to-mbox.mbox') 75 | plot_single_undirected(reader, 300) plots the 300th email from the mbox 76 | Args: 77 | reader (MBoxReader): A `MBoxReader` object 78 | id (int, optional): `id` of the email in the `MBoxReader`. Defaults to False. 79 | showtitle (bool, optional): If `True`, render the plot with a email subject and date as title. Defaults to False. 80 | """ 81 | 82 | len_reader = len(reader) 83 | if not id: 84 | email = reader.mbox[len_reader-1] 85 | else: 86 | email = reader.mbox[id] 87 | emailmsg = extract_meta(email) 88 | 89 | subject = textwrap.fill(emailmsg.subject, 40) 90 | ng = PeopleCombination(emailmsg) 91 | G = nx.Graph( 92 | name='Single Email Social Network') 93 | counter = Counter() 94 | for combo in ng.combo: 95 | counter[combo] += 1 96 | 97 | total_combos = sum(counter.values()) 98 | by_freq = {k: v/total_combos for k, v in counter.most_common()} 99 | 100 | for rel in counter.keys(): 101 | G.add_edge(*rel, weight=by_freq[rel], count=counter[rel]) 102 | k = 1/math.sqrt(G.order()) * 2 103 | pos = nx.spring_layout(G, k=k) 104 | deg = [v for _, v in G.degree()] 105 | # nx.draw_networkx_nodes(G, pos, node_size=deg, linewidths=1.0, alpha=0.90, label=G._node.keys()) 106 | nx.draw_networkx_nodes(G, pos, node_size=deg, linewidths=1.0, alpha=0.90) 107 | nx.draw_networkx_edges(G, pos, edge_color="steelblue", width=1.0, style='dashed', alpha=0.75) 108 | nx.draw_networkx_labels(G, pos, {n: n for n in G.nodes}, font_size=8, verticalalignment="bottom") 109 | # nx.draw_networkx_labels(G, pos, {n: n for n in G.nodes if n.split('@')[-1] == 'supertype.ai'}, font_size=8) 110 | 111 | if showtitle: 112 | font = {"fontname": "Helvetica", "color": "k", "fontweight": "bold", "fontsize": 8} 113 | plt.title(subject + '\n Delivery date: ' + emailmsg.date.strftime("%m/%d/%Y"), fontdict=font) 114 | 115 | plt.tight_layout(pad=0.5) 116 | plt.axis('off') 117 | plt.show() 118 | 119 | def plot_directed(reader:MBoxReader, layout:str='shell', graphml:bool=False) -> None: 120 | """ 121 | Plot a directed social network graph from the entire `mbox`, supplied by `MBoxReader`. 122 | `layout` determines the underlying `NetworkX` layout. 123 | 124 | Usage: 125 | reader = MboxReader('path-to-mbox.mbox') 126 | plot_directed(reader) 127 | Args: 128 | reader (MBoxReader): A `MBoxReader` object 129 | layout (str, optional): Can be one of 'shell', 'spring' or 'spiral'. Defaults to 'shell'. 130 | graphml (bool, optional): Determines if a .graphml file is exported to the working directory. Defaults to False. 131 | """ 132 | 133 | emails = reader.extract() 134 | plt.figure(figsize=(12,12)) 135 | G = nx.MultiDiGraph(name='Email Social Network') 136 | for email in emails: 137 | sender = email.sender.name 138 | source_addr = sender if sender != '' else email.sender.email.split('@')[0] 139 | 140 | all_recipients = [em.name if em.name !='' or None else em.email.split('@')[0] for em in email.recipients + email.cc] 141 | 142 | for recipient in all_recipients: 143 | G.add_edge(source_addr, recipient, message=email.subject) 144 | 145 | if graphml: 146 | fileName = f'network-{str(uuid.uuid4().hex)[:8]}.graphml' 147 | nx.write_graphml(G, fileName) 148 | 149 | if layout == 'shell': 150 | pos = nx.shell_layout(G) 151 | elif layout == 'spring': 152 | pos = nx.spring_layout(G) 153 | else: 154 | pos = nx.spiral_layout(G) 155 | nx.draw(G, pos, node_size=0, alpha=0.4, edge_color='cadetblue', font_size=7, with_labels=True) 156 | ax = plt.gca() 157 | ax.margins(0.08) 158 | plt.show() 159 | 160 | def plot_undirected(reader:MBoxReader, layout:str='shell', graphml:bool=False): 161 | """Plot an undirected social network graph from the entire `mbox`, supplied by `MBoxReader`. 162 | `layout` determines the underlying `NetworkX` layout. 163 | 164 | Usage: 165 | reader = MboxReader('path-to-mbox.mbox') 166 | plot_undirected(reader) 167 | 168 | Args: 169 | reader (MBoxReader): A `MBoxReader` object 170 | layout (str, optional): Can be one of 'shell', 'spring' or 'spiral'. Defaults to 'shell'. 171 | graphml (bool, optional): Determines if a .graphml file is exported to the working directory. Defaults to False. 172 | """ 173 | 174 | emails = reader.extract() 175 | G = nx.Graph(name='Email Social Network') 176 | plt.figure(figsize=(12,12)) 177 | counter = Counter() 178 | for email in emails: 179 | ng = PeopleCombination(email) 180 | 181 | for combo in ng.combo: 182 | counter[combo] += 1 183 | 184 | total_combos = sum(counter.values()) 185 | by_freq = {k: v/total_combos for k, v in counter.most_common()} 186 | for rel in counter.keys(): 187 | G.add_edge(*rel, weight=by_freq[rel], count=counter[rel]) 188 | 189 | if graphml: 190 | fileName = f'network-{str(uuid.uuid4().hex)[:8]}.graphml' 191 | nx.write_graphml(G, fileName) 192 | print(f"Graphml exported as {fileName}") 193 | 194 | if layout == 'shell': 195 | pos = nx.shell_layout(G) 196 | elif layout == 'spring': 197 | k = 1/math.sqrt(G.order()) * 2 198 | pos = nx.spring_layout(G, k=k) 199 | else: 200 | pos = nx.spiral_layout(G) 201 | 202 | deg = [v*50 for _, v in G.degree()] 203 | nx.draw_networkx_nodes(G, pos, node_size=deg, linewidths=1.0, alpha=0.60) 204 | nx.draw_networkx_edges(G, pos, width=1.0, style='dashed', edge_color='cadetblue', alpha=0.6) 205 | nx.draw_networkx_labels(G, pos, {n: n.split('@')[0] for n in G.nodes}, font_size=8, font_color='darkorchid') 206 | 207 | plt.axis('off') 208 | plt.show() 209 | 210 | if __name__ == '__main__': 211 | MBOX_PATH = f'{os.path.dirname(__file__)}/tests/test.mbox' 212 | 213 | # reader = MBoxReader('/Users/samuel/Footprints/emailnetwork/emailnetwork/tests/test.mbox') 214 | reader = MBoxReader('/Users/vincentiuscalvin/Documents/Supertype/mbox-dataset/Ori_Sample_01.mbox') 215 | # reader = MBoxReader('/Users/samuel/Footprints/samuel-supertype.mbox') 216 | # plot_single_directed(reader,300) 217 | # plot_single_directed(reader, 1, True) 218 | # plot_directed(reader) 219 | # plot_directed(reader, "shell") 220 | plot_undirected(reader, 'spring') -------------------------------------------------------------------------------- /build/lib/emailnetwork/header.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | from email.header import decode_header 3 | from emailnetwork.utils import clean_subject 4 | 5 | 6 | class HeaderCounter(Counter): 7 | """[summary] 8 | 9 | Args: 10 | Counter ([type]): [description] 11 | """ 12 | 13 | def __init__(self, reader): 14 | super().__init__() 15 | self = self.build_from(reader) 16 | 17 | def __str__(self): 18 | return f'{self.most_common()}' 19 | 20 | def build_from(self, reader): 21 | for email in reader: 22 | for k in email.keys(): 23 | self[k] += 1 24 | 25 | return self 26 | 27 | def histogram(self, n=25): 28 | from matplotlib import pyplot as plt 29 | plt.style.use('fivethirtyeight') 30 | k, v = (list(self.keys())[:n], list(self.values())[:n]) 31 | fig = plt.figure(figsize=(7, 10)) 32 | ax = fig.add_subplot(111) 33 | y_pos = [i for i in range(n)] 34 | ax.barh(y_pos, v, color='plum') 35 | ax.set_yticks(y_pos) 36 | ax.set_yticklabels(k) 37 | ax.invert_yaxis() 38 | ax.set_xlabel('Frequency') 39 | ax.set_title('Email Header Analysis') 40 | plt.tight_layout() 41 | plt.show() 42 | 43 | 44 | if __name__ == '__main__': 45 | from emailnetwork.extract import MBoxReader 46 | reader = MBoxReader('/Users/samuel/EmailNetwork/samuel-supertype.mbox') 47 | # reader = MBoxReader('/Users/vincentiuscalvin/Documents/Supertype/mbox-dataset/Ori_Sample_01.mbox') 48 | headers = HeaderCounter(reader) 49 | 50 | k = headers.keys() 51 | 52 | containspam = list(filter(lambda v: "spam" in v.lower(), k)) 53 | 54 | for email in reader: 55 | for key in email.keys(): 56 | if key in containspam: 57 | print({key: decode_header(email[key])}) 58 | -------------------------------------------------------------------------------- /build/lib/emailnetwork/network.py: -------------------------------------------------------------------------------- 1 | from itertools import combinations 2 | 3 | class PeopleCombination: 4 | """ 5 | Usage: 6 | email = EmailMeta(...) 7 | PeopleCombination(email) 8 | """ 9 | 10 | def __init__(self, email): 11 | people = [email.sender] + email.recipients + email.cc 12 | people = filter(lambda p: p is not None, people) 13 | people = set(addr.email for addr in people if addr.email) 14 | self.people = sorted(people) 15 | 16 | def __repr__(self): 17 | return str(self.people) 18 | 19 | 20 | @property 21 | def combo(self): 22 | for combination in combinations(self.people, 2): 23 | yield combination 24 | -------------------------------------------------------------------------------- /build/lib/emailnetwork/summary.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | from datetime import datetime 3 | 4 | 5 | class DomainSummary(): 6 | 7 | def __init__(self, reader): 8 | self.emailmetas = reader.extract() 9 | self.summary = self.get_summary() 10 | 11 | def get_summary(self): 12 | domains = {} 13 | for email in self.emailmetas: 14 | domains[email.origin_domain] = domains.get( 15 | email.origin_domain, 0) + 1 16 | 17 | return Counter(domains) 18 | 19 | def plot(self): 20 | import matplotlib.pyplot as plt 21 | plt.style.use('seaborn') 22 | 23 | fig = plt.figure(figsize=(10, 5)) 24 | 25 | domains = list(self.summary.keys()) 26 | freqs = list(self.summary.values()) 27 | 28 | plt.barh(domains, freqs, color='cadetblue') 29 | [i.set_color("darkorchid") for i in plt.gca().get_xticklabels()] 30 | [i.set_color("darkorchid") for i in plt.gca().get_yticklabels()] 31 | plt.title("Sender's Domain Occurences", fontdict={ 32 | "fontname": "Helvetica", "color": "k", "fontweight": "bold", "fontsize": 12}) 33 | plt.show() 34 | 35 | def __str__(self): 36 | coba = [] 37 | for keys in self.summary: 38 | coba.append(f"{keys:<25s}: {str(self.summary[keys]):<3s}") 39 | return "\n".join(coba) 40 | 41 | 42 | class IncomingOutgoingSummary(): 43 | 44 | def __init__(self, reader): 45 | self.reader = reader 46 | self.emailmetas = reader.extract() 47 | self.user_email = self.get_user_email() 48 | self.summary = self.get_summary() 49 | 50 | def get_user_email(self): 51 | from emailnetwork.extract import extract_meta 52 | 53 | email_addresses = {} 54 | for i in range(10): 55 | email = self.reader.mbox[i] 56 | emailmsg = extract_meta(email) 57 | for recipient in emailmsg.recipients: 58 | email_addresses[recipient.email] = email_addresses.get( 59 | recipient.email, 0) + 1 60 | email_addresses[emailmsg.sender.email] = email_addresses.get( 61 | emailmsg.sender.email, 0) + 1 62 | return sorted(email_addresses.items(), key=lambda k: k[1], reverse=True)[0][0] 63 | 64 | def get_summary(self): 65 | date = {} 66 | for email in self.emailmetas: 67 | if email.date.strftime('%B %Y') not in date: 68 | date[email.date.strftime('%B %Y')] = { 69 | 'Incoming': 0, 'Outgoing': 0} 70 | if email.sender.email == self.user_email: 71 | date[email.date.strftime('%B %Y')]['Outgoing'] += 1 72 | else: 73 | date[email.date.strftime('%B %Y')]['Incoming'] += 1 74 | 75 | date = sorted( 76 | date.items(), key=lambda items: datetime.strptime(items[0], '%B %Y')) 77 | return dict(date) 78 | 79 | def plot(self): 80 | import matplotlib.pyplot as plt 81 | plt.style.use('seaborn') 82 | 83 | dates = list(self.summary.keys()) 84 | incoming = list(item[1]['Incoming'] for item in self.summary.items()) 85 | outgoing = list(item[1]['Outgoing'] for item in self.summary.items()) 86 | 87 | fig, ax = plt.subplots() 88 | 89 | ax.bar(dates, incoming, 0.4, label='Incoming', color='cadetblue') 90 | ax.bar(dates, outgoing, 0.4, bottom=incoming, label='Outgoing') 91 | 92 | [i.set_color("darkorchid") for i in plt.gca().get_xticklabels()] 93 | [i.set_color("darkorchid") for i in plt.gca().get_yticklabels()] 94 | 95 | plt.xticks(rotation=45) 96 | 97 | ax.set_ylabel('Counts') 98 | ax.set_title('Number of Incoming and Outgoing Emails per Month', fontdict={"fontname": "Helvetica", "color": "k", "fontweight": "bold", "fontsize": 12}) 99 | ax.legend() 100 | 101 | plt.show() 102 | 103 | 104 | if __name__ == "__main__": 105 | from emailnetwork.extract import MBoxReader 106 | reader = MBoxReader('/Users/vincentiuscalvin/Documents/Supertype/mbox-dataset/Ori_Sample_01.mbox') 107 | summary = DomainSummary(reader) 108 | summary_2 = IncomingOutgoingSummary(reader) 109 | -------------------------------------------------------------------------------- /build/lib/emailnetwork/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/onlyphantom/emailnetwork/24b323e7c5159b5a02a7933b5f609664689a7316/build/lib/emailnetwork/tests/__init__.py -------------------------------------------------------------------------------- /build/lib/emailnetwork/tests/test_extract.py: -------------------------------------------------------------------------------- 1 | import os 2 | import datetime 3 | from unittest import TestCase 4 | 5 | from emailnetwork.extract import MBoxReader, extract_meta, extract_body 6 | from emailnetwork.emails import EmailAddress, EmailMeta, EmailBody 7 | 8 | """ 9 | Demo mbox is generated from Benjamin Bengfort's Tribe tool 10 | with person names modified for anonymity 11 | """ 12 | MBOX_PATH = f'{os.path.dirname(__file__)}/test.mbox' 13 | 14 | class TestExtract(TestCase): 15 | def setUp(self): 16 | self.reader = MBoxReader(MBOX_PATH) 17 | self.emails = self.reader.extract() 18 | 19 | def tearDown(self): 20 | self.reader = None 21 | 22 | def test_read_mbox(self): 23 | self.assertTrue(isinstance(self.reader, MBoxReader)) 24 | 25 | def test_length_mbox(self): 26 | self.assertEqual(len(self.reader), 140) 27 | 28 | def test_extract(self): 29 | # self.assertTrue(isinstance(next(self.emails), EmailMeta)) 30 | firstemail = next(self.emails) 31 | self.assertIsInstance(firstemail, EmailMeta) 32 | self.assertIsInstance(firstemail.subject, str) 33 | self.assertIsInstance(firstemail.date, datetime.datetime) 34 | 35 | for msg in self.emails: 36 | self.assertGreaterEqual(len(msg.recipients), 1) 37 | self.assertIsInstance(msg.cc, list) 38 | 39 | def test_email_address(self): 40 | firstemail = next(self.emails) 41 | self.assertIsInstance(firstemail.sender, EmailAddress) 42 | self.assertIsInstance(firstemail.sender.name, str) 43 | self.assertIsInstance(firstemail.sender.email, str) 44 | 45 | def test_filter_emails(self): 46 | newmails = self.reader.filter_emails(datestring="2020-01-01", dateoperator=">=") 47 | self.assertEqual(len(newmails), 4) 48 | 49 | for email in newmails: 50 | self.assertGreater(email.date, datetime.datetime(2020,1,1)) 51 | self.assertLess(email.date, datetime.datetime.now()) 52 | 53 | oldmails = self.reader.filter_emails(datestring="2019-12-31", dateoperator="<=") 54 | self.assertEqual(len(oldmails), 136) 55 | 56 | exactmails = self.reader.filter_emails(datestring="2020-04-17", dateoperator="==") 57 | self.assertEqual(len(exactmails), 1) 58 | self.assertEqual(exactmails[0].date.date(), datetime.date(2020, 4, 17)) 59 | 60 | namedmails = self.reader.filter_emails(emailaddress='samuelchan@gmail.com') 61 | 62 | for email in namedmails: 63 | checkers = [email.sender.email] + [recipient.email for recipient in email.recipients] 64 | self.assertTrue('samuelchan@gmail.com' in checkers) 65 | 66 | fullfilteredmails = self.reader.filter_emails(emailaddress='samuelchan@gmail.com', datestring="2020-01-01", dateoperator=">=") 67 | 68 | for email in fullfilteredmails: 69 | checkers = [email.sender.email] + [recipient.email for recipient in email.recipients] 70 | self.assertTrue('samuelchan@gmail.com' in checkers) 71 | self.assertGreater(email.date, datetime.datetime(2020,1,1)) 72 | 73 | # also need tests to fail with expected exception when datetime operator not in [==, <=, >=], emailaddress and datetime in wrong format. 74 | def test_afunction_throws_exception(self): 75 | self.assertRaises(ValueError, self.reader.filter_emails, 20, "2019-12-31", "<") 76 | 77 | def test_extract_meta_single(self): 78 | for email in self.reader.mbox: 79 | self.assertIsInstance(email['Subject'], (bytes, str)) 80 | emailmsg = extract_meta(email) 81 | self.assertIsInstance(emailmsg, EmailMeta) 82 | self.assertIsInstance(emailmsg.origin_domain, str) 83 | self.assertIsInstance(emailmsg.subject, str) 84 | 85 | def test_extract_body_single(self): 86 | for email in self.reader.mbox: 87 | emailbody = extract_body(email) 88 | self.assertIsInstance(emailbody, EmailBody) 89 | self.assertIsInstance(emailbody.subject, str) 90 | self.assertIsInstance(emailbody.body, str) 91 | -------------------------------------------------------------------------------- /build/lib/emailnetwork/tests/test_graph.py: -------------------------------------------------------------------------------- 1 | import os 2 | from unittest import TestCase, mock 3 | 4 | from emailnetwork.extract import MBoxReader 5 | # from emailnetwork.graph import plot_single_email 6 | import emailnetwork.graph as graph 7 | 8 | MBOX_PATH = f'{os.path.dirname(__file__)}/test.mbox' 9 | 10 | @mock.patch(f"{__name__}.graph.plt") 11 | def test_plot_single_directed(mock_plt): 12 | reader = MBoxReader(MBOX_PATH) 13 | graph.plot_single_directed(reader, 1, True) 14 | mock_plt.title.assert_called_once_with("Three tips to get the most out of Gmail\n Delivery date: 04/17/2020", fontdict={'fontname': 'Helvetica', 'color': 'k', 'fontweight': 'bold', 'fontsize': 8}) 15 | assert mock_plt.figure.called 16 | 17 | 18 | class TestGraph(TestCase): 19 | def setUp(self): 20 | self.reader = MBoxReader(MBOX_PATH) 21 | self.emails = self.reader.extract() 22 | 23 | def test_single_graph(self): 24 | # TODO: to be implemented later 25 | pass 26 | 27 | -------------------------------------------------------------------------------- /build/lib/emailnetwork/tests/test_summary.py: -------------------------------------------------------------------------------- 1 | import os 2 | from unittest import TestCase 3 | from collections import Counter 4 | from datetime import datetime 5 | 6 | from emailnetwork.extract import MBoxReader 7 | from emailnetwork.summary import DomainSummary, IncomingOutgoingSummary 8 | 9 | MBOX_PATH = f'{os.path.dirname(__file__)}/test.mbox' 10 | 11 | 12 | class TestSummary(TestCase): 13 | def setUp(self): 14 | self.reader = MBoxReader(MBOX_PATH) 15 | self.domain_summary = DomainSummary(self.reader) 16 | self.incoming_outgoing_summary = IncomingOutgoingSummary(self.reader) 17 | 18 | def tearDown(self): 19 | self.domain_summary = None 20 | self.incoming_outgoing_summary = None 21 | 22 | def test_summary_instance(self): 23 | self.assertTrue(isinstance(self.domain_summary, DomainSummary)) 24 | self.assertTrue(isinstance(self.domain_summary.summary, Counter)) 25 | self.assertTrue(isinstance( 26 | self.incoming_outgoing_summary, IncomingOutgoingSummary)) 27 | self.assertTrue(isinstance( 28 | self.incoming_outgoing_summary.summary, dict)) 29 | 30 | def test_one_summary(self): 31 | for summary in self.domain_summary.summary: 32 | self.assertTrue(isinstance(summary, str)) 33 | self.assertTrue(isinstance( 34 | self.domain_summary.summary[summary], int)) 35 | self.assertGreater(self.domain_summary.summary[summary], 0) 36 | 37 | for summary in self.incoming_outgoing_summary.summary: 38 | self.assertTrue(isinstance(summary, str)) 39 | self.assertTrue(isinstance( 40 | self.incoming_outgoing_summary.summary[summary], dict)) 41 | for keys in self.incoming_outgoing_summary.summary[summary]: 42 | self.assertIn(keys, ('Incoming', 'Outgoing')) 43 | self.assertIsInstance( 44 | self.incoming_outgoing_summary.summary[summary][keys], int) 45 | -------------------------------------------------------------------------------- /build/lib/emailnetwork/tests/test_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | from unittest import TestCase 3 | 4 | from emailnetwork.extract import MBoxReader 5 | 6 | MBOX_PATH = f'{os.path.dirname(__file__)}/test.mbox' 7 | 8 | class TestNetwork(TestCase): 9 | def setUp(self): 10 | # self.reader = MBoxReader(MBOX_PATH) 11 | # self.emails = self.reader.extract() 12 | pass 13 | 14 | # test PeopleCombination 15 | 16 | -------------------------------------------------------------------------------- /build/lib/emailnetwork/utils.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | 3 | from dateutil import parser 4 | from dateutil.tz import tzlocal, tzutc 5 | from email.utils import parsedate_tz, mktime_tz 6 | from email.header import decode_header 7 | 8 | 9 | def parse_date(datestring: str): 10 | """[summary] 11 | Usage: 12 | Primarily used for extract_meta(email): parse_date(email['Date']) 13 | parse_date('Sat, 19 Sep 2020 12:01:38 +0800') 14 | Args: 15 | datestring (str): [description] 16 | """ 17 | try: 18 | dt = parsedate_tz(datestring) 19 | if dt is not None: 20 | return datetime.utcfromtimestamp(mktime_tz(dt)) 21 | 22 | return parser.parse(datestring) 23 | except Exception: 24 | return None 25 | 26 | 27 | def clean_subject(subject): 28 | """[summary] 29 | Usage: 30 | 31 | Args: 32 | subject (byte or str) 33 | """ 34 | subject, encoding = decode_header(subject)[0] 35 | if isinstance(subject, bytes): 36 | try: 37 | return subject.decode(encoding).strip() 38 | except: 39 | return subject.decode('utf-8').strip() 40 | else: 41 | return subject.strip().replace('\r\n', '') 42 | 43 | 44 | def clean_body(email): 45 | if email.is_multipart(): 46 | for part in email.walk(): 47 | ctype = part.get_content_type() 48 | cdispo = str(part.get('Content-Disposition')) 49 | 50 | # skip any text/plain (txt) attachments 51 | if ctype == 'text/plain' and 'attachment' not in cdispo: 52 | return part.get_payload(decode=True).decode() # decode 53 | break 54 | # not multipart - i.e. plain text, no attachments, keeping fingers crossed 55 | else: 56 | return email.get_payload(decode=True).decode() 57 | -------------------------------------------------------------------------------- /build/lib/emailnetwork/version.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.0.2" -------------------------------------------------------------------------------- /coverage.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | /Users/vincentiuscalvin/Documents/Supertype/mbox-dataset/emailnetwork 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | 281 | 282 | 283 | 284 | 285 | 286 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | 294 | 295 | 296 | 297 | 298 | 299 | 300 | 301 | 302 | 303 | 304 | 305 | 306 | 307 | 308 | 309 | 310 | 311 | 312 | 313 | 314 | 315 | 316 | 317 | 318 | 319 | 320 | 321 | 322 | 323 | 324 | 325 | 326 | 327 | 328 | 329 | 330 | 331 | 332 | 333 | 334 | 335 | 336 | 337 | 338 | 339 | 340 | 341 | 342 | 343 | 344 | 345 | 346 | 347 | 348 | 349 | 350 | 351 | 352 | 353 | 354 | 355 | 356 | 357 | 358 | 359 | 360 | 361 | 362 | 363 | 364 | 365 | 366 | 367 | 368 | 369 | 370 | 371 | 372 | 373 | 374 | 375 | 376 | 377 | 378 | 379 | 380 | 381 | 382 | 383 | 384 | 385 | 386 | 387 | 388 | 389 | 390 | 391 | 392 | 393 | 394 | 395 | 396 | 397 | 398 | 399 | 400 | 401 | 402 | 403 | 404 | 405 | 406 | 407 | 408 | 409 | 410 | 411 | 412 | 413 | 414 | 415 | 416 | 417 | 418 | 419 | 420 | 421 | 422 | 423 | 424 | 425 | 426 | 427 | 428 | -------------------------------------------------------------------------------- /dist/emailnetwork-0.0.1-py3-none-any.whl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/onlyphantom/emailnetwork/24b323e7c5159b5a02a7933b5f609664689a7316/dist/emailnetwork-0.0.1-py3-none-any.whl -------------------------------------------------------------------------------- /dist/emailnetwork-0.0.1.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/onlyphantom/emailnetwork/24b323e7c5159b5a02a7933b5f609664689a7316/dist/emailnetwork-0.0.1.tar.gz -------------------------------------------------------------------------------- /dist/emailnetwork-0.0.2-py3-none-any.whl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/onlyphantom/emailnetwork/24b323e7c5159b5a02a7933b5f609664689a7316/dist/emailnetwork-0.0.2-py3-none-any.whl -------------------------------------------------------------------------------- /dist/emailnetwork-0.0.2.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/onlyphantom/emailnetwork/24b323e7c5159b5a02a7933b5f609664689a7316/dist/emailnetwork-0.0.2.tar.gz -------------------------------------------------------------------------------- /emailnetwork.egg-info/PKG-INFO: -------------------------------------------------------------------------------- 1 | Metadata-Version: 2.1 2 | Name: emailnetwork 3 | Version: 0.0.2 4 | Summary: Network graphing utilities for email/mailbox (.mbox) data 5 | Home-page: http://github.com/onlyphantom/emailnetwork 6 | Author: Samuel Chan 7 | Author-email: s@supertype.ai 8 | License: MIT 9 | Description: # Email Network 10 | 11 | ## Description 12 | Network graphing utilities for email/mailbox data. 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | For the social scientists, creating social networks from your mailbox data and among other things: 21 | * Discover subgroups within your organization (whether the different task forces established were as cohesive as it seems on the outside) 22 | * Study social actors (most emails from Marketing involve Peter and Andy) and their relative influence 23 | * Identify the key social groups (Sales team hangs out a lot, but the IT / product division less so) 24 | * Key account managers of the company (Despite being with the company only recently, Margaretha is connected to more key clients than her peers) 25 | 26 | 27 | If you're a graph theorist and looking for something more statistical: 28 | * Support directed and undirected graphs (**implemented in version 0.0.2**, see below) 29 | * Also output statistical measurements such as centrality distribution (**planned for version 0.0.3**) 30 | * Betweenness, closeness, hubness, distance histograms plotting (**planned for version 0.0.3**) 31 | * Exports to `.graphml` format for use in other graphing software (**implemented in version 0.0.2**) 32 | 33 | ## Dependencies 34 | * Python 3.7+ 35 | * Only dependencies are NetworkX and Matplotlib 36 | 37 | ## Example Usage 38 | To install `emailnetwork`: 39 | ``` 40 | pip install emailnetwork 41 | ``` 42 | 43 | A sample `.mbox` file is provided to you, but you can obtain export your own mailbox from your email service provider. If you use Google (Gmail), you can [use the Google Takeout service](https://takeout.google.com/settings/takeout) to export your mail data. 44 | 45 | 46 | ```python 47 | from emailnetwork.extract import MBoxReader 48 | reader = MBoxReader('path-to-mbox.mbox') 49 | print(f'{len(reader)} emails in the sample mbox.') 50 | 51 | # extract a specific email 52 | email = reader.mbox[5] 53 | emailmsg = extract_meta(email) 54 | 55 | # filter emails by certain date 56 | thisyearmails = reader.filter_by_date('>=', '2021-01-05') 57 | 58 | # print email domains of recipients 59 | print(emailmsg.recipients) 60 | print(emailmsg.recipients[0].domain) 61 | 62 | # extract all emails 63 | emails = reader.extract() 64 | ``` 65 | 66 | For graph visualization: 67 | ```py 68 | from emailnetwork.extract import MBoxReader 69 | # Read from .mbox 70 | MBOX_PATH = f'{os.path.dirname(__file__)}/tests/test.mbox' 71 | reader = MBoxReader(MBOX_PATH) 72 | 73 | # Try the following: 74 | # plot a single directed graph the email at index 3 75 | plot_single_directed(reader,3) 76 | 77 | # plot a single undirected graph the email at index 3, show title in plot 78 | plot_single_undirected(reader, 1, showtitle=True) 79 | 80 | # plot a directed graph, optionally specifying a layout style 81 | plot_directed(reader) 82 | plot_directed(reader, 'shell') 83 | # optionally export a .graphml to your working directory for use 84 | # in other network / graphing software 85 | plot_undirected(reader, 'spring', graphml=True) 86 | ``` 87 | 88 | #### Email Header Analysis 89 | 90 | To obtain a histogram: 91 | 92 | ```py 93 | from emailnetwork.extract import MBoxReader 94 | reader = MBoxReader('path-to-mbox') 95 | headers = HeaderCounter(reader) 96 | headers.histogram() 97 | # to show only top 10 header, set an optional n parameter 98 | # headers.histogram(n=10) 99 | ``` 100 | Because `HeaderCounter` is a subclass of Python's `Counter`, you can also perform operations such as `headers.most_common(8)` to get the 8 most-common headers from the `mbox` file. 101 | 102 | If you want to find all email headers with the word "spam" in it (e.g spam score, other antispam mechanism), you can use Python's `filter()` function: 103 | ```python 104 | reader = MBoxReader('path-to-mbox') 105 | headers = HeaderCounter(reader) 106 | spamheaders = list(filter(lambda v: "spam" in v.lower(), headers.keys())) 107 | # return: 108 | # ['X-Spam-Checked-In-Group', 'X-Microsoft-Antispam-PRVS', 'X-Microsoft-Antispam-Untrusted', 'X-Microsoft-Antispam-Message-Info-Original', 'X-Forefront-Antispam-Report-Untrusted', 'x-ms-exchange-antispam-messagedata', 'X-Microsoft-Antispam', 'X-Microsoft-Antispam-Message-Info', 'X-Forefront-Antispam-Report', 'X-Mimecast-Spam-Score', 'x-microsoft-antispam-prvs', 'x-microsoft-antispam', 'x-microsoft-antispam-message-info', 'x-forefront-antispam-report'] 109 | ``` 110 | 111 | #### Mailbox Summary 112 | 113 | To get a simple barchart on the distribution of email domains in your `.mbox`, you can create a `DomainSummary` object and call the `.plot()` function: 114 | 115 | 116 | 117 | ```python 118 | from emailnetwork.summary import DomainSummary 119 | summary = DomainSummary(reader) 120 | summary.plot() 121 | ``` 122 | 123 | You can also return a `Counter()` (a subclass of `dict`) instead of a plot: 124 | 125 | ```python 126 | summary.summary 127 | # return: 128 | # Counter({'supertype.ai': 203, 'hubspot.com': 115, 'gmail.com': 75, 'google.com': 53, 'adcolony.com': 38, 'fbworkmail.com': 35, 'elementor.com': 29, 'payoneer.com': 15, 'gogame.net': 14, 'zoomd.com': 13, 'am.atlassian.com': 10, 'theafternaut.com': 6, 'alegrium.com': 5, 'accounts.google.com': 4, 'e.atlassian.com': 4, 'tnbaura.com': 4, 'support.lazada.sg': 4, '3kraters.com': 3, 'go.facebookmail.com': 2, 'docs.google.com': 2, 'mail.hellosign.com': 2, 'algorit.ma': 2, 'supertype.atlassian.net': 2, 'ucdconnect.ie': 2, 'mc.facebookmail.com': 1, 'inplacesoftware.com': 1, 'aura.co': 1, 'atlassian.com': 1, 'greenhouse.io': 1}) 129 | ``` 130 | ##### Why Python 3.7+? 131 | Python 3.7+ is required because the package is written to take advantage of many features of Python 3.7 and above. 132 | 133 | Examples of features that were used extensively in the creation of this package: 134 | * [Dataclasses, new in Python 3.7](https://www.youtube.com/watch?v=sH_jLQvnpBo) 135 | * [Insertion-ordered Dictionaries, new in Python 3.7](https://www.youtube.com/watch?v=h-DBWPjpqWY) 136 | * [Typing (Type hints), new in Python 3.5](https://docs.python.org/3/library/typing.html) 137 | * [Formatted string literal, new in Python 3.6](https://docs.python.org/3/reference/lexical_analysis.html#f-strings) 138 | ## Testing 139 | Git clone, and run `nosetests`. You can also run nosetests with coverage: 140 | ``` 141 | nosetests --with-coverage --cover-package=emailnetwork 142 | 143 | ......... 144 | Name Stmts Miss Cover 145 | ---------------------------------------------- 146 | emailnetwork/__init__.py 2 0 100% 147 | emailnetwork/emails.py 55 11 80% 148 | emailnetwork/extract.py 54 11 80% 149 | emailnetwork/graph.py 120 82 32% 150 | emailnetwork/network.py 13 7 46% 151 | emailnetwork/utils.py 32 17 47% 152 | emailnetwork/version.py 1 0 100% 153 | ---------------------------------------------- 154 | TOTAL 277 128 54% 155 | ---------------------------------------------------------------------- 156 | Ran 9 tests in 3.226s 157 | 158 | OK 159 | ``` 160 | 161 | All tests are located in the `/tests/` directory. 162 | 163 | 164 | ## Authors and Copyright 165 | 166 | Samuel Chan, Supertype [Supertype](https://supertype.ai) 167 | 168 | Vincentius Christopher Calvin, Supertype [https://supertype.ai](https://supertype.ai) 169 | 170 | If you find the code useful in your project, please link to this repository in your citation. 171 | 172 | ##### The MIT License (MIT) 173 | 174 | Copyright (c) 2021 Supertype Pte Ltd 175 | 176 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 177 | 178 | * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 179 | 180 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 181 | Platform: UNKNOWN 182 | Classifier: License :: OSI Approved :: MIT License 183 | Classifier: Programming Language :: Python :: 3 184 | Classifier: Programming Language :: Python :: 3.7 185 | Classifier: Operating System :: OS Independent 186 | Requires-Python: >=3.7 187 | Description-Content-Type: text/markdown 188 | -------------------------------------------------------------------------------- /emailnetwork.egg-info/SOURCES.txt: -------------------------------------------------------------------------------- 1 | MANIFEST.in 2 | README.md 3 | setup.py 4 | emailnetwork/__init__.py 5 | emailnetwork/emails.py 6 | emailnetwork/extract.py 7 | emailnetwork/graph.py 8 | emailnetwork/header.py 9 | emailnetwork/network.py 10 | emailnetwork/summary.py 11 | emailnetwork/utils.py 12 | emailnetwork/version.py 13 | emailnetwork.egg-info/PKG-INFO 14 | emailnetwork.egg-info/SOURCES.txt 15 | emailnetwork.egg-info/dependency_links.txt 16 | emailnetwork.egg-info/not-zip-safe 17 | emailnetwork.egg-info/requires.txt 18 | emailnetwork.egg-info/top_level.txt 19 | emailnetwork/tests/__init__.py 20 | emailnetwork/tests/test.mbox 21 | emailnetwork/tests/test_extract.py 22 | emailnetwork/tests/test_graph.py 23 | emailnetwork/tests/test_summary.py 24 | emailnetwork/tests/test_utils.py -------------------------------------------------------------------------------- /emailnetwork.egg-info/dependency_links.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /emailnetwork.egg-info/not-zip-safe: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /emailnetwork.egg-info/requires.txt: -------------------------------------------------------------------------------- 1 | matplotlib 2 | networkx 3 | -------------------------------------------------------------------------------- /emailnetwork.egg-info/top_level.txt: -------------------------------------------------------------------------------- 1 | emailnetwork 2 | -------------------------------------------------------------------------------- /emailnetwork/.coverage: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/onlyphantom/emailnetwork/24b323e7c5159b5a02a7933b5f609664689a7316/emailnetwork/.coverage -------------------------------------------------------------------------------- /emailnetwork/__init__.py: -------------------------------------------------------------------------------- 1 | # Email Network 2 | # Network graphing utilities for email/mailbox (.mbox) data 3 | # 4 | # Author: Samuel Chan 5 | # Credits: Benjamin Bengfort 6 | # Created: Jan 2021 7 | # 8 | # Copyright (c) 2021 Supertype Pte Ltd 9 | # 10 | 11 | """ 12 | Network graphing utilities for email/mailbox (.mbox) data 13 | """ 14 | 15 | ########################################################################## 16 | ## Package Version 17 | ########################################################################## 18 | 19 | from .version import __version__ 20 | -------------------------------------------------------------------------------- /emailnetwork/emails.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from datetime import datetime 3 | from emailnetwork.utils import parse_date 4 | 5 | @dataclass 6 | class EmailMeta: 7 | """ 8 | Also Refer to: 9 | https://www.iana.org/assignments/message-headers/message-headers.xhtml 10 | 11 | """ 12 | 13 | sender: str 14 | subject: str 15 | date: str 16 | recipients: list 17 | cc: list 18 | origin_domain: str=None 19 | 20 | def __post_init__(self): 21 | self.origin_domain = self.sender.domain 22 | self.date = parse_date(self.date) 23 | 24 | def __eq__(self, targetdate): 25 | if isinstance(targetdate, datetime): 26 | return self.date.date() == targetdate.date() 27 | 28 | def __ge__(self, targetdate): 29 | if isinstance(targetdate, datetime): 30 | return self.date.date() >= targetdate.date() 31 | 32 | def __le__(self, targetdate): 33 | if isinstance(targetdate, datetime): 34 | return self.date.date() <= targetdate.date() 35 | 36 | @dataclass 37 | class EmailAddress: 38 | name: str=None 39 | email: str=None 40 | 41 | def __init__(self, string): 42 | self.name, self.email = string 43 | self.email = self.email.lower() 44 | 45 | def __getitem__(self): 46 | return self.name, self.email 47 | 48 | @property 49 | def domain(self): 50 | return self.email.split('@')[-1] or None 51 | 52 | @dataclass 53 | class EmailBody: 54 | subject: str = None 55 | body: str = None -------------------------------------------------------------------------------- /emailnetwork/extract.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from email.utils import getaddresses 3 | from mailbox import mbox 4 | 5 | from mailbox import mboxMessage 6 | 7 | from emailnetwork.utils import clean_subject, clean_body 8 | from emailnetwork.emails import EmailAddress, EmailMeta, EmailBody 9 | from emailnetwork.summary import DomainSummary 10 | 11 | from emailnetwork.header import HeaderCounter 12 | 13 | 14 | def extract_meta(email): 15 | 16 | recs = email.get_all('To', []) + email.get_all('Resent-To', []) 17 | ccs = email.get_all('Cc', []) + email.get_all('Resent-Cc', []) 18 | 19 | return EmailMeta( 20 | sender=EmailAddress(getaddresses(email.get_all('From'))[0]), 21 | recipients=[EmailAddress(rec) for rec in getaddresses(recs)], 22 | cc=[EmailAddress(cc) for cc in getaddresses(ccs)], 23 | subject=clean_subject(email['Subject']) or None, 24 | date=email['Date'] 25 | ) 26 | 27 | 28 | def extract_body(email): 29 | 30 | return EmailBody( 31 | subject=clean_subject(email['Subject']) or None, 32 | body=clean_body(email) 33 | ) 34 | 35 | 36 | class MBoxReader(object): 37 | """ A class that extends python's `mailbox` module to provide additional 38 | functionalities such as length, date filtering and parsing. A key component of 39 | many `emailnetwork`'s operations. 40 | 41 | Usage: 42 | reader = MboxReader('path-to-mbox.mbox') 43 | 44 | Args: 45 | object ([type]): Instantiate this class by specifying a path to an `.mbox` object 46 | """ 47 | 48 | def __init__(self, path) -> None: 49 | super().__init__() 50 | self.path = path 51 | self.mbox = mbox(path) 52 | 53 | def __iter__(self): 54 | for msg in self.mbox: 55 | yield msg 56 | 57 | def __len__(self): 58 | return self.count() 59 | 60 | def count(self): 61 | """ 62 | Count the number of emails in the mbox instance. 63 | Helper function to implement __len__ 64 | """ 65 | return self.mbox.keys()[-1]+1 66 | # return len(self.mbox.keys()) 67 | 68 | def extract(self): 69 | """ 70 | Extract the meta data from the Mbox instance 71 | """ 72 | for email in self: 73 | try: 74 | emailmeta = extract_meta(email) 75 | if emailmeta is not None: 76 | yield emailmeta 77 | 78 | except Exception as e: 79 | print(e) 80 | continue 81 | 82 | def filter_emails(self, emailaddress=None, datestring=None, dateoperator="=="): 83 | if emailaddress != None: 84 | if type(emailaddress) != str: 85 | raise ValueError( 86 | "Please use a valid string representing an email address") 87 | 88 | if dateoperator not in ['>=', '==', '<=']: 89 | raise ValueError("Please use one of ['>=', '==', '<=']") 90 | 91 | if datestring != None: 92 | try: 93 | targetdate = datetime.strptime(datestring, "%Y-%m-%d") 94 | except ValueError: 95 | print(ValueError) 96 | return "Please use the ISO format for comparison: YYYY-MM-DD" 97 | 98 | val = [] 99 | if emailaddress == None and datestring == None: 100 | for email in self.mbox: 101 | emailmeta = extract_meta(email) 102 | val.append(emailmeta) 103 | elif emailaddress != None and datestring == None: 104 | for email in self.mbox: 105 | emailmeta = extract_meta(email) 106 | checkers = [emailmeta.sender.email] + [recipient.email for recipient in emailmeta.recipients] 107 | if emailaddress in checkers: 108 | val.append(emailmeta) 109 | elif emailaddress == None and datestring != None: 110 | for email in self.mbox: 111 | emailmeta = extract_meta(email) 112 | if dateoperator == '>=': 113 | if emailmeta >= targetdate: 114 | val.append(emailmeta) 115 | elif dateoperator == '==': 116 | if emailmeta == targetdate: 117 | val.append(emailmeta) 118 | elif dateoperator == '<=': 119 | if emailmeta <= targetdate: 120 | val.append(emailmeta) 121 | else: 122 | for email in self.mbox: 123 | emailmeta = extract_meta(email) 124 | checkers = [emailmeta.sender.email] + [recipient.email for recipient in emailmeta.recipients] 125 | if emailaddress in checkers: 126 | if dateoperator == '>=': 127 | if emailmeta >= targetdate: 128 | val.append(emailmeta) 129 | elif dateoperator == '==': 130 | if emailmeta == targetdate: 131 | val.append(emailmeta) 132 | elif dateoperator == '<=': 133 | if emailmeta <= targetdate: 134 | val.append(emailmeta) 135 | 136 | return val 137 | 138 | 139 | if __name__ == '__main__': 140 | reader = MBoxReader('/Users/samuel/EmailNetwork/samuel-supertype.mbox') 141 | # reader = MBoxReader('/Users/vincentiuscalvin/Documents/Supertype/mbox-dataset/Ori_Sample_01.mbox') 142 | headers = HeaderCounter(reader) 143 | k = headers.keys() 144 | spamheaders = list(filter(lambda v: "spam" in v.lower(), k)) 145 | 146 | summary = DomainSummary(reader) 147 | 148 | email = reader.mbox[1] 149 | emailmsg = extract_meta(email) 150 | emailbody = extract_body(email) 151 | mails = reader.filter_emails(datestring='2020-12-31', dateoperator="==") -------------------------------------------------------------------------------- /emailnetwork/graph.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | import math 3 | import os 4 | import uuid 5 | import matplotlib.pyplot as plt 6 | import networkx as nx 7 | import textwrap 8 | 9 | from emailnetwork.extract import MBoxReader, extract_meta 10 | from emailnetwork.network import PeopleCombination 11 | 12 | 13 | def plot_single_directed(reader:MBoxReader, id:int=False, showtitle:bool=False) -> None: 14 | """ 15 | Plot a directed graph from a single email, as determined by `id`. 16 | If `showtitle` is `True`, render the plot with a email subject and date as title. 17 | 18 | Usage: 19 | reader = MboxReader('path-to-mbox.mbox') 20 | plot_single_directed(reader, 300) plots the 300th email from the mbox 21 | Args: 22 | reader (MBoxReader): A `MBoxReader` object 23 | id (int, optional): `id` of the email in the `MBoxReader`. Defaults to False. 24 | showtitle (bool, optional): If `True`, render the plot with a email subject and date as title. Defaults to False. 25 | """ 26 | 27 | len_reader = len(reader) 28 | if not id: 29 | email = reader.mbox[len_reader-1] 30 | else: 31 | email = reader.mbox[id] 32 | emailmsg = extract_meta(email) 33 | 34 | subject = textwrap.fill(emailmsg.subject, 40) 35 | sender = emailmsg.sender.name if len(emailmsg.sender.name) != 0 else emailmsg.sender.email.split('@')[0] 36 | 37 | plt.figure(figsize=(9, 6)) 38 | G = nx.DiGraph(name='Single Email Flow') 39 | 40 | for recipient in emailmsg.recipients: 41 | rec = recipient.name 42 | G.add_edge(sender, rec if len(rec) != 0 else recipient.email, 43 | message=subject, color='darkorchid', weight=3) 44 | 45 | for cc in emailmsg.cc: 46 | ccc = cc.name 47 | G.add_edge(sender, ccc if len(ccc) != 0 else cc.email, 48 | message='cc', color='lightsteelblue', weight=2) 49 | 50 | colors = nx.get_edge_attributes(G,'color').values() 51 | weights = nx.get_edge_attributes(G,'weight').values() 52 | edge_labels = nx.get_edge_attributes(G, 'message') 53 | 54 | pos = nx.planar_layout(G) 55 | 56 | # nx.draw_spectral(G,node_size=0, alpha=0.8, edge_color=colors, width=list(weights), font_size=8, with_labels=True) 57 | nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_size=6, label_pos=0.5) 58 | nx.draw_planar(G,node_size=0, alpha=1, edge_color=colors, width=list(weights), font_size=8, font_weight='bold', with_labels=True, verticalalignment='bottom') 59 | 60 | if showtitle: 61 | font = {"fontname": "Helvetica", "color": "k", "fontweight": "bold", "fontsize": 8} 62 | plt.title(subject + '\n Delivery date: ' + emailmsg.date.strftime("%m/%d/%Y"), fontdict=font) 63 | 64 | plt.tight_layout(pad=0.5) 65 | plt.axis('off') 66 | plt.show() 67 | 68 | def plot_single_undirected(reader:MBoxReader, id:int=False, showtitle:bool=False) -> None: 69 | """ 70 | Plot an undirected social network graph from a single email, as determined by `id`. 71 | If `showtitle` is `True`, render the plot with a email subject and date as title. 72 | 73 | Usage: 74 | reader = MboxReader('path-to-mbox.mbox') 75 | plot_single_undirected(reader, 300) plots the 300th email from the mbox 76 | Args: 77 | reader (MBoxReader): A `MBoxReader` object 78 | id (int, optional): `id` of the email in the `MBoxReader`. Defaults to False. 79 | showtitle (bool, optional): If `True`, render the plot with a email subject and date as title. Defaults to False. 80 | """ 81 | 82 | len_reader = len(reader) 83 | if not id: 84 | email = reader.mbox[len_reader-1] 85 | else: 86 | email = reader.mbox[id] 87 | emailmsg = extract_meta(email) 88 | 89 | subject = textwrap.fill(emailmsg.subject, 40) 90 | ng = PeopleCombination(emailmsg) 91 | G = nx.Graph( 92 | name='Single Email Social Network') 93 | counter = Counter() 94 | for combo in ng.combo: 95 | counter[combo] += 1 96 | 97 | total_combos = sum(counter.values()) 98 | by_freq = {k: v/total_combos for k, v in counter.most_common()} 99 | 100 | for rel in counter.keys(): 101 | G.add_edge(*rel, weight=by_freq[rel], count=counter[rel]) 102 | k = 1/math.sqrt(G.order()) * 2 103 | pos = nx.spring_layout(G, k=k) 104 | deg = [v for _, v in G.degree()] 105 | # nx.draw_networkx_nodes(G, pos, node_size=deg, linewidths=1.0, alpha=0.90, label=G._node.keys()) 106 | nx.draw_networkx_nodes(G, pos, node_size=deg, linewidths=1.0, alpha=0.90) 107 | nx.draw_networkx_edges(G, pos, edge_color="steelblue", width=1.0, style='dashed', alpha=0.75) 108 | nx.draw_networkx_labels(G, pos, {n: n for n in G.nodes}, font_size=8, verticalalignment="bottom") 109 | # nx.draw_networkx_labels(G, pos, {n: n for n in G.nodes if n.split('@')[-1] == 'supertype.ai'}, font_size=8) 110 | 111 | if showtitle: 112 | font = {"fontname": "Helvetica", "color": "k", "fontweight": "bold", "fontsize": 8} 113 | plt.title(subject + '\n Delivery date: ' + emailmsg.date.strftime("%m/%d/%Y"), fontdict=font) 114 | 115 | plt.tight_layout(pad=0.5) 116 | plt.axis('off') 117 | plt.show() 118 | 119 | def plot_directed(reader:MBoxReader, layout:str='shell', graphml:bool=False) -> None: 120 | """ 121 | Plot a directed social network graph from the entire `mbox`, supplied by `MBoxReader`. 122 | `layout` determines the underlying `NetworkX` layout. 123 | 124 | Usage: 125 | reader = MboxReader('path-to-mbox.mbox') 126 | plot_directed(reader) 127 | Args: 128 | reader (MBoxReader): A `MBoxReader` object 129 | layout (str, optional): Can be one of 'shell', 'spring' or 'spiral'. Defaults to 'shell'. 130 | graphml (bool, optional): Determines if a .graphml file is exported to the working directory. Defaults to False. 131 | """ 132 | 133 | emails = reader.extract() 134 | plt.figure(figsize=(12,12)) 135 | G = nx.MultiDiGraph(name='Email Social Network') 136 | for email in emails: 137 | sender = email.sender.name 138 | source_addr = sender if sender != '' else email.sender.email.split('@')[0] 139 | 140 | all_recipients = [em.name if em.name !='' or None else em.email.split('@')[0] for em in email.recipients + email.cc] 141 | 142 | for recipient in all_recipients: 143 | G.add_edge(source_addr, recipient, message=email.subject) 144 | 145 | if graphml: 146 | fileName = f'network-{str(uuid.uuid4().hex)[:8]}.graphml' 147 | nx.write_graphml(G, fileName) 148 | 149 | if layout == 'shell': 150 | pos = nx.shell_layout(G) 151 | elif layout == 'spring': 152 | pos = nx.spring_layout(G) 153 | else: 154 | pos = nx.spiral_layout(G) 155 | nx.draw(G, pos, node_size=0, alpha=0.4, edge_color='cadetblue', font_size=7, with_labels=True) 156 | ax = plt.gca() 157 | ax.margins(0.08) 158 | plt.show() 159 | 160 | def plot_undirected(reader:MBoxReader, layout:str='shell', graphml:bool=False): 161 | """Plot an undirected social network graph from the entire `mbox`, supplied by `MBoxReader`. 162 | `layout` determines the underlying `NetworkX` layout. 163 | 164 | Usage: 165 | reader = MboxReader('path-to-mbox.mbox') 166 | plot_undirected(reader) 167 | 168 | Args: 169 | reader (MBoxReader): A `MBoxReader` object 170 | layout (str, optional): Can be one of 'shell', 'spring' or 'spiral'. Defaults to 'shell'. 171 | graphml (bool, optional): Determines if a .graphml file is exported to the working directory. Defaults to False. 172 | """ 173 | 174 | emails = reader.extract() 175 | G = nx.Graph(name='Email Social Network') 176 | plt.figure(figsize=(12,12)) 177 | counter = Counter() 178 | for email in emails: 179 | ng = PeopleCombination(email) 180 | 181 | for combo in ng.combo: 182 | counter[combo] += 1 183 | 184 | total_combos = sum(counter.values()) 185 | by_freq = {k: v/total_combos for k, v in counter.most_common()} 186 | for rel in counter.keys(): 187 | G.add_edge(*rel, weight=by_freq[rel], count=counter[rel]) 188 | 189 | if graphml: 190 | fileName = f'network-{str(uuid.uuid4().hex)[:8]}.graphml' 191 | nx.write_graphml(G, fileName) 192 | print(f"Graphml exported as {fileName}") 193 | 194 | if layout == 'shell': 195 | pos = nx.shell_layout(G) 196 | elif layout == 'spring': 197 | k = 1/math.sqrt(G.order()) * 2 198 | pos = nx.spring_layout(G, k=k) 199 | else: 200 | pos = nx.spiral_layout(G) 201 | 202 | deg = [v*50 for _, v in G.degree()] 203 | nx.draw_networkx_nodes(G, pos, node_size=deg, linewidths=1.0, alpha=0.60) 204 | nx.draw_networkx_edges(G, pos, width=1.0, style='dashed', edge_color='cadetblue', alpha=0.6) 205 | nx.draw_networkx_labels(G, pos, {n: n.split('@')[0] for n in G.nodes}, font_size=8, font_color='darkorchid') 206 | 207 | plt.axis('off') 208 | plt.show() 209 | 210 | if __name__ == '__main__': 211 | MBOX_PATH = f'{os.path.dirname(__file__)}/tests/test.mbox' 212 | 213 | reader = MBoxReader('/Users/samuel/EmailNetwork/emailnetwork/emailnetwork/tests/test.mbox') 214 | # reader = MBoxReader('/Users/vincentiuscalvin/Documents/Supertype/mbox-dataset/Ori_Sample_01.mbox') 215 | # reader = MBoxReader('/Users/samuel/EmailNetwork/samuel-supertype.mbox') 216 | # plot_single_directed(reader,300) 217 | # plot_single_directed(reader, 1, True) 218 | # plot_directed(reader) 219 | # plot_directed(reader, "shell") 220 | plot_undirected(reader, 'spring') -------------------------------------------------------------------------------- /emailnetwork/header.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | from email.header import decode_header 3 | from emailnetwork.utils import clean_subject 4 | 5 | 6 | class HeaderCounter(Counter): 7 | """[summary] 8 | 9 | Args: 10 | Counter ([type]): [description] 11 | """ 12 | 13 | def __init__(self, reader): 14 | super().__init__() 15 | self = self.build_from(reader) 16 | 17 | def __str__(self): 18 | return f'{self.most_common()}' 19 | 20 | def build_from(self, reader): 21 | for email in reader: 22 | for k in email.keys(): 23 | self[k] += 1 24 | 25 | return self 26 | 27 | def histogram(self, n=25): 28 | from matplotlib import pyplot as plt 29 | plt.style.use('fivethirtyeight') 30 | k, v = (list(self.keys())[:n], list(self.values())[:n]) 31 | fig = plt.figure(figsize=(7, 10)) 32 | ax = fig.add_subplot(111) 33 | y_pos = [i for i in range(n)] 34 | ax.barh(y_pos, v, color='plum') 35 | ax.set_yticks(y_pos) 36 | ax.set_yticklabels(k) 37 | ax.invert_yaxis() 38 | ax.set_xlabel('Frequency') 39 | ax.set_title('Email Header Analysis') 40 | plt.tight_layout() 41 | plt.show() 42 | 43 | 44 | if __name__ == '__main__': 45 | from emailnetwork.extract import MBoxReader 46 | # reader = MBoxReader('/Users/vincentiuscalvin/Documents/Supertype/mbox-dataset/Ori_Sample_01.mbox') 47 | reader = MBoxReader('/Users/samuel/EmailNetwork/samuel-supertype.mbox') 48 | headers = HeaderCounter(reader) 49 | 50 | k = headers.keys() 51 | 52 | containspam = list(filter(lambda v: "spam" in v.lower(), k)) 53 | 54 | for email in reader: 55 | for key in email.keys(): 56 | if key in containspam: 57 | print({key: decode_header(email[key])}) 58 | -------------------------------------------------------------------------------- /emailnetwork/network.py: -------------------------------------------------------------------------------- 1 | from itertools import combinations 2 | 3 | class PeopleCombination: 4 | """ 5 | Usage: 6 | email = EmailMeta(...) 7 | PeopleCombination(email) 8 | """ 9 | 10 | def __init__(self, email): 11 | people = [email.sender] + email.recipients + email.cc 12 | people = filter(lambda p: p is not None, people) 13 | people = set(addr.email for addr in people if addr.email) 14 | self.people = sorted(people) 15 | 16 | def __repr__(self): 17 | return str(self.people) 18 | 19 | 20 | @property 21 | def combo(self): 22 | for combination in combinations(self.people, 2): 23 | yield combination 24 | -------------------------------------------------------------------------------- /emailnetwork/summary.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | from collections import Counter 3 | from datetime import datetime 4 | 5 | 6 | class DomainSummary: 7 | def __init__(self, reader): 8 | self.emailmetas = reader.extract() 9 | self.summary = self.get_summary() 10 | 11 | def get_summary(self): 12 | domains = {} 13 | for email in self.emailmetas: 14 | domains[email.origin_domain] = domains.get(email.origin_domain, 0) + 1 15 | 16 | return Counter(domains) 17 | 18 | def plot(self): 19 | 20 | plt.style.use("seaborn") 21 | 22 | fig = plt.figure(figsize=(10, 5)) 23 | 24 | domains = list(self.summary.keys()) 25 | freqs = list(self.summary.values()) 26 | 27 | plt.barh(domains, freqs, color="cadetblue") 28 | [i.set_color("darkorchid") for i in plt.gca().get_xticklabels()] 29 | [i.set_color("darkorchid") for i in plt.gca().get_yticklabels()] 30 | plt.title( 31 | "Sender's Domain Occurences", 32 | fontdict={ 33 | "fontname": "Helvetica", 34 | "color": "k", 35 | "fontweight": "bold", 36 | "fontsize": 12, 37 | }, 38 | ) 39 | plt.show() 40 | 41 | def __str__(self): 42 | coba = [] 43 | for keys in self.summary: 44 | coba.append(f"{keys:<25s}: {str(self.summary[keys]):<3s}") 45 | return "\n".join(coba) 46 | 47 | 48 | class IncomingOutgoingSummary: 49 | def __init__(self, reader): 50 | self.reader = reader 51 | self.emailmetas = reader.extract() 52 | self.user_email = self.get_user_email() 53 | self.summary = self.get_summary() 54 | 55 | def get_user_email(self): 56 | from emailnetwork.extract import extract_meta 57 | 58 | email_addresses = {} 59 | for i in range(10): 60 | email = self.reader.mbox[i] 61 | emailmsg = extract_meta(email) 62 | for recipient in emailmsg.recipients: 63 | email_addresses[recipient.email] = ( 64 | email_addresses.get(recipient.email, 0) + 1 65 | ) 66 | email_addresses[emailmsg.sender.email] = ( 67 | email_addresses.get(emailmsg.sender.email, 0) + 1 68 | ) 69 | return sorted(email_addresses.items(), key=lambda k: k[1], reverse=True)[0][0] 70 | 71 | def get_summary(self): 72 | date = {} 73 | for email in self.emailmetas: 74 | if email.date.strftime("%B %Y") not in date: 75 | date[email.date.strftime("%B %Y")] = {"Incoming": 0, "Outgoing": 0} 76 | if email.sender.email == self.user_email: 77 | date[email.date.strftime("%B %Y")]["Outgoing"] += 1 78 | else: 79 | date[email.date.strftime("%B %Y")]["Incoming"] += 1 80 | 81 | date = sorted( 82 | date.items(), key=lambda items: datetime.strptime(items[0], "%B %Y") 83 | ) 84 | return dict(date) 85 | 86 | def plot(self): 87 | 88 | plt.style.use("seaborn") 89 | 90 | dates = list(self.summary.keys()) 91 | incoming = list(item[1]["Incoming"] for item in self.summary.items()) 92 | outgoing = list(item[1]["Outgoing"] for item in self.summary.items()) 93 | 94 | fig, ax = plt.subplots() 95 | 96 | ax.bar(dates, incoming, 0.4, label="Incoming", color="cadetblue") 97 | ax.bar(dates, outgoing, 0.4, bottom=incoming, label="Outgoing") 98 | 99 | [i.set_color("darkorchid") for i in plt.gca().get_xticklabels()] 100 | [i.set_color("darkorchid") for i in plt.gca().get_yticklabels()] 101 | 102 | plt.xticks(rotation=45) 103 | 104 | ax.set_ylabel("Counts") 105 | ax.set_title( 106 | "Number of Incoming and Outgoing Emails per Month", 107 | fontdict={ 108 | "fontname": "Helvetica", 109 | "color": "k", 110 | "fontweight": "bold", 111 | "fontsize": 12, 112 | }, 113 | ) 114 | ax.legend() 115 | 116 | plt.show() 117 | 118 | 119 | if __name__ == "__main__": 120 | from emailnetwork.extract import MBoxReader 121 | 122 | reader = MBoxReader( 123 | "/Users/vincentiuscalvin/Documents/Supertype/mbox-dataset/Ori_Sample_01.mbox" 124 | ) 125 | summary = DomainSummary(reader) 126 | summary_2 = IncomingOutgoingSummary(reader) 127 | -------------------------------------------------------------------------------- /emailnetwork/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/onlyphantom/emailnetwork/24b323e7c5159b5a02a7933b5f609664689a7316/emailnetwork/tests/__init__.py -------------------------------------------------------------------------------- /emailnetwork/tests/test_extract.py: -------------------------------------------------------------------------------- 1 | import os 2 | import datetime 3 | from unittest import TestCase 4 | 5 | from emailnetwork.extract import MBoxReader, extract_meta, extract_body 6 | from emailnetwork.emails import EmailAddress, EmailMeta, EmailBody 7 | 8 | """ 9 | Demo mbox is generated from Benjamin Bengfort's Tribe tool 10 | with person names modified for anonymity 11 | """ 12 | MBOX_PATH = f"{os.path.dirname(__file__)}/test.mbox" 13 | 14 | 15 | class TestExtract(TestCase): 16 | def setUp(self): 17 | self.reader = MBoxReader(MBOX_PATH) 18 | self.emails = self.reader.extract() 19 | self.emailAdress = self.reader.mbox[1] 20 | 21 | def tearDown(self): 22 | self.reader = None 23 | 24 | def test_read_mbox(self): 25 | self.assertIsInstance(self.reader, MBoxReader) 26 | 27 | def test_length_mbox(self): 28 | self.assertEqual(len(self.reader), 140) 29 | 30 | def test_extract(self): 31 | # self.assertTrue(isinstance(next(self.emails), EmailMeta)) 32 | firstemail = next(self.emails) 33 | self.assertIsInstance(firstemail, EmailMeta) 34 | self.assertIsInstance(firstemail.subject, str) 35 | self.assertIsInstance(firstemail.date, datetime.datetime) 36 | 37 | for msg in self.emails: 38 | self.assertGreaterEqual(len(msg.recipients), 1) 39 | self.assertIsInstance(msg.cc, list) 40 | 41 | def test_email_address(self): 42 | firstemail = next(self.emails) 43 | self.assertIsInstance(firstemail.sender, EmailAddress) 44 | self.assertIsInstance(firstemail.sender.name, str) 45 | self.assertIsInstance(firstemail.sender.email, str) 46 | 47 | mail = extract_meta(self.emailAdress) 48 | self.assertIsInstance(mail.recipients[0].name, str) 49 | self.assertIsInstance(mail.recipients[0].email, str) 50 | 51 | def test_filter_emails(self): 52 | newmails = self.reader.filter_emails(datestring="2020-01-01", dateoperator=">=") 53 | self.assertEqual(len(newmails), 4) 54 | 55 | for email in newmails: 56 | self.assertGreater(email.date, datetime.datetime(2020, 1, 1)) 57 | self.assertLess(email.date, datetime.datetime.now()) 58 | 59 | oldmails = self.reader.filter_emails(datestring="2019-12-31", dateoperator="<=") 60 | self.assertEqual(len(oldmails), 136) 61 | 62 | exactmails = self.reader.filter_emails( 63 | datestring="2020-04-17", dateoperator="==" 64 | ) 65 | self.assertEqual(len(exactmails), 1) 66 | self.assertEqual(exactmails[0].date.date(), datetime.date(2020, 4, 17)) 67 | 68 | namedmails = self.reader.filter_emails(emailaddress="samuelchan@gmail.com") 69 | 70 | for email in namedmails: 71 | checkers = [email.sender.email] + [ 72 | recipient.email for recipient in email.recipients 73 | ] 74 | self.assertTrue("samuelchan@gmail.com" in checkers) 75 | 76 | fullfilteredmails = self.reader.filter_emails( 77 | emailaddress="samuelchan@gmail.com", 78 | datestring="2020-01-01", 79 | dateoperator=">=", 80 | ) 81 | 82 | for email in fullfilteredmails: 83 | checkers = [email.sender.email] + [ 84 | recipient.email for recipient in email.recipients 85 | ] 86 | self.assertTrue("samuelchan@gmail.com" in checkers) 87 | self.assertGreater(email.date, datetime.datetime(2020, 1, 1)) 88 | 89 | fullfilteredmailsequal = self.reader.filter_emails( 90 | emailaddress="samuelchan@gmail.com", 91 | datestring="2020-04-17", 92 | dateoperator="==", 93 | ) 94 | 95 | for email in fullfilteredmailsequal: 96 | checkers = [email.sender.email] + [ 97 | recipient.email for recipient in email.recipients 98 | ] 99 | self.assertTrue("samuelchan@gmail.com" in checkers) 100 | self.assertEqual( 101 | fullfilteredmailsequal[0].date.date(), datetime.date(2020, 4, 17) 102 | ) 103 | 104 | fullfilteremailless = self.reader.filter_emails( 105 | emailaddress="samuelchan@gmail.com", 106 | datestring="2019-12-31", 107 | dateoperator="<=", 108 | ) 109 | 110 | for email in fullfilteremailless: 111 | checkers = [email.sender.email] + [ 112 | recipient.email for recipient in email.recipients 113 | ] 114 | self.assertTrue("samuelchan@gmail.com" in checkers) 115 | 116 | mailswithoutfilter = self.reader.filter_emails() 117 | 118 | for email in mailswithoutfilter: 119 | self.assertIsInstance(email, EmailMeta) 120 | 121 | # also need tests to fail with expected exception when datetime operator not in [==, <=, >=], emailaddress and datetime in wrong format. 122 | def test_afunction_throws_exception(self): 123 | self.assertRaises(ValueError, self.reader.filter_emails, 20, "2019-12-31", "<") 124 | 125 | def test_extract_meta_single(self): 126 | for email in self.reader.mbox: 127 | self.assertIsInstance(email["Subject"], (bytes, str)) 128 | emailmsg = extract_meta(email) 129 | self.assertIsInstance(emailmsg, EmailMeta) 130 | self.assertIsInstance(emailmsg.origin_domain, str) 131 | self.assertIsInstance(emailmsg.subject, str) 132 | 133 | def test_extract_body_single(self): 134 | for email in self.reader.mbox: 135 | emailbody = extract_body(email) 136 | self.assertIsInstance(emailbody, EmailBody) 137 | self.assertIsInstance(emailbody.subject, str) 138 | self.assertIsInstance(emailbody.body, str) 139 | -------------------------------------------------------------------------------- /emailnetwork/tests/test_graph.py: -------------------------------------------------------------------------------- 1 | import os 2 | from unittest import TestCase, mock 3 | 4 | from emailnetwork.extract import MBoxReader 5 | # from emailnetwork.graph import plot_single_email 6 | import emailnetwork.graph as graph 7 | 8 | MBOX_PATH = f'{os.path.dirname(__file__)}/test.mbox' 9 | 10 | 11 | class TestGraph(TestCase): 12 | def setUp(self): 13 | self.reader = MBoxReader(MBOX_PATH) 14 | self.emails = self.reader.extract() 15 | self.layout = ['shell', 'spring', 'spiral'] 16 | 17 | def test_single_graph(self): 18 | # TODO: to be implemented later 19 | pass 20 | 21 | @mock.patch(f"{__name__}.graph.plt") 22 | def test_plot_single_directed(self, mock_plt): 23 | graph.plot_single_directed(self.reader, 1, True) 24 | mock_plt.title.assert_called_once_with("Three tips to get the most out of Gmail\n Delivery date: 04/17/2020", fontdict={'fontname': 'Helvetica', 'color': 'k', 'fontweight': 'bold', 'fontsize': 8}) 25 | assert mock_plt.figure.called 26 | 27 | 28 | def test_plot_single_undirected(self): 29 | with mock.patch("%s.graph.plt" % __name__) as patch, mock.patch("%s.graph.nx" % __name__) as patch2: 30 | graph.plot_single_undirected(self.reader,2, True) 31 | patch.title.assert_called_once_with("Stay more organized with Gmail's inbox\n Delivery date: 08/13/2020", fontdict={'fontname': 'Helvetica', 'color': 'k', 'fontweight': 'bold', 'fontsize': 8}) 32 | patch2.Graph.assert_called_once_with(name='Single Email Social Network') 33 | 34 | 35 | def test_plot_directed(self): 36 | with mock.patch("%s.graph.plt" % __name__) as patch: 37 | for item in self.layout: 38 | graph.plot_directed(self.reader, item) 39 | assert patch.figure.called 40 | 41 | def test_plot_undirected(self): 42 | with mock.patch("%s.graph.plt" % __name__) as patch: 43 | for item in self.layout: 44 | graph.plot_undirected(self.reader, item) 45 | assert patch.figure.called 46 | 47 | -------------------------------------------------------------------------------- /emailnetwork/tests/test_summary.py: -------------------------------------------------------------------------------- 1 | import os 2 | import matplotlib.pyplot as plt 3 | from unittest import TestCase, mock 4 | from collections import Counter 5 | 6 | from emailnetwork.extract import MBoxReader 7 | from emailnetwork.summary import DomainSummary, IncomingOutgoingSummary 8 | from emailnetwork.header import HeaderCounter 9 | 10 | import emailnetwork.summary as summary 11 | 12 | MBOX_PATH = f"{os.path.dirname(__file__)}/test.mbox" 13 | 14 | 15 | class TestSummary(TestCase): 16 | def setUp(self): 17 | self.reader = MBoxReader(MBOX_PATH) 18 | self.domain_summary = DomainSummary(self.reader) 19 | self.incoming_outgoing_summary = IncomingOutgoingSummary(self.reader) 20 | self.headers = HeaderCounter(self.reader) 21 | 22 | def tearDown(self): 23 | self.domain_summary = None 24 | self.incoming_outgoing_summary = None 25 | 26 | def test_summary_instance(self): 27 | self.assertIsInstance(self.domain_summary, DomainSummary) 28 | self.assertIsInstance(self.domain_summary.summary, Counter) 29 | self.assertIsInstance(self.incoming_outgoing_summary, IncomingOutgoingSummary) 30 | self.assertIsInstance(self.incoming_outgoing_summary.summary, dict) 31 | 32 | def test_one_summary(self): 33 | for summary in self.domain_summary.summary: 34 | self.assertIsInstance(summary, str) 35 | self.assertIsInstance(self.domain_summary.summary[summary], int) 36 | self.assertGreater(self.domain_summary.summary[summary], 0) 37 | 38 | for summary in self.incoming_outgoing_summary.summary: 39 | self.assertIsInstance(summary, str) 40 | self.assertIsInstance(self.incoming_outgoing_summary.summary[summary], dict) 41 | for keys in self.incoming_outgoing_summary.summary[summary]: 42 | self.assertIn(keys, ("Incoming", "Outgoing")) 43 | self.assertIsInstance( 44 | self.incoming_outgoing_summary.summary[summary][keys], int 45 | ) 46 | 47 | def test_header(self): 48 | self.assertIsInstance(self.headers, HeaderCounter) 49 | 50 | @mock.patch(f"{__name__}.summary.plt") 51 | def test_mock_plot(self, mock_plt): 52 | reader = MBoxReader(MBOX_PATH) 53 | ds = DomainSummary(reader=reader) 54 | ds.plot() 55 | mock_plt.title.assert_called_once_with( 56 | "Sender's Domain Occurences", 57 | fontdict={ 58 | "fontname": "Helvetica", 59 | "color": "k", 60 | "fontweight": "bold", 61 | "fontsize": 12, 62 | }, 63 | ) 64 | assert mock_plt.figure.called 65 | -------------------------------------------------------------------------------- /emailnetwork/tests/test_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | from unittest import TestCase 3 | 4 | from emailnetwork.extract import MBoxReader 5 | 6 | MBOX_PATH = f'{os.path.dirname(__file__)}/test.mbox' 7 | 8 | class TestNetwork(TestCase): 9 | def setUp(self): 10 | # self.reader = MBoxReader(MBOX_PATH) 11 | # self.emails = self.reader.extract() 12 | pass 13 | 14 | # test PeopleCombination 15 | 16 | -------------------------------------------------------------------------------- /emailnetwork/utils.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | 3 | from dateutil import parser 4 | from dateutil.tz import tzlocal, tzutc 5 | from email.utils import parsedate_tz, mktime_tz 6 | from email.header import decode_header 7 | 8 | 9 | def parse_date(datestring: str): 10 | """[summary] 11 | Usage: 12 | Primarily used for extract_meta(email): parse_date(email['Date']) 13 | parse_date('Sat, 19 Sep 2020 12:01:38 +0800') 14 | Args: 15 | datestring (str): [description] 16 | """ 17 | try: 18 | dt = parsedate_tz(datestring) 19 | if dt is not None: 20 | return datetime.utcfromtimestamp(mktime_tz(dt)) 21 | 22 | return parser.parse(datestring) 23 | except Exception: 24 | return None 25 | 26 | 27 | def clean_subject(subject): 28 | """[summary] 29 | Usage: 30 | 31 | Args: 32 | subject (byte or str) 33 | """ 34 | subject, encoding = decode_header(subject)[0] 35 | if isinstance(subject, bytes): 36 | try: 37 | return subject.decode(encoding).strip() 38 | except: 39 | return subject.decode('utf-8').strip() 40 | else: 41 | return subject.strip().replace('\r\n', '') 42 | 43 | 44 | def clean_body(email): 45 | if email.is_multipart(): 46 | for part in email.walk(): 47 | ctype = part.get_content_type() 48 | cdispo = str(part.get('Content-Disposition')) 49 | 50 | # skip any text/plain (txt) attachments 51 | if ctype == 'text/plain' and 'attachment' not in cdispo: 52 | return part.get_payload(decode=True).decode() # decode 53 | break 54 | # not multipart - i.e. plain text, no attachments, keeping fingers crossed 55 | else: 56 | return email.get_payload(decode=True).decode() 57 | -------------------------------------------------------------------------------- /emailnetwork/version.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.0.2" -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | bleach==3.2.3 2 | certifi==2020.12.5 3 | chardet==4.0.0 4 | colorama==0.4.4 5 | coverage==5.4 6 | cycler==0.10.0 7 | decorator==4.4.2 8 | docutils==0.16 9 | emailnetwork==0.0.1 10 | idna==2.10 11 | keyring==22.0.1 12 | kiwisolver==1.3.1 13 | matplotlib==3.3.3 14 | networkx==2.5 15 | numpy==1.19.5 16 | packaging==20.8 17 | Pillow==8.1.0 18 | pkginfo==1.7.0 19 | Pygments==2.7.4 20 | pyparsing==2.4.7 21 | pytest==6.2.5 22 | pytest-cov==3.0.0 23 | python-dateutil==2.8.1 24 | readme-renderer==28.0 25 | requests==2.25.1 26 | requests-toolbelt==0.9.1 27 | rfc3986==1.4.0 28 | six==1.15.0 29 | tqdm==4.56.0 30 | twine==3.3.0 31 | urllib3==1.26.3 32 | webencodings==0.5.1 33 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | from pathlib import Path 3 | from emailnetwork.version import __version__ 4 | 5 | setup(name='emailnetwork', 6 | version=__version__, 7 | description='Network graphing utilities for email/mailbox (.mbox) data', 8 | long_description=(Path(__file__).parent/'README.md').read_text(), 9 | long_description_content_type='text/markdown', 10 | url='http://github.com/onlyphantom/emailnetwork', 11 | author='Samuel Chan', 12 | author_email='s@supertype.ai', 13 | license='MIT', 14 | packages=find_packages(exclude=('tests',)), 15 | include_package_data=True, 16 | install_requires=['matplotlib', 'networkx'], 17 | classifiers=[ 18 | "License :: OSI Approved :: MIT License", 19 | "Programming Language :: Python :: 3", 20 | "Programming Language :: Python :: 3.7", 21 | "Operating System :: OS Independent", 22 | ], 23 | zip_safe=False, 24 | python_requires='>=3.7') --------------------------------------------------------------------------------