├── .coverage
├── .coveragerc
├── .github
└── workflows
│ ├── ci.yml
│ └── python-package.yml
├── .gitignore
├── LICENSE
├── MANIFEST.in
├── README.md
├── assets
├── graph1.png
├── graph2.png
├── graph3.png
├── graph4.png
├── graph5.png
├── histo.png
└── summaryplot.png
├── build
└── lib
│ └── emailnetwork
│ ├── __init__.py
│ ├── emails.py
│ ├── extract.py
│ ├── graph.py
│ ├── header.py
│ ├── network.py
│ ├── summary.py
│ ├── tests
│ ├── __init__.py
│ ├── test.mbox
│ ├── test_extract.py
│ ├── test_graph.py
│ ├── test_summary.py
│ └── test_utils.py
│ ├── utils.py
│ └── version.py
├── coverage.xml
├── dist
├── emailnetwork-0.0.1-py3-none-any.whl
├── emailnetwork-0.0.1.tar.gz
├── emailnetwork-0.0.2-py3-none-any.whl
└── emailnetwork-0.0.2.tar.gz
├── emailnetwork.egg-info
├── PKG-INFO
├── SOURCES.txt
├── dependency_links.txt
├── not-zip-safe
├── requires.txt
└── top_level.txt
├── emailnetwork
├── .coverage
├── __init__.py
├── emails.py
├── extract.py
├── graph.py
├── header.py
├── network.py
├── summary.py
├── tests
│ ├── __init__.py
│ ├── test.mbox
│ ├── test_extract.py
│ ├── test_graph.py
│ ├── test_summary.py
│ └── test_utils.py
├── utils.py
└── version.py
├── requirements.txt
└── setup.py
/.coverage:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/onlyphantom/emailnetwork/24b323e7c5159b5a02a7933b5f609664689a7316/.coverage
--------------------------------------------------------------------------------
/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | omit =
3 | */test*
--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
1 | name: Run Python Tests
2 | on:
3 | push:
4 | branches:
5 | - main
6 | pull_request:
7 | branches:
8 | - main
9 |
10 | jobs:
11 | build:
12 | runs-on: ${{ matrix.os }}
13 | strategy:
14 | matrix:
15 | os: [ubuntu-latest, macos-latest, windows-latest]
16 | python-version: ["3.7", "3.8", "3.9"]
17 | steps:
18 | - uses: actions/checkout@v2
19 | - name: Install Python 3
20 | uses: actions/setup-python@v1
21 | with:
22 | python-version: ${{ matrix.python-version }}
23 | - name: Install dependencies
24 | run: |
25 | python -m pip install --upgrade pip
26 | pip install -r requirements.txt
27 | - name: Test with pytest
28 | run: |
29 | pip install pytest-cov
30 | pytest --cov
31 |
--------------------------------------------------------------------------------
/.github/workflows/python-package.yml:
--------------------------------------------------------------------------------
1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
3 |
4 | name: Python package
5 |
6 | on:
7 | push:
8 | branches: [ main ]
9 | pull_request:
10 | branches: [ main ]
11 |
12 | jobs:
13 | build:
14 |
15 | runs-on: ubuntu-latest
16 | strategy:
17 | matrix:
18 | python-version: [3.7, 3.8, 3.9]
19 |
20 | steps:
21 | - uses: actions/checkout@v2
22 | - name: Set up Python ${{ matrix.python-version }}
23 | uses: actions/setup-python@v2
24 | with:
25 | python-version: ${{ matrix.python-version }}
26 | - name: Install dependencies
27 | run: |
28 | python -m pip install --upgrade pip
29 | python -m pip install flake8 pytest
30 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
31 | - name: Lint with flake8
32 | run: |
33 | # stop the build if there are Python syntax errors or undefined names
34 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
35 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
36 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
37 | - name: Test with pytest
38 | run: |
39 | pytest
40 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | .DS_Store
3 | .vscode
4 | htmlcov/
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2021 Supertype Pte Ltd
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include emailnetwork/tests/*.mbox
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 | [](https://supertype.ai/incubate)
3 | [](https://pypi.org/project/emailnetwork/)
4 | [](https://colab.research.google.com/drive/1mSKbt9-dTTtQq296QUkMZlZpAMybgmpV?usp=sharing)
5 | [](https://pepy.tech/project/emailnetwork)
6 |
7 | # Email Network
8 |
9 | ## Description
10 | Network graphing utilities for email/mailbox data.
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 | For the social scientists, creating social networks from your mailbox data and among other things:
19 | * Discover subgroups within your organization (whether the different task forces established were as cohesive as it seems on the outside)
20 | * Study social actors (most emails from Marketing involve Peter and Andy) and their relative influence
21 | * Identify the key social groups (Sales team hangs out a lot, but the IT / product division less so)
22 | * Key account managers of the company (Despite being with the company only recently, Margaretha is connected to more key clients than her peers)
23 | * Compare distributions and patterns of email behaviors and aggregated statistics between groups of employees
24 |
25 |
26 | If you're a graph theorist and looking for something more statistical:
27 | * Support directed and undirected graphs (**already implemented in version 0.0.2**, see below)
28 | * Also output statistical measurements such as centrality distribution (**planned for version 0.0.3**)
29 | * Betweenness, closeness, hubness, distance histograms plotting (**planned for version 0.0.3**)
30 | * Exports to `.graphml` format for use in other graphing software (**already implemented in version 0.0.2**, see below)
31 |
32 | ## Dependencies
33 | * Python 3.7+
34 | * Only dependencies are NetworkX and Matplotlib
35 |
36 | ## Example Usage
37 | To install `emailnetwork`:
38 | ```
39 | pip install emailnetwork
40 | ```
41 |
42 | A sample `.mbox` file is provided to you, but you can obtain export your own mailbox from your email service provider. If you use Google (Gmail), you can [use the Google Takeout service](https://takeout.google.com/settings/takeout) to export your mail data.
43 |
44 |
45 | ```python
46 | from emailnetwork.extract import MBoxReader
47 | reader = MBoxReader('path-to-mbox.mbox')
48 | print(f'{len(reader)} emails in the sample mbox.')
49 |
50 | # extract a specific email
51 | from emailnetwork.extract import extract_meta
52 | email = reader.mbox[5]
53 | emailmsg = extract_meta(email)
54 |
55 | # filter emails by certain date
56 | thisyearmails = reader.filter_emails(dateoperator='>=', datestring='2021-01-05')
57 |
58 | # print email domains of recipients
59 | print(emailmsg.recipients)
60 | print(emailmsg.recipients[0].domain)
61 |
62 | # extract all emails
63 | emails = reader.extract()
64 | ```
65 |
66 | For graph visualization:
67 | ```py
68 | from emailnetwork.extract import MBoxReader
69 | from emailnetwork.graph import plot_directed, plot_undirected, plot_single_directed, plot_single_undirected
70 |
71 | # Read from .mbox
72 | MBOX_PATH = f'{os.path.dirname(__file__)}/tests/test.mbox'
73 | reader = MBoxReader(MBOX_PATH)
74 |
75 | # Try the following:
76 | # plot a single directed graph the email at index 3
77 | plot_single_directed(reader,3)
78 |
79 | # plot a single undirected graph the email at index 3, show title in plot
80 | plot_single_undirected(reader, 1, showtitle=True)
81 |
82 | # plot a directed graph, optionally specifying a layout style
83 | plot_directed(reader)
84 | plot_directed(reader, 'shell')
85 | # optionally export a .graphml to your working directory for use
86 | # in other network / graphing software
87 | plot_undirected(reader, 'spring', graphml=True)
88 | ```
89 |
90 | #### Email Header Analysis
91 |
92 | To obtain a histogram:
93 |
94 | ```py
95 | from emailnetwork.extract import MBoxReader
96 | reader = MBoxReader('path-to-mbox')
97 | headers = HeaderCounter(reader)
98 | headers.histogram()
99 | # to show only top 10 header, set an optional n parameter
100 | # headers.histogram(n=10)
101 | ```
102 | Because `HeaderCounter` is a subclass of Python's `Counter`, you can also perform operations such as `headers.most_common(8)` to get the 8 most-common headers from the `mbox` file.
103 |
104 | If you want to find all email headers with the word "spam" in it (e.g spam score, other antispam mechanism), you can use Python's `filter()` function:
105 | ```python
106 | reader = MBoxReader('path-to-mbox')
107 | headers = HeaderCounter(reader)
108 | spamheaders = list(filter(lambda v: "spam" in v.lower(), headers.keys()))
109 | # return:
110 | # ['X-Spam-Checked-In-Group', 'X-Microsoft-Antispam-PRVS', 'X-Microsoft-Antispam-Untrusted', 'X-Microsoft-Antispam-Message-Info-Original', 'X-Forefront-Antispam-Report-Untrusted', 'x-ms-exchange-antispam-messagedata', 'X-Microsoft-Antispam', 'X-Microsoft-Antispam-Message-Info', 'X-Forefront-Antispam-Report', 'X-Mimecast-Spam-Score', 'x-microsoft-antispam-prvs', 'x-microsoft-antispam', 'x-microsoft-antispam-message-info', 'x-forefront-antispam-report']
111 | ```
112 |
113 | #### Mailbox Summary
114 |
115 | To get a simple barchart on the distribution of email domains in your `.mbox`, you can create a `DomainSummary` object and call the `.plot()` function:
116 |
117 |
118 |
119 | ```python
120 | from emailnetwork.summary import DomainSummary
121 | summary = DomainSummary(reader)
122 | summary.plot()
123 | ```
124 |
125 | You can also return a `Counter()` (a subclass of `dict`) instead of a plot:
126 |
127 | ```python
128 | summary.summary
129 | # return:
130 | # Counter({'supertype.ai': 203, 'hubspot.com': 115, 'gmail.com': 75, 'google.com': 53, 'adcolony.com': 38, 'fbworkmail.com': 35, 'elementor.com': 29, 'payoneer.com': 15, 'gogame.net': 14, 'zoomd.com': 13, 'am.atlassian.com': 10, 'theafternaut.com': 6, 'alegrium.com': 5, 'accounts.google.com': 4, 'e.atlassian.com': 4, 'tnbaura.com': 4, 'support.lazada.sg': 4, '3kraters.com': 3, 'go.facebookmail.com': 2, 'docs.google.com': 2, 'mail.hellosign.com': 2, 'algorit.ma': 2, 'supertype.atlassian.net': 2, 'ucdconnect.ie': 2, 'mc.facebookmail.com': 1, 'inplacesoftware.com': 1, 'aura.co': 1, 'atlassian.com': 1, 'greenhouse.io': 1})
131 | ```
132 | ##### Why Python 3.7+?
133 | Python 3.7+ is required because the package is written to take advantage of many features of Python 3.7 and above.
134 |
135 | Examples of features that were used extensively in the creation of this package:
136 | * [Dataclasses, new in Python 3.7](https://www.youtube.com/watch?v=sH_jLQvnpBo)
137 | * [Insertion-ordered Dictionaries, new in Python 3.7](https://www.youtube.com/watch?v=h-DBWPjpqWY)
138 | * [Typing (Type hints), new in Python 3.5](https://docs.python.org/3/library/typing.html)
139 | * [Formatted string literal, new in Python 3.6](https://docs.python.org/3/reference/lexical_analysis.html#f-strings)
140 | ## Testing
141 | Git clone, and run `pytest`. You can also run pytest with coverage:
142 | ```
143 | pytest --cov
144 |
145 | .........
146 |
147 | Name Stmts Miss Cover
148 | ----------------------------------------------
149 | emailnetwork/__init__.py 2 0 100%
150 | emailnetwork/emails.py 39 1 97%
151 | emailnetwork/extract.py 94 15 84%
152 | emailnetwork/graph.py 120 12 90%
153 | emailnetwork/header.py 39 24 38%
154 | emailnetwork/network.py 13 1 92%
155 | emailnetwork/summary.py 73 22 70%
156 | emailnetwork/utils.py 30 9 70%
157 | emailnetwork/version.py 1 0 100%
158 | ----------------------------------------------
159 | TOTAL 411 84 80%
160 |
161 |
162 | =============== 17 passed in 2.85s ==============
163 | ```
164 |
165 | All tests are located in the `/tests/` directory.
166 |
167 | ## Email Network Demo
168 |
169 | [Aurellia Christie](https://github.com/AurelliaChristie) has created a Colab Notebook: [Email Network Walkthrough](https://colab.research.google.com/drive/1mSKbt9-dTTtQq296QUkMZlZpAMybgmpV?usp=sharing) to walk you through the most common functionalities of Email Network
170 |
171 | ## Authors and Copyright
172 |
173 | Samuel Chan, [Supertype](https://supertype.ai)
174 | - Github: [onlyphantom](https://github.com/onlyphantom)
175 |
176 | Vincentius Christopher Calvin, [Supertype](https://supertype.ai)
177 | - Github: [vccalvin33](https://github.com/vccalvin33)
178 |
179 | If you find the code useful in your project, please link to this repository in your citation.
180 |
181 | ##### The MIT License (MIT)
182 |
183 | Copyright (c) 2021 Supertype Pte Ltd
184 |
185 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
186 |
187 | * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
188 |
189 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
190 |
--------------------------------------------------------------------------------
/assets/graph1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/onlyphantom/emailnetwork/24b323e7c5159b5a02a7933b5f609664689a7316/assets/graph1.png
--------------------------------------------------------------------------------
/assets/graph2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/onlyphantom/emailnetwork/24b323e7c5159b5a02a7933b5f609664689a7316/assets/graph2.png
--------------------------------------------------------------------------------
/assets/graph3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/onlyphantom/emailnetwork/24b323e7c5159b5a02a7933b5f609664689a7316/assets/graph3.png
--------------------------------------------------------------------------------
/assets/graph4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/onlyphantom/emailnetwork/24b323e7c5159b5a02a7933b5f609664689a7316/assets/graph4.png
--------------------------------------------------------------------------------
/assets/graph5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/onlyphantom/emailnetwork/24b323e7c5159b5a02a7933b5f609664689a7316/assets/graph5.png
--------------------------------------------------------------------------------
/assets/histo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/onlyphantom/emailnetwork/24b323e7c5159b5a02a7933b5f609664689a7316/assets/histo.png
--------------------------------------------------------------------------------
/assets/summaryplot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/onlyphantom/emailnetwork/24b323e7c5159b5a02a7933b5f609664689a7316/assets/summaryplot.png
--------------------------------------------------------------------------------
/build/lib/emailnetwork/__init__.py:
--------------------------------------------------------------------------------
1 | # Email Network
2 | # Network graphing utilities for email/mailbox (.mbox) data
3 | #
4 | # Author: Samuel Chan
5 | # Credits: Benjamin Bengfort
6 | # Created: Jan 2021
7 | #
8 | # Copyright (c) 2021 Supertype Pte Ltd
9 | #
10 |
11 | """
12 | Network graphing utilities for email/mailbox (.mbox) data
13 | """
14 |
15 | ##########################################################################
16 | ## Package Version
17 | ##########################################################################
18 |
19 | from .version import __version__
20 |
--------------------------------------------------------------------------------
/build/lib/emailnetwork/emails.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass
2 | from datetime import datetime
3 | from emailnetwork.utils import parse_date
4 |
5 | @dataclass
6 | class EmailMeta:
7 | """
8 | Also Refer to:
9 | https://www.iana.org/assignments/message-headers/message-headers.xhtml
10 |
11 | """
12 |
13 | sender: str
14 | subject: str
15 | date: str
16 | recipients: list
17 | cc: list
18 | origin_domain: str=None
19 |
20 | def __post_init__(self):
21 | self.origin_domain = self.sender.domain
22 | self.date = parse_date(self.date)
23 |
24 | def __eq__(self, targetdate):
25 | if isinstance(targetdate, datetime):
26 | return self.date.date() == targetdate.date()
27 |
28 | def __ge__(self, targetdate):
29 | if isinstance(targetdate, datetime):
30 | return self.date.date() >= targetdate.date()
31 |
32 | def __le__(self, targetdate):
33 | if isinstance(targetdate, datetime):
34 | return self.date.date() <= targetdate.date()
35 |
36 | @dataclass
37 | class EmailAddress:
38 | name: str=None
39 | email: str=None
40 |
41 | def __init__(self, string):
42 | self.name, self.email = string
43 | self.email = self.email.lower()
44 |
45 | def __getitem__(self):
46 | return self.name, self.email
47 |
48 | @property
49 | def domain(self):
50 | return self.email.split('@')[-1] or None
51 |
52 | @dataclass
53 | class EmailBody:
54 | subject: str = None
55 | body: str = None
--------------------------------------------------------------------------------
/build/lib/emailnetwork/extract.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime
2 | from email.utils import getaddresses
3 | from mailbox import mbox
4 |
5 | from mailbox import mboxMessage
6 |
7 | from emailnetwork.utils import clean_subject, clean_body
8 | from emailnetwork.emails import EmailAddress, EmailMeta, EmailBody
9 | from emailnetwork.summary import DomainSummary
10 |
11 | from emailnetwork.header import HeaderCounter
12 |
13 |
14 | def extract_meta(email):
15 |
16 | recs = email.get_all('To', []) + email.get_all('Resent-To', [])
17 | ccs = email.get_all('Cc', []) + email.get_all('Resent-Cc', [])
18 |
19 | return EmailMeta(
20 | sender=EmailAddress(getaddresses(email.get_all('From'))[0]),
21 | recipients=[EmailAddress(rec) for rec in getaddresses(recs)],
22 | cc=[EmailAddress(cc) for cc in getaddresses(ccs)],
23 | subject=clean_subject(email['Subject']) or None,
24 | date=email['Date']
25 | )
26 |
27 |
28 | def extract_body(email):
29 |
30 | return EmailBody(
31 | subject=clean_subject(email['Subject']) or None,
32 | body=clean_body(email)
33 | )
34 |
35 |
36 | class MBoxReader(object):
37 | """ A class that extends python's `mailbox` module to provide additional
38 | functionalities such as length, date filtering and parsing. A key component of
39 | many `emailnetwork`'s operations.
40 |
41 | Usage:
42 | reader = MboxReader('path-to-mbox.mbox')
43 |
44 | Args:
45 | object ([type]): Instantiate this class by specifying a path to an `.mbox` object
46 | """
47 |
48 | def __init__(self, path) -> None:
49 | super().__init__()
50 | self.path = path
51 | self.mbox = mbox(path)
52 |
53 | def __iter__(self):
54 | for msg in self.mbox:
55 | yield msg
56 |
57 | def __len__(self):
58 | return self.count()
59 |
60 | def count(self):
61 | """
62 | Count the number of emails in the mbox instance.
63 | Helper function to implement __len__
64 | """
65 | return self.mbox.keys()[-1]+1
66 | # return len(self.mbox.keys())
67 |
68 | def extract(self):
69 | """
70 | Extract the meta data from the Mbox instance
71 | """
72 | for email in self:
73 | try:
74 | emailmeta = extract_meta(email)
75 | if emailmeta is not None:
76 | yield emailmeta
77 |
78 | except Exception as e:
79 | print(e)
80 | continue
81 |
82 | def filter_emails(self, emailaddress=None, datestring=None, dateoperator="=="):
83 | if emailaddress != None:
84 | if type(emailaddress) != str:
85 | raise ValueError(
86 | "Please use a valid string representing an email address")
87 |
88 | if dateoperator not in ['>=', '==', '<=']:
89 | raise ValueError("Please use one of ['>=', '==', '<=']")
90 |
91 | if datestring != None:
92 | try:
93 | targetdate = datetime.strptime(datestring, "%Y-%m-%d")
94 | except ValueError:
95 | print(ValueError)
96 | return "Please use the ISO format for comparison: YYYY-MM-DD"
97 |
98 | val = []
99 | if emailaddress == None and datestring == None:
100 | for email in self.mbox:
101 | emailmeta = extract_meta(email)
102 | val.append(emailmeta)
103 | elif emailaddress != None and datestring == None:
104 | for email in self.mbox:
105 | emailmeta = extract_meta(email)
106 | checkers = [emailmeta.sender.email] + [recipient.email for recipient in emailmeta.recipients]
107 | if emailaddress in checkers:
108 | val.append(emailmeta)
109 | elif emailaddress == None and datestring != None:
110 | for email in self.mbox:
111 | emailmeta = extract_meta(email)
112 | if dateoperator == '>=':
113 | if emailmeta >= targetdate:
114 | val.append(emailmeta)
115 | elif dateoperator == '==':
116 | if emailmeta == targetdate:
117 | val.append(emailmeta)
118 | elif dateoperator == '<=':
119 | if emailmeta <= targetdate:
120 | val.append(emailmeta)
121 | else:
122 | for email in self.mbox:
123 | emailmeta = extract_meta(email)
124 | checkers = [emailmeta.sender.email] + [recipient.email for recipient in emailmeta.recipients]
125 | if emailaddress in checkers:
126 | if dateoperator == '>=':
127 | if emailmeta >= targetdate:
128 | val.append(emailmeta)
129 | elif dateoperator == '==':
130 | if emailmeta == targetdate:
131 | val.append(emailmeta)
132 | elif dateoperator == '<=':
133 | if emailmeta <= targetdate:
134 | val.append(emailmeta)
135 |
136 | return val
137 |
138 |
139 | if __name__ == '__main__':
140 | reader = MBoxReader('/Users/samuel/Footprints/samuel-supertype.mbox')
141 | # reader = MBoxReader('/Users/vincentiuscalvin/Documents/Supertype/mbox-dataset/Ori_Sample_01.mbox')
142 | headers = HeaderCounter(reader)
143 | k = headers.keys()
144 | spamheaders = list(filter(lambda v: "spam" in v.lower(), k))
145 |
146 | summary = DomainSummary(reader)
147 |
148 | email = reader.mbox[1]
149 | emailmsg = extract_meta(email)
150 | emailbody = extract_body(email)
151 | mails = reader.filter_emails(datestring='2020-12-31', dateoperator="==")
--------------------------------------------------------------------------------
/build/lib/emailnetwork/graph.py:
--------------------------------------------------------------------------------
1 | from collections import Counter
2 | import math
3 | import os
4 | import uuid
5 | import matplotlib.pyplot as plt
6 | import networkx as nx
7 | import textwrap
8 |
9 | from emailnetwork.extract import MBoxReader, extract_meta
10 | from emailnetwork.network import PeopleCombination
11 |
12 |
13 | def plot_single_directed(reader:MBoxReader, id:int=False, showtitle:bool=False) -> None:
14 | """
15 | Plot a directed graph from a single email, as determined by `id`.
16 | If `showtitle` is `True`, render the plot with a email subject and date as title.
17 |
18 | Usage:
19 | reader = MboxReader('path-to-mbox.mbox')
20 | plot_single_directed(reader, 300) plots the 300th email from the mbox
21 | Args:
22 | reader (MBoxReader): A `MBoxReader` object
23 | id (int, optional): `id` of the email in the `MBoxReader`. Defaults to False.
24 | showtitle (bool, optional): If `True`, render the plot with a email subject and date as title. Defaults to False.
25 | """
26 |
27 | len_reader = len(reader)
28 | if not id:
29 | email = reader.mbox[len_reader-1]
30 | else:
31 | email = reader.mbox[id]
32 | emailmsg = extract_meta(email)
33 |
34 | subject = textwrap.fill(emailmsg.subject, 40)
35 | sender = emailmsg.sender.name if len(emailmsg.sender.name) != 0 else emailmsg.sender.email.split('@')[0]
36 |
37 | plt.figure(figsize=(9, 6))
38 | G = nx.DiGraph(name='Single Email Flow')
39 |
40 | for recipient in emailmsg.recipients:
41 | rec = recipient.name
42 | G.add_edge(sender, rec if len(rec) != 0 else recipient.email,
43 | message=subject, color='darkorchid', weight=3)
44 |
45 | for cc in emailmsg.cc:
46 | ccc = cc.name
47 | G.add_edge(sender, ccc if len(ccc) != 0 else cc.email,
48 | message='cc', color='lightsteelblue', weight=2)
49 |
50 | colors = nx.get_edge_attributes(G,'color').values()
51 | weights = nx.get_edge_attributes(G,'weight').values()
52 | edge_labels = nx.get_edge_attributes(G, 'message')
53 |
54 | pos = nx.planar_layout(G)
55 |
56 | # nx.draw_spectral(G,node_size=0, alpha=0.8, edge_color=colors, width=list(weights), font_size=8, with_labels=True)
57 | nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_size=6, label_pos=0.5)
58 | nx.draw_planar(G,node_size=0, alpha=1, edge_color=colors, width=list(weights), font_size=8, font_weight='bold', with_labels=True, verticalalignment='bottom')
59 |
60 | if showtitle:
61 | font = {"fontname": "Helvetica", "color": "k", "fontweight": "bold", "fontsize": 8}
62 | plt.title(subject + '\n Delivery date: ' + emailmsg.date.strftime("%m/%d/%Y"), fontdict=font)
63 |
64 | plt.tight_layout(pad=0.5)
65 | plt.axis('off')
66 | plt.show()
67 |
68 | def plot_single_undirected(reader:MBoxReader, id:int=False, showtitle:bool=False) -> None:
69 | """
70 | Plot an undirected social network graph from a single email, as determined by `id`.
71 | If `showtitle` is `True`, render the plot with a email subject and date as title.
72 |
73 | Usage:
74 | reader = MboxReader('path-to-mbox.mbox')
75 | plot_single_undirected(reader, 300) plots the 300th email from the mbox
76 | Args:
77 | reader (MBoxReader): A `MBoxReader` object
78 | id (int, optional): `id` of the email in the `MBoxReader`. Defaults to False.
79 | showtitle (bool, optional): If `True`, render the plot with a email subject and date as title. Defaults to False.
80 | """
81 |
82 | len_reader = len(reader)
83 | if not id:
84 | email = reader.mbox[len_reader-1]
85 | else:
86 | email = reader.mbox[id]
87 | emailmsg = extract_meta(email)
88 |
89 | subject = textwrap.fill(emailmsg.subject, 40)
90 | ng = PeopleCombination(emailmsg)
91 | G = nx.Graph(
92 | name='Single Email Social Network')
93 | counter = Counter()
94 | for combo in ng.combo:
95 | counter[combo] += 1
96 |
97 | total_combos = sum(counter.values())
98 | by_freq = {k: v/total_combos for k, v in counter.most_common()}
99 |
100 | for rel in counter.keys():
101 | G.add_edge(*rel, weight=by_freq[rel], count=counter[rel])
102 | k = 1/math.sqrt(G.order()) * 2
103 | pos = nx.spring_layout(G, k=k)
104 | deg = [v for _, v in G.degree()]
105 | # nx.draw_networkx_nodes(G, pos, node_size=deg, linewidths=1.0, alpha=0.90, label=G._node.keys())
106 | nx.draw_networkx_nodes(G, pos, node_size=deg, linewidths=1.0, alpha=0.90)
107 | nx.draw_networkx_edges(G, pos, edge_color="steelblue", width=1.0, style='dashed', alpha=0.75)
108 | nx.draw_networkx_labels(G, pos, {n: n for n in G.nodes}, font_size=8, verticalalignment="bottom")
109 | # nx.draw_networkx_labels(G, pos, {n: n for n in G.nodes if n.split('@')[-1] == 'supertype.ai'}, font_size=8)
110 |
111 | if showtitle:
112 | font = {"fontname": "Helvetica", "color": "k", "fontweight": "bold", "fontsize": 8}
113 | plt.title(subject + '\n Delivery date: ' + emailmsg.date.strftime("%m/%d/%Y"), fontdict=font)
114 |
115 | plt.tight_layout(pad=0.5)
116 | plt.axis('off')
117 | plt.show()
118 |
119 | def plot_directed(reader:MBoxReader, layout:str='shell', graphml:bool=False) -> None:
120 | """
121 | Plot a directed social network graph from the entire `mbox`, supplied by `MBoxReader`.
122 | `layout` determines the underlying `NetworkX` layout.
123 |
124 | Usage:
125 | reader = MboxReader('path-to-mbox.mbox')
126 | plot_directed(reader)
127 | Args:
128 | reader (MBoxReader): A `MBoxReader` object
129 | layout (str, optional): Can be one of 'shell', 'spring' or 'spiral'. Defaults to 'shell'.
130 | graphml (bool, optional): Determines if a .graphml file is exported to the working directory. Defaults to False.
131 | """
132 |
133 | emails = reader.extract()
134 | plt.figure(figsize=(12,12))
135 | G = nx.MultiDiGraph(name='Email Social Network')
136 | for email in emails:
137 | sender = email.sender.name
138 | source_addr = sender if sender != '' else email.sender.email.split('@')[0]
139 |
140 | all_recipients = [em.name if em.name !='' or None else em.email.split('@')[0] for em in email.recipients + email.cc]
141 |
142 | for recipient in all_recipients:
143 | G.add_edge(source_addr, recipient, message=email.subject)
144 |
145 | if graphml:
146 | fileName = f'network-{str(uuid.uuid4().hex)[:8]}.graphml'
147 | nx.write_graphml(G, fileName)
148 |
149 | if layout == 'shell':
150 | pos = nx.shell_layout(G)
151 | elif layout == 'spring':
152 | pos = nx.spring_layout(G)
153 | else:
154 | pos = nx.spiral_layout(G)
155 | nx.draw(G, pos, node_size=0, alpha=0.4, edge_color='cadetblue', font_size=7, with_labels=True)
156 | ax = plt.gca()
157 | ax.margins(0.08)
158 | plt.show()
159 |
160 | def plot_undirected(reader:MBoxReader, layout:str='shell', graphml:bool=False):
161 | """Plot an undirected social network graph from the entire `mbox`, supplied by `MBoxReader`.
162 | `layout` determines the underlying `NetworkX` layout.
163 |
164 | Usage:
165 | reader = MboxReader('path-to-mbox.mbox')
166 | plot_undirected(reader)
167 |
168 | Args:
169 | reader (MBoxReader): A `MBoxReader` object
170 | layout (str, optional): Can be one of 'shell', 'spring' or 'spiral'. Defaults to 'shell'.
171 | graphml (bool, optional): Determines if a .graphml file is exported to the working directory. Defaults to False.
172 | """
173 |
174 | emails = reader.extract()
175 | G = nx.Graph(name='Email Social Network')
176 | plt.figure(figsize=(12,12))
177 | counter = Counter()
178 | for email in emails:
179 | ng = PeopleCombination(email)
180 |
181 | for combo in ng.combo:
182 | counter[combo] += 1
183 |
184 | total_combos = sum(counter.values())
185 | by_freq = {k: v/total_combos for k, v in counter.most_common()}
186 | for rel in counter.keys():
187 | G.add_edge(*rel, weight=by_freq[rel], count=counter[rel])
188 |
189 | if graphml:
190 | fileName = f'network-{str(uuid.uuid4().hex)[:8]}.graphml'
191 | nx.write_graphml(G, fileName)
192 | print(f"Graphml exported as {fileName}")
193 |
194 | if layout == 'shell':
195 | pos = nx.shell_layout(G)
196 | elif layout == 'spring':
197 | k = 1/math.sqrt(G.order()) * 2
198 | pos = nx.spring_layout(G, k=k)
199 | else:
200 | pos = nx.spiral_layout(G)
201 |
202 | deg = [v*50 for _, v in G.degree()]
203 | nx.draw_networkx_nodes(G, pos, node_size=deg, linewidths=1.0, alpha=0.60)
204 | nx.draw_networkx_edges(G, pos, width=1.0, style='dashed', edge_color='cadetblue', alpha=0.6)
205 | nx.draw_networkx_labels(G, pos, {n: n.split('@')[0] for n in G.nodes}, font_size=8, font_color='darkorchid')
206 |
207 | plt.axis('off')
208 | plt.show()
209 |
210 | if __name__ == '__main__':
211 | MBOX_PATH = f'{os.path.dirname(__file__)}/tests/test.mbox'
212 |
213 | # reader = MBoxReader('/Users/samuel/Footprints/emailnetwork/emailnetwork/tests/test.mbox')
214 | reader = MBoxReader('/Users/vincentiuscalvin/Documents/Supertype/mbox-dataset/Ori_Sample_01.mbox')
215 | # reader = MBoxReader('/Users/samuel/Footprints/samuel-supertype.mbox')
216 | # plot_single_directed(reader,300)
217 | # plot_single_directed(reader, 1, True)
218 | # plot_directed(reader)
219 | # plot_directed(reader, "shell")
220 | plot_undirected(reader, 'spring')
--------------------------------------------------------------------------------
/build/lib/emailnetwork/header.py:
--------------------------------------------------------------------------------
1 | from collections import Counter
2 | from email.header import decode_header
3 | from emailnetwork.utils import clean_subject
4 |
5 |
6 | class HeaderCounter(Counter):
7 | """[summary]
8 |
9 | Args:
10 | Counter ([type]): [description]
11 | """
12 |
13 | def __init__(self, reader):
14 | super().__init__()
15 | self = self.build_from(reader)
16 |
17 | def __str__(self):
18 | return f'{self.most_common()}'
19 |
20 | def build_from(self, reader):
21 | for email in reader:
22 | for k in email.keys():
23 | self[k] += 1
24 |
25 | return self
26 |
27 | def histogram(self, n=25):
28 | from matplotlib import pyplot as plt
29 | plt.style.use('fivethirtyeight')
30 | k, v = (list(self.keys())[:n], list(self.values())[:n])
31 | fig = plt.figure(figsize=(7, 10))
32 | ax = fig.add_subplot(111)
33 | y_pos = [i for i in range(n)]
34 | ax.barh(y_pos, v, color='plum')
35 | ax.set_yticks(y_pos)
36 | ax.set_yticklabels(k)
37 | ax.invert_yaxis()
38 | ax.set_xlabel('Frequency')
39 | ax.set_title('Email Header Analysis')
40 | plt.tight_layout()
41 | plt.show()
42 |
43 |
44 | if __name__ == '__main__':
45 | from emailnetwork.extract import MBoxReader
46 | reader = MBoxReader('/Users/samuel/EmailNetwork/samuel-supertype.mbox')
47 | # reader = MBoxReader('/Users/vincentiuscalvin/Documents/Supertype/mbox-dataset/Ori_Sample_01.mbox')
48 | headers = HeaderCounter(reader)
49 |
50 | k = headers.keys()
51 |
52 | containspam = list(filter(lambda v: "spam" in v.lower(), k))
53 |
54 | for email in reader:
55 | for key in email.keys():
56 | if key in containspam:
57 | print({key: decode_header(email[key])})
58 |
--------------------------------------------------------------------------------
/build/lib/emailnetwork/network.py:
--------------------------------------------------------------------------------
1 | from itertools import combinations
2 |
3 | class PeopleCombination:
4 | """
5 | Usage:
6 | email = EmailMeta(...)
7 | PeopleCombination(email)
8 | """
9 |
10 | def __init__(self, email):
11 | people = [email.sender] + email.recipients + email.cc
12 | people = filter(lambda p: p is not None, people)
13 | people = set(addr.email for addr in people if addr.email)
14 | self.people = sorted(people)
15 |
16 | def __repr__(self):
17 | return str(self.people)
18 |
19 |
20 | @property
21 | def combo(self):
22 | for combination in combinations(self.people, 2):
23 | yield combination
24 |
--------------------------------------------------------------------------------
/build/lib/emailnetwork/summary.py:
--------------------------------------------------------------------------------
1 | from collections import Counter
2 | from datetime import datetime
3 |
4 |
5 | class DomainSummary():
6 |
7 | def __init__(self, reader):
8 | self.emailmetas = reader.extract()
9 | self.summary = self.get_summary()
10 |
11 | def get_summary(self):
12 | domains = {}
13 | for email in self.emailmetas:
14 | domains[email.origin_domain] = domains.get(
15 | email.origin_domain, 0) + 1
16 |
17 | return Counter(domains)
18 |
19 | def plot(self):
20 | import matplotlib.pyplot as plt
21 | plt.style.use('seaborn')
22 |
23 | fig = plt.figure(figsize=(10, 5))
24 |
25 | domains = list(self.summary.keys())
26 | freqs = list(self.summary.values())
27 |
28 | plt.barh(domains, freqs, color='cadetblue')
29 | [i.set_color("darkorchid") for i in plt.gca().get_xticklabels()]
30 | [i.set_color("darkorchid") for i in plt.gca().get_yticklabels()]
31 | plt.title("Sender's Domain Occurences", fontdict={
32 | "fontname": "Helvetica", "color": "k", "fontweight": "bold", "fontsize": 12})
33 | plt.show()
34 |
35 | def __str__(self):
36 | coba = []
37 | for keys in self.summary:
38 | coba.append(f"{keys:<25s}: {str(self.summary[keys]):<3s}")
39 | return "\n".join(coba)
40 |
41 |
42 | class IncomingOutgoingSummary():
43 |
44 | def __init__(self, reader):
45 | self.reader = reader
46 | self.emailmetas = reader.extract()
47 | self.user_email = self.get_user_email()
48 | self.summary = self.get_summary()
49 |
50 | def get_user_email(self):
51 | from emailnetwork.extract import extract_meta
52 |
53 | email_addresses = {}
54 | for i in range(10):
55 | email = self.reader.mbox[i]
56 | emailmsg = extract_meta(email)
57 | for recipient in emailmsg.recipients:
58 | email_addresses[recipient.email] = email_addresses.get(
59 | recipient.email, 0) + 1
60 | email_addresses[emailmsg.sender.email] = email_addresses.get(
61 | emailmsg.sender.email, 0) + 1
62 | return sorted(email_addresses.items(), key=lambda k: k[1], reverse=True)[0][0]
63 |
64 | def get_summary(self):
65 | date = {}
66 | for email in self.emailmetas:
67 | if email.date.strftime('%B %Y') not in date:
68 | date[email.date.strftime('%B %Y')] = {
69 | 'Incoming': 0, 'Outgoing': 0}
70 | if email.sender.email == self.user_email:
71 | date[email.date.strftime('%B %Y')]['Outgoing'] += 1
72 | else:
73 | date[email.date.strftime('%B %Y')]['Incoming'] += 1
74 |
75 | date = sorted(
76 | date.items(), key=lambda items: datetime.strptime(items[0], '%B %Y'))
77 | return dict(date)
78 |
79 | def plot(self):
80 | import matplotlib.pyplot as plt
81 | plt.style.use('seaborn')
82 |
83 | dates = list(self.summary.keys())
84 | incoming = list(item[1]['Incoming'] for item in self.summary.items())
85 | outgoing = list(item[1]['Outgoing'] for item in self.summary.items())
86 |
87 | fig, ax = plt.subplots()
88 |
89 | ax.bar(dates, incoming, 0.4, label='Incoming', color='cadetblue')
90 | ax.bar(dates, outgoing, 0.4, bottom=incoming, label='Outgoing')
91 |
92 | [i.set_color("darkorchid") for i in plt.gca().get_xticklabels()]
93 | [i.set_color("darkorchid") for i in plt.gca().get_yticklabels()]
94 |
95 | plt.xticks(rotation=45)
96 |
97 | ax.set_ylabel('Counts')
98 | ax.set_title('Number of Incoming and Outgoing Emails per Month', fontdict={"fontname": "Helvetica", "color": "k", "fontweight": "bold", "fontsize": 12})
99 | ax.legend()
100 |
101 | plt.show()
102 |
103 |
104 | if __name__ == "__main__":
105 | from emailnetwork.extract import MBoxReader
106 | reader = MBoxReader('/Users/vincentiuscalvin/Documents/Supertype/mbox-dataset/Ori_Sample_01.mbox')
107 | summary = DomainSummary(reader)
108 | summary_2 = IncomingOutgoingSummary(reader)
109 |
--------------------------------------------------------------------------------
/build/lib/emailnetwork/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/onlyphantom/emailnetwork/24b323e7c5159b5a02a7933b5f609664689a7316/build/lib/emailnetwork/tests/__init__.py
--------------------------------------------------------------------------------
/build/lib/emailnetwork/tests/test_extract.py:
--------------------------------------------------------------------------------
1 | import os
2 | import datetime
3 | from unittest import TestCase
4 |
5 | from emailnetwork.extract import MBoxReader, extract_meta, extract_body
6 | from emailnetwork.emails import EmailAddress, EmailMeta, EmailBody
7 |
8 | """
9 | Demo mbox is generated from Benjamin Bengfort's Tribe tool
10 | with person names modified for anonymity
11 | """
12 | MBOX_PATH = f'{os.path.dirname(__file__)}/test.mbox'
13 |
14 | class TestExtract(TestCase):
15 | def setUp(self):
16 | self.reader = MBoxReader(MBOX_PATH)
17 | self.emails = self.reader.extract()
18 |
19 | def tearDown(self):
20 | self.reader = None
21 |
22 | def test_read_mbox(self):
23 | self.assertTrue(isinstance(self.reader, MBoxReader))
24 |
25 | def test_length_mbox(self):
26 | self.assertEqual(len(self.reader), 140)
27 |
28 | def test_extract(self):
29 | # self.assertTrue(isinstance(next(self.emails), EmailMeta))
30 | firstemail = next(self.emails)
31 | self.assertIsInstance(firstemail, EmailMeta)
32 | self.assertIsInstance(firstemail.subject, str)
33 | self.assertIsInstance(firstemail.date, datetime.datetime)
34 |
35 | for msg in self.emails:
36 | self.assertGreaterEqual(len(msg.recipients), 1)
37 | self.assertIsInstance(msg.cc, list)
38 |
39 | def test_email_address(self):
40 | firstemail = next(self.emails)
41 | self.assertIsInstance(firstemail.sender, EmailAddress)
42 | self.assertIsInstance(firstemail.sender.name, str)
43 | self.assertIsInstance(firstemail.sender.email, str)
44 |
45 | def test_filter_emails(self):
46 | newmails = self.reader.filter_emails(datestring="2020-01-01", dateoperator=">=")
47 | self.assertEqual(len(newmails), 4)
48 |
49 | for email in newmails:
50 | self.assertGreater(email.date, datetime.datetime(2020,1,1))
51 | self.assertLess(email.date, datetime.datetime.now())
52 |
53 | oldmails = self.reader.filter_emails(datestring="2019-12-31", dateoperator="<=")
54 | self.assertEqual(len(oldmails), 136)
55 |
56 | exactmails = self.reader.filter_emails(datestring="2020-04-17", dateoperator="==")
57 | self.assertEqual(len(exactmails), 1)
58 | self.assertEqual(exactmails[0].date.date(), datetime.date(2020, 4, 17))
59 |
60 | namedmails = self.reader.filter_emails(emailaddress='samuelchan@gmail.com')
61 |
62 | for email in namedmails:
63 | checkers = [email.sender.email] + [recipient.email for recipient in email.recipients]
64 | self.assertTrue('samuelchan@gmail.com' in checkers)
65 |
66 | fullfilteredmails = self.reader.filter_emails(emailaddress='samuelchan@gmail.com', datestring="2020-01-01", dateoperator=">=")
67 |
68 | for email in fullfilteredmails:
69 | checkers = [email.sender.email] + [recipient.email for recipient in email.recipients]
70 | self.assertTrue('samuelchan@gmail.com' in checkers)
71 | self.assertGreater(email.date, datetime.datetime(2020,1,1))
72 |
73 | # also need tests to fail with expected exception when datetime operator not in [==, <=, >=], emailaddress and datetime in wrong format.
74 | def test_afunction_throws_exception(self):
75 | self.assertRaises(ValueError, self.reader.filter_emails, 20, "2019-12-31", "<")
76 |
77 | def test_extract_meta_single(self):
78 | for email in self.reader.mbox:
79 | self.assertIsInstance(email['Subject'], (bytes, str))
80 | emailmsg = extract_meta(email)
81 | self.assertIsInstance(emailmsg, EmailMeta)
82 | self.assertIsInstance(emailmsg.origin_domain, str)
83 | self.assertIsInstance(emailmsg.subject, str)
84 |
85 | def test_extract_body_single(self):
86 | for email in self.reader.mbox:
87 | emailbody = extract_body(email)
88 | self.assertIsInstance(emailbody, EmailBody)
89 | self.assertIsInstance(emailbody.subject, str)
90 | self.assertIsInstance(emailbody.body, str)
91 |
--------------------------------------------------------------------------------
/build/lib/emailnetwork/tests/test_graph.py:
--------------------------------------------------------------------------------
1 | import os
2 | from unittest import TestCase, mock
3 |
4 | from emailnetwork.extract import MBoxReader
5 | # from emailnetwork.graph import plot_single_email
6 | import emailnetwork.graph as graph
7 |
8 | MBOX_PATH = f'{os.path.dirname(__file__)}/test.mbox'
9 |
10 | @mock.patch(f"{__name__}.graph.plt")
11 | def test_plot_single_directed(mock_plt):
12 | reader = MBoxReader(MBOX_PATH)
13 | graph.plot_single_directed(reader, 1, True)
14 | mock_plt.title.assert_called_once_with("Three tips to get the most out of Gmail\n Delivery date: 04/17/2020", fontdict={'fontname': 'Helvetica', 'color': 'k', 'fontweight': 'bold', 'fontsize': 8})
15 | assert mock_plt.figure.called
16 |
17 |
18 | class TestGraph(TestCase):
19 | def setUp(self):
20 | self.reader = MBoxReader(MBOX_PATH)
21 | self.emails = self.reader.extract()
22 |
23 | def test_single_graph(self):
24 | # TODO: to be implemented later
25 | pass
26 |
27 |
--------------------------------------------------------------------------------
/build/lib/emailnetwork/tests/test_summary.py:
--------------------------------------------------------------------------------
1 | import os
2 | from unittest import TestCase
3 | from collections import Counter
4 | from datetime import datetime
5 |
6 | from emailnetwork.extract import MBoxReader
7 | from emailnetwork.summary import DomainSummary, IncomingOutgoingSummary
8 |
9 | MBOX_PATH = f'{os.path.dirname(__file__)}/test.mbox'
10 |
11 |
12 | class TestSummary(TestCase):
13 | def setUp(self):
14 | self.reader = MBoxReader(MBOX_PATH)
15 | self.domain_summary = DomainSummary(self.reader)
16 | self.incoming_outgoing_summary = IncomingOutgoingSummary(self.reader)
17 |
18 | def tearDown(self):
19 | self.domain_summary = None
20 | self.incoming_outgoing_summary = None
21 |
22 | def test_summary_instance(self):
23 | self.assertTrue(isinstance(self.domain_summary, DomainSummary))
24 | self.assertTrue(isinstance(self.domain_summary.summary, Counter))
25 | self.assertTrue(isinstance(
26 | self.incoming_outgoing_summary, IncomingOutgoingSummary))
27 | self.assertTrue(isinstance(
28 | self.incoming_outgoing_summary.summary, dict))
29 |
30 | def test_one_summary(self):
31 | for summary in self.domain_summary.summary:
32 | self.assertTrue(isinstance(summary, str))
33 | self.assertTrue(isinstance(
34 | self.domain_summary.summary[summary], int))
35 | self.assertGreater(self.domain_summary.summary[summary], 0)
36 |
37 | for summary in self.incoming_outgoing_summary.summary:
38 | self.assertTrue(isinstance(summary, str))
39 | self.assertTrue(isinstance(
40 | self.incoming_outgoing_summary.summary[summary], dict))
41 | for keys in self.incoming_outgoing_summary.summary[summary]:
42 | self.assertIn(keys, ('Incoming', 'Outgoing'))
43 | self.assertIsInstance(
44 | self.incoming_outgoing_summary.summary[summary][keys], int)
45 |
--------------------------------------------------------------------------------
/build/lib/emailnetwork/tests/test_utils.py:
--------------------------------------------------------------------------------
1 | import os
2 | from unittest import TestCase
3 |
4 | from emailnetwork.extract import MBoxReader
5 |
6 | MBOX_PATH = f'{os.path.dirname(__file__)}/test.mbox'
7 |
8 | class TestNetwork(TestCase):
9 | def setUp(self):
10 | # self.reader = MBoxReader(MBOX_PATH)
11 | # self.emails = self.reader.extract()
12 | pass
13 |
14 | # test PeopleCombination
15 |
16 |
--------------------------------------------------------------------------------
/build/lib/emailnetwork/utils.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime
2 |
3 | from dateutil import parser
4 | from dateutil.tz import tzlocal, tzutc
5 | from email.utils import parsedate_tz, mktime_tz
6 | from email.header import decode_header
7 |
8 |
9 | def parse_date(datestring: str):
10 | """[summary]
11 | Usage:
12 | Primarily used for extract_meta(email): parse_date(email['Date'])
13 | parse_date('Sat, 19 Sep 2020 12:01:38 +0800')
14 | Args:
15 | datestring (str): [description]
16 | """
17 | try:
18 | dt = parsedate_tz(datestring)
19 | if dt is not None:
20 | return datetime.utcfromtimestamp(mktime_tz(dt))
21 |
22 | return parser.parse(datestring)
23 | except Exception:
24 | return None
25 |
26 |
27 | def clean_subject(subject):
28 | """[summary]
29 | Usage:
30 |
31 | Args:
32 | subject (byte or str)
33 | """
34 | subject, encoding = decode_header(subject)[0]
35 | if isinstance(subject, bytes):
36 | try:
37 | return subject.decode(encoding).strip()
38 | except:
39 | return subject.decode('utf-8').strip()
40 | else:
41 | return subject.strip().replace('\r\n', '')
42 |
43 |
44 | def clean_body(email):
45 | if email.is_multipart():
46 | for part in email.walk():
47 | ctype = part.get_content_type()
48 | cdispo = str(part.get('Content-Disposition'))
49 |
50 | # skip any text/plain (txt) attachments
51 | if ctype == 'text/plain' and 'attachment' not in cdispo:
52 | return part.get_payload(decode=True).decode() # decode
53 | break
54 | # not multipart - i.e. plain text, no attachments, keeping fingers crossed
55 | else:
56 | return email.get_payload(decode=True).decode()
57 |
--------------------------------------------------------------------------------
/build/lib/emailnetwork/version.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.0.2"
--------------------------------------------------------------------------------
/coverage.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | /Users/vincentiuscalvin/Documents/Supertype/mbox-dataset/emailnetwork
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
283 |
284 |
285 |
286 |
287 |
288 |
289 |
290 |
291 |
292 |
293 |
294 |
295 |
296 |
297 |
298 |
299 |
300 |
301 |
302 |
303 |
304 |
305 |
306 |
307 |
308 |
309 |
310 |
311 |
312 |
313 |
314 |
315 |
316 |
317 |
318 |
319 |
320 |
321 |
322 |
323 |
324 |
325 |
326 |
327 |
328 |
329 |
330 |
331 |
332 |
333 |
334 |
335 |
336 |
337 |
338 |
339 |
340 |
341 |
342 |
343 |
344 |
345 |
346 |
347 |
348 |
349 |
350 |
351 |
352 |
353 |
354 |
355 |
356 |
357 |
358 |
359 |
360 |
361 |
362 |
363 |
364 |
365 |
366 |
367 |
368 |
369 |
370 |
371 |
372 |
373 |
374 |
375 |
376 |
377 |
378 |
379 |
380 |
381 |
382 |
383 |
384 |
385 |
386 |
387 |
388 |
389 |
390 |
391 |
392 |
393 |
394 |
395 |
396 |
397 |
398 |
399 |
400 |
401 |
402 |
403 |
404 |
405 |
406 |
407 |
408 |
409 |
410 |
411 |
412 |
413 |
414 |
415 |
416 |
417 |
418 |
419 |
420 |
421 |
422 |
423 |
424 |
425 |
426 |
427 |
428 |
--------------------------------------------------------------------------------
/dist/emailnetwork-0.0.1-py3-none-any.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/onlyphantom/emailnetwork/24b323e7c5159b5a02a7933b5f609664689a7316/dist/emailnetwork-0.0.1-py3-none-any.whl
--------------------------------------------------------------------------------
/dist/emailnetwork-0.0.1.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/onlyphantom/emailnetwork/24b323e7c5159b5a02a7933b5f609664689a7316/dist/emailnetwork-0.0.1.tar.gz
--------------------------------------------------------------------------------
/dist/emailnetwork-0.0.2-py3-none-any.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/onlyphantom/emailnetwork/24b323e7c5159b5a02a7933b5f609664689a7316/dist/emailnetwork-0.0.2-py3-none-any.whl
--------------------------------------------------------------------------------
/dist/emailnetwork-0.0.2.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/onlyphantom/emailnetwork/24b323e7c5159b5a02a7933b5f609664689a7316/dist/emailnetwork-0.0.2.tar.gz
--------------------------------------------------------------------------------
/emailnetwork.egg-info/PKG-INFO:
--------------------------------------------------------------------------------
1 | Metadata-Version: 2.1
2 | Name: emailnetwork
3 | Version: 0.0.2
4 | Summary: Network graphing utilities for email/mailbox (.mbox) data
5 | Home-page: http://github.com/onlyphantom/emailnetwork
6 | Author: Samuel Chan
7 | Author-email: s@supertype.ai
8 | License: MIT
9 | Description: # Email Network
10 |
11 | ## Description
12 | Network graphing utilities for email/mailbox data.
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 | For the social scientists, creating social networks from your mailbox data and among other things:
21 | * Discover subgroups within your organization (whether the different task forces established were as cohesive as it seems on the outside)
22 | * Study social actors (most emails from Marketing involve Peter and Andy) and their relative influence
23 | * Identify the key social groups (Sales team hangs out a lot, but the IT / product division less so)
24 | * Key account managers of the company (Despite being with the company only recently, Margaretha is connected to more key clients than her peers)
25 |
26 |
27 | If you're a graph theorist and looking for something more statistical:
28 | * Support directed and undirected graphs (**implemented in version 0.0.2**, see below)
29 | * Also output statistical measurements such as centrality distribution (**planned for version 0.0.3**)
30 | * Betweenness, closeness, hubness, distance histograms plotting (**planned for version 0.0.3**)
31 | * Exports to `.graphml` format for use in other graphing software (**implemented in version 0.0.2**)
32 |
33 | ## Dependencies
34 | * Python 3.7+
35 | * Only dependencies are NetworkX and Matplotlib
36 |
37 | ## Example Usage
38 | To install `emailnetwork`:
39 | ```
40 | pip install emailnetwork
41 | ```
42 |
43 | A sample `.mbox` file is provided to you, but you can obtain export your own mailbox from your email service provider. If you use Google (Gmail), you can [use the Google Takeout service](https://takeout.google.com/settings/takeout) to export your mail data.
44 |
45 |
46 | ```python
47 | from emailnetwork.extract import MBoxReader
48 | reader = MBoxReader('path-to-mbox.mbox')
49 | print(f'{len(reader)} emails in the sample mbox.')
50 |
51 | # extract a specific email
52 | email = reader.mbox[5]
53 | emailmsg = extract_meta(email)
54 |
55 | # filter emails by certain date
56 | thisyearmails = reader.filter_by_date('>=', '2021-01-05')
57 |
58 | # print email domains of recipients
59 | print(emailmsg.recipients)
60 | print(emailmsg.recipients[0].domain)
61 |
62 | # extract all emails
63 | emails = reader.extract()
64 | ```
65 |
66 | For graph visualization:
67 | ```py
68 | from emailnetwork.extract import MBoxReader
69 | # Read from .mbox
70 | MBOX_PATH = f'{os.path.dirname(__file__)}/tests/test.mbox'
71 | reader = MBoxReader(MBOX_PATH)
72 |
73 | # Try the following:
74 | # plot a single directed graph the email at index 3
75 | plot_single_directed(reader,3)
76 |
77 | # plot a single undirected graph the email at index 3, show title in plot
78 | plot_single_undirected(reader, 1, showtitle=True)
79 |
80 | # plot a directed graph, optionally specifying a layout style
81 | plot_directed(reader)
82 | plot_directed(reader, 'shell')
83 | # optionally export a .graphml to your working directory for use
84 | # in other network / graphing software
85 | plot_undirected(reader, 'spring', graphml=True)
86 | ```
87 |
88 | #### Email Header Analysis
89 |
90 | To obtain a histogram:
91 |
92 | ```py
93 | from emailnetwork.extract import MBoxReader
94 | reader = MBoxReader('path-to-mbox')
95 | headers = HeaderCounter(reader)
96 | headers.histogram()
97 | # to show only top 10 header, set an optional n parameter
98 | # headers.histogram(n=10)
99 | ```
100 | Because `HeaderCounter` is a subclass of Python's `Counter`, you can also perform operations such as `headers.most_common(8)` to get the 8 most-common headers from the `mbox` file.
101 |
102 | If you want to find all email headers with the word "spam" in it (e.g spam score, other antispam mechanism), you can use Python's `filter()` function:
103 | ```python
104 | reader = MBoxReader('path-to-mbox')
105 | headers = HeaderCounter(reader)
106 | spamheaders = list(filter(lambda v: "spam" in v.lower(), headers.keys()))
107 | # return:
108 | # ['X-Spam-Checked-In-Group', 'X-Microsoft-Antispam-PRVS', 'X-Microsoft-Antispam-Untrusted', 'X-Microsoft-Antispam-Message-Info-Original', 'X-Forefront-Antispam-Report-Untrusted', 'x-ms-exchange-antispam-messagedata', 'X-Microsoft-Antispam', 'X-Microsoft-Antispam-Message-Info', 'X-Forefront-Antispam-Report', 'X-Mimecast-Spam-Score', 'x-microsoft-antispam-prvs', 'x-microsoft-antispam', 'x-microsoft-antispam-message-info', 'x-forefront-antispam-report']
109 | ```
110 |
111 | #### Mailbox Summary
112 |
113 | To get a simple barchart on the distribution of email domains in your `.mbox`, you can create a `DomainSummary` object and call the `.plot()` function:
114 |
115 |
116 |
117 | ```python
118 | from emailnetwork.summary import DomainSummary
119 | summary = DomainSummary(reader)
120 | summary.plot()
121 | ```
122 |
123 | You can also return a `Counter()` (a subclass of `dict`) instead of a plot:
124 |
125 | ```python
126 | summary.summary
127 | # return:
128 | # Counter({'supertype.ai': 203, 'hubspot.com': 115, 'gmail.com': 75, 'google.com': 53, 'adcolony.com': 38, 'fbworkmail.com': 35, 'elementor.com': 29, 'payoneer.com': 15, 'gogame.net': 14, 'zoomd.com': 13, 'am.atlassian.com': 10, 'theafternaut.com': 6, 'alegrium.com': 5, 'accounts.google.com': 4, 'e.atlassian.com': 4, 'tnbaura.com': 4, 'support.lazada.sg': 4, '3kraters.com': 3, 'go.facebookmail.com': 2, 'docs.google.com': 2, 'mail.hellosign.com': 2, 'algorit.ma': 2, 'supertype.atlassian.net': 2, 'ucdconnect.ie': 2, 'mc.facebookmail.com': 1, 'inplacesoftware.com': 1, 'aura.co': 1, 'atlassian.com': 1, 'greenhouse.io': 1})
129 | ```
130 | ##### Why Python 3.7+?
131 | Python 3.7+ is required because the package is written to take advantage of many features of Python 3.7 and above.
132 |
133 | Examples of features that were used extensively in the creation of this package:
134 | * [Dataclasses, new in Python 3.7](https://www.youtube.com/watch?v=sH_jLQvnpBo)
135 | * [Insertion-ordered Dictionaries, new in Python 3.7](https://www.youtube.com/watch?v=h-DBWPjpqWY)
136 | * [Typing (Type hints), new in Python 3.5](https://docs.python.org/3/library/typing.html)
137 | * [Formatted string literal, new in Python 3.6](https://docs.python.org/3/reference/lexical_analysis.html#f-strings)
138 | ## Testing
139 | Git clone, and run `nosetests`. You can also run nosetests with coverage:
140 | ```
141 | nosetests --with-coverage --cover-package=emailnetwork
142 |
143 | .........
144 | Name Stmts Miss Cover
145 | ----------------------------------------------
146 | emailnetwork/__init__.py 2 0 100%
147 | emailnetwork/emails.py 55 11 80%
148 | emailnetwork/extract.py 54 11 80%
149 | emailnetwork/graph.py 120 82 32%
150 | emailnetwork/network.py 13 7 46%
151 | emailnetwork/utils.py 32 17 47%
152 | emailnetwork/version.py 1 0 100%
153 | ----------------------------------------------
154 | TOTAL 277 128 54%
155 | ----------------------------------------------------------------------
156 | Ran 9 tests in 3.226s
157 |
158 | OK
159 | ```
160 |
161 | All tests are located in the `/tests/` directory.
162 |
163 |
164 | ## Authors and Copyright
165 |
166 | Samuel Chan, Supertype [Supertype](https://supertype.ai)
167 |
168 | Vincentius Christopher Calvin, Supertype [https://supertype.ai](https://supertype.ai)
169 |
170 | If you find the code useful in your project, please link to this repository in your citation.
171 |
172 | ##### The MIT License (MIT)
173 |
174 | Copyright (c) 2021 Supertype Pte Ltd
175 |
176 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
177 |
178 | * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
179 |
180 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
181 | Platform: UNKNOWN
182 | Classifier: License :: OSI Approved :: MIT License
183 | Classifier: Programming Language :: Python :: 3
184 | Classifier: Programming Language :: Python :: 3.7
185 | Classifier: Operating System :: OS Independent
186 | Requires-Python: >=3.7
187 | Description-Content-Type: text/markdown
188 |
--------------------------------------------------------------------------------
/emailnetwork.egg-info/SOURCES.txt:
--------------------------------------------------------------------------------
1 | MANIFEST.in
2 | README.md
3 | setup.py
4 | emailnetwork/__init__.py
5 | emailnetwork/emails.py
6 | emailnetwork/extract.py
7 | emailnetwork/graph.py
8 | emailnetwork/header.py
9 | emailnetwork/network.py
10 | emailnetwork/summary.py
11 | emailnetwork/utils.py
12 | emailnetwork/version.py
13 | emailnetwork.egg-info/PKG-INFO
14 | emailnetwork.egg-info/SOURCES.txt
15 | emailnetwork.egg-info/dependency_links.txt
16 | emailnetwork.egg-info/not-zip-safe
17 | emailnetwork.egg-info/requires.txt
18 | emailnetwork.egg-info/top_level.txt
19 | emailnetwork/tests/__init__.py
20 | emailnetwork/tests/test.mbox
21 | emailnetwork/tests/test_extract.py
22 | emailnetwork/tests/test_graph.py
23 | emailnetwork/tests/test_summary.py
24 | emailnetwork/tests/test_utils.py
--------------------------------------------------------------------------------
/emailnetwork.egg-info/dependency_links.txt:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/emailnetwork.egg-info/not-zip-safe:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/emailnetwork.egg-info/requires.txt:
--------------------------------------------------------------------------------
1 | matplotlib
2 | networkx
3 |
--------------------------------------------------------------------------------
/emailnetwork.egg-info/top_level.txt:
--------------------------------------------------------------------------------
1 | emailnetwork
2 |
--------------------------------------------------------------------------------
/emailnetwork/.coverage:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/onlyphantom/emailnetwork/24b323e7c5159b5a02a7933b5f609664689a7316/emailnetwork/.coverage
--------------------------------------------------------------------------------
/emailnetwork/__init__.py:
--------------------------------------------------------------------------------
1 | # Email Network
2 | # Network graphing utilities for email/mailbox (.mbox) data
3 | #
4 | # Author: Samuel Chan
5 | # Credits: Benjamin Bengfort
6 | # Created: Jan 2021
7 | #
8 | # Copyright (c) 2021 Supertype Pte Ltd
9 | #
10 |
11 | """
12 | Network graphing utilities for email/mailbox (.mbox) data
13 | """
14 |
15 | ##########################################################################
16 | ## Package Version
17 | ##########################################################################
18 |
19 | from .version import __version__
20 |
--------------------------------------------------------------------------------
/emailnetwork/emails.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass
2 | from datetime import datetime
3 | from emailnetwork.utils import parse_date
4 |
5 | @dataclass
6 | class EmailMeta:
7 | """
8 | Also Refer to:
9 | https://www.iana.org/assignments/message-headers/message-headers.xhtml
10 |
11 | """
12 |
13 | sender: str
14 | subject: str
15 | date: str
16 | recipients: list
17 | cc: list
18 | origin_domain: str=None
19 |
20 | def __post_init__(self):
21 | self.origin_domain = self.sender.domain
22 | self.date = parse_date(self.date)
23 |
24 | def __eq__(self, targetdate):
25 | if isinstance(targetdate, datetime):
26 | return self.date.date() == targetdate.date()
27 |
28 | def __ge__(self, targetdate):
29 | if isinstance(targetdate, datetime):
30 | return self.date.date() >= targetdate.date()
31 |
32 | def __le__(self, targetdate):
33 | if isinstance(targetdate, datetime):
34 | return self.date.date() <= targetdate.date()
35 |
36 | @dataclass
37 | class EmailAddress:
38 | name: str=None
39 | email: str=None
40 |
41 | def __init__(self, string):
42 | self.name, self.email = string
43 | self.email = self.email.lower()
44 |
45 | def __getitem__(self):
46 | return self.name, self.email
47 |
48 | @property
49 | def domain(self):
50 | return self.email.split('@')[-1] or None
51 |
52 | @dataclass
53 | class EmailBody:
54 | subject: str = None
55 | body: str = None
--------------------------------------------------------------------------------
/emailnetwork/extract.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime
2 | from email.utils import getaddresses
3 | from mailbox import mbox
4 |
5 | from mailbox import mboxMessage
6 |
7 | from emailnetwork.utils import clean_subject, clean_body
8 | from emailnetwork.emails import EmailAddress, EmailMeta, EmailBody
9 | from emailnetwork.summary import DomainSummary
10 |
11 | from emailnetwork.header import HeaderCounter
12 |
13 |
14 | def extract_meta(email):
15 |
16 | recs = email.get_all('To', []) + email.get_all('Resent-To', [])
17 | ccs = email.get_all('Cc', []) + email.get_all('Resent-Cc', [])
18 |
19 | return EmailMeta(
20 | sender=EmailAddress(getaddresses(email.get_all('From'))[0]),
21 | recipients=[EmailAddress(rec) for rec in getaddresses(recs)],
22 | cc=[EmailAddress(cc) for cc in getaddresses(ccs)],
23 | subject=clean_subject(email['Subject']) or None,
24 | date=email['Date']
25 | )
26 |
27 |
28 | def extract_body(email):
29 |
30 | return EmailBody(
31 | subject=clean_subject(email['Subject']) or None,
32 | body=clean_body(email)
33 | )
34 |
35 |
36 | class MBoxReader(object):
37 | """ A class that extends python's `mailbox` module to provide additional
38 | functionalities such as length, date filtering and parsing. A key component of
39 | many `emailnetwork`'s operations.
40 |
41 | Usage:
42 | reader = MboxReader('path-to-mbox.mbox')
43 |
44 | Args:
45 | object ([type]): Instantiate this class by specifying a path to an `.mbox` object
46 | """
47 |
48 | def __init__(self, path) -> None:
49 | super().__init__()
50 | self.path = path
51 | self.mbox = mbox(path)
52 |
53 | def __iter__(self):
54 | for msg in self.mbox:
55 | yield msg
56 |
57 | def __len__(self):
58 | return self.count()
59 |
60 | def count(self):
61 | """
62 | Count the number of emails in the mbox instance.
63 | Helper function to implement __len__
64 | """
65 | return self.mbox.keys()[-1]+1
66 | # return len(self.mbox.keys())
67 |
68 | def extract(self):
69 | """
70 | Extract the meta data from the Mbox instance
71 | """
72 | for email in self:
73 | try:
74 | emailmeta = extract_meta(email)
75 | if emailmeta is not None:
76 | yield emailmeta
77 |
78 | except Exception as e:
79 | print(e)
80 | continue
81 |
82 | def filter_emails(self, emailaddress=None, datestring=None, dateoperator="=="):
83 | if emailaddress != None:
84 | if type(emailaddress) != str:
85 | raise ValueError(
86 | "Please use a valid string representing an email address")
87 |
88 | if dateoperator not in ['>=', '==', '<=']:
89 | raise ValueError("Please use one of ['>=', '==', '<=']")
90 |
91 | if datestring != None:
92 | try:
93 | targetdate = datetime.strptime(datestring, "%Y-%m-%d")
94 | except ValueError:
95 | print(ValueError)
96 | return "Please use the ISO format for comparison: YYYY-MM-DD"
97 |
98 | val = []
99 | if emailaddress == None and datestring == None:
100 | for email in self.mbox:
101 | emailmeta = extract_meta(email)
102 | val.append(emailmeta)
103 | elif emailaddress != None and datestring == None:
104 | for email in self.mbox:
105 | emailmeta = extract_meta(email)
106 | checkers = [emailmeta.sender.email] + [recipient.email for recipient in emailmeta.recipients]
107 | if emailaddress in checkers:
108 | val.append(emailmeta)
109 | elif emailaddress == None and datestring != None:
110 | for email in self.mbox:
111 | emailmeta = extract_meta(email)
112 | if dateoperator == '>=':
113 | if emailmeta >= targetdate:
114 | val.append(emailmeta)
115 | elif dateoperator == '==':
116 | if emailmeta == targetdate:
117 | val.append(emailmeta)
118 | elif dateoperator == '<=':
119 | if emailmeta <= targetdate:
120 | val.append(emailmeta)
121 | else:
122 | for email in self.mbox:
123 | emailmeta = extract_meta(email)
124 | checkers = [emailmeta.sender.email] + [recipient.email for recipient in emailmeta.recipients]
125 | if emailaddress in checkers:
126 | if dateoperator == '>=':
127 | if emailmeta >= targetdate:
128 | val.append(emailmeta)
129 | elif dateoperator == '==':
130 | if emailmeta == targetdate:
131 | val.append(emailmeta)
132 | elif dateoperator == '<=':
133 | if emailmeta <= targetdate:
134 | val.append(emailmeta)
135 |
136 | return val
137 |
138 |
139 | if __name__ == '__main__':
140 | reader = MBoxReader('/Users/samuel/EmailNetwork/samuel-supertype.mbox')
141 | # reader = MBoxReader('/Users/vincentiuscalvin/Documents/Supertype/mbox-dataset/Ori_Sample_01.mbox')
142 | headers = HeaderCounter(reader)
143 | k = headers.keys()
144 | spamheaders = list(filter(lambda v: "spam" in v.lower(), k))
145 |
146 | summary = DomainSummary(reader)
147 |
148 | email = reader.mbox[1]
149 | emailmsg = extract_meta(email)
150 | emailbody = extract_body(email)
151 | mails = reader.filter_emails(datestring='2020-12-31', dateoperator="==")
--------------------------------------------------------------------------------
/emailnetwork/graph.py:
--------------------------------------------------------------------------------
1 | from collections import Counter
2 | import math
3 | import os
4 | import uuid
5 | import matplotlib.pyplot as plt
6 | import networkx as nx
7 | import textwrap
8 |
9 | from emailnetwork.extract import MBoxReader, extract_meta
10 | from emailnetwork.network import PeopleCombination
11 |
12 |
13 | def plot_single_directed(reader:MBoxReader, id:int=False, showtitle:bool=False) -> None:
14 | """
15 | Plot a directed graph from a single email, as determined by `id`.
16 | If `showtitle` is `True`, render the plot with a email subject and date as title.
17 |
18 | Usage:
19 | reader = MboxReader('path-to-mbox.mbox')
20 | plot_single_directed(reader, 300) plots the 300th email from the mbox
21 | Args:
22 | reader (MBoxReader): A `MBoxReader` object
23 | id (int, optional): `id` of the email in the `MBoxReader`. Defaults to False.
24 | showtitle (bool, optional): If `True`, render the plot with a email subject and date as title. Defaults to False.
25 | """
26 |
27 | len_reader = len(reader)
28 | if not id:
29 | email = reader.mbox[len_reader-1]
30 | else:
31 | email = reader.mbox[id]
32 | emailmsg = extract_meta(email)
33 |
34 | subject = textwrap.fill(emailmsg.subject, 40)
35 | sender = emailmsg.sender.name if len(emailmsg.sender.name) != 0 else emailmsg.sender.email.split('@')[0]
36 |
37 | plt.figure(figsize=(9, 6))
38 | G = nx.DiGraph(name='Single Email Flow')
39 |
40 | for recipient in emailmsg.recipients:
41 | rec = recipient.name
42 | G.add_edge(sender, rec if len(rec) != 0 else recipient.email,
43 | message=subject, color='darkorchid', weight=3)
44 |
45 | for cc in emailmsg.cc:
46 | ccc = cc.name
47 | G.add_edge(sender, ccc if len(ccc) != 0 else cc.email,
48 | message='cc', color='lightsteelblue', weight=2)
49 |
50 | colors = nx.get_edge_attributes(G,'color').values()
51 | weights = nx.get_edge_attributes(G,'weight').values()
52 | edge_labels = nx.get_edge_attributes(G, 'message')
53 |
54 | pos = nx.planar_layout(G)
55 |
56 | # nx.draw_spectral(G,node_size=0, alpha=0.8, edge_color=colors, width=list(weights), font_size=8, with_labels=True)
57 | nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_size=6, label_pos=0.5)
58 | nx.draw_planar(G,node_size=0, alpha=1, edge_color=colors, width=list(weights), font_size=8, font_weight='bold', with_labels=True, verticalalignment='bottom')
59 |
60 | if showtitle:
61 | font = {"fontname": "Helvetica", "color": "k", "fontweight": "bold", "fontsize": 8}
62 | plt.title(subject + '\n Delivery date: ' + emailmsg.date.strftime("%m/%d/%Y"), fontdict=font)
63 |
64 | plt.tight_layout(pad=0.5)
65 | plt.axis('off')
66 | plt.show()
67 |
68 | def plot_single_undirected(reader:MBoxReader, id:int=False, showtitle:bool=False) -> None:
69 | """
70 | Plot an undirected social network graph from a single email, as determined by `id`.
71 | If `showtitle` is `True`, render the plot with a email subject and date as title.
72 |
73 | Usage:
74 | reader = MboxReader('path-to-mbox.mbox')
75 | plot_single_undirected(reader, 300) plots the 300th email from the mbox
76 | Args:
77 | reader (MBoxReader): A `MBoxReader` object
78 | id (int, optional): `id` of the email in the `MBoxReader`. Defaults to False.
79 | showtitle (bool, optional): If `True`, render the plot with a email subject and date as title. Defaults to False.
80 | """
81 |
82 | len_reader = len(reader)
83 | if not id:
84 | email = reader.mbox[len_reader-1]
85 | else:
86 | email = reader.mbox[id]
87 | emailmsg = extract_meta(email)
88 |
89 | subject = textwrap.fill(emailmsg.subject, 40)
90 | ng = PeopleCombination(emailmsg)
91 | G = nx.Graph(
92 | name='Single Email Social Network')
93 | counter = Counter()
94 | for combo in ng.combo:
95 | counter[combo] += 1
96 |
97 | total_combos = sum(counter.values())
98 | by_freq = {k: v/total_combos for k, v in counter.most_common()}
99 |
100 | for rel in counter.keys():
101 | G.add_edge(*rel, weight=by_freq[rel], count=counter[rel])
102 | k = 1/math.sqrt(G.order()) * 2
103 | pos = nx.spring_layout(G, k=k)
104 | deg = [v for _, v in G.degree()]
105 | # nx.draw_networkx_nodes(G, pos, node_size=deg, linewidths=1.0, alpha=0.90, label=G._node.keys())
106 | nx.draw_networkx_nodes(G, pos, node_size=deg, linewidths=1.0, alpha=0.90)
107 | nx.draw_networkx_edges(G, pos, edge_color="steelblue", width=1.0, style='dashed', alpha=0.75)
108 | nx.draw_networkx_labels(G, pos, {n: n for n in G.nodes}, font_size=8, verticalalignment="bottom")
109 | # nx.draw_networkx_labels(G, pos, {n: n for n in G.nodes if n.split('@')[-1] == 'supertype.ai'}, font_size=8)
110 |
111 | if showtitle:
112 | font = {"fontname": "Helvetica", "color": "k", "fontweight": "bold", "fontsize": 8}
113 | plt.title(subject + '\n Delivery date: ' + emailmsg.date.strftime("%m/%d/%Y"), fontdict=font)
114 |
115 | plt.tight_layout(pad=0.5)
116 | plt.axis('off')
117 | plt.show()
118 |
119 | def plot_directed(reader:MBoxReader, layout:str='shell', graphml:bool=False) -> None:
120 | """
121 | Plot a directed social network graph from the entire `mbox`, supplied by `MBoxReader`.
122 | `layout` determines the underlying `NetworkX` layout.
123 |
124 | Usage:
125 | reader = MboxReader('path-to-mbox.mbox')
126 | plot_directed(reader)
127 | Args:
128 | reader (MBoxReader): A `MBoxReader` object
129 | layout (str, optional): Can be one of 'shell', 'spring' or 'spiral'. Defaults to 'shell'.
130 | graphml (bool, optional): Determines if a .graphml file is exported to the working directory. Defaults to False.
131 | """
132 |
133 | emails = reader.extract()
134 | plt.figure(figsize=(12,12))
135 | G = nx.MultiDiGraph(name='Email Social Network')
136 | for email in emails:
137 | sender = email.sender.name
138 | source_addr = sender if sender != '' else email.sender.email.split('@')[0]
139 |
140 | all_recipients = [em.name if em.name !='' or None else em.email.split('@')[0] for em in email.recipients + email.cc]
141 |
142 | for recipient in all_recipients:
143 | G.add_edge(source_addr, recipient, message=email.subject)
144 |
145 | if graphml:
146 | fileName = f'network-{str(uuid.uuid4().hex)[:8]}.graphml'
147 | nx.write_graphml(G, fileName)
148 |
149 | if layout == 'shell':
150 | pos = nx.shell_layout(G)
151 | elif layout == 'spring':
152 | pos = nx.spring_layout(G)
153 | else:
154 | pos = nx.spiral_layout(G)
155 | nx.draw(G, pos, node_size=0, alpha=0.4, edge_color='cadetblue', font_size=7, with_labels=True)
156 | ax = plt.gca()
157 | ax.margins(0.08)
158 | plt.show()
159 |
160 | def plot_undirected(reader:MBoxReader, layout:str='shell', graphml:bool=False):
161 | """Plot an undirected social network graph from the entire `mbox`, supplied by `MBoxReader`.
162 | `layout` determines the underlying `NetworkX` layout.
163 |
164 | Usage:
165 | reader = MboxReader('path-to-mbox.mbox')
166 | plot_undirected(reader)
167 |
168 | Args:
169 | reader (MBoxReader): A `MBoxReader` object
170 | layout (str, optional): Can be one of 'shell', 'spring' or 'spiral'. Defaults to 'shell'.
171 | graphml (bool, optional): Determines if a .graphml file is exported to the working directory. Defaults to False.
172 | """
173 |
174 | emails = reader.extract()
175 | G = nx.Graph(name='Email Social Network')
176 | plt.figure(figsize=(12,12))
177 | counter = Counter()
178 | for email in emails:
179 | ng = PeopleCombination(email)
180 |
181 | for combo in ng.combo:
182 | counter[combo] += 1
183 |
184 | total_combos = sum(counter.values())
185 | by_freq = {k: v/total_combos for k, v in counter.most_common()}
186 | for rel in counter.keys():
187 | G.add_edge(*rel, weight=by_freq[rel], count=counter[rel])
188 |
189 | if graphml:
190 | fileName = f'network-{str(uuid.uuid4().hex)[:8]}.graphml'
191 | nx.write_graphml(G, fileName)
192 | print(f"Graphml exported as {fileName}")
193 |
194 | if layout == 'shell':
195 | pos = nx.shell_layout(G)
196 | elif layout == 'spring':
197 | k = 1/math.sqrt(G.order()) * 2
198 | pos = nx.spring_layout(G, k=k)
199 | else:
200 | pos = nx.spiral_layout(G)
201 |
202 | deg = [v*50 for _, v in G.degree()]
203 | nx.draw_networkx_nodes(G, pos, node_size=deg, linewidths=1.0, alpha=0.60)
204 | nx.draw_networkx_edges(G, pos, width=1.0, style='dashed', edge_color='cadetblue', alpha=0.6)
205 | nx.draw_networkx_labels(G, pos, {n: n.split('@')[0] for n in G.nodes}, font_size=8, font_color='darkorchid')
206 |
207 | plt.axis('off')
208 | plt.show()
209 |
210 | if __name__ == '__main__':
211 | MBOX_PATH = f'{os.path.dirname(__file__)}/tests/test.mbox'
212 |
213 | reader = MBoxReader('/Users/samuel/EmailNetwork/emailnetwork/emailnetwork/tests/test.mbox')
214 | # reader = MBoxReader('/Users/vincentiuscalvin/Documents/Supertype/mbox-dataset/Ori_Sample_01.mbox')
215 | # reader = MBoxReader('/Users/samuel/EmailNetwork/samuel-supertype.mbox')
216 | # plot_single_directed(reader,300)
217 | # plot_single_directed(reader, 1, True)
218 | # plot_directed(reader)
219 | # plot_directed(reader, "shell")
220 | plot_undirected(reader, 'spring')
--------------------------------------------------------------------------------
/emailnetwork/header.py:
--------------------------------------------------------------------------------
1 | from collections import Counter
2 | from email.header import decode_header
3 | from emailnetwork.utils import clean_subject
4 |
5 |
6 | class HeaderCounter(Counter):
7 | """[summary]
8 |
9 | Args:
10 | Counter ([type]): [description]
11 | """
12 |
13 | def __init__(self, reader):
14 | super().__init__()
15 | self = self.build_from(reader)
16 |
17 | def __str__(self):
18 | return f'{self.most_common()}'
19 |
20 | def build_from(self, reader):
21 | for email in reader:
22 | for k in email.keys():
23 | self[k] += 1
24 |
25 | return self
26 |
27 | def histogram(self, n=25):
28 | from matplotlib import pyplot as plt
29 | plt.style.use('fivethirtyeight')
30 | k, v = (list(self.keys())[:n], list(self.values())[:n])
31 | fig = plt.figure(figsize=(7, 10))
32 | ax = fig.add_subplot(111)
33 | y_pos = [i for i in range(n)]
34 | ax.barh(y_pos, v, color='plum')
35 | ax.set_yticks(y_pos)
36 | ax.set_yticklabels(k)
37 | ax.invert_yaxis()
38 | ax.set_xlabel('Frequency')
39 | ax.set_title('Email Header Analysis')
40 | plt.tight_layout()
41 | plt.show()
42 |
43 |
44 | if __name__ == '__main__':
45 | from emailnetwork.extract import MBoxReader
46 | # reader = MBoxReader('/Users/vincentiuscalvin/Documents/Supertype/mbox-dataset/Ori_Sample_01.mbox')
47 | reader = MBoxReader('/Users/samuel/EmailNetwork/samuel-supertype.mbox')
48 | headers = HeaderCounter(reader)
49 |
50 | k = headers.keys()
51 |
52 | containspam = list(filter(lambda v: "spam" in v.lower(), k))
53 |
54 | for email in reader:
55 | for key in email.keys():
56 | if key in containspam:
57 | print({key: decode_header(email[key])})
58 |
--------------------------------------------------------------------------------
/emailnetwork/network.py:
--------------------------------------------------------------------------------
1 | from itertools import combinations
2 |
3 | class PeopleCombination:
4 | """
5 | Usage:
6 | email = EmailMeta(...)
7 | PeopleCombination(email)
8 | """
9 |
10 | def __init__(self, email):
11 | people = [email.sender] + email.recipients + email.cc
12 | people = filter(lambda p: p is not None, people)
13 | people = set(addr.email for addr in people if addr.email)
14 | self.people = sorted(people)
15 |
16 | def __repr__(self):
17 | return str(self.people)
18 |
19 |
20 | @property
21 | def combo(self):
22 | for combination in combinations(self.people, 2):
23 | yield combination
24 |
--------------------------------------------------------------------------------
/emailnetwork/summary.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | from collections import Counter
3 | from datetime import datetime
4 |
5 |
6 | class DomainSummary:
7 | def __init__(self, reader):
8 | self.emailmetas = reader.extract()
9 | self.summary = self.get_summary()
10 |
11 | def get_summary(self):
12 | domains = {}
13 | for email in self.emailmetas:
14 | domains[email.origin_domain] = domains.get(email.origin_domain, 0) + 1
15 |
16 | return Counter(domains)
17 |
18 | def plot(self):
19 |
20 | plt.style.use("seaborn")
21 |
22 | fig = plt.figure(figsize=(10, 5))
23 |
24 | domains = list(self.summary.keys())
25 | freqs = list(self.summary.values())
26 |
27 | plt.barh(domains, freqs, color="cadetblue")
28 | [i.set_color("darkorchid") for i in plt.gca().get_xticklabels()]
29 | [i.set_color("darkorchid") for i in plt.gca().get_yticklabels()]
30 | plt.title(
31 | "Sender's Domain Occurences",
32 | fontdict={
33 | "fontname": "Helvetica",
34 | "color": "k",
35 | "fontweight": "bold",
36 | "fontsize": 12,
37 | },
38 | )
39 | plt.show()
40 |
41 | def __str__(self):
42 | coba = []
43 | for keys in self.summary:
44 | coba.append(f"{keys:<25s}: {str(self.summary[keys]):<3s}")
45 | return "\n".join(coba)
46 |
47 |
48 | class IncomingOutgoingSummary:
49 | def __init__(self, reader):
50 | self.reader = reader
51 | self.emailmetas = reader.extract()
52 | self.user_email = self.get_user_email()
53 | self.summary = self.get_summary()
54 |
55 | def get_user_email(self):
56 | from emailnetwork.extract import extract_meta
57 |
58 | email_addresses = {}
59 | for i in range(10):
60 | email = self.reader.mbox[i]
61 | emailmsg = extract_meta(email)
62 | for recipient in emailmsg.recipients:
63 | email_addresses[recipient.email] = (
64 | email_addresses.get(recipient.email, 0) + 1
65 | )
66 | email_addresses[emailmsg.sender.email] = (
67 | email_addresses.get(emailmsg.sender.email, 0) + 1
68 | )
69 | return sorted(email_addresses.items(), key=lambda k: k[1], reverse=True)[0][0]
70 |
71 | def get_summary(self):
72 | date = {}
73 | for email in self.emailmetas:
74 | if email.date.strftime("%B %Y") not in date:
75 | date[email.date.strftime("%B %Y")] = {"Incoming": 0, "Outgoing": 0}
76 | if email.sender.email == self.user_email:
77 | date[email.date.strftime("%B %Y")]["Outgoing"] += 1
78 | else:
79 | date[email.date.strftime("%B %Y")]["Incoming"] += 1
80 |
81 | date = sorted(
82 | date.items(), key=lambda items: datetime.strptime(items[0], "%B %Y")
83 | )
84 | return dict(date)
85 |
86 | def plot(self):
87 |
88 | plt.style.use("seaborn")
89 |
90 | dates = list(self.summary.keys())
91 | incoming = list(item[1]["Incoming"] for item in self.summary.items())
92 | outgoing = list(item[1]["Outgoing"] for item in self.summary.items())
93 |
94 | fig, ax = plt.subplots()
95 |
96 | ax.bar(dates, incoming, 0.4, label="Incoming", color="cadetblue")
97 | ax.bar(dates, outgoing, 0.4, bottom=incoming, label="Outgoing")
98 |
99 | [i.set_color("darkorchid") for i in plt.gca().get_xticklabels()]
100 | [i.set_color("darkorchid") for i in plt.gca().get_yticklabels()]
101 |
102 | plt.xticks(rotation=45)
103 |
104 | ax.set_ylabel("Counts")
105 | ax.set_title(
106 | "Number of Incoming and Outgoing Emails per Month",
107 | fontdict={
108 | "fontname": "Helvetica",
109 | "color": "k",
110 | "fontweight": "bold",
111 | "fontsize": 12,
112 | },
113 | )
114 | ax.legend()
115 |
116 | plt.show()
117 |
118 |
119 | if __name__ == "__main__":
120 | from emailnetwork.extract import MBoxReader
121 |
122 | reader = MBoxReader(
123 | "/Users/vincentiuscalvin/Documents/Supertype/mbox-dataset/Ori_Sample_01.mbox"
124 | )
125 | summary = DomainSummary(reader)
126 | summary_2 = IncomingOutgoingSummary(reader)
127 |
--------------------------------------------------------------------------------
/emailnetwork/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/onlyphantom/emailnetwork/24b323e7c5159b5a02a7933b5f609664689a7316/emailnetwork/tests/__init__.py
--------------------------------------------------------------------------------
/emailnetwork/tests/test_extract.py:
--------------------------------------------------------------------------------
1 | import os
2 | import datetime
3 | from unittest import TestCase
4 |
5 | from emailnetwork.extract import MBoxReader, extract_meta, extract_body
6 | from emailnetwork.emails import EmailAddress, EmailMeta, EmailBody
7 |
8 | """
9 | Demo mbox is generated from Benjamin Bengfort's Tribe tool
10 | with person names modified for anonymity
11 | """
12 | MBOX_PATH = f"{os.path.dirname(__file__)}/test.mbox"
13 |
14 |
15 | class TestExtract(TestCase):
16 | def setUp(self):
17 | self.reader = MBoxReader(MBOX_PATH)
18 | self.emails = self.reader.extract()
19 | self.emailAdress = self.reader.mbox[1]
20 |
21 | def tearDown(self):
22 | self.reader = None
23 |
24 | def test_read_mbox(self):
25 | self.assertIsInstance(self.reader, MBoxReader)
26 |
27 | def test_length_mbox(self):
28 | self.assertEqual(len(self.reader), 140)
29 |
30 | def test_extract(self):
31 | # self.assertTrue(isinstance(next(self.emails), EmailMeta))
32 | firstemail = next(self.emails)
33 | self.assertIsInstance(firstemail, EmailMeta)
34 | self.assertIsInstance(firstemail.subject, str)
35 | self.assertIsInstance(firstemail.date, datetime.datetime)
36 |
37 | for msg in self.emails:
38 | self.assertGreaterEqual(len(msg.recipients), 1)
39 | self.assertIsInstance(msg.cc, list)
40 |
41 | def test_email_address(self):
42 | firstemail = next(self.emails)
43 | self.assertIsInstance(firstemail.sender, EmailAddress)
44 | self.assertIsInstance(firstemail.sender.name, str)
45 | self.assertIsInstance(firstemail.sender.email, str)
46 |
47 | mail = extract_meta(self.emailAdress)
48 | self.assertIsInstance(mail.recipients[0].name, str)
49 | self.assertIsInstance(mail.recipients[0].email, str)
50 |
51 | def test_filter_emails(self):
52 | newmails = self.reader.filter_emails(datestring="2020-01-01", dateoperator=">=")
53 | self.assertEqual(len(newmails), 4)
54 |
55 | for email in newmails:
56 | self.assertGreater(email.date, datetime.datetime(2020, 1, 1))
57 | self.assertLess(email.date, datetime.datetime.now())
58 |
59 | oldmails = self.reader.filter_emails(datestring="2019-12-31", dateoperator="<=")
60 | self.assertEqual(len(oldmails), 136)
61 |
62 | exactmails = self.reader.filter_emails(
63 | datestring="2020-04-17", dateoperator="=="
64 | )
65 | self.assertEqual(len(exactmails), 1)
66 | self.assertEqual(exactmails[0].date.date(), datetime.date(2020, 4, 17))
67 |
68 | namedmails = self.reader.filter_emails(emailaddress="samuelchan@gmail.com")
69 |
70 | for email in namedmails:
71 | checkers = [email.sender.email] + [
72 | recipient.email for recipient in email.recipients
73 | ]
74 | self.assertTrue("samuelchan@gmail.com" in checkers)
75 |
76 | fullfilteredmails = self.reader.filter_emails(
77 | emailaddress="samuelchan@gmail.com",
78 | datestring="2020-01-01",
79 | dateoperator=">=",
80 | )
81 |
82 | for email in fullfilteredmails:
83 | checkers = [email.sender.email] + [
84 | recipient.email for recipient in email.recipients
85 | ]
86 | self.assertTrue("samuelchan@gmail.com" in checkers)
87 | self.assertGreater(email.date, datetime.datetime(2020, 1, 1))
88 |
89 | fullfilteredmailsequal = self.reader.filter_emails(
90 | emailaddress="samuelchan@gmail.com",
91 | datestring="2020-04-17",
92 | dateoperator="==",
93 | )
94 |
95 | for email in fullfilteredmailsequal:
96 | checkers = [email.sender.email] + [
97 | recipient.email for recipient in email.recipients
98 | ]
99 | self.assertTrue("samuelchan@gmail.com" in checkers)
100 | self.assertEqual(
101 | fullfilteredmailsequal[0].date.date(), datetime.date(2020, 4, 17)
102 | )
103 |
104 | fullfilteremailless = self.reader.filter_emails(
105 | emailaddress="samuelchan@gmail.com",
106 | datestring="2019-12-31",
107 | dateoperator="<=",
108 | )
109 |
110 | for email in fullfilteremailless:
111 | checkers = [email.sender.email] + [
112 | recipient.email for recipient in email.recipients
113 | ]
114 | self.assertTrue("samuelchan@gmail.com" in checkers)
115 |
116 | mailswithoutfilter = self.reader.filter_emails()
117 |
118 | for email in mailswithoutfilter:
119 | self.assertIsInstance(email, EmailMeta)
120 |
121 | # also need tests to fail with expected exception when datetime operator not in [==, <=, >=], emailaddress and datetime in wrong format.
122 | def test_afunction_throws_exception(self):
123 | self.assertRaises(ValueError, self.reader.filter_emails, 20, "2019-12-31", "<")
124 |
125 | def test_extract_meta_single(self):
126 | for email in self.reader.mbox:
127 | self.assertIsInstance(email["Subject"], (bytes, str))
128 | emailmsg = extract_meta(email)
129 | self.assertIsInstance(emailmsg, EmailMeta)
130 | self.assertIsInstance(emailmsg.origin_domain, str)
131 | self.assertIsInstance(emailmsg.subject, str)
132 |
133 | def test_extract_body_single(self):
134 | for email in self.reader.mbox:
135 | emailbody = extract_body(email)
136 | self.assertIsInstance(emailbody, EmailBody)
137 | self.assertIsInstance(emailbody.subject, str)
138 | self.assertIsInstance(emailbody.body, str)
139 |
--------------------------------------------------------------------------------
/emailnetwork/tests/test_graph.py:
--------------------------------------------------------------------------------
1 | import os
2 | from unittest import TestCase, mock
3 |
4 | from emailnetwork.extract import MBoxReader
5 | # from emailnetwork.graph import plot_single_email
6 | import emailnetwork.graph as graph
7 |
8 | MBOX_PATH = f'{os.path.dirname(__file__)}/test.mbox'
9 |
10 |
11 | class TestGraph(TestCase):
12 | def setUp(self):
13 | self.reader = MBoxReader(MBOX_PATH)
14 | self.emails = self.reader.extract()
15 | self.layout = ['shell', 'spring', 'spiral']
16 |
17 | def test_single_graph(self):
18 | # TODO: to be implemented later
19 | pass
20 |
21 | @mock.patch(f"{__name__}.graph.plt")
22 | def test_plot_single_directed(self, mock_plt):
23 | graph.plot_single_directed(self.reader, 1, True)
24 | mock_plt.title.assert_called_once_with("Three tips to get the most out of Gmail\n Delivery date: 04/17/2020", fontdict={'fontname': 'Helvetica', 'color': 'k', 'fontweight': 'bold', 'fontsize': 8})
25 | assert mock_plt.figure.called
26 |
27 |
28 | def test_plot_single_undirected(self):
29 | with mock.patch("%s.graph.plt" % __name__) as patch, mock.patch("%s.graph.nx" % __name__) as patch2:
30 | graph.plot_single_undirected(self.reader,2, True)
31 | patch.title.assert_called_once_with("Stay more organized with Gmail's inbox\n Delivery date: 08/13/2020", fontdict={'fontname': 'Helvetica', 'color': 'k', 'fontweight': 'bold', 'fontsize': 8})
32 | patch2.Graph.assert_called_once_with(name='Single Email Social Network')
33 |
34 |
35 | def test_plot_directed(self):
36 | with mock.patch("%s.graph.plt" % __name__) as patch:
37 | for item in self.layout:
38 | graph.plot_directed(self.reader, item)
39 | assert patch.figure.called
40 |
41 | def test_plot_undirected(self):
42 | with mock.patch("%s.graph.plt" % __name__) as patch:
43 | for item in self.layout:
44 | graph.plot_undirected(self.reader, item)
45 | assert patch.figure.called
46 |
47 |
--------------------------------------------------------------------------------
/emailnetwork/tests/test_summary.py:
--------------------------------------------------------------------------------
1 | import os
2 | import matplotlib.pyplot as plt
3 | from unittest import TestCase, mock
4 | from collections import Counter
5 |
6 | from emailnetwork.extract import MBoxReader
7 | from emailnetwork.summary import DomainSummary, IncomingOutgoingSummary
8 | from emailnetwork.header import HeaderCounter
9 |
10 | import emailnetwork.summary as summary
11 |
12 | MBOX_PATH = f"{os.path.dirname(__file__)}/test.mbox"
13 |
14 |
15 | class TestSummary(TestCase):
16 | def setUp(self):
17 | self.reader = MBoxReader(MBOX_PATH)
18 | self.domain_summary = DomainSummary(self.reader)
19 | self.incoming_outgoing_summary = IncomingOutgoingSummary(self.reader)
20 | self.headers = HeaderCounter(self.reader)
21 |
22 | def tearDown(self):
23 | self.domain_summary = None
24 | self.incoming_outgoing_summary = None
25 |
26 | def test_summary_instance(self):
27 | self.assertIsInstance(self.domain_summary, DomainSummary)
28 | self.assertIsInstance(self.domain_summary.summary, Counter)
29 | self.assertIsInstance(self.incoming_outgoing_summary, IncomingOutgoingSummary)
30 | self.assertIsInstance(self.incoming_outgoing_summary.summary, dict)
31 |
32 | def test_one_summary(self):
33 | for summary in self.domain_summary.summary:
34 | self.assertIsInstance(summary, str)
35 | self.assertIsInstance(self.domain_summary.summary[summary], int)
36 | self.assertGreater(self.domain_summary.summary[summary], 0)
37 |
38 | for summary in self.incoming_outgoing_summary.summary:
39 | self.assertIsInstance(summary, str)
40 | self.assertIsInstance(self.incoming_outgoing_summary.summary[summary], dict)
41 | for keys in self.incoming_outgoing_summary.summary[summary]:
42 | self.assertIn(keys, ("Incoming", "Outgoing"))
43 | self.assertIsInstance(
44 | self.incoming_outgoing_summary.summary[summary][keys], int
45 | )
46 |
47 | def test_header(self):
48 | self.assertIsInstance(self.headers, HeaderCounter)
49 |
50 | @mock.patch(f"{__name__}.summary.plt")
51 | def test_mock_plot(self, mock_plt):
52 | reader = MBoxReader(MBOX_PATH)
53 | ds = DomainSummary(reader=reader)
54 | ds.plot()
55 | mock_plt.title.assert_called_once_with(
56 | "Sender's Domain Occurences",
57 | fontdict={
58 | "fontname": "Helvetica",
59 | "color": "k",
60 | "fontweight": "bold",
61 | "fontsize": 12,
62 | },
63 | )
64 | assert mock_plt.figure.called
65 |
--------------------------------------------------------------------------------
/emailnetwork/tests/test_utils.py:
--------------------------------------------------------------------------------
1 | import os
2 | from unittest import TestCase
3 |
4 | from emailnetwork.extract import MBoxReader
5 |
6 | MBOX_PATH = f'{os.path.dirname(__file__)}/test.mbox'
7 |
8 | class TestNetwork(TestCase):
9 | def setUp(self):
10 | # self.reader = MBoxReader(MBOX_PATH)
11 | # self.emails = self.reader.extract()
12 | pass
13 |
14 | # test PeopleCombination
15 |
16 |
--------------------------------------------------------------------------------
/emailnetwork/utils.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime
2 |
3 | from dateutil import parser
4 | from dateutil.tz import tzlocal, tzutc
5 | from email.utils import parsedate_tz, mktime_tz
6 | from email.header import decode_header
7 |
8 |
9 | def parse_date(datestring: str):
10 | """[summary]
11 | Usage:
12 | Primarily used for extract_meta(email): parse_date(email['Date'])
13 | parse_date('Sat, 19 Sep 2020 12:01:38 +0800')
14 | Args:
15 | datestring (str): [description]
16 | """
17 | try:
18 | dt = parsedate_tz(datestring)
19 | if dt is not None:
20 | return datetime.utcfromtimestamp(mktime_tz(dt))
21 |
22 | return parser.parse(datestring)
23 | except Exception:
24 | return None
25 |
26 |
27 | def clean_subject(subject):
28 | """[summary]
29 | Usage:
30 |
31 | Args:
32 | subject (byte or str)
33 | """
34 | subject, encoding = decode_header(subject)[0]
35 | if isinstance(subject, bytes):
36 | try:
37 | return subject.decode(encoding).strip()
38 | except:
39 | return subject.decode('utf-8').strip()
40 | else:
41 | return subject.strip().replace('\r\n', '')
42 |
43 |
44 | def clean_body(email):
45 | if email.is_multipart():
46 | for part in email.walk():
47 | ctype = part.get_content_type()
48 | cdispo = str(part.get('Content-Disposition'))
49 |
50 | # skip any text/plain (txt) attachments
51 | if ctype == 'text/plain' and 'attachment' not in cdispo:
52 | return part.get_payload(decode=True).decode() # decode
53 | break
54 | # not multipart - i.e. plain text, no attachments, keeping fingers crossed
55 | else:
56 | return email.get_payload(decode=True).decode()
57 |
--------------------------------------------------------------------------------
/emailnetwork/version.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.0.2"
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | bleach==3.2.3
2 | certifi==2020.12.5
3 | chardet==4.0.0
4 | colorama==0.4.4
5 | coverage==5.4
6 | cycler==0.10.0
7 | decorator==4.4.2
8 | docutils==0.16
9 | emailnetwork==0.0.1
10 | idna==2.10
11 | keyring==22.0.1
12 | kiwisolver==1.3.1
13 | matplotlib==3.3.3
14 | networkx==2.5
15 | numpy==1.19.5
16 | packaging==20.8
17 | Pillow==8.1.0
18 | pkginfo==1.7.0
19 | Pygments==2.7.4
20 | pyparsing==2.4.7
21 | pytest==6.2.5
22 | pytest-cov==3.0.0
23 | python-dateutil==2.8.1
24 | readme-renderer==28.0
25 | requests==2.25.1
26 | requests-toolbelt==0.9.1
27 | rfc3986==1.4.0
28 | six==1.15.0
29 | tqdm==4.56.0
30 | twine==3.3.0
31 | urllib3==1.26.3
32 | webencodings==0.5.1
33 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 | from pathlib import Path
3 | from emailnetwork.version import __version__
4 |
5 | setup(name='emailnetwork',
6 | version=__version__,
7 | description='Network graphing utilities for email/mailbox (.mbox) data',
8 | long_description=(Path(__file__).parent/'README.md').read_text(),
9 | long_description_content_type='text/markdown',
10 | url='http://github.com/onlyphantom/emailnetwork',
11 | author='Samuel Chan',
12 | author_email='s@supertype.ai',
13 | license='MIT',
14 | packages=find_packages(exclude=('tests',)),
15 | include_package_data=True,
16 | install_requires=['matplotlib', 'networkx'],
17 | classifiers=[
18 | "License :: OSI Approved :: MIT License",
19 | "Programming Language :: Python :: 3",
20 | "Programming Language :: Python :: 3.7",
21 | "Operating System :: OS Independent",
22 | ],
23 | zip_safe=False,
24 | python_requires='>=3.7')
--------------------------------------------------------------------------------