├── .github └── workflows │ ├── publish-to-test-pypi.yml │ └── test_msgtopdf.yml ├── .gitignore ├── LICENSE ├── README.md ├── msgtopdf ├── __init__.py ├── msgtopdf.py └── scripts │ ├── __init__.py │ └── msg2pdf.py ├── requirements.txt ├── setup.py └── tests ├── example.py └── test_msgtopdf.py /.github/workflows/publish-to-test-pypi.yml: -------------------------------------------------------------------------------- 1 | name: Publish msgtopdf to PyPi 2 | 3 | on: push 4 | 5 | jobs: 6 | build-n-publish: 7 | name: Build and publish msgtopdf to PyPi 8 | runs-on: ubuntu-latest 9 | steps: 10 | - uses: actions/checkout@master 11 | - name: Set up Python 3.8 12 | uses: actions/setup-python@v1 13 | with: 14 | python-version: 3.8 15 | - name: Install setuptools 16 | run: python -m pip install --upgrade setuptools wheel 17 | - name: Build 18 | run: python setup.py sdist bdist_wheel 19 | - name: Publish to PyPI 20 | if: startsWith(github.event.ref, 'refs/tags') 21 | uses: pypa/gh-action-pypi-publish@master 22 | with: 23 | password: ${{ secrets.pypi_password }} -------------------------------------------------------------------------------- /.github/workflows/test_msgtopdf.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: Test msgtopdf 5 | 6 | on: 7 | push: 8 | branches: [ master ] 9 | pull_request: 10 | branches: [ master ] 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: ${{ matrix.os }} 16 | strategy: 17 | matrix: 18 | python-version: [3.5, 3.6, 3.7, 3.8] 19 | os: [windows-latest, windows-2016] 20 | 21 | steps: 22 | - uses: actions/checkout@v2 23 | - name: add wkhtmltopdf to path 24 | run: echo "::add-path::C:\Program Files\wkhtmltopdf\bin" 25 | - name: Set up Python ${{ matrix.python-version }} 26 | uses: actions/setup-python@v1 27 | with: 28 | python-version: ${{ matrix.python-version }} 29 | - name: Install dependencies 30 | run: | 31 | python -m pip install --upgrade pip 32 | pip install -r requirements.txt 33 | - name: Test with pytest 34 | run: | 35 | pip install pytest 36 | python -m pytest 37 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | # vscode 132 | .vscode/ 133 | 134 | # project specific 135 | file.msg 136 | file/ 137 | 138 | tests/email.msg 139 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 ushills 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![Test msgtopdf](https://github.com/ushills/msgtopdf/workflows/Test%20msgtopdf/badge.svg) [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) 2 | 3 | # Converts Outlook .msg files to PDF 4 | 5 | `msgtopdf` is a Python 3 module to convert Outlook `.msg` files to PDF and extract the attachments. Unlike the majority of current modules `msgtopdf` maintains the formatting of HTML and RTF messages and embeds any inline images in the PDF output. 6 | 7 | As the module uses the `win32com` library the host machine must have Outlook installed. 8 | 9 | `msgtopdf` uses the [wkhtmltopdf](https://wkhtmltopdf.org/) tool to convert the HTML message to PDF and [wkhtmltopdf](https://wkhtmltopdf.org/) must be installed separately. 10 | 11 | Currently `msgtopdf` extracts the message body and attachments to a new subfolder using the subject of the email as the folder name. 12 | 13 | # Usage 14 | 15 | ## Module Usage 16 | 17 | Example module usage is provided in the `tests/example.py` file. 18 | 19 | 20 | ## Command Line Usage 21 | 22 | The command-line option `msg2pdf` will convert individual files or all `*.msg` files in a directory. 23 | 24 | `msg2pdf --help` for options. 25 | 26 | Usage: msg2pdf [OPTIONS] PATH 27 | 28 | msg2pdf converts Outlook email messages (msg) to pdf. 29 | 30 | The output is a folder for each email using the email subject as the 31 | folder name including a pdf of the email and all attachments. 32 | 33 | Inline images are included in the email pdf. 34 | 35 | Options: 36 | -f, --file Convert an individual file PATH to pdf. 37 | -d, --directory Convert all msg files in directory PATH to pdf. 38 | --help Show this message and exit. 39 | 40 | 41 | 42 | # Requirements 43 | 44 | Install the Windows binary release of [wkhtmltopdf](https://wkhtmltopdf.org/downloads.html) 45 | 46 | Ensure that `wkhtmltopdf` command is found in your `PATH`. 47 | 48 | This can be tested by entering `wkhtmltopdf --version` in your Command Prompt. 49 | 50 | You should receive an output similar to the attached. 51 | 52 | 53 | Microsoft Windows [Version 6.1.7601] 54 | Copyright (c) 2009 Microsoft Corporation. All rights reserved. 55 | 56 | C:\>wkhtmltopdf --version 57 | wkhtmltopdf 0.12.5 (with patched qt) 58 | 59 | C:\> 60 | -------------------------------------------------------------------------------- /msgtopdf/__init__.py: -------------------------------------------------------------------------------- 1 | from .msgtopdf import * 2 | -------------------------------------------------------------------------------- /msgtopdf/msgtopdf.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import re 4 | import subprocess 5 | import sys 6 | from pathlib import Path, PurePath 7 | 8 | import win32com.client 9 | 10 | __all__ = ["Msgtopdf"] 11 | 12 | # logging defaults 13 | logging.basicConfig( 14 | level=logging.INFO, 15 | format="%(asctime)s - %(levelname)s - %(message)s", 16 | datefmt="%m/%d/%Y %I:%M:%S %p", 17 | ) 18 | 19 | required_paths = ["wkhtmltopdf"] 20 | 21 | 22 | class Msgtopdf: 23 | def __init__(self, msgfile): 24 | if check_paths_exist(required_paths) is False: 25 | sys.exit(1) 26 | outlook = win32com.client.Dispatch("Outlook.Application").GetNamespace("MAPI") 27 | self.msgfile = PurePath(msgfile) 28 | self.directory = PurePath(self.msgfile).parent 29 | self.file = PurePath(self.msgfile).name 30 | self.file_name = self.file.split(".msg")[0] 31 | self.save_path = self.__define_save_path() 32 | self.msg = outlook.OpenSharedItem(self.msgfile) 33 | 34 | def raw_email_body(self): 35 | if self.msg.BodyFormat == 2: 36 | body = self.msg.HTMLBody 37 | self.email_format = "html" 38 | elif self.msg.BodyFormat == 3: 39 | body = self.msg.RTFBody 40 | self.email_format = "html" 41 | else: 42 | body = self.msg.Body 43 | self.email_format = "txt" 44 | self.raw_body = body 45 | return self.raw_body 46 | 47 | def email2pdf(self): 48 | Path.mkdir(Path(self.save_path)) 49 | html_header = self.__add_header_information() 50 | raw_email_body = self.raw_email_body() 51 | full_email_body = html_header + raw_email_body 52 | clean_email_body = self.replace_CID(full_email_body) 53 | self.html_body_file = PurePath(self.save_path, self.file_name + ".html") 54 | self.extract_email_attachments() 55 | # convert_html_to_pdf(clean_email_body, self.html_body_file) 56 | with open(self.html_body_file, "w", encoding="utf-8") as f: 57 | f.write(clean_email_body) 58 | # save pdf copy using wkhtmltopdf 59 | try: 60 | subprocess.run( 61 | [ 62 | "wkhtmltopdf", 63 | "--enable-local-file-access", 64 | "--log-level", 65 | "warn", 66 | "--encoding", 67 | "utf-8", 68 | "--footer-font-size", 69 | "6", 70 | "--footer-line", 71 | "--footer-center", 72 | "[page] / [topage]", 73 | str(self.html_body_file), 74 | str(PurePath(self.save_path, self.file_name + ".pdf")), 75 | ] 76 | ) 77 | except Exception as e: 78 | logging.critical("Could not call wkhtmltopdf") 79 | logging.debug(e) 80 | self.__delete_redundant_files() 81 | 82 | def extract_email_attachments(self): 83 | count_attachments = self.msg.Attachments.Count 84 | if count_attachments > 0: 85 | for item in range(count_attachments): 86 | attachment_filename = self.msg.Attachments.Item(item + 1).Filename 87 | self.msg.Attachments.Item(item + 1).SaveAsFile( 88 | PurePath(self.save_path, attachment_filename) 89 | ) 90 | 91 | def __define_save_path(self): 92 | msgfile_name = self.file.split(".msg")[0] 93 | msgfile_folder = self.clean_path(msgfile_name) 94 | save_path = PurePath(self.directory, msgfile_folder) 95 | # TODO check if save_path already exists and if so add increment 96 | return save_path 97 | 98 | def __add_header_information(self): 99 | html_str = """ 100 | 101 | 102 | 103 |

104 | 105 | From: {sender}
106 | Sent: {sent}
107 | To: {to}
108 | Cc: {cc}
109 | Subject: {subject}

110 |
111 | """ 112 | formatted_html = html_str.format( 113 | base_href="file:///" + str(self.save_path) + "\\", 114 | sender=self.msg.SenderName, 115 | sent=self.msg.SentOn, 116 | to=self.msg.To, 117 | cc=self.msg.CC, 118 | subject=self.msg.Subject, 119 | attachments=self.msg.Attachments, 120 | ) 121 | return formatted_html 122 | 123 | def replace_CID(self, body): 124 | self.image_files = [] 125 | # search for cid:(capture_group)@* upto " 126 | p = re.compile(r"cid:([^\"@]*)[^\"]*") 127 | r = p.sub(self.__return_image_reference, body) 128 | return r 129 | 130 | def __return_image_reference(self, match): 131 | value = str(match.groups()[0]) 132 | if value not in self.image_files: 133 | self.image_files.append(value) 134 | return value 135 | 136 | def __delete_redundant_files(self): 137 | Path.unlink(Path(self.html_body_file)) 138 | for f in self.image_files: 139 | image_full_path = Path(self.save_path, f) 140 | if Path.exists(image_full_path): 141 | Path.unlink(image_full_path) 142 | 143 | def clean_path(self, path): 144 | c_path = re.sub(r'[\\/\:*"<>\|\.%\$\^&£]', "", path) 145 | c_path = re.sub(r"[ ]{2,}", "", c_path) 146 | c_path = c_path.strip() 147 | return c_path 148 | 149 | 150 | def check_paths_exist(paths_to_check): 151 | path = os.getenv("PATH") 152 | for p in paths_to_check: 153 | if p not in path: 154 | logging.critical("%s not in path", p) 155 | logging.error(path) 156 | return False 157 | return True 158 | -------------------------------------------------------------------------------- /msgtopdf/scripts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ushills/msgtopdf/25c5cf60158220ab840c0fb2fd46bae1cf3d5e22/msgtopdf/scripts/__init__.py -------------------------------------------------------------------------------- /msgtopdf/scripts/msg2pdf.py: -------------------------------------------------------------------------------- 1 | import click 2 | from colorama import init, Fore 3 | from msgtopdf.msgtopdf import Msgtopdf 4 | 5 | from pathlib import Path, PurePath 6 | 7 | # Initialise colorama 8 | init() 9 | 10 | 11 | @click.command() 12 | @click.version_option() 13 | @click.option( 14 | "-f", 15 | "--file", 16 | "path_type", 17 | flag_value="filename", 18 | help="Convert an individual file PATH to pdf.", 19 | ) 20 | @click.option( 21 | "-d", 22 | "--directory", 23 | "path_type", 24 | flag_value="directory", 25 | help="Convert all msg files in directory PATH to pdf.", 26 | ) 27 | @click.argument("path", type=click.Path(exists=True, resolve_path=True)) 28 | def cli(path_type, path): 29 | """msg2pdf converts Outlook email messages (msg) to pdf.\n 30 | The output is a folder for each email using the email subject as the folder name 31 | inculding a pdf of the email and all attachments.\n 32 | Inline images are included in the email pdf.""" 33 | if path_type == "filename": 34 | convert_file(path) 35 | if path_type == "directory": 36 | convert_directory(path) 37 | 38 | 39 | def convert_file(filename): 40 | try: 41 | f = Msgtopdf(filename) 42 | f.email2pdf() 43 | print(Fore.GREEN + f"Converted {filename} to PDF!" + Fore.RESET) 44 | except: 45 | print(Fore.RED + f"Something went wrong!" + Fore.RESET) 46 | 47 | 48 | def convert_directory(directory): 49 | msg_files = list(Path(directory).glob("**/*.msg")) 50 | for f in msg_files: 51 | convert_file(f) 52 | 53 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pywin32 2 | click 3 | colorama 4 | 5 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | 4 | with open("README.md", "r") as fh: 5 | long_description = fh.read() 6 | 7 | setup( 8 | name="msgtopdf", 9 | version="0.1.6", 10 | author="Ian Hill", 11 | author_email="web@ushills.co.uk", 12 | description="Convert Outlook msg to PDF", 13 | long_description=long_description, 14 | long_description_content_type="text/markdown", 15 | url="https://github.com/ushills/msgtopdf", 16 | packages=find_packages(exclude=["tests"]), 17 | install_requires=["pywin32", "Click", "Colorama"], 18 | entry_points=""" 19 | [console_scripts] 20 | msg2pdf=msgtopdf.scripts.msg2pdf:cli 21 | """, 22 | include_package_data=True, 23 | classifiers=[ 24 | "Programming Language :: Python :: 3", 25 | "Development Status :: 4 - Beta", 26 | "License :: OSI Approved :: MIT License", 27 | "Environment :: Win32 (MS Windows)", 28 | "Operating System :: Microsoft :: Windows", 29 | ], 30 | python_requires=">=3.6", 31 | ) 32 | -------------------------------------------------------------------------------- /tests/example.py: -------------------------------------------------------------------------------- 1 | from msgtopdf import Msgtopdf 2 | from pathlib import Path 3 | 4 | 5 | def main(): 6 | directory = Path.cwd() 7 | msgfile = "file.msg" 8 | msgfile = Path(directory, msgfile) 9 | email = Msgtopdf(msgfile) 10 | email.email2pdf() 11 | 12 | 13 | if __name__ == "__main__": 14 | main() 15 | -------------------------------------------------------------------------------- /tests/test_msgtopdf.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import pathlib 3 | from unittest.mock import MagicMock 4 | import win32com.client 5 | 6 | from msgtopdf.msgtopdf import Msgtopdf 7 | from msgtopdf.msgtopdf import check_paths_exist 8 | 9 | mock_outlook = MagicMock() 10 | win32com.client = mock_outlook 11 | 12 | 13 | def test_init_check_paths_exist(): 14 | assert check_paths_exist(["system32"]) is True 15 | 16 | 17 | def test_init_check_paths_does_not_exist(): 18 | assert check_paths_exist(["notinpath"]) is False 19 | 20 | 21 | class Test_Msgtopdf: 22 | def test_init_directory(self): 23 | email = Msgtopdf("C:/test/email.msg") 24 | assert email.directory == pathlib.PurePath("C:/test") 25 | 26 | def test_init_file(self): 27 | email = Msgtopdf("C:/test/email.msg") 28 | assert email.file == "email.msg" 29 | 30 | def test_init_save_path(self): 31 | email = Msgtopdf("C:/test/email.msg") 32 | assert email.save_path == pathlib.PurePath("C:/test/email") 33 | 34 | def test_replace_CID_single_CID(self): 35 | email = Msgtopdf("C:/test/email.msg") 36 | line = '' 37 | assert email.replace_CID(line) == '' 38 | 39 | def test_replace_CID_alt_CID(self): 40 | email = Msgtopdf("C:/test/email.msg") 41 | line = 'cid:image004.png@01D543A2.096B0830' 42 | assert ( 43 | email.replace_CID(line) 44 | == 'image004.png' 45 | ) 46 | 47 | def test_replace_CID_multiple_CID(self): 48 | email = Msgtopdf("C:/test/email.msg") 49 | line = '' 50 | assert ( 51 | email.replace_CID(line) 52 | == '' 53 | ) 54 | assert email.image_files == ["image001.png", "image002.png"] 55 | 56 | def test_replace_CID_no_replace(self): 57 | email = Msgtopdf("C:/test/email.msg") 58 | body = "

Not an image

" 59 | assert email.replace_CID(body) == "

Not an image

" 60 | 61 | def test_clean_path(self): 62 | email = Msgtopdf("C:/test/email.msg") 63 | path = r"RE:/ test dirty path ^" 64 | assert email.clean_path(path) == "RE test dirty path" 65 | 66 | def test___delete_redundant_files(self): 67 | email = Msgtopdf("C:/test/email.msg") 68 | email.save_path = pathlib.PurePath("./tests/") 69 | email.image_files = ["exists.png", "does_not_exist.png"] 70 | email.html_body_file = "./tests/html_body.html" 71 | # create temporary files for deletion 72 | open("./tests/html_body.html", "w+") 73 | open("./tests/exists.png", "w+") 74 | assert email.image_files == ["exists.png", "does_not_exist.png"] 75 | email._Msgtopdf__delete_redundant_files() 76 | 77 | def test_raw_email_body_html(self): 78 | email = Msgtopdf("C:/test/email.msg") 79 | email.msg.BodyFormat = 2 80 | email.raw_email_body() 81 | assert email.email_format == "html" 82 | email.msg.BodyFormat = 3 83 | email.raw_email_body() 84 | assert email.email_format == "html" 85 | email.msg.BodyFormat = 1 86 | email.raw_email_body() 87 | assert email.email_format == "txt" 88 | --------------------------------------------------------------------------------