├── .github └── workflows │ ├── publish-to-test-pypi.yml │ └── test_msgtopdf.yml ├── .gitignore ├── LICENSE ├── README.md ├── msgtopdf ├── __init__.py ├── msgtopdf.py └── scripts │ ├── __init__.py │ └── msg2pdf.py ├── requirements.txt ├── setup.py └── tests ├── example.py └── test_msgtopdf.py /.github/workflows/publish-to-test-pypi.yml: -------------------------------------------------------------------------------- 1 | name: Publish msgtopdf to PyPi 2 | 3 | on: push 4 | 5 | jobs: 6 | build-n-publish: 7 | name: Build and publish msgtopdf to PyPi 8 | runs-on: ubuntu-latest 9 | steps: 10 | - uses: actions/checkout@master 11 | - name: Set up Python 3.8 12 | uses: actions/setup-python@v1 13 | with: 14 | python-version: 3.8 15 | - name: Install setuptools 16 | run: python -m pip install --upgrade setuptools wheel 17 | - name: Build 18 | run: python setup.py sdist bdist_wheel 19 | - name: Publish to PyPI 20 | if: startsWith(github.event.ref, 'refs/tags') 21 | uses: pypa/gh-action-pypi-publish@master 22 | with: 23 | password: ${{ secrets.pypi_password }} -------------------------------------------------------------------------------- /.github/workflows/test_msgtopdf.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: Test msgtopdf 5 | 6 | on: 7 | push: 8 | branches: [ master ] 9 | pull_request: 10 | branches: [ master ] 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: ${{ matrix.os }} 16 | strategy: 17 | matrix: 18 | python-version: [3.5, 3.6, 3.7, 3.8] 19 | os: [windows-latest, windows-2016] 20 | 21 | steps: 22 | - uses: actions/checkout@v2 23 | - name: add wkhtmltopdf to path 24 | run: echo "::add-path::C:\Program Files\wkhtmltopdf\bin" 25 | - name: Set up Python ${{ matrix.python-version }} 26 | uses: actions/setup-python@v1 27 | with: 28 | python-version: ${{ matrix.python-version }} 29 | - name: Install dependencies 30 | run: | 31 | python -m pip install --upgrade pip 32 | pip install -r requirements.txt 33 | - name: Test with pytest 34 | run: | 35 | pip install pytest 36 | python -m pytest 37 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | # vscode 132 | .vscode/ 133 | 134 | # project specific 135 | file.msg 136 | file/ 137 | 138 | tests/email.msg 139 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 ushills 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |  [](https://github.com/psf/black) 2 | 3 | # Converts Outlook .msg files to PDF 4 | 5 | `msgtopdf` is a Python 3 module to convert Outlook `.msg` files to PDF and extract the attachments. Unlike the majority of current modules `msgtopdf` maintains the formatting of HTML and RTF messages and embeds any inline images in the PDF output. 6 | 7 | As the module uses the `win32com` library the host machine must have Outlook installed. 8 | 9 | `msgtopdf` uses the [wkhtmltopdf](https://wkhtmltopdf.org/) tool to convert the HTML message to PDF and [wkhtmltopdf](https://wkhtmltopdf.org/) must be installed separately. 10 | 11 | Currently `msgtopdf` extracts the message body and attachments to a new subfolder using the subject of the email as the folder name. 12 | 13 | # Usage 14 | 15 | ## Module Usage 16 | 17 | Example module usage is provided in the `tests/example.py` file. 18 | 19 | 20 | ## Command Line Usage 21 | 22 | The command-line option `msg2pdf` will convert individual files or all `*.msg` files in a directory. 23 | 24 | `msg2pdf --help` for options. 25 | 26 | Usage: msg2pdf [OPTIONS] PATH 27 | 28 | msg2pdf converts Outlook email messages (msg) to pdf. 29 | 30 | The output is a folder for each email using the email subject as the 31 | folder name including a pdf of the email and all attachments. 32 | 33 | Inline images are included in the email pdf. 34 | 35 | Options: 36 | -f, --file Convert an individual file PATH to pdf. 37 | -d, --directory Convert all msg files in directory PATH to pdf. 38 | --help Show this message and exit. 39 | 40 | 41 | 42 | # Requirements 43 | 44 | Install the Windows binary release of [wkhtmltopdf](https://wkhtmltopdf.org/downloads.html) 45 | 46 | Ensure that `wkhtmltopdf` command is found in your `PATH`. 47 | 48 | This can be tested by entering `wkhtmltopdf --version` in your Command Prompt. 49 | 50 | You should receive an output similar to the attached. 51 | 52 | 53 | Microsoft Windows [Version 6.1.7601] 54 | Copyright (c) 2009 Microsoft Corporation. All rights reserved. 55 | 56 | C:\>wkhtmltopdf --version 57 | wkhtmltopdf 0.12.5 (with patched qt) 58 | 59 | C:\> 60 | -------------------------------------------------------------------------------- /msgtopdf/__init__.py: -------------------------------------------------------------------------------- 1 | from .msgtopdf import * 2 | -------------------------------------------------------------------------------- /msgtopdf/msgtopdf.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import re 4 | import subprocess 5 | import sys 6 | from pathlib import Path, PurePath 7 | 8 | import win32com.client 9 | 10 | __all__ = ["Msgtopdf"] 11 | 12 | # logging defaults 13 | logging.basicConfig( 14 | level=logging.INFO, 15 | format="%(asctime)s - %(levelname)s - %(message)s", 16 | datefmt="%m/%d/%Y %I:%M:%S %p", 17 | ) 18 | 19 | required_paths = ["wkhtmltopdf"] 20 | 21 | 22 | class Msgtopdf: 23 | def __init__(self, msgfile): 24 | if check_paths_exist(required_paths) is False: 25 | sys.exit(1) 26 | outlook = win32com.client.Dispatch("Outlook.Application").GetNamespace("MAPI") 27 | self.msgfile = PurePath(msgfile) 28 | self.directory = PurePath(self.msgfile).parent 29 | self.file = PurePath(self.msgfile).name 30 | self.file_name = self.file.split(".msg")[0] 31 | self.save_path = self.__define_save_path() 32 | self.msg = outlook.OpenSharedItem(self.msgfile) 33 | 34 | def raw_email_body(self): 35 | if self.msg.BodyFormat == 2: 36 | body = self.msg.HTMLBody 37 | self.email_format = "html" 38 | elif self.msg.BodyFormat == 3: 39 | body = self.msg.RTFBody 40 | self.email_format = "html" 41 | else: 42 | body = self.msg.Body 43 | self.email_format = "txt" 44 | self.raw_body = body 45 | return self.raw_body 46 | 47 | def email2pdf(self): 48 | Path.mkdir(Path(self.save_path)) 49 | html_header = self.__add_header_information() 50 | raw_email_body = self.raw_email_body() 51 | full_email_body = html_header + raw_email_body 52 | clean_email_body = self.replace_CID(full_email_body) 53 | self.html_body_file = PurePath(self.save_path, self.file_name + ".html") 54 | self.extract_email_attachments() 55 | # convert_html_to_pdf(clean_email_body, self.html_body_file) 56 | with open(self.html_body_file, "w", encoding="utf-8") as f: 57 | f.write(clean_email_body) 58 | # save pdf copy using wkhtmltopdf 59 | try: 60 | subprocess.run( 61 | [ 62 | "wkhtmltopdf", 63 | "--enable-local-file-access", 64 | "--log-level", 65 | "warn", 66 | "--encoding", 67 | "utf-8", 68 | "--footer-font-size", 69 | "6", 70 | "--footer-line", 71 | "--footer-center", 72 | "[page] / [topage]", 73 | str(self.html_body_file), 74 | str(PurePath(self.save_path, self.file_name + ".pdf")), 75 | ] 76 | ) 77 | except Exception as e: 78 | logging.critical("Could not call wkhtmltopdf") 79 | logging.debug(e) 80 | self.__delete_redundant_files() 81 | 82 | def extract_email_attachments(self): 83 | count_attachments = self.msg.Attachments.Count 84 | if count_attachments > 0: 85 | for item in range(count_attachments): 86 | attachment_filename = self.msg.Attachments.Item(item + 1).Filename 87 | self.msg.Attachments.Item(item + 1).SaveAsFile( 88 | PurePath(self.save_path, attachment_filename) 89 | ) 90 | 91 | def __define_save_path(self): 92 | msgfile_name = self.file.split(".msg")[0] 93 | msgfile_folder = self.clean_path(msgfile_name) 94 | save_path = PurePath(self.directory, msgfile_folder) 95 | # TODO check if save_path already exists and if so add increment 96 | return save_path 97 | 98 | def __add_header_information(self): 99 | html_str = """ 100 |
101 | 102 |104 | 105 | From: {sender} 106 | Sent: {sent} 107 | To: {to} 108 | Cc: {cc} 109 | Subject: {subject}
110 |Not an image
" 59 | assert email.replace_CID(body) == "Not an image
" 60 | 61 | def test_clean_path(self): 62 | email = Msgtopdf("C:/test/email.msg") 63 | path = r"RE:/ test dirty path ^" 64 | assert email.clean_path(path) == "RE test dirty path" 65 | 66 | def test___delete_redundant_files(self): 67 | email = Msgtopdf("C:/test/email.msg") 68 | email.save_path = pathlib.PurePath("./tests/") 69 | email.image_files = ["exists.png", "does_not_exist.png"] 70 | email.html_body_file = "./tests/html_body.html" 71 | # create temporary files for deletion 72 | open("./tests/html_body.html", "w+") 73 | open("./tests/exists.png", "w+") 74 | assert email.image_files == ["exists.png", "does_not_exist.png"] 75 | email._Msgtopdf__delete_redundant_files() 76 | 77 | def test_raw_email_body_html(self): 78 | email = Msgtopdf("C:/test/email.msg") 79 | email.msg.BodyFormat = 2 80 | email.raw_email_body() 81 | assert email.email_format == "html" 82 | email.msg.BodyFormat = 3 83 | email.raw_email_body() 84 | assert email.email_format == "html" 85 | email.msg.BodyFormat = 1 86 | email.raw_email_body() 87 | assert email.email_format == "txt" 88 | --------------------------------------------------------------------------------