├── .github └── workflows │ └── python-publish.yml ├── .gitignore ├── LICENSE ├── README.md ├── pyhtml2pdf ├── __init__.py ├── compressor.py └── converter.py ├── setup.cfg └── setup.py /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflow will upload a Python Package using Twine when a release is created 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries 3 | 4 | # This workflow uses actions that are not certified by GitHub. 5 | # They are provided by a third-party and are governed by 6 | # separate terms of service, privacy policy, and support 7 | # documentation. 8 | 9 | name: Upload Python Package 10 | 11 | on: 12 | release: 13 | types: [published] 14 | 15 | permissions: 16 | contents: read 17 | 18 | jobs: 19 | deploy: 20 | 21 | runs-on: ubuntu-latest 22 | 23 | steps: 24 | - uses: actions/checkout@v3 25 | - name: Set up Python 26 | uses: actions/setup-python@v3 27 | with: 28 | python-version: '3.7' 29 | - name: Install dependencies 30 | run: | 31 | python -m pip install --upgrade pip 32 | pip install build 33 | - name: Build package 34 | run: python -m build 35 | - name: Publish package 36 | uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 37 | with: 38 | user: __token__ 39 | password: ${{ secrets.PYPI_API_TOKEN }} 40 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | 53 | # Translations 54 | *.mo 55 | *.pot 56 | 57 | # Django stuff: 58 | *.log 59 | local_settings.py 60 | db.sqlite3 61 | db.sqlite3-journal 62 | 63 | # Flask stuff: 64 | instance/ 65 | .webassets-cache 66 | 67 | # Scrapy stuff: 68 | .scrapy 69 | 70 | # Sphinx documentation 71 | docs/_build/ 72 | 73 | # PyBuilder 74 | target/ 75 | 76 | # Jupyter Notebook 77 | .ipynb_checkpoints 78 | 79 | # IPython 80 | profile_default/ 81 | ipython_config.py 82 | 83 | # pyenv 84 | .python-version 85 | 86 | # pipenv 87 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 88 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 89 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 90 | # install all needed dependencies. 91 | #Pipfile.lock 92 | 93 | # celery beat schedule file 94 | celerybeat-schedule 95 | 96 | # SageMath parsed files 97 | *.sage.py 98 | 99 | # Environments 100 | .env 101 | .venv 102 | env/ 103 | venv/ 104 | ENV/ 105 | env.bak/ 106 | venv.bak/ 107 | 108 | # Spyder project settings 109 | .spyderproject 110 | .spyproject 111 | 112 | # Rope project settings 113 | .ropeproject 114 | 115 | # mkdocs documentation 116 | /site 117 | 118 | # mypy 119 | .mypy_cache/ 120 | .dmypy.json 121 | dmypy.json 122 | 123 | # Pyre type checker 124 | .pyre/ 125 | 126 | *.pdf 127 | *.html 128 | chromedriver 129 | requirements.txt 130 | main.py -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pyhtml2pdf 2 | Simple python wrapper to convert HTML to PDF with headless Chrome via selenium. 3 | 4 | ## Install 5 | ``` 6 | pip install pyhtml2pdf 7 | ``` 8 | 9 | ## Dependencies 10 | 11 | - [Selenium Chrome Webdriver](https://chromedriver.chromium.org/downloads) (If Chrome is installed on the machine you won't need to install the chrome driver) 12 | - [Ghostscript](https://www.ghostscript.com/download.html) 13 | 14 | ## Example 15 | 16 | ### **Convert to PDF** 17 | 18 | **Use with website url** 19 | 20 | ``` 21 | from pyhtml2pdf import converter 22 | 23 | converter.convert('https://pypi.org', 'sample.pdf') 24 | ``` 25 | 26 | **Use with html file from local machine** 27 | 28 | ``` 29 | import os 30 | from pyhtml2pdf import converter 31 | 32 | path = os.path.abspath('index.html') 33 | converter.convert(f'file:///{path}', 'sample.pdf') 34 | ``` 35 | 36 | **Some JS objects may have animations or take a some time to render. You can set a time out in order to help render those objects. You can set timeout in seconds** 37 | 38 | ``` 39 | converter.convert(source, target, timeout=2) 40 | ``` 41 | 42 | **Compress the converted PDF** 43 | 44 | Some PDFs may be oversized. So there is a built in PDF compression feature. 45 | 46 | The power of the compression, 47 | - 0: default 48 | - 1: prepress 49 | - 2: printer 50 | - 3: ebook 51 | - 4: screen 52 | 53 | ``` 54 | converter.convert(source, target, compress=True, power=0) 55 | ``` 56 | 57 | ### **Pass Print Options** 58 | 59 | You can use print options mentioned [here](https://vanilla.aslushnikov.com/?Page.printToPDF) 60 | 61 | ``` 62 | converter.convert( f"file:///{path}", f"sample.pdf", print_options={"scale": 0.95} ) 63 | ``` 64 | 65 | ### **Compress PDF** 66 | 67 | **Use it to compress a PDF file from local machine** 68 | 69 | ``` 70 | import os 71 | from pyhtml2pdf import compressor 72 | 73 | compressor.compress('sample.pdf', 'compressed_sample.pdf') 74 | ``` 75 | 76 | Inspired the works from, 77 | 78 | - https://github.com/maxvst/python-selenium-chrome-html-to-pdf-converter.git 79 | - https://github.com/theeko74/pdfc 80 | 81 | -------------------------------------------------------------------------------- /pyhtml2pdf/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kumaF/pyhtml2pdf/0aded26906c3ec9dd0844c7e745dffdb5e972244/pyhtml2pdf/__init__.py -------------------------------------------------------------------------------- /pyhtml2pdf/compressor.py: -------------------------------------------------------------------------------- 1 | import os 2 | import platform 3 | import subprocess 4 | from pathlib import Path 5 | from tempfile import NamedTemporaryFile, _TemporaryFileWrapper 6 | 7 | 8 | def compress(source: str | os.PathLike | _TemporaryFileWrapper, 9 | target: str | os.PathLike, 10 | power: int = 0, 11 | ghostscript_command: str = None) -> None: 12 | """ 13 | 14 | :param source: Source PDF file 15 | :param target: Target location to save the compressed PDF 16 | :param power: Power of the compression. Default value is 0. This can be 17 | 0: default, 18 | 1: prepress, 19 | 2: printer, 20 | 3: ebook, 21 | 4: screen 22 | :param ghostscript_command: The name of the ghostscript executable. If set to the default value None, is attempted 23 | to be inferred from the OS. 24 | If the OS is not Windows, "gs" is used as executable name. 25 | If the OS is Windows, and it is a 64-bit version, "gswin64c" is used. If it is a 32-bit 26 | version, "gswin32c" is used. 27 | """ 28 | quality = { 29 | 0: '/default', 30 | 1: '/prepress', 31 | 2: '/printer', 32 | 3: '/ebook', 33 | 4: '/screen' 34 | } 35 | 36 | if ghostscript_command is None: 37 | if platform.system() == 'Windows': 38 | if platform.machine().endswith('64'): 39 | ghostscript_command = 'gswin64c' 40 | else: 41 | ghostscript_command = 'gswin32c' 42 | else: 43 | ghostscript_command = 'gs' 44 | 45 | if isinstance(source, _TemporaryFileWrapper): 46 | source = source.name 47 | 48 | source = Path(source) 49 | target = Path(target) 50 | 51 | if not source.is_file(): 52 | raise FileNotFoundError('invalid path for input PDF file') 53 | 54 | if source.suffix != '.pdf': 55 | raise ValueError('Input file must be a .pdf file') 56 | 57 | subprocess.call([ghostscript_command, 58 | '-sDEVICE=pdfwrite', '-dCompatibilityLevel=1.4', 59 | '-dPDFSETTINGS={}'.format(quality[power]), 60 | '-dNOPAUSE', '-dQUIET', '-dBATCH', 61 | '-sOutputFile={}'.format(target.as_posix()), 62 | source.as_posix()], 63 | shell=platform.system() == 'Windows' 64 | ) 65 | 66 | 67 | def _compress(result: bytes, 68 | target: str | os.PathLike, 69 | power: int, 70 | ghostscript_command: str | None): 71 | with NamedTemporaryFile(suffix='.pdf', delete=platform.system() != 'Windows') as tmp_file: 72 | tmp_file.write(result) 73 | 74 | compress(tmp_file, target, power, ghostscript_command) 75 | -------------------------------------------------------------------------------- /pyhtml2pdf/converter.py: -------------------------------------------------------------------------------- 1 | import json 2 | import base64 3 | import io 4 | from typing import Union, TypedDict 5 | 6 | from selenium import webdriver 7 | from selenium.webdriver.chrome.options import Options 8 | from selenium.webdriver.chrome.service import Service 9 | from selenium.common.exceptions import TimeoutException 10 | from selenium.webdriver.support.ui import WebDriverWait 11 | from selenium.webdriver.support.expected_conditions import staleness_of 12 | from webdriver_manager.chrome import ChromeDriverManager 13 | from selenium.webdriver.common.by import By 14 | 15 | from .compressor import _compress 16 | 17 | 18 | class PrintOptions(TypedDict): 19 | landscape: bool 20 | displayHeaderFooter: bool 21 | printBackground: bool 22 | scale: float 23 | paperWidth: float 24 | paperHeight: float 25 | marginTop: float 26 | marginBottom: float 27 | marginLeft: float 28 | marginRight: float 29 | pageRanges: str 30 | ignoreInvalidPageRanges: bool 31 | preferCSSPageSize: bool 32 | 33 | 34 | def convert( 35 | source: Union[str, io.BytesIO], 36 | target: Union[str, io.BytesIO], 37 | timeout: int = 2, 38 | compress: bool = False, 39 | power: int = 0, 40 | install_driver: bool = True, 41 | print_options: PrintOptions = {}, 42 | ghostscript_command: str = None 43 | ): 44 | """ 45 | Convert a given html file or website into PDF 46 | 47 | :param str source: source html file or website link or html content or a BytesIO object 48 | :param str | BytesIO target: target location to save the PDF, can be a path or a BytesIO object 49 | :param int timeout: timeout in seconds. Default value is set to 2 seconds 50 | :param bool compress: whether PDF is compressed or not. Default value is False 51 | :param int power: power of the compression. Default value is 0. This can be 0: default, 1: prepress, 2: printer, 3: ebook, 4: screen 52 | :param bool install_driver: whether or not to install using ChromeDriverManager. Default value is True 53 | :param PrintOptions print_options: A dictionary containing options for the printing of the PDF, conforming to the types specified in the PrintOptions TypedDict. 54 | :param ghostscript_command: The name of the ghostscript executable. If set to the default value None, is attempted 55 | to be inferred from the OS. 56 | If the OS is not Windows, "gs" is used as executable name. 57 | If the OS is Windows, and it is a 64-bit version, "gswin64c" is used. If it is a 32-bit 58 | version, "gswin32c" is used. 59 | """ 60 | if print_options is None: 61 | print_options = {} 62 | 63 | result = __get_pdf_from_html( 64 | source, timeout, install_driver, print_options) 65 | 66 | if compress: 67 | _compress(result, target, power, ghostscript_command) 68 | else: 69 | if type(target) == io.BytesIO: 70 | return target.write(result) 71 | with open(target, "wb") as file: 72 | file.write(result) 73 | 74 | 75 | def __send_devtools(driver, cmd, params=None): 76 | if params is None: 77 | params = {} 78 | resource = "/session/%s/chromium/send_command_and_get_result" % driver.session_id 79 | url = driver.command_executor._url + resource 80 | body = json.dumps({"cmd": cmd, "params": params}) 81 | response = driver.command_executor._request("POST", url, body) 82 | 83 | if not response: 84 | raise Exception(response.get("value")) 85 | 86 | return response.get("value") 87 | 88 | 89 | def __get_pdf_from_html( 90 | source: Union[str, io.BytesIO], timeout: int, install_driver: bool, print_options: dict 91 | ): 92 | ) -> bytes: 93 | webdriver_options = Options() 94 | webdriver_prefs = {} 95 | 96 | webdriver_options.add_argument("--headless") 97 | webdriver_options.add_argument("--disable-gpu") 98 | webdriver_options.add_argument("--no-sandbox") 99 | webdriver_options.add_argument("--disable-dev-shm-usage") 100 | webdriver_options.experimental_options["prefs"] = webdriver_prefs 101 | 102 | webdriver_prefs["profile.default_content_settings"] = {"images": 2} 103 | 104 | if install_driver: 105 | service = Service(ChromeDriverManager().install()) 106 | driver = webdriver.Chrome(service=service, options=webdriver_options) 107 | else: 108 | driver = webdriver.Chrome(options=webdriver_options) 109 | 110 | 111 | # Detect the type of source and create data url if needed 112 | if type(source) == io.BytesIO: 113 | encoded_content = base64.b64encode(source.getvalue()).decode('utf-8') 114 | path = f'data:text/html;base64,{encoded_content}' 115 | if not source.startswith('http') and not source.startswith('file'): 116 | encoded_content = base64.b64encode(source.encode('utf-8')).decode('utf-8') 117 | path = f'data:text/html;base64,{encoded_content}' 118 | else: 119 | path = source 120 | 121 | driver.get(path) 122 | 123 | try: 124 | WebDriverWait(driver, timeout).until( 125 | staleness_of(driver.find_element(by=By.TAG_NAME, value="html")) 126 | ) 127 | except TimeoutException: 128 | calculated_print_options = { 129 | "landscape": False, 130 | "displayHeaderFooter": False, 131 | "printBackground": True, 132 | "preferCSSPageSize": True, 133 | } 134 | calculated_print_options.update(print_options) 135 | result = __send_devtools( 136 | driver, "Page.printToPDF", calculated_print_options) 137 | return base64.b64decode(result["data"]) 138 | finally: 139 | driver.quit() 140 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | with open("README.md", "r") as fh: 4 | long_description = fh.read() 5 | 6 | setuptools.setup( 7 | name="pyhtml2pdf", # Replace with your own username 8 | version="0.0.8rc1", 9 | author="Kumara Fernando", 10 | author_email="mklmfernando@gmail.com", 11 | description="Simple python wrapper to convert HTML to PDF with headless Chrome via selenium.", 12 | long_description=long_description, 13 | long_description_content_type="text/markdown", 14 | url="https://github.com/kumaF/pyhtml2pdf", 15 | packages=setuptools.find_packages(), 16 | install_requires=[ # I get to this in a second 17 | 'selenium', 18 | 'webdriver-manager', 19 | ], 20 | classifiers=[ 21 | "Programming Language :: Python :: 3", 22 | "License :: OSI Approved :: MIT License", 23 | "Operating System :: OS Independent", 24 | ], 25 | python_requires='>=3.6', 26 | ) --------------------------------------------------------------------------------