├── pyhtml2pdf
├── __init__.py
├── compressor.py
└── converter.py
├── setup.cfg
├── setup.py
├── LICENSE
├── .github
└── workflows
│ └── python-publish.yml
├── README.md
└── .gitignore
/pyhtml2pdf/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md
3 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | import setuptools
2 |
3 | with open("README.md", "r") as fh:
4 | long_description = fh.read()
5 |
6 | setuptools.setup(
7 | name="pyhtml2pdf", # Replace with your own username
8 | version="0.0.8rc1",
9 | author="Kumara Fernando",
10 | author_email="mklmfernando@gmail.com",
11 | description="Simple python wrapper to convert HTML to PDF with headless Chrome via selenium.",
12 | long_description=long_description,
13 | long_description_content_type="text/markdown",
14 | url="https://github.com/kumaF/pyhtml2pdf",
15 | packages=setuptools.find_packages(),
16 | install_requires=[ # I get to this in a second
17 | 'selenium',
18 | 'webdriver-manager',
19 | ],
20 | classifiers=[
21 | "Programming Language :: Python :: 3",
22 | "License :: OSI Approved :: MIT License",
23 | "Operating System :: OS Independent",
24 | ],
25 | python_requires='>=3.6',
26 | )
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2020
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
1 | # This workflow will upload a Python Package using Twine when a release is created
2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries
3 |
4 | # This workflow uses actions that are not certified by GitHub.
5 | # They are provided by a third-party and are governed by
6 | # separate terms of service, privacy policy, and support
7 | # documentation.
8 |
9 | name: Upload Python Package
10 |
11 | on:
12 | release:
13 | types: [published]
14 |
15 | permissions:
16 | contents: read
17 |
18 | jobs:
19 | deploy:
20 |
21 | runs-on: ubuntu-latest
22 |
23 | steps:
24 | - uses: actions/checkout@v3
25 | - name: Set up Python
26 | uses: actions/setup-python@v3
27 | with:
28 | python-version: '3.7'
29 | - name: Install dependencies
30 | run: |
31 | python -m pip install --upgrade pip
32 | pip install build
33 | - name: Build package
34 | run: python -m build
35 | - name: Publish package
36 | uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
37 | with:
38 | user: __token__
39 | password: ${{ secrets.PYPI_API_TOKEN }}
40 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # pyhtml2pdf
2 | Simple python wrapper to convert HTML to PDF with headless Chrome via selenium.
3 |
4 | ## Install
5 | ```
6 | pip install pyhtml2pdf
7 | ```
8 |
9 | ## Dependencies
10 |
11 | - [Selenium Chrome Webdriver](https://chromedriver.chromium.org/downloads) (If Chrome is installed on the machine you won't need to install the chrome driver)
12 | - [Ghostscript](https://www.ghostscript.com/download.html)
13 |
14 | ## Example
15 |
16 | ### **Convert to PDF**
17 |
18 | **Use with website url**
19 |
20 | ```
21 | from pyhtml2pdf import converter
22 |
23 | converter.convert('https://pypi.org', 'sample.pdf')
24 | ```
25 |
26 | **Use with html file from local machine**
27 |
28 | ```
29 | import os
30 | from pyhtml2pdf import converter
31 |
32 | path = os.path.abspath('index.html')
33 | converter.convert(f'file:///{path}', 'sample.pdf')
34 | ```
35 |
36 | **Some JS objects may have animations or take a some time to render. You can set a time out in order to help render those objects. You can set timeout in seconds**
37 |
38 | ```
39 | converter.convert(source, target, timeout=2)
40 | ```
41 |
42 | **Compress the converted PDF**
43 |
44 | Some PDFs may be oversized. So there is a built in PDF compression feature.
45 |
46 | The power of the compression,
47 | - 0: default
48 | - 1: prepress
49 | - 2: printer
50 | - 3: ebook
51 | - 4: screen
52 |
53 | ```
54 | converter.convert(source, target, compress=True, power=0)
55 | ```
56 |
57 | ### **Pass Print Options**
58 |
59 | You can use print options mentioned [here](https://vanilla.aslushnikov.com/?Page.printToPDF)
60 |
61 | ```
62 | converter.convert( f"file:///{path}", f"sample.pdf", print_options={"scale": 0.95} )
63 | ```
64 |
65 | ### **Compress PDF**
66 |
67 | **Use it to compress a PDF file from local machine**
68 |
69 | ```
70 | import os
71 | from pyhtml2pdf import compressor
72 |
73 | compressor.compress('sample.pdf', 'compressed_sample.pdf')
74 | ```
75 |
76 | Inspired the works from,
77 |
78 | - https://github.com/maxvst/python-selenium-chrome-html-to-pdf-converter.git
79 | - https://github.com/theeko74/pdfc
80 |
81 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | .hypothesis/
51 | .pytest_cache/
52 |
53 | # Translations
54 | *.mo
55 | *.pot
56 |
57 | # Django stuff:
58 | *.log
59 | local_settings.py
60 | db.sqlite3
61 | db.sqlite3-journal
62 |
63 | # Flask stuff:
64 | instance/
65 | .webassets-cache
66 |
67 | # Scrapy stuff:
68 | .scrapy
69 |
70 | # Sphinx documentation
71 | docs/_build/
72 |
73 | # PyBuilder
74 | target/
75 |
76 | # Jupyter Notebook
77 | .ipynb_checkpoints
78 |
79 | # IPython
80 | profile_default/
81 | ipython_config.py
82 |
83 | # pyenv
84 | .python-version
85 |
86 | # pipenv
87 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
88 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
89 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
90 | # install all needed dependencies.
91 | #Pipfile.lock
92 |
93 | # celery beat schedule file
94 | celerybeat-schedule
95 |
96 | # SageMath parsed files
97 | *.sage.py
98 |
99 | # Environments
100 | .env
101 | .venv
102 | env/
103 | venv/
104 | ENV/
105 | env.bak/
106 | venv.bak/
107 |
108 | # Spyder project settings
109 | .spyderproject
110 | .spyproject
111 |
112 | # Rope project settings
113 | .ropeproject
114 |
115 | # mkdocs documentation
116 | /site
117 |
118 | # mypy
119 | .mypy_cache/
120 | .dmypy.json
121 | dmypy.json
122 |
123 | # Pyre type checker
124 | .pyre/
125 |
126 | *.pdf
127 | *.html
128 | chromedriver
129 | requirements.txt
130 | main.py
--------------------------------------------------------------------------------
/pyhtml2pdf/compressor.py:
--------------------------------------------------------------------------------
1 | import os
2 | import platform
3 | import subprocess
4 | from pathlib import Path
5 | from tempfile import NamedTemporaryFile, _TemporaryFileWrapper
6 |
7 |
8 | def compress(source: str | os.PathLike | _TemporaryFileWrapper,
9 | target: str | os.PathLike,
10 | power: int = 0,
11 | ghostscript_command: str = None) -> None:
12 | """
13 |
14 | :param source: Source PDF file
15 | :param target: Target location to save the compressed PDF
16 | :param power: Power of the compression. Default value is 0. This can be
17 | 0: default,
18 | 1: prepress,
19 | 2: printer,
20 | 3: ebook,
21 | 4: screen
22 | :param ghostscript_command: The name of the ghostscript executable. If set to the default value None, is attempted
23 | to be inferred from the OS.
24 | If the OS is not Windows, "gs" is used as executable name.
25 | If the OS is Windows, and it is a 64-bit version, "gswin64c" is used. If it is a 32-bit
26 | version, "gswin32c" is used.
27 | """
28 | quality = {
29 | 0: '/default',
30 | 1: '/prepress',
31 | 2: '/printer',
32 | 3: '/ebook',
33 | 4: '/screen'
34 | }
35 |
36 | if ghostscript_command is None:
37 | if platform.system() == 'Windows':
38 | if platform.machine().endswith('64'):
39 | ghostscript_command = 'gswin64c'
40 | else:
41 | ghostscript_command = 'gswin32c'
42 | else:
43 | ghostscript_command = 'gs'
44 |
45 | if isinstance(source, _TemporaryFileWrapper):
46 | source = source.name
47 |
48 | source = Path(source)
49 | target = Path(target)
50 |
51 | if not source.is_file():
52 | raise FileNotFoundError('invalid path for input PDF file')
53 |
54 | if source.suffix != '.pdf':
55 | raise ValueError('Input file must be a .pdf file')
56 |
57 | subprocess.call([ghostscript_command,
58 | '-sDEVICE=pdfwrite', '-dCompatibilityLevel=1.4',
59 | '-dPDFSETTINGS={}'.format(quality[power]),
60 | '-dNOPAUSE', '-dQUIET', '-dBATCH',
61 | '-sOutputFile={}'.format(target.as_posix()),
62 | source.as_posix()],
63 | shell=platform.system() == 'Windows'
64 | )
65 |
66 |
67 | def _compress(result: bytes,
68 | target: str | os.PathLike,
69 | power: int,
70 | ghostscript_command: str | None):
71 | with NamedTemporaryFile(suffix='.pdf', delete=platform.system() != 'Windows') as tmp_file:
72 | tmp_file.write(result)
73 |
74 | compress(tmp_file, target, power, ghostscript_command)
75 |
--------------------------------------------------------------------------------
/pyhtml2pdf/converter.py:
--------------------------------------------------------------------------------
1 | import json
2 | import base64
3 | import io
4 | from typing import Union, TypedDict
5 |
6 | from selenium import webdriver
7 | from selenium.webdriver.chrome.options import Options
8 | from selenium.webdriver.chrome.service import Service
9 | from selenium.common.exceptions import TimeoutException
10 | from selenium.webdriver.support.ui import WebDriverWait
11 | from selenium.webdriver.support.expected_conditions import staleness_of
12 | from webdriver_manager.chrome import ChromeDriverManager
13 | from selenium.webdriver.common.by import By
14 |
15 | from .compressor import _compress
16 |
17 |
18 | class PrintOptions(TypedDict):
19 | landscape: bool
20 | displayHeaderFooter: bool
21 | printBackground: bool
22 | scale: float
23 | paperWidth: float
24 | paperHeight: float
25 | marginTop: float
26 | marginBottom: float
27 | marginLeft: float
28 | marginRight: float
29 | pageRanges: str
30 | ignoreInvalidPageRanges: bool
31 | preferCSSPageSize: bool
32 |
33 |
34 | def convert(
35 | source: Union[str, io.BytesIO],
36 | target: Union[str, io.BytesIO],
37 | timeout: int = 2,
38 | compress: bool = False,
39 | power: int = 0,
40 | install_driver: bool = True,
41 | print_options: PrintOptions = {},
42 | ghostscript_command: str = None
43 | ):
44 | """
45 | Convert a given html file or website into PDF
46 |
47 | :param str source: source html file or website link or html content or a BytesIO object
48 | :param str | BytesIO target: target location to save the PDF, can be a path or a BytesIO object
49 | :param int timeout: timeout in seconds. Default value is set to 2 seconds
50 | :param bool compress: whether PDF is compressed or not. Default value is False
51 | :param int power: power of the compression. Default value is 0. This can be 0: default, 1: prepress, 2: printer, 3: ebook, 4: screen
52 | :param bool install_driver: whether or not to install using ChromeDriverManager. Default value is True
53 | :param PrintOptions print_options: A dictionary containing options for the printing of the PDF, conforming to the types specified in the PrintOptions TypedDict.
54 | :param ghostscript_command: The name of the ghostscript executable. If set to the default value None, is attempted
55 | to be inferred from the OS.
56 | If the OS is not Windows, "gs" is used as executable name.
57 | If the OS is Windows, and it is a 64-bit version, "gswin64c" is used. If it is a 32-bit
58 | version, "gswin32c" is used.
59 | """
60 | if print_options is None:
61 | print_options = {}
62 |
63 | result = __get_pdf_from_html(
64 | source, timeout, install_driver, print_options)
65 |
66 | if compress:
67 | _compress(result, target, power, ghostscript_command)
68 | else:
69 | if type(target) == io.BytesIO:
70 | return target.write(result)
71 | with open(target, "wb") as file:
72 | file.write(result)
73 |
74 |
75 | def __send_devtools(driver, cmd, params=None):
76 | if params is None:
77 | params = {}
78 | resource = "/session/%s/chromium/send_command_and_get_result" % driver.session_id
79 | url = driver.command_executor._url + resource
80 | body = json.dumps({"cmd": cmd, "params": params})
81 | response = driver.command_executor._request("POST", url, body)
82 |
83 | if not response:
84 | raise Exception(response.get("value"))
85 |
86 | return response.get("value")
87 |
88 |
89 | def __get_pdf_from_html(
90 | source: Union[str, io.BytesIO], timeout: int, install_driver: bool, print_options: dict
91 | ):
92 | ) -> bytes:
93 | webdriver_options = Options()
94 | webdriver_prefs = {}
95 |
96 | webdriver_options.add_argument("--headless")
97 | webdriver_options.add_argument("--disable-gpu")
98 | webdriver_options.add_argument("--no-sandbox")
99 | webdriver_options.add_argument("--disable-dev-shm-usage")
100 | webdriver_options.experimental_options["prefs"] = webdriver_prefs
101 |
102 | webdriver_prefs["profile.default_content_settings"] = {"images": 2}
103 |
104 | if install_driver:
105 | service = Service(ChromeDriverManager().install())
106 | driver = webdriver.Chrome(service=service, options=webdriver_options)
107 | else:
108 | driver = webdriver.Chrome(options=webdriver_options)
109 |
110 |
111 | # Detect the type of source and create data url if needed
112 | if type(source) == io.BytesIO:
113 | encoded_content = base64.b64encode(source.getvalue()).decode('utf-8')
114 | path = f'data:text/html;base64,{encoded_content}'
115 | if not source.startswith('http') and not source.startswith('file'):
116 | encoded_content = base64.b64encode(source.encode('utf-8')).decode('utf-8')
117 | path = f'data:text/html;base64,{encoded_content}'
118 | else:
119 | path = source
120 |
121 | driver.get(path)
122 |
123 | try:
124 | WebDriverWait(driver, timeout).until(
125 | staleness_of(driver.find_element(by=By.TAG_NAME, value="html"))
126 | )
127 | except TimeoutException:
128 | calculated_print_options = {
129 | "landscape": False,
130 | "displayHeaderFooter": False,
131 | "printBackground": True,
132 | "preferCSSPageSize": True,
133 | }
134 | calculated_print_options.update(print_options)
135 | result = __send_devtools(
136 | driver, "Page.printToPDF", calculated_print_options)
137 | return base64.b64decode(result["data"])
138 | finally:
139 | driver.quit()
140 |
--------------------------------------------------------------------------------