├── .github
    └── workflows
    │   └── python-publish.yml
├── .gitignore
├── LICENSE
├── README.md
├── pyhtml2pdf
    ├── __init__.py
    ├── compressor.py
    └── converter.py
├── setup.cfg
└── setup.py


/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will upload a Python Package using Twine when a release is created
 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries
 3 | 
 4 | # This workflow uses actions that are not certified by GitHub.
 5 | # They are provided by a third-party and are governed by
 6 | # separate terms of service, privacy policy, and support
 7 | # documentation.
 8 | 
 9 | name: Upload Python Package
10 | 
11 | on:
12 |   release:
13 |     types: [published]
14 | 
15 | permissions:
16 |   contents: read
17 | 
18 | jobs:
19 |   deploy:
20 | 
21 |     runs-on: ubuntu-latest
22 | 
23 |     steps:
24 |     - uses: actions/checkout@v3
25 |     - name: Set up Python
26 |       uses: actions/setup-python@v3
27 |       with:
28 |         python-version: '3.7'
29 |     - name: Install dependencies
30 |       run: |
31 |         python -m pip install --upgrade pip
32 |         pip install build
33 |     - name: Build package
34 |       run: python -m build
35 |     - name: Publish package
36 |       uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
37 |       with:
38 |         user: __token__
39 |         password: ${{ secrets.PYPI_API_TOKEN }}
40 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | 
 53 | # Translations
 54 | *.mo
 55 | *.pot
 56 | 
 57 | # Django stuff:
 58 | *.log
 59 | local_settings.py
 60 | db.sqlite3
 61 | db.sqlite3-journal
 62 | 
 63 | # Flask stuff:
 64 | instance/
 65 | .webassets-cache
 66 | 
 67 | # Scrapy stuff:
 68 | .scrapy
 69 | 
 70 | # Sphinx documentation
 71 | docs/_build/
 72 | 
 73 | # PyBuilder
 74 | target/
 75 | 
 76 | # Jupyter Notebook
 77 | .ipynb_checkpoints
 78 | 
 79 | # IPython
 80 | profile_default/
 81 | ipython_config.py
 82 | 
 83 | # pyenv
 84 | .python-version
 85 | 
 86 | # pipenv
 87 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 88 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 89 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 90 | #   install all needed dependencies.
 91 | #Pipfile.lock
 92 | 
 93 | # celery beat schedule file
 94 | celerybeat-schedule
 95 | 
 96 | # SageMath parsed files
 97 | *.sage.py
 98 | 
 99 | # Environments
100 | .env
101 | .venv
102 | env/
103 | venv/
104 | ENV/
105 | env.bak/
106 | venv.bak/
107 | 
108 | # Spyder project settings
109 | .spyderproject
110 | .spyproject
111 | 
112 | # Rope project settings
113 | .ropeproject
114 | 
115 | # mkdocs documentation
116 | /site
117 | 
118 | # mypy
119 | .mypy_cache/
120 | .dmypy.json
121 | dmypy.json
122 | 
123 | # Pyre type checker
124 | .pyre/
125 | 
126 | *.pdf
127 | *.html
128 | chromedriver
129 | requirements.txt
130 | main.py


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # pyhtml2pdf
 2 | Simple python wrapper to convert HTML to PDF with headless Chrome via selenium.
 3 | 
 4 | ## Install
 5 | ```
 6 | pip install pyhtml2pdf
 7 | ```
 8 | 
 9 | ## Dependencies
10 | 
11 |  - [Selenium Chrome Webdriver](https://chromedriver.chromium.org/downloads) (If Chrome is installed on the machine you won't need to install the chrome driver)
12 |  - [Ghostscript](https://www.ghostscript.com/download.html)
13 | 
14 | ## Example
15 | 
16 | ### **Convert to PDF**
17 | 
18 | **Use with website url**
19 | 
20 | ```
21 | from pyhtml2pdf import converter
22 | 
23 | converter.convert('https://pypi.org', 'sample.pdf')
24 | ```
25 | 
26 | **Use with html file from local machine**
27 | 
28 | ```
29 | import os
30 | from pyhtml2pdf import converter
31 | 
32 | path = os.path.abspath('index.html')
33 | converter.convert(f'file:///{path}', 'sample.pdf')
34 | ```
35 | 
36 | **Some JS objects may have animations or take a some time to render. You can set a time out in order to help render those objects. You can set timeout in seconds**
37 | 
38 | ```
39 | converter.convert(source, target, timeout=2)
40 | ```
41 | 
42 | **Compress the converted PDF**
43 | 
44 | Some PDFs may be oversized. So there is a built in PDF compression feature.
45 | 
46 | The power of the compression,
47 |  - 0: default
48 |  - 1: prepress
49 |  - 2: printer
50 |  - 3: ebook
51 |  - 4: screen
52 | 
53 | ```
54 | converter.convert(source, target, compress=True, power=0)
55 | ```
56 | 
57 | ### **Pass Print Options**
58 | 
59 | You can use print options mentioned [here](https://vanilla.aslushnikov.com/?Page.printToPDF)
60 | 
61 | ```
62 | converter.convert( f"file:///{path}", f"sample.pdf", print_options={"scale": 0.95} )
63 | ```
64 | 
65 | ### **Compress PDF**
66 | 
67 | **Use it to compress a PDF file from local machine**
68 | 
69 | ```
70 | import os
71 | from pyhtml2pdf import compressor
72 | 
73 | compressor.compress('sample.pdf', 'compressed_sample.pdf')
74 | ```
75 | 
76 | Inspired the works from,
77 | 
78 |  - https://github.com/maxvst/python-selenium-chrome-html-to-pdf-converter.git
79 |  - https://github.com/theeko74/pdfc
80 | 
81 | 


--------------------------------------------------------------------------------
/pyhtml2pdf/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kumaF/pyhtml2pdf/0aded26906c3ec9dd0844c7e745dffdb5e972244/pyhtml2pdf/__init__.py


--------------------------------------------------------------------------------
/pyhtml2pdf/compressor.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import platform
 3 | import subprocess
 4 | from pathlib import Path
 5 | from tempfile import NamedTemporaryFile, _TemporaryFileWrapper
 6 | 
 7 | 
 8 | def compress(source: str | os.PathLike | _TemporaryFileWrapper,
 9 |              target: str | os.PathLike,
10 |              power: int = 0,
11 |              ghostscript_command: str = None) -> None:
12 |     """
13 | 
14 |     :param source: Source PDF file
15 |     :param target: Target location to save the compressed PDF
16 |     :param power: Power of the compression. Default value is 0. This can be
17 |                     0: default,
18 |                     1: prepress,
19 |                     2: printer,
20 |                     3: ebook,
21 |                     4: screen
22 |     :param ghostscript_command: The name of the ghostscript executable. If set to the default value None, is attempted
23 |                                 to be inferred from the OS.
24 |                                 If the OS is not Windows, "gs" is used as executable name.
25 |                                 If the OS is Windows, and it is a 64-bit version, "gswin64c" is used. If it is a 32-bit
26 |                                 version, "gswin32c" is used.
27 |     """
28 |     quality = {
29 |         0: '/default',
30 |         1: '/prepress',
31 |         2: '/printer',
32 |         3: '/ebook',
33 |         4: '/screen'
34 |     }
35 | 
36 |     if ghostscript_command is None:
37 |         if platform.system() == 'Windows':
38 |             if platform.machine().endswith('64'):
39 |                 ghostscript_command = 'gswin64c'
40 |             else:
41 |                 ghostscript_command = 'gswin32c'
42 |         else:
43 |             ghostscript_command = 'gs'
44 | 
45 |     if isinstance(source, _TemporaryFileWrapper):
46 |         source = source.name
47 | 
48 |     source = Path(source)
49 |     target = Path(target)
50 | 
51 |     if not source.is_file():
52 |         raise FileNotFoundError('invalid path for input PDF file')
53 | 
54 |     if source.suffix != '.pdf':
55 |         raise ValueError('Input file must be a .pdf file')
56 | 
57 |     subprocess.call([ghostscript_command,
58 |                      '-sDEVICE=pdfwrite', '-dCompatibilityLevel=1.4',
59 |                      '-dPDFSETTINGS={}'.format(quality[power]),
60 |                      '-dNOPAUSE', '-dQUIET', '-dBATCH',
61 |                      '-sOutputFile={}'.format(target.as_posix()),
62 |                      source.as_posix()],
63 |                     shell=platform.system() == 'Windows'
64 |                     )
65 | 
66 | 
67 | def _compress(result: bytes,
68 |                target: str | os.PathLike,
69 |                power: int,
70 |                ghostscript_command: str | None):
71 |     with NamedTemporaryFile(suffix='.pdf', delete=platform.system() != 'Windows') as tmp_file:
72 |         tmp_file.write(result)
73 | 
74 |         compress(tmp_file, target, power, ghostscript_command)
75 | 


--------------------------------------------------------------------------------
/pyhtml2pdf/converter.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import base64
  3 | import io
  4 | from typing import Union, TypedDict
  5 | 
  6 | from selenium import webdriver
  7 | from selenium.webdriver.chrome.options import Options
  8 | from selenium.webdriver.chrome.service import Service
  9 | from selenium.common.exceptions import TimeoutException
 10 | from selenium.webdriver.support.ui import WebDriverWait
 11 | from selenium.webdriver.support.expected_conditions import staleness_of
 12 | from webdriver_manager.chrome import ChromeDriverManager
 13 | from selenium.webdriver.common.by import By
 14 | 
 15 | from .compressor import _compress
 16 | 
 17 | 
 18 | class PrintOptions(TypedDict):
 19 |     landscape: bool
 20 |     displayHeaderFooter: bool
 21 |     printBackground: bool
 22 |     scale: float
 23 |     paperWidth: float
 24 |     paperHeight: float
 25 |     marginTop: float
 26 |     marginBottom: float
 27 |     marginLeft: float
 28 |     marginRight: float
 29 |     pageRanges: str
 30 |     ignoreInvalidPageRanges: bool
 31 |     preferCSSPageSize: bool
 32 | 
 33 | 
 34 | def convert(
 35 |     source: Union[str, io.BytesIO],
 36 |     target: Union[str, io.BytesIO],
 37 |     timeout: int = 2,
 38 |     compress: bool = False,
 39 |     power: int = 0,
 40 |     install_driver: bool = True,
 41 |     print_options: PrintOptions = {},
 42 |     ghostscript_command: str = None
 43 | ):
 44 |     """
 45 |     Convert a given html file or website into PDF
 46 | 
 47 |     :param str source: source html file or website link or html content or a BytesIO object
 48 |     :param str | BytesIO target: target location to save the PDF, can be a path or a BytesIO object
 49 |     :param int timeout: timeout in seconds. Default value is set to 2 seconds
 50 |     :param bool compress: whether PDF is compressed or not. Default value is False
 51 |     :param int power: power of the compression. Default value is 0. This can be 0: default, 1: prepress, 2: printer, 3: ebook, 4: screen
 52 |     :param bool install_driver: whether or not to install using ChromeDriverManager. Default value is True
 53 |     :param PrintOptions print_options: A dictionary containing options for the printing of the PDF, conforming to the types specified in the PrintOptions TypedDict.
 54 |     :param ghostscript_command: The name of the ghostscript executable. If set to the default value None, is attempted
 55 |                             to be inferred from the OS.
 56 |                             If the OS is not Windows, "gs" is used as executable name.
 57 |                             If the OS is Windows, and it is a 64-bit version, "gswin64c" is used. If it is a 32-bit
 58 |                             version, "gswin32c" is used.
 59 |     """
 60 |     if print_options is None:
 61 |         print_options = {}
 62 | 
 63 |     result = __get_pdf_from_html(
 64 |         source, timeout, install_driver, print_options)
 65 | 
 66 |     if compress:
 67 |         _compress(result, target, power, ghostscript_command)
 68 |     else:
 69 |         if type(target) == io.BytesIO:
 70 |             return target.write(result)
 71 |         with open(target, "wb") as file:
 72 |             file.write(result)
 73 | 
 74 | 
 75 | def __send_devtools(driver, cmd, params=None):
 76 |     if params is None:
 77 |         params = {}
 78 |     resource = "/session/%s/chromium/send_command_and_get_result" % driver.session_id
 79 |     url = driver.command_executor._url + resource
 80 |     body = json.dumps({"cmd": cmd, "params": params})
 81 |     response = driver.command_executor._request("POST", url, body)
 82 | 
 83 |     if not response:
 84 |         raise Exception(response.get("value"))
 85 | 
 86 |     return response.get("value")
 87 | 
 88 | 
 89 | def __get_pdf_from_html(
 90 |     source: Union[str, io.BytesIO], timeout: int, install_driver: bool, print_options: dict
 91 | ):
 92 | ) -> bytes:
 93 |     webdriver_options = Options()
 94 |     webdriver_prefs = {}
 95 | 
 96 |     webdriver_options.add_argument("--headless")
 97 |     webdriver_options.add_argument("--disable-gpu")
 98 |     webdriver_options.add_argument("--no-sandbox")
 99 |     webdriver_options.add_argument("--disable-dev-shm-usage")
100 |     webdriver_options.experimental_options["prefs"] = webdriver_prefs
101 | 
102 |     webdriver_prefs["profile.default_content_settings"] = {"images": 2}
103 | 
104 |     if install_driver:
105 |         service = Service(ChromeDriverManager().install())
106 |         driver = webdriver.Chrome(service=service, options=webdriver_options)
107 |     else:
108 |         driver = webdriver.Chrome(options=webdriver_options)
109 | 
110 |     
111 |     # Detect the type of source and create data url if needed
112 |     if type(source) == io.BytesIO:
113 |         encoded_content = base64.b64encode(source.getvalue()).decode('utf-8')
114 |         path = f'data:text/html;base64,{encoded_content}'
115 |     if not source.startswith('http') and not source.startswith('file'):
116 |         encoded_content = base64.b64encode(source.encode('utf-8')).decode('utf-8')
117 |         path = f'data:text/html;base64,{encoded_content}'
118 |     else:
119 |         path = source
120 | 
121 |     driver.get(path)
122 | 
123 |     try:
124 |         WebDriverWait(driver, timeout).until(
125 |             staleness_of(driver.find_element(by=By.TAG_NAME, value="html"))
126 |         )
127 |     except TimeoutException:
128 |         calculated_print_options = {
129 |             "landscape": False,
130 |             "displayHeaderFooter": False,
131 |             "printBackground": True,
132 |             "preferCSSPageSize": True,
133 |         }
134 |         calculated_print_options.update(print_options)
135 |         result = __send_devtools(
136 |             driver, "Page.printToPDF", calculated_print_options)
137 |         return base64.b64decode(result["data"])
138 |     finally:
139 |         driver.quit()
140 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md
3 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | 
 3 | with open("README.md", "r") as fh:
 4 |     long_description = fh.read()
 5 | 
 6 | setuptools.setup(
 7 |     name="pyhtml2pdf", # Replace with your own username
 8 |     version="0.0.8rc1",
 9 |     author="Kumara Fernando",
10 |     author_email="mklmfernando@gmail.com",
11 |     description="Simple python wrapper to convert HTML to PDF with headless Chrome via selenium.",
12 |     long_description=long_description,
13 |     long_description_content_type="text/markdown",
14 |     url="https://github.com/kumaF/pyhtml2pdf",
15 |     packages=setuptools.find_packages(),
16 |     install_requires=[            # I get to this in a second
17 |           'selenium',
18 |           'webdriver-manager',
19 |     ],
20 |     classifiers=[
21 |         "Programming Language :: Python :: 3",
22 |         "License :: OSI Approved :: MIT License",
23 |         "Operating System :: OS Independent",
24 |     ],
25 |     python_requires='>=3.6',
26 | )


--------------------------------------------------------------------------------