├── .github └── workflows │ ├── linting.yml │ ├── release.yml │ └── test.yml ├── .gitignore ├── .pre-commit-config.yaml ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── img ├── logo.png ├── pdfitdown_ui_demo.mp4 └── thumbnail.png ├── pyproject.toml ├── src └── pdfitdown │ ├── __init__.py │ ├── __pycache__ │ ├── pdfconversion.cpython-313.pyc │ └── reader.cpython-313.pyc │ ├── pdfconversion.py │ ├── pdfitdown_cli.py │ └── pdfitdown_ui.py ├── tests ├── data │ ├── test.txt │ ├── test0.png │ ├── test1.pptx │ ├── test2.md │ ├── test3.json │ ├── test4.docx │ └── test5.zip ├── llamaparse │ ├── test1.pptx │ └── test4.docx ├── test_llamaparse.py ├── test_markitdown.py └── test_ui.py └── uv.lock /.github/workflows/linting.yml: -------------------------------------------------------------------------------- 1 | name: Linting 2 | 3 | on: 4 | pull_request: 5 | 6 | jobs: 7 | lint: 8 | runs-on: ubuntu-latest 9 | steps: 10 | - uses: actions/checkout@v4 11 | 12 | - name: Install uv 13 | uses: astral-sh/setup-uv@v6 14 | 15 | - name: Set up Python 16 | run: uv python install 3.12 17 | 18 | - name: Install pre-commit 19 | shell: bash 20 | run: uv venv && source .venv/bin/activate && uv pip install pre-commit 21 | 22 | - name: Run linter 23 | shell: bash 24 | run: uv run -- pre-commit run -a 25 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: PyPI Release 2 | 3 | on: 4 | push: 5 | tags: 6 | - "v[0-9].[0-9]+.[0-9]+*" 7 | 8 | jobs: 9 | release: 10 | runs-on: ubuntu-latest 11 | permissions: 12 | contents: write 13 | 14 | steps: 15 | - name: Checkout 16 | uses: actions/checkout@v4 17 | 18 | - name: Install Hatch 19 | run: pip install hatch 20 | 21 | - name: Publish on PyPi 22 | env: 23 | HATCH_INDEX_USER: __token__ 24 | HATCH_INDEX_AUTH: ${{ secrets.PYPI_API_TOKEN }} 25 | run: | 26 | hatch build 27 | hatch publish -y 28 | 29 | - name: Create GitHub Release 30 | uses: ncipollo/release-action@v1 31 | with: 32 | artifacts: "dist/*" 33 | generateReleaseNotes: true 34 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | # This workflow comes from https://github.com/ofek/hatch-mypyc 2 | # https://github.com/ofek/hatch-mypyc/blob/5a198c0ba8660494d02716cfc9d79ce4adfb1442/.github/workflows/test.yml 3 | name: test 4 | 5 | on: 6 | # push: 7 | # branches: 8 | # - main 9 | pull_request: 10 | 11 | concurrency: 12 | group: test-${{ github.head_ref }} 13 | cancel-in-progress: true 14 | 15 | env: 16 | PYTHONUNBUFFERED: "1" 17 | FORCE_COLOR: "1" 18 | 19 | jobs: 20 | test-linux: 21 | name: Python ${{ matrix.python-version }} on Linux 22 | runs-on: ubuntu-latest 23 | strategy: 24 | fail-fast: false 25 | matrix: 26 | python-version: ["3.10", "3.11", "3.12"] 27 | 28 | steps: 29 | - uses: actions/checkout@v4 30 | 31 | - name: Set up Python ${{ matrix.python-version }} 32 | uses: actions/setup-python@v5 33 | with: 34 | python-version: ${{ matrix.python-version }} 35 | 36 | - name: Install Hatch 37 | run: pip install --upgrade hatch 38 | 39 | - name: Run tests 40 | env: 41 | llamacloud_api_key: ${{ secrets.LLAMACLOUD_API_KEY }} 42 | run: hatch run test 43 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | dist/ 2 | src/pdfitdown.egg-info/ 3 | .pytest_cache/ 4 | build/ 5 | .ruff_cache/ 6 | tests/__pycache__/ 7 | tests/data/*.pdf 8 | .env 9 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | default_language_version: 3 | python: python3 4 | 5 | repos: 6 | - repo: https://github.com/pre-commit/pre-commit-hooks 7 | rev: v4.5.0 8 | hooks: 9 | - id: check-byte-order-marker 10 | - id: check-merge-conflict 11 | - id: check-symlinks 12 | - id: check-toml 13 | exclude: llama-index-core/llama_index/core/_static 14 | - id: check-yaml 15 | exclude: llama-index-core/llama_index/core/_static 16 | - id: detect-private-key 17 | - id: end-of-file-fixer 18 | exclude: llama-index-core/llama_index/core/_static 19 | - id: mixed-line-ending 20 | exclude: llama-index-core/llama_index/core/_static 21 | - id: trailing-whitespace 22 | exclude: llama-index-core/llama_index/core/_static 23 | 24 | - repo: https://github.com/charliermarsh/ruff-pre-commit 25 | rev: v0.11.8 26 | hooks: 27 | - id: ruff 28 | args: [--exit-non-zero-on-fix, --fix] 29 | exclude: ".*poetry.lock|.*_static" 30 | 31 | # - repo: https://github.com/psf/black-pre-commit-mirror 32 | # rev: 23.10.1 33 | # hooks: 34 | # - id: black-jupyter 35 | # name: black-src 36 | # alias: black 37 | # exclude: "^docs|.*poetry.lock|.*_static" 38 | 39 | - repo: https://github.com/pre-commit/mirrors-mypy 40 | rev: v1.0.1 41 | hooks: 42 | - id: mypy 43 | additional_dependencies: 44 | [ 45 | "types-requests", 46 | "types-Deprecated", 47 | "types-redis", 48 | "types-setuptools", 49 | "types-PyYAML", 50 | "types-protobuf==4.24.0.4", 51 | ] 52 | args: 53 | [ 54 | --namespace-packages, 55 | --explicit-package-bases, 56 | --disallow-untyped-defs, 57 | --ignore-missing-imports, 58 | --python-version=3.9, 59 | ] 60 | entry: bash -c "export MYPYPATH=ingest_anything" 61 | 62 | - repo: https://github.com/psf/black-pre-commit-mirror 63 | rev: 23.10.1 64 | hooks: 65 | - id: black-jupyter 66 | name: black-docs-py 67 | alias: black 68 | files: ^(docs/|examples/) 69 | # Using PEP 8's line length in docs prevents excess left/right scrolling 70 | args: [--line-length=79] 71 | 72 | - repo: https://github.com/pre-commit/mirrors-prettier 73 | rev: v3.0.3 74 | hooks: 75 | - id: prettier 76 | exclude: llama-index-core/llama_index/core/_static|poetry.lock|llama-index-legacy/llama_index/legacy/_static|docs/docs 77 | 78 | - repo: https://github.com/pappasam/toml-sort 79 | rev: v0.23.1 80 | hooks: 81 | - id: toml-sort-fix 82 | exclude: ".*poetry.lock|.*_static" 83 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to `PdfItDown` 2 | 3 | Do you want to contribute to this project? Make sure to read this guidelines first :) 4 | 5 | ## Issue 6 | 7 | **When to do it**: 8 | 9 | - You found bugs but you don't know how to solve them or don't have time/will to do the solve 10 | - You want new features but you don't know how to implement them or don't have time/will to do the implementation 11 | 12 | > ⚠️ _Always check open and closed issues before you submit yours to avoid duplicates_ 13 | 14 | **How to do it**: 15 | 16 | - Open an issue 17 | - Give the issue a meaningful title (short but effective problem description) 18 | - Describe the problem 19 | 20 | ## Traditional contribution 21 | 22 | **When to do it**: 23 | 24 | - You found bugs and corrected them 25 | - You optimized/improved the code 26 | - You added new features that you think could be useful to others 27 | 28 | **How to do it**: 29 | 30 | 1. Fork this repository 31 | 2. Test your changes locally 32 | 33 | ``` 34 | uv pip install hatch 35 | cd PdfItDown/ 36 | hatch run test 37 | ``` 38 | 39 | 3. If all the test pass, you can commit your changes. 40 | 4. Submit pull request (make sure to provide a thorough description of the changes) 41 | 42 | > [!NOTE] > _If you add a new feature, you might need to add new tests!_ 43 | 44 | ### Thanks for contributing! 45 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Clelia (Astra) Bertelli 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 |

PdfItDown

3 |

Convert Everything to PDF

4 |
5 |
6 |
7 | Join Discord Server 8 |
9 |
10 |
11 | PdfItDown Logo 12 |
13 | 14 | **PdfItDown** is a python package that relies on [`markitdown` by Microsoft](https://github.com/microsoft/markitdown/), [`markdown_pdf`](https://github.com/vb64/markdown-pdf), [img2pdf](https://pypi.org/project/img2pdf/) and [`LlamaIndex`](https://www.llamaindex.ai/). Visit us on our [documentation website](https://pdfitdown.eu)! 15 | 16 | ### Applicability 17 | 18 | **PdfItDown** is applicable to the following file formats: 19 | 20 | - Markdown 21 | - PowerPoint 22 | - Word 23 | - Excel 24 | - HTML 25 | - Text-based formats (CSV, XML, JSON) 26 | - ZIP files (iterates over contents) 27 | - Image files (PNG, JPG) 28 | 29 | The format-specific support needs to be evaluated for the specific reader you are using. 30 | 31 | ### How does it work? 32 | 33 | **PdfItDown** works in a very simple way: 34 | 35 | - From **markdown** to PDF 36 | 37 | ```mermaid 38 | graph LR 39 | 2(Input File) --> 3[Markdown content] 40 | 3[Markdown content] --> 4[markdown-pdf] 41 | 4[markdown-pdf] --> 5(PDF file) 42 | ``` 43 | 44 | - From **image** to PDF 45 | 46 | ```mermaid 47 | graph LR 48 | 2(Input File) --> 3[Bytes] 49 | 3[Bytes] --> 4[img2pdf] 50 | 4[img2pdf] --> 5(PDF file) 51 | ``` 52 | 53 | - From other **text-based** file formats to PDF 54 | 55 | ```mermaid 56 | graph LR 57 | 2(Input File) --> 3[LlamaIndex-compatible Reader - defaults to MarkItDown] 58 | 3[LlamaIndex-compatible Reader - defaults to MarkItDown] --> 4[Markdown content] 59 | 4[Markdown content] --> 5[markdown-pdf] 60 | 5[markdown-pdf] --> 6(PDF file) 61 | ``` 62 | 63 | ### Installation and Usage 64 | 65 | To install **PdfItDown**, just run: 66 | 67 | ```bash 68 | python3 -m pip install pdfitdown 69 | ``` 70 | 71 | You can now use the **command line tool**: 72 | 73 | ``` 74 | usage: pdfitdown [-h] [-i INPUTFILE] [-o OUTPUTFILE] [-t TITLE] [-d DIRECTORY] 75 | 76 | options: 77 | -h, --help show this help message and exit 78 | -i, --inputfile INPUTFILE 79 | Path to the input file(s) that need to be converted to PDF. The path should be comma 80 | separated: input1.csv,input2.md,...,inputN.xml. 81 | -o, --outputfile OUTPUTFILE 82 | Path to the output PDF file(s). If more than one input file is provided, you should provide an 83 | equally long list of output files. The path should be comma separated: 84 | output1.pdf,output2.pdf,...,outputN.pdf. Defaults to 'None' 85 | -t, --title TITLE Title to include in the PDF metadata. Default: 'File Converted with PdfItDown'. If more than 86 | one file is provided, it will be ignored. 87 | -d, --directory DIRECTORY 88 | Directory whose files you want to bulk-convert to PDF. If the --inputfile argument is also 89 | provided, it will be ignored. Defaults to None. 90 | ``` 91 | 92 | An example usage can be: 93 | 94 | ```bash 95 | pdfitdown -i README.md -o README.pdf -t "README" 96 | ``` 97 | 98 | Or you can use it **inside your python scripts**: 99 | 100 | ```python 101 | from pdfitdown.pdfconversion import Converter 102 | 103 | converter = Converter() 104 | converter.convert(file_path = "business_grow.md", output_path = "business_growth.pdf", title="Business Growth for Q3 in 2024") 105 | converter.convert(file_path = "logo.png", output_path = "logo.pdf") 106 | converter.convert(file_path = "users.xlsx", output_path = "users.pdf") 107 | ``` 108 | 109 | You can also convert **multiple files at once**: 110 | 111 | - In the CLI: 112 | 113 | ```bash 114 | # with custom output paths 115 | pdfitdown -i "test0.png,test1.csv" -o "testoutput0.pdf,testoutput1.pdf" 116 | # with inferred output paths 117 | pdfitdown -i "test0.png,test1.csv" 118 | ``` 119 | 120 | - In the Python API: 121 | 122 | ```python 123 | from pdfitdown.pdfconversion import Converter 124 | from llama_parse import LlamaParse 125 | from dotenv import load_dotenv 126 | import os 127 | 128 | load_dotenv() 129 | 130 | reader = LlamaParse(api_key=os.getenv("llamacloud_api_key"), result_type="markdown") 131 | converter = Converter(reader=reader) 132 | # with custom output paths 133 | converter.multiple_convert(file_paths = ["business_growth.md", "logo.png"], output_paths = ["business_growth.pdf", "logo.pdf"]) 134 | # with inferred output paths 135 | converter.multiple_convert(file_paths = ["business_growth.md", "logo.png"]) 136 | ``` 137 | 138 | You can bulk-convert **all the files in a directory**: 139 | 140 | - In the CLI: 141 | 142 | ```bash 143 | pdfitdown -d tests/data/testdir 144 | ``` 145 | 146 | - In the Python API: 147 | 148 | ```python 149 | from pdfitdown.pdfconversion import Converter 150 | 151 | converter = Converter() 152 | output_paths = converter.convert_directory(directory_path = "tests/data/testdir") 153 | print(output_paths) 154 | ``` 155 | 156 | Or you can just launch a [Gradio](https://gradio.app)-based user interface: 157 | 158 | ```bash 159 | pdfitdown_ui 160 | ``` 161 | 162 | You will be able to see the application running on `http://localhost:7860` within seconds! 163 | 164 | Watch the demo here: 165 | 166 | [![Watch the video demo!](https://raw.githubusercontent.com/AstraBert/PdfItDown/main/img/thumbnail.png)](https://raw.githubusercontent.com/AstraBert/PdfItDown/main/img/pdfitdown_ui_demo.mp4) 167 | 168 | ### Contributing 169 | 170 | Contributions are always welcome! 171 | 172 | Find contribution guidelines at [CONTRIBUTING.md](https://github.com/AstraBert/PdfItDown/tree/main/CONTRIBUTING.md) 173 | 174 | ### License and Funding 175 | 176 | This project is open-source and is provided under an [MIT License](https://github.com/AstraBert/PdfItDown/tree/main/LICENSE). 177 | 178 | If you found it useful, please consider [funding it](https://github.com/sponsors/AstraBert). 179 | -------------------------------------------------------------------------------- /img/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AstraBert/PdfItDown/593434bd104b2c6f89464c5943ebe9573c60d475/img/logo.png -------------------------------------------------------------------------------- /img/pdfitdown_ui_demo.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AstraBert/PdfItDown/593434bd104b2c6f89464c5943ebe9573c60d475/img/pdfitdown_ui_demo.mp4 -------------------------------------------------------------------------------- /img/thumbnail.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AstraBert/PdfItDown/593434bd104b2c6f89464c5943ebe9573c60d475/img/thumbnail.png -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["hatchling>=1.0.0"] 3 | build-backend = "hatchling.build" 4 | 5 | [options.package_data] 6 | pdfitdown = ["*"] 7 | 8 | [project] 9 | name = "pdfitdown" 10 | version = "1.5.1" 11 | authors = [ 12 | {name = "Clelia (Astra) Bertelli", email = "astraberte9@gmail.com"} 13 | ] 14 | description = "PdfItDown - Convert Everything to PDF" 15 | readme = "README.md" 16 | requires-python = ">=3.10,<3.13" 17 | classifiers = [ 18 | "Programming Language :: Python :: 3", 19 | "License :: OSI Approved :: MIT License", 20 | "Operating System :: OS Independent" 21 | ] 22 | dependencies = [ 23 | 'llama-index-readers-markitdown', 24 | 'markdown-pdf', 25 | 'img2pdf', 26 | 'pillow', 27 | 'gradio', 28 | 'termcolor' 29 | ] 30 | 31 | [project.scripts] 32 | pdfitdown = "pdfitdown.pdfitdown_cli:main" 33 | pdfitdown_ui = "pdfitdown.pdfitdown_ui:main" 34 | 35 | [project.urls] 36 | Homepage = "https://github.com/AstraBert/PdfItDown" 37 | Issues = "https://github.com/AstraBert/PdfItDown/issues" 38 | 39 | [tool.hatch.build.targets.wheel] 40 | only-include = ["src/pdfitdown"] 41 | 42 | [tool.hatch.build.targets.wheel.sources] 43 | "src" = "" 44 | 45 | [tool.hatch.envs.default] 46 | dependencies = [ 47 | "pytest", 48 | "llama_parse" 49 | ] 50 | 51 | [tool.hatch.envs.default.py-version] 52 | 40 = "3.10" 53 | 41 = "3.11" 54 | 42 = "3.12" 55 | 56 | [tool.hatch.envs.default.scripts] 57 | test = "cp src/pdfitdown/pdfconversion.py tests/ && cp src/pdfitdown/pdfitdown_ui.py tests/ && pytest tests/*.py -p no:warnings && rm tests/pdfconversion.py && rm tests/pdfitdown_ui.py" 58 | 59 | [tool.setuptools.packages.find] 60 | where = ["src"] 61 | include = ["pdfitdown*"] 62 | -------------------------------------------------------------------------------- /src/pdfitdown/__init__.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | warnings.filterwarnings("ignore") 3 | -------------------------------------------------------------------------------- /src/pdfitdown/__pycache__/pdfconversion.cpython-313.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AstraBert/PdfItDown/593434bd104b2c6f89464c5943ebe9573c60d475/src/pdfitdown/__pycache__/pdfconversion.cpython-313.pyc -------------------------------------------------------------------------------- /src/pdfitdown/__pycache__/reader.cpython-313.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AstraBert/PdfItDown/593434bd104b2c6f89464c5943ebe9573c60d475/src/pdfitdown/__pycache__/reader.cpython-313.pyc -------------------------------------------------------------------------------- /src/pdfitdown/pdfconversion.py: -------------------------------------------------------------------------------- 1 | import img2pdf 2 | import warnings 3 | import os 4 | from PIL import Image 5 | from markdown_pdf import MarkdownPdf, Section 6 | from pydantic import BaseModel, field_validator, model_validator 7 | from pathlib import Path 8 | from typing import List, Optional 9 | from typing_extensions import Self 10 | from llama_index.core.readers.base import BaseReader 11 | from llama_index.readers.markitdown import MarkItDownReader 12 | 13 | class FilePath(BaseModel): 14 | file: str 15 | @field_validator("file") 16 | def is_valid_file(cls, file: str): 17 | p = Path(file) 18 | if not p.is_file(): 19 | raise ValueError(f"{file} is not a file") 20 | return file 21 | 22 | class FileExistsWarning(Warning): 23 | """Warns you that a file exists""" 24 | 25 | class DirPath(BaseModel): 26 | path: str 27 | @model_validator(mode="after") 28 | def validate_dir_path(self) -> Self: 29 | if Path(self.path).is_dir(): 30 | if len(os.listdir(self.path)) == 0: 31 | raise ValueError("You should provide a non-empty directory") 32 | else: 33 | return self 34 | else: 35 | raise ValueError("You should provide the path for an existing directory") 36 | 37 | class OutputPath(BaseModel): 38 | file: str 39 | @field_validator("file") 40 | def file_exists_warning(cls, file: str): 41 | if os.path.splitext(file)[1] != ".pdf": 42 | raise ValueError("Output file must be a PDF") 43 | p = Path(file) 44 | if p.is_file(): 45 | warnings.warn(f"The file {file} already exists, you are about to overwrite it", FileExistsWarning) 46 | return file 47 | 48 | class MultipleFileConversion(BaseModel): 49 | input_files: List[FilePath] 50 | output_files: List[str] | List[OutputPath] | None 51 | @model_validator(mode="after") 52 | def validate_multiple_file_conversion(self) -> Self: 53 | if self.output_files is not None and len(self.input_files) != len(self.output_files): 54 | raise ValueError("Input and output files must be lists of the same length") 55 | else: 56 | if self.output_files is None: 57 | self.output_files = [OutputPath(file=(fl.file.replace(os.path.splitext(fl.file)[1],".pdf"))) for fl in self.input_files] 58 | else: 59 | if isinstance(self.output_files[0], str): 60 | self.output_files = [OutputPath(file=fl) for fl in self.output_files] 61 | return self 62 | 63 | class Converter: 64 | """A class for converting .docx, .html, .xml, .json, .csv, .md, .pptx, .xlsx, .png, .jpg, .png, .zip files into PDF""" 65 | def __init__(self, reader: Optional[BaseReader] = None) -> None: 66 | """ 67 | Initialize the Converter class. 68 | 69 | Args: 70 | reader (Optional[BaseReader]): the reader to extract the file text (needs to be LlamaIndex-compatible). Defaults to MarkItDown reader. 71 | Returns: 72 | None 73 | """ 74 | if reader is not None: 75 | self._reader = reader 76 | else: 77 | self._reader = MarkItDownReader() 78 | return 79 | def convert(self, file_path: str, output_path: str, title: str = "File Converted with PdfItDown"): 80 | """ 81 | Convert various document types into PDF format (supports .docx, .html, .xml, .json, .csv, .md, .pptx, .xlsx, .png, .jpg, .png, .zip). 82 | 83 | Args: 84 | file_path (str): The path to the input file 85 | output_path (str): The path to the output file 86 | title (str): The title for the PDF document (defaults to: 'File Converted with PdfItDown') 87 | Returns: 88 | output_path (str): Path to the output file 89 | Raises: 90 | ValidationError: if the format of the input file is not support or if the format of the output file is not PDF 91 | FileExistsWarning: if the output PDF path is an existing file, it warns you that the file will be overwritten 92 | """ 93 | self.file_input = FilePath(file=file_path) 94 | self.file_output = OutputPath(file=output_path) 95 | if os.path.splitext(self.file_input.file)[1] == ".md": 96 | f = open(self.file_input.file, "r") 97 | finstr = f.read() 98 | f.close() 99 | pdf = MarkdownPdf(toc_level=0) 100 | pdf.add_section(Section(finstr)) 101 | pdf.meta["title"] = title 102 | pdf.save(self.file_output.file) 103 | return self.file_output.file 104 | elif os.path.splitext(self.file_input.file)[1] in [".jpg", ".png"]: 105 | image = Image.open(self.file_input.file) 106 | pdf_bytes = img2pdf.convert(image.filename) 107 | with open(self.file_output.file, "wb") as file: 108 | file.write(pdf_bytes) 109 | file.close() 110 | image.close() 111 | return self.file_output.file 112 | else: 113 | try: 114 | result = self._reader.load_data([self.file_input.file]) 115 | finstr = result[0].text 116 | pdf = MarkdownPdf(toc_level=0) 117 | pdf.add_section(Section(finstr)) 118 | pdf.meta["title"] = title 119 | pdf.save(self.file_output.file) 120 | return self.file_output.file 121 | except Exception: 122 | return None 123 | def multiple_convert(self, file_paths: List[str], output_paths: Optional[List[str]] = None): 124 | """ 125 | Convert various document types into PDF format (supports .docx, .html, .xml, .json, .csv, .md, .pptx, .xlsx, .png, .jpg, .png, .zip). Converts multiple files at once. 126 | Args: 127 | file_paths (str): The paths to the input files 128 | output_paths (Optional[str]): The path to the output files 129 | Returns: 130 | output_paths (List[str]): Paths to the output files 131 | Raises: 132 | ValidationError: if the format of the input file is not support or if the format of the output file is not PDF 133 | FileExistsWarning: if the output PDF path is an existing file, it warns you that the file will be overwritten 134 | """ 135 | input_files = [FilePath(file=fl) for fl in file_paths] 136 | to_convert_list = MultipleFileConversion(input_files=input_files, output_files=output_paths) 137 | output_fls: List[OutputPath] = [] 138 | for i in range(len(to_convert_list.input_files)): 139 | result = self.convert(file_path=to_convert_list.input_files[i].file, output_path=to_convert_list.output_files[i].file) 140 | if result is not None: 141 | output_fls.append(to_convert_list.output_files[i]) 142 | return [el.file for el in output_fls] 143 | def convert_directory(self, directory_path: str): 144 | """ 145 | Convert various document types into PDF format (supports .docx, .html, .xml, .json, .csv, .md, .pptx, .xlsx, .png, .jpg, .png, .zip). Converts all the files in a directory at once. 146 | Args: 147 | directory_path (str): The paths to the input files 148 | Returns: 149 | output_paths (List[str]): Paths to the output files 150 | Raises: 151 | ValidationError: if the format of the input file is not support or if the format of the output file is not PDF 152 | FileExistsWarning: if the output PDF path is an existing file, it warns you that the file will be overwritten 153 | """ 154 | dirpath = DirPath(path=directory_path) 155 | fls = [] 156 | p = os.walk(dirpath.path) 157 | for root, parent, file in p: 158 | for f in file: 159 | fls.append(root+"/"+f) 160 | output_paths = self.multiple_convert(file_paths=fls) 161 | return output_paths 162 | -------------------------------------------------------------------------------- /src/pdfitdown/pdfitdown_cli.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from argparse import ArgumentParser 3 | from .pdfconversion import Converter 4 | from termcolor import cprint 5 | from pydantic import ValidationError 6 | 7 | def main(): 8 | parser = ArgumentParser() 9 | parser.add_argument("-i", "--inputfile", 10 | help="Path to the input file(s) that need to be converted to PDF. The path should be comma separated: input1.csv,input2.md,...,inputN.xml.", 11 | required=False, type=str, default=None) 12 | parser.add_argument("-o", "--outputfile", 13 | help="Path to the output PDF file(s). If more than one input file is provided, you should provide an equally long list of output files. The path should be comma separated: output1.pdf,output2.pdf,...,outputN.pdf. Defaults to 'None'", 14 | required=False, type=str, default=None) 15 | parser.add_argument("-t", "--title", 16 | help="Title to include in the PDF metadata. Default: 'File Converted with PdfItDown'. If more than one file is provided, it will be ignored.", 17 | required=False, default="File Converted with PdfItDown", type=str) 18 | parser.add_argument("-d", "--directory", 19 | help="Directory whose files you want to bulk-convert to PDF. If the --inputfile argument is also provided, it will be ignored. Defaults to None.", 20 | required=False, default=None, type=str) 21 | args = parser.parse_args() 22 | inf = args.inputfile 23 | outf = args.outputfile 24 | diri = args.directory 25 | titl = args.title 26 | conv = Converter() 27 | try: 28 | if inf is not None: 29 | if outf is not None and len(inf.split(",")) > 1: 30 | outf = conv.multiple_convert(inf.split(","), outf.split(",")) 31 | cprint("Conversion successful!🎉", color="green", attrs=["bold"], file=sys.stdout) 32 | sys.exit(0) 33 | elif outf is None and len(inf.split(",")) > 1: 34 | outf = conv.multiple_convert(inf.split(","), outf) 35 | cprint("Conversion successful!🎉", color="green", attrs=["bold"], file=sys.stdout) 36 | sys.exit(0) 37 | elif outf is not None and len(inf.split(",")) == 1: 38 | outf = conv.convert(inf.split(",")[0], outf.split(",")[0], title=titl) 39 | cprint("Conversion successful!🎉", color="green", attrs=["bold"], file=sys.stdout) 40 | sys.exit(0) 41 | else: 42 | cprint("ERROR! Invalid input provided, check your input and output files",color="red", file=sys.stderr) 43 | sys.exit(1) 44 | elif inf is None and diri is None: 45 | cprint("ERROR! You should provide at least one of --inputfile or --directory",color="red", file=sys.stderr) 46 | sys.exit(1) 47 | else: 48 | outf = conv.convert_directory(diri) 49 | cprint("Conversion successful!🎉", color="green", attrs=["bold"], file=sys.stdout) 50 | sys.exit(0) 51 | except ValidationError as e: 52 | cprint(f"ERROR! Error:\n\n{e}\n\nwas raised during conversion",color="red", file=sys.stderr) 53 | sys.exit(1) 54 | 55 | if __name__ == "__main__": 56 | main() 57 | -------------------------------------------------------------------------------- /src/pdfitdown/pdfitdown_ui.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | import os 3 | try: 4 | from .pdfconversion import Converter 5 | except ImportError: 6 | from pdfconversion import Converter 7 | from typing import List 8 | import gradio as gr 9 | 10 | class FileNotConvertedWarning(Warning): 11 | """The file was not in one of the specified formats for conversion to PDF,thus it was not converted""" 12 | 13 | def to_pdf(files: List[str]) -> List[str]: 14 | pdfs = [] 15 | converter = Converter() 16 | for fl in files: 17 | try: 18 | outf = converter.convert(fl, fl.replace(os.path.splitext(fl)[1], ".pdf")) 19 | except Exception as e: 20 | warnings.warn(f"File {fl} not converted because of an error during the conversion: {e}", FileNotConvertedWarning) 21 | else: 22 | pdfs.append(outf) 23 | return pdfs 24 | 25 | def convert_files(files: List[str]) -> List[str]: 26 | pdfs = to_pdf(files) 27 | return pdfs 28 | 29 | def main(): 30 | iface = gr.Interface( 31 | fn=convert_files, 32 | inputs=gr.File(label="Upload your file", file_count="multiple"), 33 | outputs=gr.File(label="Converted PDF", file_count="multiple"), 34 | title="File to PDF Converter", 35 | description="Upload a file in .docx, .xlsx, .html, .pptx, .json, .csv, .xml, .md, .jpg/.jpeg, .png, .zip format, and get it converted to PDF." 36 | ) 37 | iface.launch() 38 | 39 | if __name__ == "__main__": 40 | main() 41 | -------------------------------------------------------------------------------- /tests/data/test.txt: -------------------------------------------------------------------------------- 1 | This is example text 2 | -------------------------------------------------------------------------------- /tests/data/test0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AstraBert/PdfItDown/593434bd104b2c6f89464c5943ebe9573c60d475/tests/data/test0.png -------------------------------------------------------------------------------- /tests/data/test1.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AstraBert/PdfItDown/593434bd104b2c6f89464c5943ebe9573c60d475/tests/data/test1.pptx -------------------------------------------------------------------------------- /tests/data/test2.md: -------------------------------------------------------------------------------- 1 | ## This is a test markdown 2 | 3 | This is a test markdown 4 | -------------------------------------------------------------------------------- /tests/data/test3.json: -------------------------------------------------------------------------------- 1 | { 2 | "apiVersion": "v1", 3 | "kind": "ExampleData", 4 | "metadata": { 5 | "name": "sample-data", 6 | "creationTimestamp": "2023-10-27T10:00:00Z", 7 | "labels": { 8 | "environment": "production", 9 | "application": "data-processor" 10 | }, 11 | "annotations": { 12 | "description": "This is a sample data set for testing purposes." 13 | }, 14 | "uid": "a1b2c3d4-e5f6-7890-1234-567890abcdef" 15 | }, 16 | "spec": { 17 | "dataType": "string", 18 | "source": "Database", 19 | "retentionPolicy": { 20 | "duration": "30d", 21 | "action": "archive" 22 | }, 23 | "validationRules": [ 24 | { 25 | "field": "name", 26 | "type": "regex", 27 | "pattern": "^[a-zA-Z0-9\\s]+$" 28 | }, 29 | { 30 | "field": "age", 31 | "type": "range", 32 | "min": 0, 33 | "max": 120 34 | } 35 | ] 36 | }, 37 | "status": { 38 | "state": "Active", 39 | "lastProcessed": "2023-10-27T09:55:00Z", 40 | "processedRecords": 12345, 41 | "errors": 0, 42 | "message": "Data processing is running smoothly." 43 | }, 44 | "data": [ 45 | { 46 | "id": 1, 47 | "name": "John Doe", 48 | "age": 30, 49 | "city": "New York" 50 | }, 51 | { 52 | "id": 2, 53 | "name": "Jane Smith", 54 | "age": 25, 55 | "city": "Los Angeles" 56 | }, 57 | { 58 | "id": 3, 59 | "name": "Peter Jones", 60 | "age": 40, 61 | "city": "Chicago" 62 | } 63 | ] 64 | } 65 | -------------------------------------------------------------------------------- /tests/data/test4.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AstraBert/PdfItDown/593434bd104b2c6f89464c5943ebe9573c60d475/tests/data/test4.docx -------------------------------------------------------------------------------- /tests/data/test5.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AstraBert/PdfItDown/593434bd104b2c6f89464c5943ebe9573c60d475/tests/data/test5.zip -------------------------------------------------------------------------------- /tests/llamaparse/test1.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AstraBert/PdfItDown/593434bd104b2c6f89464c5943ebe9573c60d475/tests/llamaparse/test1.pptx -------------------------------------------------------------------------------- /tests/llamaparse/test4.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AstraBert/PdfItDown/593434bd104b2c6f89464c5943ebe9573c60d475/tests/llamaparse/test4.docx -------------------------------------------------------------------------------- /tests/test_llamaparse.py: -------------------------------------------------------------------------------- 1 | from pdfconversion import Converter 2 | from llama_parse import LlamaParse 3 | import pathlib 4 | import os 5 | from dotenv import load_dotenv 6 | 7 | load_dotenv() 8 | 9 | reader = LlamaParse(api_key=os.getenv("llamacloud_api_key"), result_type="markdown") 10 | converter = Converter(reader=reader) 11 | 12 | def test_single_file(): 13 | test_cases = [ 14 | { 15 | "test_name": "Successful HTML conversion", 16 | "file_input": "tests/data/test1.pptx", 17 | "file_output": "tests/data/test1.pdf", 18 | "expected": True 19 | }, 20 | { 21 | "test_name": "Successful md file conversion", 22 | "file_input": "tests/data/test2.md", 23 | "file_output": "tests/data/test2.pdf", 24 | "expected": True 25 | }, 26 | { 27 | "test_name": "Successful image file conversion", 28 | "file_input": "tests/data/test0.png", 29 | "file_output": "tests/data/test0.pdf", 30 | "expected": True 31 | }, 32 | { 33 | "test_name": "Unsuccessful file conversion", 34 | "file_input": "tests/data/tes.md", 35 | "file_output": "tests/data/tes.pdf", 36 | "expected": False 37 | }, 38 | ] 39 | for c in test_cases: 40 | print(c["test_name"]) 41 | try: 42 | result = converter.convert(file_path=c["file_input"], output_path=c["file_output"]) 43 | assert pathlib.Path(result).is_file() == c["expected"] 44 | if pathlib.Path(result).is_file(): 45 | os.remove(result) 46 | except Exception: 47 | result = c["file_output"] 48 | assert pathlib.Path(result).is_file() == c["expected"] 49 | 50 | def test_multiple_files(): 51 | test_cases = [ 52 | { 53 | "test_name": "Specified output files", 54 | "file_input": ["tests/data/test1.pptx","tests/data/test4.docx","tests/data/test2.md"], 55 | "file_output": ["tests/data/test0_1.pdf","tests/data/test_1.pdf","tests/data/test2_1.pdf"], 56 | "expected": [True, True, True] 57 | }, 58 | { 59 | "test_name": "Unspecified output files", 60 | "file_input": ["tests/data/test1.pptx","tests/data/test4.docx","tests/data/test2.md"], 61 | "file_output": None, 62 | "expected": [True, True, True] 63 | }, 64 | { 65 | "test_name": "Unspecified output files", 66 | "file_input": ["tests/data/test1.pptx","tests/data/test4.docx","tests/data/test2.md"], 67 | "file_output": ["tests/data/test0_2.pdf"], 68 | "expected": False 69 | }, 70 | ] 71 | for c in test_cases: 72 | print(c["test_name"]) 73 | try: 74 | result = converter.multiple_convert(file_paths=c["file_input"], output_paths=c["file_output"]) 75 | assert [pathlib.Path(r).is_file() for r in result] == c["expected"] 76 | for f in result: 77 | if pathlib.Path(f).is_file(): 78 | os.remove(f) 79 | except Exception: 80 | assert pathlib.Path(c["file_output"][0]).is_file() == c["expected"] 81 | 82 | 83 | def test_dir(): 84 | test_cases = [ 85 | { 86 | "test_name": "Correct dir path", 87 | "file_input": "tests/llamaparse", 88 | "file_output": ["tests/llamaparse/test1.pdf", "tests/llamaparse/test4.pdf"], 89 | "expected": [True, True] 90 | }, 91 | { 92 | "test_name": "Wrong dir path", 93 | "file_input": "tests/data/llamapars", 94 | "file_output": ["tests/llamaparse/test1.pdf", "tests/llamaparse/test4.pdf"], 95 | "expected": [False, False] 96 | }, 97 | ] 98 | for c in test_cases: 99 | print(c["test_name"]) 100 | try: 101 | converter.convert_directory(directory_path=c["file_input"]) 102 | assert [pathlib.Path(r).is_file() for r in c["file_output"]] == c["expected"] 103 | for f in c["file_output"]: 104 | if pathlib.Path(f).is_file(): 105 | os.remove(f) 106 | except Exception: 107 | assert [pathlib.Path(r).is_file() for r in c["file_output"]] == c["expected"] 108 | -------------------------------------------------------------------------------- /tests/test_markitdown.py: -------------------------------------------------------------------------------- 1 | from pdfconversion import Converter 2 | import pathlib 3 | import os 4 | 5 | converter = Converter() 6 | 7 | def test_single_file(): 8 | test_cases = [ 9 | { 10 | "test_name": "Successful image conversion", 11 | "file_input": "tests/data/test0.png", 12 | "file_output": "tests/data/test0.pdf", 13 | "expected": True 14 | }, 15 | { 16 | "test_name": "Successful text file conversion", 17 | "file_input": "tests/data/test.txt", 18 | "file_output": "tests/data/test.pdf", 19 | "expected": True 20 | }, 21 | { 22 | "test_name": "Successful md file conversion", 23 | "file_input": "tests/data/test2.md", 24 | "file_output": "tests/data/test2.pdf", 25 | "expected": True 26 | }, 27 | { 28 | "test_name": "Unsuccessful file conversion", 29 | "file_input": "tests/data/test1.pptx", 30 | "file_output": "tests/data/test1.pdf", 31 | "expected": True 32 | }, 33 | { 34 | "test_name": "Unsuccessful file conversion", 35 | "file_input": "tests/data/tes.md", 36 | "file_output": "tests/data/tes.pdf", 37 | "expected": False 38 | }, 39 | ] 40 | for c in test_cases: 41 | print(c["test_name"]) 42 | try: 43 | result = converter.convert(file_path=c["file_input"], output_path=c["file_output"]) 44 | assert pathlib.Path(result).is_file() == c["expected"] 45 | if pathlib.Path(result).is_file(): 46 | os.remove(result) 47 | except Exception: 48 | result = c["file_output"] 49 | assert pathlib.Path(result).is_file() == c["expected"] 50 | 51 | def test_multiple_files(): 52 | test_cases = [ 53 | { 54 | "test_name": "Specified output files", 55 | "file_input": ["tests/data/test0.png","tests/data/test.txt","tests/data/test2.md"], 56 | "file_output": ["tests/data/test0_1.pdf","tests/data/test_1.pdf","tests/data/test2_1.pdf"], 57 | "expected": [True, True, True] 58 | }, 59 | { 60 | "test_name": "Unspecified output files", 61 | "file_input": ["tests/data/test0.png","tests/data/test.txt","tests/data/test2.md"], 62 | "file_output": None, 63 | "expected": [True, True, True] 64 | }, 65 | { 66 | "test_name": "Unspecified output files", 67 | "file_input": ["tests/data/test0.png","tests/data/test.txt","tests/data/test2.md"], 68 | "file_output": ["tests/data/test0_2.pdf"], 69 | "expected": False 70 | }, 71 | ] 72 | for c in test_cases: 73 | print(c["test_name"]) 74 | try: 75 | result = converter.multiple_convert(file_paths=c["file_input"], output_paths=c["file_output"]) 76 | assert [pathlib.Path(r).is_file() for r in result] == c["expected"] 77 | for f in result: 78 | if pathlib.Path(f).is_file(): 79 | os.remove(f) 80 | except Exception: 81 | assert pathlib.Path(c["file_output"][0]).is_file() == c["expected"] 82 | 83 | 84 | def test_dir(): 85 | test_cases = [ 86 | { 87 | "test_name": "Correct dir path", 88 | "file_input": "tests/data", 89 | "file_output": ["tests/data/test0.pdf","tests/data/test1.pdf", "tests/data/test.pdf","tests/data/test2.pdf", "tests/data/test3.pdf", "tests/data/test4.pdf", "tests/data/test5.pdf"], 90 | "expected": [True, True, True, True, True, True, True] 91 | }, 92 | { 93 | "test_name": "Wrong dir path", 94 | "file_input": "tests/dat", 95 | "file_output": ["tests/data/test0.pdf","tests/data/test1.pdf", "tests/data/test.pdf","tests/data/test2.pdf", "tests/data/test3.pdf", "tests/data/test4.pdf", "tests/data/test5.pdf"], 96 | "expected": [False, False, False, False, False, False, False] 97 | }, 98 | ] 99 | for c in test_cases: 100 | print(c["test_name"]) 101 | try: 102 | converter.convert_directory(directory_path=c["file_input"]) 103 | assert [pathlib.Path(r).is_file() for r in c["file_output"]] == c["expected"] 104 | for f in c["file_output"]: 105 | if pathlib.Path(f).is_file(): 106 | os.remove(f) 107 | except Exception: 108 | assert [pathlib.Path(r).is_file() for r in c["file_output"]] == c["expected"] 109 | -------------------------------------------------------------------------------- /tests/test_ui.py: -------------------------------------------------------------------------------- 1 | from pdfitdown_ui import to_pdf 2 | import os 3 | from pathlib import Path 4 | 5 | def test_to_pdf(): 6 | test_files = ["tests/data/test0.png","tests/data/test.txt","tests/data/test2.md"] 7 | expected_outputs = ["tests/data/test0.pdf","tests/data/test.pdf","tests/data/test2.pdf"] 8 | assert to_pdf(test_files) == expected_outputs 9 | for p in expected_outputs: 10 | if Path(p).is_file(): 11 | os.remove(p) 12 | --------------------------------------------------------------------------------