├── .github
└── workflows
│ ├── linting.yml
│ ├── release.yml
│ └── test.yml
├── .gitignore
├── .pre-commit-config.yaml
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── img
├── logo.png
├── pdfitdown_ui_demo.mp4
└── thumbnail.png
├── pyproject.toml
├── src
└── pdfitdown
│ ├── __init__.py
│ ├── __pycache__
│ ├── pdfconversion.cpython-313.pyc
│ └── reader.cpython-313.pyc
│ ├── pdfconversion.py
│ ├── pdfitdown_cli.py
│ └── pdfitdown_ui.py
├── tests
├── data
│ ├── test.txt
│ ├── test0.png
│ ├── test1.pptx
│ ├── test2.md
│ ├── test3.json
│ ├── test4.docx
│ └── test5.zip
├── llamaparse
│ ├── test1.pptx
│ └── test4.docx
├── test_llamaparse.py
├── test_markitdown.py
└── test_ui.py
└── uv.lock
/.github/workflows/linting.yml:
--------------------------------------------------------------------------------
1 | name: Linting
2 |
3 | on:
4 | pull_request:
5 |
6 | jobs:
7 | lint:
8 | runs-on: ubuntu-latest
9 | steps:
10 | - uses: actions/checkout@v4
11 |
12 | - name: Install uv
13 | uses: astral-sh/setup-uv@v6
14 |
15 | - name: Set up Python
16 | run: uv python install 3.12
17 |
18 | - name: Install pre-commit
19 | shell: bash
20 | run: uv venv && source .venv/bin/activate && uv pip install pre-commit
21 |
22 | - name: Run linter
23 | shell: bash
24 | run: uv run -- pre-commit run -a
25 |
--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
1 | name: PyPI Release
2 |
3 | on:
4 | push:
5 | tags:
6 | - "v[0-9].[0-9]+.[0-9]+*"
7 |
8 | jobs:
9 | release:
10 | runs-on: ubuntu-latest
11 | permissions:
12 | contents: write
13 |
14 | steps:
15 | - name: Checkout
16 | uses: actions/checkout@v4
17 |
18 | - name: Install Hatch
19 | run: pip install hatch
20 |
21 | - name: Publish on PyPi
22 | env:
23 | HATCH_INDEX_USER: __token__
24 | HATCH_INDEX_AUTH: ${{ secrets.PYPI_API_TOKEN }}
25 | run: |
26 | hatch build
27 | hatch publish -y
28 |
29 | - name: Create GitHub Release
30 | uses: ncipollo/release-action@v1
31 | with:
32 | artifacts: "dist/*"
33 | generateReleaseNotes: true
34 |
--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
1 | # This workflow comes from https://github.com/ofek/hatch-mypyc
2 | # https://github.com/ofek/hatch-mypyc/blob/5a198c0ba8660494d02716cfc9d79ce4adfb1442/.github/workflows/test.yml
3 | name: test
4 |
5 | on:
6 | # push:
7 | # branches:
8 | # - main
9 | pull_request:
10 |
11 | concurrency:
12 | group: test-${{ github.head_ref }}
13 | cancel-in-progress: true
14 |
15 | env:
16 | PYTHONUNBUFFERED: "1"
17 | FORCE_COLOR: "1"
18 |
19 | jobs:
20 | test-linux:
21 | name: Python ${{ matrix.python-version }} on Linux
22 | runs-on: ubuntu-latest
23 | strategy:
24 | fail-fast: false
25 | matrix:
26 | python-version: ["3.10", "3.11", "3.12"]
27 |
28 | steps:
29 | - uses: actions/checkout@v4
30 |
31 | - name: Set up Python ${{ matrix.python-version }}
32 | uses: actions/setup-python@v5
33 | with:
34 | python-version: ${{ matrix.python-version }}
35 |
36 | - name: Install Hatch
37 | run: pip install --upgrade hatch
38 |
39 | - name: Run tests
40 | env:
41 | llamacloud_api_key: ${{ secrets.LLAMACLOUD_API_KEY }}
42 | run: hatch run test
43 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | dist/
2 | src/pdfitdown.egg-info/
3 | .pytest_cache/
4 | build/
5 | .ruff_cache/
6 | tests/__pycache__/
7 | tests/data/*.pdf
8 | .env
9 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | ---
2 | default_language_version:
3 | python: python3
4 |
5 | repos:
6 | - repo: https://github.com/pre-commit/pre-commit-hooks
7 | rev: v4.5.0
8 | hooks:
9 | - id: check-byte-order-marker
10 | - id: check-merge-conflict
11 | - id: check-symlinks
12 | - id: check-toml
13 | exclude: llama-index-core/llama_index/core/_static
14 | - id: check-yaml
15 | exclude: llama-index-core/llama_index/core/_static
16 | - id: detect-private-key
17 | - id: end-of-file-fixer
18 | exclude: llama-index-core/llama_index/core/_static
19 | - id: mixed-line-ending
20 | exclude: llama-index-core/llama_index/core/_static
21 | - id: trailing-whitespace
22 | exclude: llama-index-core/llama_index/core/_static
23 |
24 | - repo: https://github.com/charliermarsh/ruff-pre-commit
25 | rev: v0.11.8
26 | hooks:
27 | - id: ruff
28 | args: [--exit-non-zero-on-fix, --fix]
29 | exclude: ".*poetry.lock|.*_static"
30 |
31 | # - repo: https://github.com/psf/black-pre-commit-mirror
32 | # rev: 23.10.1
33 | # hooks:
34 | # - id: black-jupyter
35 | # name: black-src
36 | # alias: black
37 | # exclude: "^docs|.*poetry.lock|.*_static"
38 |
39 | - repo: https://github.com/pre-commit/mirrors-mypy
40 | rev: v1.0.1
41 | hooks:
42 | - id: mypy
43 | additional_dependencies:
44 | [
45 | "types-requests",
46 | "types-Deprecated",
47 | "types-redis",
48 | "types-setuptools",
49 | "types-PyYAML",
50 | "types-protobuf==4.24.0.4",
51 | ]
52 | args:
53 | [
54 | --namespace-packages,
55 | --explicit-package-bases,
56 | --disallow-untyped-defs,
57 | --ignore-missing-imports,
58 | --python-version=3.9,
59 | ]
60 | entry: bash -c "export MYPYPATH=ingest_anything"
61 |
62 | - repo: https://github.com/psf/black-pre-commit-mirror
63 | rev: 23.10.1
64 | hooks:
65 | - id: black-jupyter
66 | name: black-docs-py
67 | alias: black
68 | files: ^(docs/|examples/)
69 | # Using PEP 8's line length in docs prevents excess left/right scrolling
70 | args: [--line-length=79]
71 |
72 | - repo: https://github.com/pre-commit/mirrors-prettier
73 | rev: v3.0.3
74 | hooks:
75 | - id: prettier
76 | exclude: llama-index-core/llama_index/core/_static|poetry.lock|llama-index-legacy/llama_index/legacy/_static|docs/docs
77 |
78 | - repo: https://github.com/pappasam/toml-sort
79 | rev: v0.23.1
80 | hooks:
81 | - id: toml-sort-fix
82 | exclude: ".*poetry.lock|.*_static"
83 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing to `PdfItDown`
2 |
3 | Do you want to contribute to this project? Make sure to read this guidelines first :)
4 |
5 | ## Issue
6 |
7 | **When to do it**:
8 |
9 | - You found bugs but you don't know how to solve them or don't have time/will to do the solve
10 | - You want new features but you don't know how to implement them or don't have time/will to do the implementation
11 |
12 | > ⚠️ _Always check open and closed issues before you submit yours to avoid duplicates_
13 |
14 | **How to do it**:
15 |
16 | - Open an issue
17 | - Give the issue a meaningful title (short but effective problem description)
18 | - Describe the problem
19 |
20 | ## Traditional contribution
21 |
22 | **When to do it**:
23 |
24 | - You found bugs and corrected them
25 | - You optimized/improved the code
26 | - You added new features that you think could be useful to others
27 |
28 | **How to do it**:
29 |
30 | 1. Fork this repository
31 | 2. Test your changes locally
32 |
33 | ```
34 | uv pip install hatch
35 | cd PdfItDown/
36 | hatch run test
37 | ```
38 |
39 | 3. If all the test pass, you can commit your changes.
40 | 4. Submit pull request (make sure to provide a thorough description of the changes)
41 |
42 | > [!NOTE] > _If you add a new feature, you might need to add new tests!_
43 |
44 | ### Thanks for contributing!
45 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2024 Clelia (Astra) Bertelli
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
PdfItDown
3 | Convert Everything to PDF
4 |
5 |
6 |
7 |

8 |
9 |
10 |
11 |

12 |
13 |
14 | **PdfItDown** is a python package that relies on [`markitdown` by Microsoft](https://github.com/microsoft/markitdown/), [`markdown_pdf`](https://github.com/vb64/markdown-pdf), [img2pdf](https://pypi.org/project/img2pdf/) and [`LlamaIndex`](https://www.llamaindex.ai/). Visit us on our [documentation website](https://pdfitdown.eu)!
15 |
16 | ### Applicability
17 |
18 | **PdfItDown** is applicable to the following file formats:
19 |
20 | - Markdown
21 | - PowerPoint
22 | - Word
23 | - Excel
24 | - HTML
25 | - Text-based formats (CSV, XML, JSON)
26 | - ZIP files (iterates over contents)
27 | - Image files (PNG, JPG)
28 |
29 | The format-specific support needs to be evaluated for the specific reader you are using.
30 |
31 | ### How does it work?
32 |
33 | **PdfItDown** works in a very simple way:
34 |
35 | - From **markdown** to PDF
36 |
37 | ```mermaid
38 | graph LR
39 | 2(Input File) --> 3[Markdown content]
40 | 3[Markdown content] --> 4[markdown-pdf]
41 | 4[markdown-pdf] --> 5(PDF file)
42 | ```
43 |
44 | - From **image** to PDF
45 |
46 | ```mermaid
47 | graph LR
48 | 2(Input File) --> 3[Bytes]
49 | 3[Bytes] --> 4[img2pdf]
50 | 4[img2pdf] --> 5(PDF file)
51 | ```
52 |
53 | - From other **text-based** file formats to PDF
54 |
55 | ```mermaid
56 | graph LR
57 | 2(Input File) --> 3[LlamaIndex-compatible Reader - defaults to MarkItDown]
58 | 3[LlamaIndex-compatible Reader - defaults to MarkItDown] --> 4[Markdown content]
59 | 4[Markdown content] --> 5[markdown-pdf]
60 | 5[markdown-pdf] --> 6(PDF file)
61 | ```
62 |
63 | ### Installation and Usage
64 |
65 | To install **PdfItDown**, just run:
66 |
67 | ```bash
68 | python3 -m pip install pdfitdown
69 | ```
70 |
71 | You can now use the **command line tool**:
72 |
73 | ```
74 | usage: pdfitdown [-h] [-i INPUTFILE] [-o OUTPUTFILE] [-t TITLE] [-d DIRECTORY]
75 |
76 | options:
77 | -h, --help show this help message and exit
78 | -i, --inputfile INPUTFILE
79 | Path to the input file(s) that need to be converted to PDF. The path should be comma
80 | separated: input1.csv,input2.md,...,inputN.xml.
81 | -o, --outputfile OUTPUTFILE
82 | Path to the output PDF file(s). If more than one input file is provided, you should provide an
83 | equally long list of output files. The path should be comma separated:
84 | output1.pdf,output2.pdf,...,outputN.pdf. Defaults to 'None'
85 | -t, --title TITLE Title to include in the PDF metadata. Default: 'File Converted with PdfItDown'. If more than
86 | one file is provided, it will be ignored.
87 | -d, --directory DIRECTORY
88 | Directory whose files you want to bulk-convert to PDF. If the --inputfile argument is also
89 | provided, it will be ignored. Defaults to None.
90 | ```
91 |
92 | An example usage can be:
93 |
94 | ```bash
95 | pdfitdown -i README.md -o README.pdf -t "README"
96 | ```
97 |
98 | Or you can use it **inside your python scripts**:
99 |
100 | ```python
101 | from pdfitdown.pdfconversion import Converter
102 |
103 | converter = Converter()
104 | converter.convert(file_path = "business_grow.md", output_path = "business_growth.pdf", title="Business Growth for Q3 in 2024")
105 | converter.convert(file_path = "logo.png", output_path = "logo.pdf")
106 | converter.convert(file_path = "users.xlsx", output_path = "users.pdf")
107 | ```
108 |
109 | You can also convert **multiple files at once**:
110 |
111 | - In the CLI:
112 |
113 | ```bash
114 | # with custom output paths
115 | pdfitdown -i "test0.png,test1.csv" -o "testoutput0.pdf,testoutput1.pdf"
116 | # with inferred output paths
117 | pdfitdown -i "test0.png,test1.csv"
118 | ```
119 |
120 | - In the Python API:
121 |
122 | ```python
123 | from pdfitdown.pdfconversion import Converter
124 | from llama_parse import LlamaParse
125 | from dotenv import load_dotenv
126 | import os
127 |
128 | load_dotenv()
129 |
130 | reader = LlamaParse(api_key=os.getenv("llamacloud_api_key"), result_type="markdown")
131 | converter = Converter(reader=reader)
132 | # with custom output paths
133 | converter.multiple_convert(file_paths = ["business_growth.md", "logo.png"], output_paths = ["business_growth.pdf", "logo.pdf"])
134 | # with inferred output paths
135 | converter.multiple_convert(file_paths = ["business_growth.md", "logo.png"])
136 | ```
137 |
138 | You can bulk-convert **all the files in a directory**:
139 |
140 | - In the CLI:
141 |
142 | ```bash
143 | pdfitdown -d tests/data/testdir
144 | ```
145 |
146 | - In the Python API:
147 |
148 | ```python
149 | from pdfitdown.pdfconversion import Converter
150 |
151 | converter = Converter()
152 | output_paths = converter.convert_directory(directory_path = "tests/data/testdir")
153 | print(output_paths)
154 | ```
155 |
156 | Or you can just launch a [Gradio](https://gradio.app)-based user interface:
157 |
158 | ```bash
159 | pdfitdown_ui
160 | ```
161 |
162 | You will be able to see the application running on `http://localhost:7860` within seconds!
163 |
164 | Watch the demo here:
165 |
166 | [](https://raw.githubusercontent.com/AstraBert/PdfItDown/main/img/pdfitdown_ui_demo.mp4)
167 |
168 | ### Contributing
169 |
170 | Contributions are always welcome!
171 |
172 | Find contribution guidelines at [CONTRIBUTING.md](https://github.com/AstraBert/PdfItDown/tree/main/CONTRIBUTING.md)
173 |
174 | ### License and Funding
175 |
176 | This project is open-source and is provided under an [MIT License](https://github.com/AstraBert/PdfItDown/tree/main/LICENSE).
177 |
178 | If you found it useful, please consider [funding it](https://github.com/sponsors/AstraBert).
179 |
--------------------------------------------------------------------------------
/img/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AstraBert/PdfItDown/593434bd104b2c6f89464c5943ebe9573c60d475/img/logo.png
--------------------------------------------------------------------------------
/img/pdfitdown_ui_demo.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AstraBert/PdfItDown/593434bd104b2c6f89464c5943ebe9573c60d475/img/pdfitdown_ui_demo.mp4
--------------------------------------------------------------------------------
/img/thumbnail.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AstraBert/PdfItDown/593434bd104b2c6f89464c5943ebe9573c60d475/img/thumbnail.png
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["hatchling>=1.0.0"]
3 | build-backend = "hatchling.build"
4 |
5 | [options.package_data]
6 | pdfitdown = ["*"]
7 |
8 | [project]
9 | name = "pdfitdown"
10 | version = "1.5.1"
11 | authors = [
12 | {name = "Clelia (Astra) Bertelli", email = "astraberte9@gmail.com"}
13 | ]
14 | description = "PdfItDown - Convert Everything to PDF"
15 | readme = "README.md"
16 | requires-python = ">=3.10,<3.13"
17 | classifiers = [
18 | "Programming Language :: Python :: 3",
19 | "License :: OSI Approved :: MIT License",
20 | "Operating System :: OS Independent"
21 | ]
22 | dependencies = [
23 | 'llama-index-readers-markitdown',
24 | 'markdown-pdf',
25 | 'img2pdf',
26 | 'pillow',
27 | 'gradio',
28 | 'termcolor'
29 | ]
30 |
31 | [project.scripts]
32 | pdfitdown = "pdfitdown.pdfitdown_cli:main"
33 | pdfitdown_ui = "pdfitdown.pdfitdown_ui:main"
34 |
35 | [project.urls]
36 | Homepage = "https://github.com/AstraBert/PdfItDown"
37 | Issues = "https://github.com/AstraBert/PdfItDown/issues"
38 |
39 | [tool.hatch.build.targets.wheel]
40 | only-include = ["src/pdfitdown"]
41 |
42 | [tool.hatch.build.targets.wheel.sources]
43 | "src" = ""
44 |
45 | [tool.hatch.envs.default]
46 | dependencies = [
47 | "pytest",
48 | "llama_parse"
49 | ]
50 |
51 | [tool.hatch.envs.default.py-version]
52 | 40 = "3.10"
53 | 41 = "3.11"
54 | 42 = "3.12"
55 |
56 | [tool.hatch.envs.default.scripts]
57 | test = "cp src/pdfitdown/pdfconversion.py tests/ && cp src/pdfitdown/pdfitdown_ui.py tests/ && pytest tests/*.py -p no:warnings && rm tests/pdfconversion.py && rm tests/pdfitdown_ui.py"
58 |
59 | [tool.setuptools.packages.find]
60 | where = ["src"]
61 | include = ["pdfitdown*"]
62 |
--------------------------------------------------------------------------------
/src/pdfitdown/__init__.py:
--------------------------------------------------------------------------------
1 | import warnings
2 | warnings.filterwarnings("ignore")
3 |
--------------------------------------------------------------------------------
/src/pdfitdown/__pycache__/pdfconversion.cpython-313.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AstraBert/PdfItDown/593434bd104b2c6f89464c5943ebe9573c60d475/src/pdfitdown/__pycache__/pdfconversion.cpython-313.pyc
--------------------------------------------------------------------------------
/src/pdfitdown/__pycache__/reader.cpython-313.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AstraBert/PdfItDown/593434bd104b2c6f89464c5943ebe9573c60d475/src/pdfitdown/__pycache__/reader.cpython-313.pyc
--------------------------------------------------------------------------------
/src/pdfitdown/pdfconversion.py:
--------------------------------------------------------------------------------
1 | import img2pdf
2 | import warnings
3 | import os
4 | from PIL import Image
5 | from markdown_pdf import MarkdownPdf, Section
6 | from pydantic import BaseModel, field_validator, model_validator
7 | from pathlib import Path
8 | from typing import List, Optional
9 | from typing_extensions import Self
10 | from llama_index.core.readers.base import BaseReader
11 | from llama_index.readers.markitdown import MarkItDownReader
12 |
13 | class FilePath(BaseModel):
14 | file: str
15 | @field_validator("file")
16 | def is_valid_file(cls, file: str):
17 | p = Path(file)
18 | if not p.is_file():
19 | raise ValueError(f"{file} is not a file")
20 | return file
21 |
22 | class FileExistsWarning(Warning):
23 | """Warns you that a file exists"""
24 |
25 | class DirPath(BaseModel):
26 | path: str
27 | @model_validator(mode="after")
28 | def validate_dir_path(self) -> Self:
29 | if Path(self.path).is_dir():
30 | if len(os.listdir(self.path)) == 0:
31 | raise ValueError("You should provide a non-empty directory")
32 | else:
33 | return self
34 | else:
35 | raise ValueError("You should provide the path for an existing directory")
36 |
37 | class OutputPath(BaseModel):
38 | file: str
39 | @field_validator("file")
40 | def file_exists_warning(cls, file: str):
41 | if os.path.splitext(file)[1] != ".pdf":
42 | raise ValueError("Output file must be a PDF")
43 | p = Path(file)
44 | if p.is_file():
45 | warnings.warn(f"The file {file} already exists, you are about to overwrite it", FileExistsWarning)
46 | return file
47 |
48 | class MultipleFileConversion(BaseModel):
49 | input_files: List[FilePath]
50 | output_files: List[str] | List[OutputPath] | None
51 | @model_validator(mode="after")
52 | def validate_multiple_file_conversion(self) -> Self:
53 | if self.output_files is not None and len(self.input_files) != len(self.output_files):
54 | raise ValueError("Input and output files must be lists of the same length")
55 | else:
56 | if self.output_files is None:
57 | self.output_files = [OutputPath(file=(fl.file.replace(os.path.splitext(fl.file)[1],".pdf"))) for fl in self.input_files]
58 | else:
59 | if isinstance(self.output_files[0], str):
60 | self.output_files = [OutputPath(file=fl) for fl in self.output_files]
61 | return self
62 |
63 | class Converter:
64 | """A class for converting .docx, .html, .xml, .json, .csv, .md, .pptx, .xlsx, .png, .jpg, .png, .zip files into PDF"""
65 | def __init__(self, reader: Optional[BaseReader] = None) -> None:
66 | """
67 | Initialize the Converter class.
68 |
69 | Args:
70 | reader (Optional[BaseReader]): the reader to extract the file text (needs to be LlamaIndex-compatible). Defaults to MarkItDown reader.
71 | Returns:
72 | None
73 | """
74 | if reader is not None:
75 | self._reader = reader
76 | else:
77 | self._reader = MarkItDownReader()
78 | return
79 | def convert(self, file_path: str, output_path: str, title: str = "File Converted with PdfItDown"):
80 | """
81 | Convert various document types into PDF format (supports .docx, .html, .xml, .json, .csv, .md, .pptx, .xlsx, .png, .jpg, .png, .zip).
82 |
83 | Args:
84 | file_path (str): The path to the input file
85 | output_path (str): The path to the output file
86 | title (str): The title for the PDF document (defaults to: 'File Converted with PdfItDown')
87 | Returns:
88 | output_path (str): Path to the output file
89 | Raises:
90 | ValidationError: if the format of the input file is not support or if the format of the output file is not PDF
91 | FileExistsWarning: if the output PDF path is an existing file, it warns you that the file will be overwritten
92 | """
93 | self.file_input = FilePath(file=file_path)
94 | self.file_output = OutputPath(file=output_path)
95 | if os.path.splitext(self.file_input.file)[1] == ".md":
96 | f = open(self.file_input.file, "r")
97 | finstr = f.read()
98 | f.close()
99 | pdf = MarkdownPdf(toc_level=0)
100 | pdf.add_section(Section(finstr))
101 | pdf.meta["title"] = title
102 | pdf.save(self.file_output.file)
103 | return self.file_output.file
104 | elif os.path.splitext(self.file_input.file)[1] in [".jpg", ".png"]:
105 | image = Image.open(self.file_input.file)
106 | pdf_bytes = img2pdf.convert(image.filename)
107 | with open(self.file_output.file, "wb") as file:
108 | file.write(pdf_bytes)
109 | file.close()
110 | image.close()
111 | return self.file_output.file
112 | else:
113 | try:
114 | result = self._reader.load_data([self.file_input.file])
115 | finstr = result[0].text
116 | pdf = MarkdownPdf(toc_level=0)
117 | pdf.add_section(Section(finstr))
118 | pdf.meta["title"] = title
119 | pdf.save(self.file_output.file)
120 | return self.file_output.file
121 | except Exception:
122 | return None
123 | def multiple_convert(self, file_paths: List[str], output_paths: Optional[List[str]] = None):
124 | """
125 | Convert various document types into PDF format (supports .docx, .html, .xml, .json, .csv, .md, .pptx, .xlsx, .png, .jpg, .png, .zip). Converts multiple files at once.
126 | Args:
127 | file_paths (str): The paths to the input files
128 | output_paths (Optional[str]): The path to the output files
129 | Returns:
130 | output_paths (List[str]): Paths to the output files
131 | Raises:
132 | ValidationError: if the format of the input file is not support or if the format of the output file is not PDF
133 | FileExistsWarning: if the output PDF path is an existing file, it warns you that the file will be overwritten
134 | """
135 | input_files = [FilePath(file=fl) for fl in file_paths]
136 | to_convert_list = MultipleFileConversion(input_files=input_files, output_files=output_paths)
137 | output_fls: List[OutputPath] = []
138 | for i in range(len(to_convert_list.input_files)):
139 | result = self.convert(file_path=to_convert_list.input_files[i].file, output_path=to_convert_list.output_files[i].file)
140 | if result is not None:
141 | output_fls.append(to_convert_list.output_files[i])
142 | return [el.file for el in output_fls]
143 | def convert_directory(self, directory_path: str):
144 | """
145 | Convert various document types into PDF format (supports .docx, .html, .xml, .json, .csv, .md, .pptx, .xlsx, .png, .jpg, .png, .zip). Converts all the files in a directory at once.
146 | Args:
147 | directory_path (str): The paths to the input files
148 | Returns:
149 | output_paths (List[str]): Paths to the output files
150 | Raises:
151 | ValidationError: if the format of the input file is not support or if the format of the output file is not PDF
152 | FileExistsWarning: if the output PDF path is an existing file, it warns you that the file will be overwritten
153 | """
154 | dirpath = DirPath(path=directory_path)
155 | fls = []
156 | p = os.walk(dirpath.path)
157 | for root, parent, file in p:
158 | for f in file:
159 | fls.append(root+"/"+f)
160 | output_paths = self.multiple_convert(file_paths=fls)
161 | return output_paths
162 |
--------------------------------------------------------------------------------
/src/pdfitdown/pdfitdown_cli.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from argparse import ArgumentParser
3 | from .pdfconversion import Converter
4 | from termcolor import cprint
5 | from pydantic import ValidationError
6 |
7 | def main():
8 | parser = ArgumentParser()
9 | parser.add_argument("-i", "--inputfile",
10 | help="Path to the input file(s) that need to be converted to PDF. The path should be comma separated: input1.csv,input2.md,...,inputN.xml.",
11 | required=False, type=str, default=None)
12 | parser.add_argument("-o", "--outputfile",
13 | help="Path to the output PDF file(s). If more than one input file is provided, you should provide an equally long list of output files. The path should be comma separated: output1.pdf,output2.pdf,...,outputN.pdf. Defaults to 'None'",
14 | required=False, type=str, default=None)
15 | parser.add_argument("-t", "--title",
16 | help="Title to include in the PDF metadata. Default: 'File Converted with PdfItDown'. If more than one file is provided, it will be ignored.",
17 | required=False, default="File Converted with PdfItDown", type=str)
18 | parser.add_argument("-d", "--directory",
19 | help="Directory whose files you want to bulk-convert to PDF. If the --inputfile argument is also provided, it will be ignored. Defaults to None.",
20 | required=False, default=None, type=str)
21 | args = parser.parse_args()
22 | inf = args.inputfile
23 | outf = args.outputfile
24 | diri = args.directory
25 | titl = args.title
26 | conv = Converter()
27 | try:
28 | if inf is not None:
29 | if outf is not None and len(inf.split(",")) > 1:
30 | outf = conv.multiple_convert(inf.split(","), outf.split(","))
31 | cprint("Conversion successful!🎉", color="green", attrs=["bold"], file=sys.stdout)
32 | sys.exit(0)
33 | elif outf is None and len(inf.split(",")) > 1:
34 | outf = conv.multiple_convert(inf.split(","), outf)
35 | cprint("Conversion successful!🎉", color="green", attrs=["bold"], file=sys.stdout)
36 | sys.exit(0)
37 | elif outf is not None and len(inf.split(",")) == 1:
38 | outf = conv.convert(inf.split(",")[0], outf.split(",")[0], title=titl)
39 | cprint("Conversion successful!🎉", color="green", attrs=["bold"], file=sys.stdout)
40 | sys.exit(0)
41 | else:
42 | cprint("ERROR! Invalid input provided, check your input and output files",color="red", file=sys.stderr)
43 | sys.exit(1)
44 | elif inf is None and diri is None:
45 | cprint("ERROR! You should provide at least one of --inputfile or --directory",color="red", file=sys.stderr)
46 | sys.exit(1)
47 | else:
48 | outf = conv.convert_directory(diri)
49 | cprint("Conversion successful!🎉", color="green", attrs=["bold"], file=sys.stdout)
50 | sys.exit(0)
51 | except ValidationError as e:
52 | cprint(f"ERROR! Error:\n\n{e}\n\nwas raised during conversion",color="red", file=sys.stderr)
53 | sys.exit(1)
54 |
55 | if __name__ == "__main__":
56 | main()
57 |
--------------------------------------------------------------------------------
/src/pdfitdown/pdfitdown_ui.py:
--------------------------------------------------------------------------------
1 | import warnings
2 | import os
3 | try:
4 | from .pdfconversion import Converter
5 | except ImportError:
6 | from pdfconversion import Converter
7 | from typing import List
8 | import gradio as gr
9 |
10 | class FileNotConvertedWarning(Warning):
11 | """The file was not in one of the specified formats for conversion to PDF,thus it was not converted"""
12 |
13 | def to_pdf(files: List[str]) -> List[str]:
14 | pdfs = []
15 | converter = Converter()
16 | for fl in files:
17 | try:
18 | outf = converter.convert(fl, fl.replace(os.path.splitext(fl)[1], ".pdf"))
19 | except Exception as e:
20 | warnings.warn(f"File {fl} not converted because of an error during the conversion: {e}", FileNotConvertedWarning)
21 | else:
22 | pdfs.append(outf)
23 | return pdfs
24 |
25 | def convert_files(files: List[str]) -> List[str]:
26 | pdfs = to_pdf(files)
27 | return pdfs
28 |
29 | def main():
30 | iface = gr.Interface(
31 | fn=convert_files,
32 | inputs=gr.File(label="Upload your file", file_count="multiple"),
33 | outputs=gr.File(label="Converted PDF", file_count="multiple"),
34 | title="File to PDF Converter",
35 | description="Upload a file in .docx, .xlsx, .html, .pptx, .json, .csv, .xml, .md, .jpg/.jpeg, .png, .zip format, and get it converted to PDF."
36 | )
37 | iface.launch()
38 |
39 | if __name__ == "__main__":
40 | main()
41 |
--------------------------------------------------------------------------------
/tests/data/test.txt:
--------------------------------------------------------------------------------
1 | This is example text
2 |
--------------------------------------------------------------------------------
/tests/data/test0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AstraBert/PdfItDown/593434bd104b2c6f89464c5943ebe9573c60d475/tests/data/test0.png
--------------------------------------------------------------------------------
/tests/data/test1.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AstraBert/PdfItDown/593434bd104b2c6f89464c5943ebe9573c60d475/tests/data/test1.pptx
--------------------------------------------------------------------------------
/tests/data/test2.md:
--------------------------------------------------------------------------------
1 | ## This is a test markdown
2 |
3 | This is a test markdown
4 |
--------------------------------------------------------------------------------
/tests/data/test3.json:
--------------------------------------------------------------------------------
1 | {
2 | "apiVersion": "v1",
3 | "kind": "ExampleData",
4 | "metadata": {
5 | "name": "sample-data",
6 | "creationTimestamp": "2023-10-27T10:00:00Z",
7 | "labels": {
8 | "environment": "production",
9 | "application": "data-processor"
10 | },
11 | "annotations": {
12 | "description": "This is a sample data set for testing purposes."
13 | },
14 | "uid": "a1b2c3d4-e5f6-7890-1234-567890abcdef"
15 | },
16 | "spec": {
17 | "dataType": "string",
18 | "source": "Database",
19 | "retentionPolicy": {
20 | "duration": "30d",
21 | "action": "archive"
22 | },
23 | "validationRules": [
24 | {
25 | "field": "name",
26 | "type": "regex",
27 | "pattern": "^[a-zA-Z0-9\\s]+$"
28 | },
29 | {
30 | "field": "age",
31 | "type": "range",
32 | "min": 0,
33 | "max": 120
34 | }
35 | ]
36 | },
37 | "status": {
38 | "state": "Active",
39 | "lastProcessed": "2023-10-27T09:55:00Z",
40 | "processedRecords": 12345,
41 | "errors": 0,
42 | "message": "Data processing is running smoothly."
43 | },
44 | "data": [
45 | {
46 | "id": 1,
47 | "name": "John Doe",
48 | "age": 30,
49 | "city": "New York"
50 | },
51 | {
52 | "id": 2,
53 | "name": "Jane Smith",
54 | "age": 25,
55 | "city": "Los Angeles"
56 | },
57 | {
58 | "id": 3,
59 | "name": "Peter Jones",
60 | "age": 40,
61 | "city": "Chicago"
62 | }
63 | ]
64 | }
65 |
--------------------------------------------------------------------------------
/tests/data/test4.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AstraBert/PdfItDown/593434bd104b2c6f89464c5943ebe9573c60d475/tests/data/test4.docx
--------------------------------------------------------------------------------
/tests/data/test5.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AstraBert/PdfItDown/593434bd104b2c6f89464c5943ebe9573c60d475/tests/data/test5.zip
--------------------------------------------------------------------------------
/tests/llamaparse/test1.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AstraBert/PdfItDown/593434bd104b2c6f89464c5943ebe9573c60d475/tests/llamaparse/test1.pptx
--------------------------------------------------------------------------------
/tests/llamaparse/test4.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AstraBert/PdfItDown/593434bd104b2c6f89464c5943ebe9573c60d475/tests/llamaparse/test4.docx
--------------------------------------------------------------------------------
/tests/test_llamaparse.py:
--------------------------------------------------------------------------------
1 | from pdfconversion import Converter
2 | from llama_parse import LlamaParse
3 | import pathlib
4 | import os
5 | from dotenv import load_dotenv
6 |
7 | load_dotenv()
8 |
9 | reader = LlamaParse(api_key=os.getenv("llamacloud_api_key"), result_type="markdown")
10 | converter = Converter(reader=reader)
11 |
12 | def test_single_file():
13 | test_cases = [
14 | {
15 | "test_name": "Successful HTML conversion",
16 | "file_input": "tests/data/test1.pptx",
17 | "file_output": "tests/data/test1.pdf",
18 | "expected": True
19 | },
20 | {
21 | "test_name": "Successful md file conversion",
22 | "file_input": "tests/data/test2.md",
23 | "file_output": "tests/data/test2.pdf",
24 | "expected": True
25 | },
26 | {
27 | "test_name": "Successful image file conversion",
28 | "file_input": "tests/data/test0.png",
29 | "file_output": "tests/data/test0.pdf",
30 | "expected": True
31 | },
32 | {
33 | "test_name": "Unsuccessful file conversion",
34 | "file_input": "tests/data/tes.md",
35 | "file_output": "tests/data/tes.pdf",
36 | "expected": False
37 | },
38 | ]
39 | for c in test_cases:
40 | print(c["test_name"])
41 | try:
42 | result = converter.convert(file_path=c["file_input"], output_path=c["file_output"])
43 | assert pathlib.Path(result).is_file() == c["expected"]
44 | if pathlib.Path(result).is_file():
45 | os.remove(result)
46 | except Exception:
47 | result = c["file_output"]
48 | assert pathlib.Path(result).is_file() == c["expected"]
49 |
50 | def test_multiple_files():
51 | test_cases = [
52 | {
53 | "test_name": "Specified output files",
54 | "file_input": ["tests/data/test1.pptx","tests/data/test4.docx","tests/data/test2.md"],
55 | "file_output": ["tests/data/test0_1.pdf","tests/data/test_1.pdf","tests/data/test2_1.pdf"],
56 | "expected": [True, True, True]
57 | },
58 | {
59 | "test_name": "Unspecified output files",
60 | "file_input": ["tests/data/test1.pptx","tests/data/test4.docx","tests/data/test2.md"],
61 | "file_output": None,
62 | "expected": [True, True, True]
63 | },
64 | {
65 | "test_name": "Unspecified output files",
66 | "file_input": ["tests/data/test1.pptx","tests/data/test4.docx","tests/data/test2.md"],
67 | "file_output": ["tests/data/test0_2.pdf"],
68 | "expected": False
69 | },
70 | ]
71 | for c in test_cases:
72 | print(c["test_name"])
73 | try:
74 | result = converter.multiple_convert(file_paths=c["file_input"], output_paths=c["file_output"])
75 | assert [pathlib.Path(r).is_file() for r in result] == c["expected"]
76 | for f in result:
77 | if pathlib.Path(f).is_file():
78 | os.remove(f)
79 | except Exception:
80 | assert pathlib.Path(c["file_output"][0]).is_file() == c["expected"]
81 |
82 |
83 | def test_dir():
84 | test_cases = [
85 | {
86 | "test_name": "Correct dir path",
87 | "file_input": "tests/llamaparse",
88 | "file_output": ["tests/llamaparse/test1.pdf", "tests/llamaparse/test4.pdf"],
89 | "expected": [True, True]
90 | },
91 | {
92 | "test_name": "Wrong dir path",
93 | "file_input": "tests/data/llamapars",
94 | "file_output": ["tests/llamaparse/test1.pdf", "tests/llamaparse/test4.pdf"],
95 | "expected": [False, False]
96 | },
97 | ]
98 | for c in test_cases:
99 | print(c["test_name"])
100 | try:
101 | converter.convert_directory(directory_path=c["file_input"])
102 | assert [pathlib.Path(r).is_file() for r in c["file_output"]] == c["expected"]
103 | for f in c["file_output"]:
104 | if pathlib.Path(f).is_file():
105 | os.remove(f)
106 | except Exception:
107 | assert [pathlib.Path(r).is_file() for r in c["file_output"]] == c["expected"]
108 |
--------------------------------------------------------------------------------
/tests/test_markitdown.py:
--------------------------------------------------------------------------------
1 | from pdfconversion import Converter
2 | import pathlib
3 | import os
4 |
5 | converter = Converter()
6 |
7 | def test_single_file():
8 | test_cases = [
9 | {
10 | "test_name": "Successful image conversion",
11 | "file_input": "tests/data/test0.png",
12 | "file_output": "tests/data/test0.pdf",
13 | "expected": True
14 | },
15 | {
16 | "test_name": "Successful text file conversion",
17 | "file_input": "tests/data/test.txt",
18 | "file_output": "tests/data/test.pdf",
19 | "expected": True
20 | },
21 | {
22 | "test_name": "Successful md file conversion",
23 | "file_input": "tests/data/test2.md",
24 | "file_output": "tests/data/test2.pdf",
25 | "expected": True
26 | },
27 | {
28 | "test_name": "Unsuccessful file conversion",
29 | "file_input": "tests/data/test1.pptx",
30 | "file_output": "tests/data/test1.pdf",
31 | "expected": True
32 | },
33 | {
34 | "test_name": "Unsuccessful file conversion",
35 | "file_input": "tests/data/tes.md",
36 | "file_output": "tests/data/tes.pdf",
37 | "expected": False
38 | },
39 | ]
40 | for c in test_cases:
41 | print(c["test_name"])
42 | try:
43 | result = converter.convert(file_path=c["file_input"], output_path=c["file_output"])
44 | assert pathlib.Path(result).is_file() == c["expected"]
45 | if pathlib.Path(result).is_file():
46 | os.remove(result)
47 | except Exception:
48 | result = c["file_output"]
49 | assert pathlib.Path(result).is_file() == c["expected"]
50 |
51 | def test_multiple_files():
52 | test_cases = [
53 | {
54 | "test_name": "Specified output files",
55 | "file_input": ["tests/data/test0.png","tests/data/test.txt","tests/data/test2.md"],
56 | "file_output": ["tests/data/test0_1.pdf","tests/data/test_1.pdf","tests/data/test2_1.pdf"],
57 | "expected": [True, True, True]
58 | },
59 | {
60 | "test_name": "Unspecified output files",
61 | "file_input": ["tests/data/test0.png","tests/data/test.txt","tests/data/test2.md"],
62 | "file_output": None,
63 | "expected": [True, True, True]
64 | },
65 | {
66 | "test_name": "Unspecified output files",
67 | "file_input": ["tests/data/test0.png","tests/data/test.txt","tests/data/test2.md"],
68 | "file_output": ["tests/data/test0_2.pdf"],
69 | "expected": False
70 | },
71 | ]
72 | for c in test_cases:
73 | print(c["test_name"])
74 | try:
75 | result = converter.multiple_convert(file_paths=c["file_input"], output_paths=c["file_output"])
76 | assert [pathlib.Path(r).is_file() for r in result] == c["expected"]
77 | for f in result:
78 | if pathlib.Path(f).is_file():
79 | os.remove(f)
80 | except Exception:
81 | assert pathlib.Path(c["file_output"][0]).is_file() == c["expected"]
82 |
83 |
84 | def test_dir():
85 | test_cases = [
86 | {
87 | "test_name": "Correct dir path",
88 | "file_input": "tests/data",
89 | "file_output": ["tests/data/test0.pdf","tests/data/test1.pdf", "tests/data/test.pdf","tests/data/test2.pdf", "tests/data/test3.pdf", "tests/data/test4.pdf", "tests/data/test5.pdf"],
90 | "expected": [True, True, True, True, True, True, True]
91 | },
92 | {
93 | "test_name": "Wrong dir path",
94 | "file_input": "tests/dat",
95 | "file_output": ["tests/data/test0.pdf","tests/data/test1.pdf", "tests/data/test.pdf","tests/data/test2.pdf", "tests/data/test3.pdf", "tests/data/test4.pdf", "tests/data/test5.pdf"],
96 | "expected": [False, False, False, False, False, False, False]
97 | },
98 | ]
99 | for c in test_cases:
100 | print(c["test_name"])
101 | try:
102 | converter.convert_directory(directory_path=c["file_input"])
103 | assert [pathlib.Path(r).is_file() for r in c["file_output"]] == c["expected"]
104 | for f in c["file_output"]:
105 | if pathlib.Path(f).is_file():
106 | os.remove(f)
107 | except Exception:
108 | assert [pathlib.Path(r).is_file() for r in c["file_output"]] == c["expected"]
109 |
--------------------------------------------------------------------------------
/tests/test_ui.py:
--------------------------------------------------------------------------------
1 | from pdfitdown_ui import to_pdf
2 | import os
3 | from pathlib import Path
4 |
5 | def test_to_pdf():
6 | test_files = ["tests/data/test0.png","tests/data/test.txt","tests/data/test2.md"]
7 | expected_outputs = ["tests/data/test0.pdf","tests/data/test.pdf","tests/data/test2.pdf"]
8 | assert to_pdf(test_files) == expected_outputs
9 | for p in expected_outputs:
10 | if Path(p).is_file():
11 | os.remove(p)
12 |
--------------------------------------------------------------------------------