├── .git-blame-ignore-revs ├── .github ├── ISSUE_TEMPLATE │ ├── bug-report.md │ └── feature-request.md ├── SECURITY.md ├── dependabot.yml ├── scripts │ └── check_pr_title.py └── workflows │ ├── benchmark.yaml │ ├── create-github-release.yaml │ ├── github-ci.yaml │ ├── publish-to-pypi.yaml │ ├── release.yaml │ └── title-check.yml ├── .gitignore ├── .gitmodules ├── .pre-commit-config.yaml ├── .readthedocs.yaml ├── CHANGELOG.md ├── CONTRIBUTING.md ├── CONTRIBUTORS.md ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.md ├── docs ├── Makefile ├── _static │ ├── logo.png │ ├── releasing.drawio │ └── releasing.drawio.png ├── conf.py ├── dev │ ├── PR_Header_example.png │ ├── cmaps.md │ ├── deprecations.md │ ├── documentation.md │ ├── intro.md │ ├── pdf-format.md │ ├── pypdf-parsing.md │ ├── pypdf-writing.md │ ├── releasing.md │ └── testing.md ├── index.rst ├── make.bat ├── meta │ ├── changelog-v1.md │ ├── comparisons.md │ ├── faq.md │ ├── history.md │ ├── project-governance.md │ ├── scope-of-pypdf.md │ └── taking-ownership.md ├── modules │ ├── Destination.rst │ ├── DocumentInformation.rst │ ├── Field.rst │ ├── Fit.rst │ ├── PageObject.rst │ ├── PageRange.rst │ ├── PaperSize.rst │ ├── PdfDocCommon.rst │ ├── PdfReader.rst │ ├── PdfWriter.rst │ ├── RectangleObject.rst │ ├── Transformation.rst │ ├── XmpInformation.rst │ ├── annotations.rst │ ├── constants.rst │ ├── errors.rst │ └── generic.rst └── user │ ├── add-javascript.md │ ├── add-watermark.md │ ├── adding-pdf-annotations.md │ ├── annotation-circle.png │ ├── annotation-highlight.png │ ├── annotation-line.png │ ├── annotation-polygon.png │ ├── annotation-polyline.png │ ├── annotation-popup.png │ ├── annotation-square.png │ ├── cropping-and-transforming.md │ ├── encryption-decryption.md │ ├── error-hierarchy.png │ ├── extract-attachments.md │ ├── extract-images.md │ ├── extract-text.md │ ├── file-size.md │ ├── forms.md │ ├── free-text-annotation.png │ ├── installation.md │ ├── merge-45-deg-rot.png │ ├── merge-rotate-expand.png │ ├── merge-translated.png │ ├── merging-pdfs.md │ ├── metadata.md │ ├── migration-1-to-2.md │ ├── nup-dest1.png │ ├── nup-dest2.png │ ├── nup-source.png │ ├── page-stamped.png │ ├── page.png │ ├── pdf-version-support.md │ ├── pdfa-compliance.md │ ├── plain-merge.png │ ├── post-processing-in-text-extraction.md │ ├── reading-pdf-annotations.md │ ├── robustness.md │ ├── scaling.png │ ├── stamp.png │ ├── streaming-data.md │ ├── suppress-warnings.md │ ├── text-annotation.png │ ├── viewer-preferences.md │ └── watermark.png ├── make_release.py ├── pypdf ├── __init__.py ├── _cmap.py ├── _codecs │ ├── __init__.py │ ├── _codecs.py │ ├── adobe_glyphs.py │ ├── pdfdoc.py │ ├── std.py │ ├── symbol.py │ └── zapfding.py ├── _crypt_providers │ ├── __init__.py │ ├── _base.py │ ├── _cryptography.py │ ├── _fallback.py │ └── _pycryptodome.py ├── _doc_common.py ├── _encryption.py ├── _merger.py ├── _page.py ├── _page_labels.py ├── _protocols.py ├── _reader.py ├── _text_extraction │ ├── __init__.py │ └── _layout_mode │ │ ├── __init__.py │ │ ├── _fixed_width_page.py │ │ ├── _font.py │ │ ├── _font_widths.py │ │ ├── _text_state_manager.py │ │ └── _text_state_params.py ├── _utils.py ├── _version.py ├── _writer.py ├── _xobj_image_helpers.py ├── annotations │ ├── __init__.py │ ├── _base.py │ ├── _markup_annotations.py │ └── _non_markup_annotations.py ├── constants.py ├── errors.py ├── filters.py ├── generic │ ├── __init__.py │ ├── _base.py │ ├── _data_structures.py │ ├── _files.py │ ├── _fit.py │ ├── _image_inline.py │ ├── _outline.py │ ├── _rectangle.py │ ├── _utils.py │ └── _viewerpref.py ├── pagerange.py ├── papersizes.py ├── py.typed ├── types.py └── xmp.py ├── pyproject.toml ├── requirements ├── ci-3.11.txt ├── ci.in ├── ci.txt ├── dev.in ├── dev.txt ├── docs.in └── docs.txt ├── resources ├── 010-pdflatex-forms.txt ├── AEO.1172.layout.rot180.txt ├── AEO.1172.layout.txt ├── AutoCad_Diagram.pdf ├── AutoCad_Simple.pdf ├── Claim Maker Alerts Guide_pg2.layout.txt ├── Epic.Page.layout.txt ├── FormTestFromOo.pdf ├── GeoBase_NHNC1_Data_Model_UML_EN.pdf ├── SF424_page2.pdf ├── Sample_Td-matrix.pdf ├── Seige_of_Vicksburg_Sample_OCR-crazyones-merged.pdf ├── Seige_of_Vicksburg_Sample_OCR.pdf ├── attachment.pdf ├── box.pdf ├── bytes.pdf ├── commented-xmp.pdf ├── commented.pdf ├── crazyones-encrypted-256.pdf ├── crazyones.pdf ├── crazyones.txt ├── crazyones_layout_vertical_space.txt ├── crazyones_layout_vertical_space_font_height_weight.txt ├── encrypted-file.pdf ├── encrypted_doc_no_id.pdf ├── encryption │ ├── r2-empty-password.pdf │ ├── r2-owner-password.pdf │ ├── r2-user-password.pdf │ ├── r3-empty-password.pdf │ ├── r3-user-password.pdf │ ├── r4-aes-user-password.pdf │ ├── r4-owner-password.pdf │ ├── r4-user-password.pdf │ ├── r5-empty-password.pdf │ ├── r5-owner-password.pdf │ ├── r5-user-password.pdf │ ├── r6-both-passwords.pdf │ ├── r6-empty-password.pdf │ ├── r6-owner-password.pdf │ ├── r6-user-password.pdf │ └── unencrypted.pdf ├── form.pdf ├── form_acrobatReader.pdf ├── form_evince.pdf ├── git.pdf ├── hello-world.pdf ├── imagemagick-ASCII85Decode.pdf ├── imagemagick-CCITTFaxDecode.pdf ├── imagemagick-images.pdf ├── imagemagick-lzw.pdf ├── indirect-rotation.pdf ├── inkscape-abc.pdf ├── issue-297.pdf ├── issue-301.pdf ├── issue-604.pdf ├── issue-914-xmp-data.pdf ├── jpeg.pdf ├── jpeg.txt ├── labeled-edges-center-image.pdf ├── libreoffice-form.pdf ├── libreoffice-writer-password.pdf ├── lzw_decoder_table_overflow.bin ├── metadata.pdf ├── missing_info.pdf ├── multicolumn-lorem-ipsum.txt ├── multilang.pdf ├── outline-without-title.pdf ├── outlines-with-invalid-destinations.pdf ├── pdflatex-forms.pdf ├── pdflatex-outline.pdf ├── reportlab-inline-image.pdf ├── selenium-pypdf-issue-177.pdf ├── side-by-side-subfig.pdf ├── test Orient.pdf ├── test_watermarking_reportlab_rendering.png ├── toy.layout.txt ├── toy.pdf └── two-different-pages.pdf └── tests ├── __init__.py ├── bench.py ├── conftest.py ├── example_files.yaml ├── generic ├── __init__.py ├── test_files.py └── test_image_inline.py ├── scripts ├── __init__.py ├── data │ └── commits__version_4_0_1.json └── test_make_release.py ├── test_annotations.py ├── test_cmap.py ├── test_codecs.py ├── test_constants.py ├── test_doc_common.py ├── test_encryption.py ├── test_filters.py ├── test_forms.py ├── test_generic.py ├── test_images.py ├── test_javascript.py ├── test_merger.py ├── test_page.py ├── test_page_labels.py ├── test_pagerange.py ├── test_papersizes.py ├── test_pdfa.py ├── test_protocols.py ├── test_reader.py ├── test_text_extraction.py ├── test_utils.py ├── test_workflows.py ├── test_writer.py ├── test_xmp.py └── test_xobject_image_helpers.py /.git-blame-ignore-revs: -------------------------------------------------------------------------------- 1 | # This file helps us to ignore style / formatting / doc changes 2 | # in git blame. That is useful when we're trying to find the root cause of an 3 | # error. 4 | 5 | # Docstring formatting 6 | a89ff74d8c0203278a039d9496a3d8df4d134f84 7 | 8 | # STY: Apply pre-commit (black, isort) + use snake_case variables (#832) 9 | eef03d935dfeacaa75848b39082cf94d833d3174 10 | 11 | # STY: Apply black and isort 12 | baeb7d23278de0f8d00ca9f2b656bf0674f08937 13 | 14 | # STY: Documentation, Variable names (#839) 15 | 444fca22836df061d9d23e71ffb7d68edcdfa766 16 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug-report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Report a bug 3 | about: Something broke! 4 | title: '' 5 | labels: Bug 6 | assignees: '' 7 | 8 | --- 9 | 10 | Replace this: What happened? What were you trying to achieve? 11 | 12 | ## Environment 13 | 14 | Which environment were you using when you encountered the problem? 15 | 16 | ```bash 17 | $ python -m platform 18 | # TODO: Your output goes here 19 | 20 | $ python -c "import pypdf;print(pypdf._debug_versions)" 21 | # TODO: Your output goes here 22 | ``` 23 | 24 | ## Code + PDF 25 | 26 | This is a minimal, complete example that shows the issue: 27 | 28 | ```python 29 | # TODO: Your code goes here 30 | ``` 31 | 32 | Share here the PDF file(s) that cause the issue. The smaller they are, the 33 | better. Let us know if we may add them to our tests! 34 | 35 | ## Traceback 36 | 37 | This is the complete traceback I see: 38 | 39 | ``` 40 | # TODO: Your traceback goes here (if applicable) 41 | ``` 42 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature-request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Request a Feature 3 | about: What do you think is missing in pypdf? 4 | title: '' 5 | labels: Feature Request 6 | assignees: '' 7 | 8 | --- 9 | 10 | ## Explanation 11 | 12 | Explain briefly what you want to achieve. 13 | 14 | ## Code Example 15 | 16 | How would your feature be used? (Remove this if it is not applicable.) 17 | 18 | ```python 19 | from pypdf import PdfReader, PdfWriter 20 | 21 | ... # your new feature in action! 22 | ``` 23 | -------------------------------------------------------------------------------- /.github/SECURITY.md: -------------------------------------------------------------------------------- 1 | # Security Policy 2 | 3 | ## Supported Versions 4 | 5 | Security fixes are applied to the latest version. 6 | 7 | ## Reporting a Vulnerability 8 | 9 | If you find a potential security issue, please report it using the 10 | [private vulnerability reporting](https://docs.github.com/en/code-security/security-advisories/guidance-on-reporting-and-writing-information-about-vulnerabilities/privately-reporting-a-security-vulnerability) feature of GitHub to 11 | automatically inform all relevant team members. Otherwise, please 12 | get in touch with stefan6419846 through e-mail (current maintainer, 13 | address in GitHub profile). 14 | 15 | We will try to find a fix in a timely manner and will then issue a security 16 | advisory together with the update via GitHub 17 | ([example](https://github.com/py-pdf/pypdf/security/advisories/GHSA-xcjx-m2pj-8g79)). 18 | 19 | If you don't get a reaction within 30 days, please open a public issue on 20 | GitHub. 21 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # Set update schedule for GitHub Actions 2 | 3 | version: 2 4 | updates: 5 | 6 | - package-ecosystem: "github-actions" 7 | directory: "/" 8 | schedule: 9 | interval: "daily" 10 | commit-message: 11 | prefix: "DEV" 12 | -------------------------------------------------------------------------------- /.github/scripts/check_pr_title.py: -------------------------------------------------------------------------------- 1 | """Check that all PR titles follow the desired scheme.""" # noqa: INP001 2 | 3 | import os 4 | import sys 5 | 6 | KNOWN_PREFIXES = ( 7 | "SEC: ", 8 | "BUG: ", 9 | "ENH: ", 10 | "DEP: ", 11 | "PI: ", 12 | "ROB: ", 13 | "DOC: ", 14 | "TST: ", 15 | "DEV: ", 16 | "STY: ", 17 | "MAINT: ", 18 | "REL: ", # For internal use only. 19 | ) 20 | PR_TITLE = os.getenv("PR_TITLE", "") 21 | 22 | if not PR_TITLE.startswith(KNOWN_PREFIXES) or not PR_TITLE.split(": ", maxsplit=1)[1]: 23 | sys.stderr.write( 24 | f"The PR title '{PR_TITLE}' does not follow the projects naming scheme: " 25 | "https://pypdf.readthedocs.io/en/latest/dev/intro.html#commit-messages\n", 26 | ) 27 | sys.stderr.write( 28 | "If you do not know which one to choose or if multiple apply, make a best guess. " 29 | "Nobody will complain if it does not quite fit :-)\n", 30 | ) 31 | sys.exit(1) 32 | else: 33 | sys.stdout.write(f"PR title '{PR_TITLE}' appears to be valid.\n") 34 | -------------------------------------------------------------------------------- /.github/workflows/benchmark.yaml: -------------------------------------------------------------------------------- 1 | name: Benchmarking pypdf 2 | on: 3 | push: 4 | branches: 5 | - main 6 | 7 | permissions: 8 | contents: write 9 | deployments: write 10 | 11 | jobs: 12 | benchmark: 13 | name: Run pytest-benchmark 14 | runs-on: ubuntu-latest 15 | strategy: 16 | matrix: 17 | python-version: ["3.x"] 18 | steps: 19 | - name: Checkout Code 20 | uses: actions/checkout@v4 21 | with: 22 | submodules: 'recursive' 23 | - name: Setup Python 24 | uses: actions/setup-python@v5 25 | with: 26 | python-version: ${{ matrix.python-version }} 27 | - name: Install requirements (Python 3) 28 | run: | 29 | pip install -r requirements/ci-3.11.txt 30 | - name: Install pypdf 31 | run: | 32 | pip install . 33 | - name: Run benchmark 34 | run: | 35 | pytest tests/bench.py --benchmark-json output.json 36 | - name: Store benchmark result 37 | uses: benchmark-action/github-action-benchmark@v1 38 | with: 39 | name: Python Benchmark with pytest-benchmark 40 | tool: 'pytest' 41 | output-file-path: output.json 42 | # Use personal access token instead of GITHUB_TOKEN due to https://github.community/t/github-action-not-triggering-gh-pages-upon-push/16096 43 | github-token: ${{ secrets.GITHUB_TOKEN }} 44 | auto-push: true 45 | # Show alert with commit comment on detecting possible performance regression 46 | alert-threshold: '200%' 47 | comment-on-alert: true 48 | fail-on-alert: true 49 | alert-comment-cc-users: '@MartinThoma' 50 | -------------------------------------------------------------------------------- /.github/workflows/create-github-release.yaml: -------------------------------------------------------------------------------- 1 | name: Create a GitHub release page 2 | 3 | on: 4 | push: 5 | tags: 6 | - '*.*.*' 7 | workflow_dispatch: 8 | 9 | permissions: 10 | contents: write 11 | 12 | jobs: 13 | build_and_publish: 14 | name: Create a GitHub release page 15 | runs-on: ubuntu-latest 16 | steps: 17 | - name: Checkout Repository 18 | uses: actions/checkout@v4 19 | - name: Prepare variables 20 | id: prepare_variables 21 | run: | 22 | git fetch --tags --force 23 | latest_tag=$(git describe --tags --abbrev=0) 24 | echo "latest_tag=${latest_tag}" >> "$GITHUB_ENV" 25 | echo "date=$(date +'%Y-%m-%d')" >> "$GITHUB_ENV" 26 | EOF=$(dd if=/dev/urandom bs=15 count=1 status=none | base64) 27 | echo "tag_body<<$EOF" >> "$GITHUB_ENV" 28 | git --no-pager tag -l "${latest_tag}" --format='%(contents:body)' >> "$GITHUB_ENV" 29 | echo "$EOF" >> "$GITHUB_ENV" 30 | - name: Create GitHub Release 🚀 31 | uses: softprops/action-gh-release@v2 32 | with: 33 | tag_name: ${{ env.latest_tag }} 34 | name: Version ${{ env.latest_tag }}, ${{ env.date }} 35 | draft: false 36 | prerelease: false 37 | body: ${{ env.tag_body }} 38 | -------------------------------------------------------------------------------- /.github/workflows/publish-to-pypi.yaml: -------------------------------------------------------------------------------- 1 | name: Publish Python Package to PyPI 2 | 3 | on: 4 | push: 5 | tags: 6 | - '*.*.*' 7 | workflow_dispatch: 8 | 9 | jobs: 10 | build: 11 | name: Build distribution 12 | runs-on: ubuntu-latest 13 | 14 | steps: 15 | - uses: actions/checkout@v4 16 | - name: Set up Python 17 | uses: actions/setup-python@v5 18 | with: 19 | python-version: "3.x" 20 | - name: Install pypa/build 21 | run: >- 22 | python3 -m 23 | pip install 24 | build 25 | --user 26 | - name: Build a binary wheel and a source tarball 27 | run: python3 -m build 28 | - name: Store the distribution packages 29 | uses: actions/upload-artifact@v4 30 | with: 31 | name: python-package-distributions 32 | path: dist/ 33 | 34 | publish-to-pypi: 35 | name: Publish Python distribution to PyPI 36 | needs: 37 | - build 38 | runs-on: ubuntu-latest 39 | environment: 40 | name: pypi 41 | url: https://pypi.org/p/pypdf 42 | permissions: 43 | id-token: write # IMPORTANT: mandatory for trusted publishing 44 | 45 | steps: 46 | - name: Download all the dists 47 | uses: actions/download-artifact@v4 48 | with: 49 | name: python-package-distributions 50 | path: dist/ 51 | - name: Publish distribution to PyPI 52 | uses: pypa/gh-action-pypi-publish@release/v1 53 | -------------------------------------------------------------------------------- /.github/workflows/release.yaml: -------------------------------------------------------------------------------- 1 | # This action assumes that there is a REL-commit which already has a 2 | # Markdown-formatted git tag. Hence the CHANGELOG is already adjusted 3 | # and it's decided what should be in the release. 4 | # This action only ensures the release is done with the proper contents 5 | # and that it's announced with a Github release. 6 | name: Create git tag 7 | on: 8 | push: 9 | branches: 10 | - main 11 | 12 | permissions: 13 | contents: write 14 | 15 | env: 16 | HEAD_COMMIT_MESSAGE: ${{ github.event.head_commit.message }} 17 | 18 | jobs: 19 | build_and_publish: 20 | name: Publish a new version 21 | runs-on: ubuntu-latest 22 | if: "${{ startsWith(github.event.head_commit.message, 'REL: ') }}" 23 | steps: 24 | - name: Checkout Repository 25 | uses: actions/checkout@v4 26 | 27 | - name: Extract version from commit message 28 | id: extract_version 29 | run: | 30 | VERSION=$(echo "$HEAD_COMMIT_MESSAGE" | grep -oP '(?<=REL: )\d+\.\d+\.\d+') 31 | echo "version=$VERSION" >> $GITHUB_OUTPUT 32 | 33 | - name: Extract tag message from commit message 34 | id: extract_message 35 | run: | 36 | VERSION="${{ steps.extract_version.outputs.version }}" 37 | delimiter="$(openssl rand -hex 8)" 38 | MESSAGE=$(echo "$HEAD_COMMIT_MESSAGE" | sed "0,/REL: $VERSION/s///" ) 39 | echo "message<<${delimiter}" >> $GITHUB_OUTPUT 40 | echo "$MESSAGE" >> $GITHUB_OUTPUT 41 | echo "${delimiter}" >> $GITHUB_OUTPUT 42 | 43 | - name: Create Git Tag 44 | run: | 45 | VERSION="${{ steps.extract_version.outputs.version }}" 46 | MESSAGE="${{ steps.extract_message.outputs.message }}" 47 | git config user.name github-actions 48 | git config user.email github-actions@github.com 49 | git tag "$VERSION" -m "$MESSAGE" 50 | git push origin $VERSION 51 | -------------------------------------------------------------------------------- /.github/workflows/title-check.yml: -------------------------------------------------------------------------------- 1 | name: 'PR Title Check' 2 | on: 3 | pull_request: 4 | # check when PR 5 | # * is created, 6 | # * title is edited, and 7 | # * new commits are added (to ensure failing title blocks merging) 8 | types: [opened, reopened, edited, synchronize] 9 | 10 | jobs: 11 | title-check: 12 | name: Title check 13 | runs-on: ubuntu-latest 14 | steps: 15 | - name: Checkout Code 16 | uses: actions/checkout@v4 17 | - name: Check PR title 18 | env: 19 | PR_TITLE: ${{ github.event.pull_request.title }} 20 | run: python .github/scripts/check_pr_title.py 21 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.swp 3 | .DS_Store 4 | .tox 5 | build 6 | .idea/* 7 | *.egg-info/ 8 | dist/* 9 | __pycache__/ 10 | 11 | # in-project virtual environments 12 | venv/ 13 | .venv/ 14 | 15 | # Code coverage artifacts 16 | .coverage* 17 | coverage.xml 18 | 19 | # Editors / IDEs 20 | .vscode/ 21 | 22 | # Docs 23 | docs/_build/ 24 | 25 | .cspell/ 26 | 27 | # Files generated by some of the scripts 28 | dont_commit_*.pdf 29 | pypdf-output.pdf 30 | annotated-pdf-link.pdf 31 | Image9.png 32 | pypdf_pdfLocation.txt 33 | 34 | .python-version 35 | tests/pdf_cache/ 36 | docs/meta/CHANGELOG.md 37 | docs/meta/CONTRIBUTORS.md 38 | extracted-images/ 39 | 40 | RELEASE_COMMIT_MSG.md 41 | RELEASE_TAG_MSG.md 42 | .envrc 43 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "sample-files"] 2 | path = sample-files 3 | url = https://github.com/py-pdf/sample-files 4 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | # pre-commit run --all-files 2 | repos: 3 | - repo: https://github.com/pre-commit/pre-commit-hooks 4 | rev: v5.0.0 5 | hooks: 6 | - id: check-ast 7 | - id: check-case-conflict 8 | - id: check-docstring-first 9 | - id: check-yaml 10 | - id: debug-statements 11 | - id: end-of-file-fixer 12 | exclude: "resources/.*|docs/make.bat" 13 | - id: fix-byte-order-marker 14 | - id: trailing-whitespace 15 | - id: mixed-line-ending 16 | args: ['--fix=lf'] 17 | exclude: "docs/make.bat" 18 | - id: check-added-large-files 19 | args: ['--maxkb=1000'] 20 | 21 | - repo: https://github.com/charliermarsh/ruff-pre-commit 22 | rev: v0.11.0 23 | hooks: 24 | - id: ruff 25 | args: ['--fix'] 26 | 27 | - repo: https://github.com/asottile/pyupgrade 28 | rev: v3.19.1 29 | hooks: 30 | - id: pyupgrade 31 | args: [--py38-plus] 32 | 33 | - repo: https://github.com/pre-commit/mirrors-mypy 34 | rev: 'v1.16.0' 35 | hooks: 36 | - id: mypy 37 | additional_dependencies: [types-Pillow==10.2.0.20240822] 38 | files: ^pypdf/.* 39 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 2 | version: 2 3 | 4 | 5 | build: 6 | os: ubuntu-22.04 7 | tools: 8 | python: "3.12" 9 | 10 | # Build documentation in the "docs/" directory with Sphinx 11 | sphinx: 12 | configuration: docs/conf.py 13 | 14 | # If using Sphinx, optionally build your docs in additional formats such as PDF 15 | formats: all 16 | 17 | # Optionally declare the Python requirements required to build your docs 18 | python: 19 | install: 20 | - requirements: requirements/docs.txt 21 | - method: pip 22 | path: . 23 | extra_requirements: 24 | - full 25 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | Please check the [documentation page dedicated to development](https://pypdf.readthedocs.io/en/stable/dev/intro.html). 2 | 3 | ## Creating issues / tickets 4 | 5 | Please go here: https://github.com/py-pdf/pypdf/issues 6 | 7 | Typically you should not send e-mails. E-mails might only reach one person and 8 | it could go into spam or that person might be busy. Please create issues on 9 | GitHub instead. 10 | 11 | Please use the templates provided. 12 | 13 | Keep in mind that although PDF has an official specification, there are tons of 14 | variations which might require special handling. Thus, please always provide a 15 | reproducing example file for us to work with. Otherwise, we have to guess possible 16 | issues, leading to unnecessary overhead - especially since most of the contributions 17 | happen during our free time. 18 | 19 | If you already know a fix, consider opening a pull request after reporting the issue 20 | to make life easier for everyone. 21 | 22 | ## Creating Pull Requests 23 | 24 | We appreciate if people make PRs, but please be aware that pypdf is used by many 25 | people. That means: 26 | 27 | * We rarely make breaking changes and have a [deprecation process](https://pypdf.readthedocs.io/en/latest/dev/deprecations.html). 28 | * New features, especially adding to the public interface, typically need to be 29 | discussed first. 30 | 31 | Before you make bigger changes, open an issue to make the suggestion. 32 | Note which interface changes you want to make. 33 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2006-2008, Mathieu Fenniak 2 | Some contributions copyright (c) 2007, Ashish Kulkarni 3 | Some contributions copyright (c) 2014, Steve Witham 4 | 5 | All rights reserved. 6 | 7 | Redistribution and use in source and binary forms, with or without 8 | modification, are permitted provided that the following conditions are 9 | met: 10 | 11 | * Redistributions of source code must retain the above copyright notice, 12 | this list of conditions and the following disclaimer. 13 | * Redistributions in binary form must reproduce the above copyright notice, 14 | this list of conditions and the following disclaimer in the documentation 15 | and/or other materials provided with the distribution. 16 | * The name of the author may not be used to endorse or promote products 17 | derived from this software without specific prior written permission. 18 | 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 23 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 | POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include CHANGELOG 2 | include LICENSE 3 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | maint: 2 | pre-commit autoupdate 3 | pip-compile -U requirements/ci.in 4 | pip-compile -U requirements/dev.in 5 | pip-compile -U requirements/docs.in 6 | 7 | release: 8 | python make_release.py 9 | git commit -eF RELEASE_COMMIT_MSG.md 10 | 11 | clean: 12 | python -m pip install pyclean 13 | pyclean . 14 | rm -rf tests/__pycache__ pypdf/__pycache__ htmlcov docs/_build dist pypdf.egg-info .pytest_cache .mypy_cache .benchmarks 15 | 16 | test: 17 | pytest tests --cov --cov-report term-missing -vv --cov-report html --durations=3 --timeout=60 pypdf 18 | 19 | testtype: 20 | pytest tests --cov --cov-report term-missing -vv --cov-report html --durations=3 --timeout=30 --typeguard-packages=pypdf 21 | 22 | benchmark: 23 | pytest tests/bench.py 24 | 25 | mypy: 26 | mypy pypdf --ignore-missing-imports --check-untyped --strict 27 | 28 | ruff: 29 | ruff check pypdf tests make_release.py 30 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/_static/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/docs/_static/logo.png -------------------------------------------------------------------------------- /docs/_static/releasing.drawio.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/docs/_static/releasing.drawio.png -------------------------------------------------------------------------------- /docs/dev/PR_Header_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/docs/dev/PR_Header_example.png -------------------------------------------------------------------------------- /docs/dev/cmaps.md: -------------------------------------------------------------------------------- 1 | # CMaps 2 | 3 | Looking at the cmap of "crazyones": 4 | 5 | ```bash 6 | pdftk crazyones.pdf output crazyones-uncomp.pdf uncompress 7 | ``` 8 | 9 | You can see this: 10 | 11 | ```text 12 | begincmap 13 | /CMapName /T1Encoding-UTF16 def 14 | /CMapType 2 def 15 | /CIDSystemInfo << 16 | /Registry (Adobe) 17 | /Ordering (UCS) 18 | /Supplement 0 19 | >> def 20 | 1 begincodespacerange 21 | <00> 22 | endcodespacerange 23 | 1 beginbfchar 24 | <1B> 25 | endbfchar 26 | endcmap 27 | CMapName currentdict /CMap defineresource pop 28 | ``` 29 | 30 | ## codespacerange 31 | 32 | A codespacerange maps a complete sequence of bytes to a range of unicode glyphs. 33 | It defines a starting point: 34 | 35 | ```text 36 | 1 beginbfchar 37 | <1B> 38 | ``` 39 | 40 | That means that `1B` (Hex for 27) maps to the unicode character [`FB00`](https://unicode-table.com/en/FB00/) - the ligature ff (two lowercase f's). 41 | 42 | The two numbers in `begincodespacerange` mean that it starts with an offset of 43 | 0 (hence from `1B ➜ FB00`) up to an offset of FF (dec: 255), hence 1B+FF = 282 44 | ➜ [FBFF](https://www.compart.com/de/unicode/U+FBFF). 45 | 46 | Within the text stream, there is 47 | 48 | ```text 49 | (The)-342(mis\034ts.) 50 | ``` 51 | 52 | `\034 ` is octal for 28 decimal. 53 | -------------------------------------------------------------------------------- /docs/dev/deprecations.md: -------------------------------------------------------------------------------- 1 | # The Deprecation Process 2 | 3 | pypdf strives to be an excellent library for its current users and for new 4 | ones. We are careful with introducing potentially breaking changes, but we 5 | will do them if they provide value for the community on the long run. 6 | 7 | We hope and think that deprecations will not happen frequently. If they do, 8 | users can rely on the following procedure. 9 | 10 | ## Semantic Versioning 11 | 12 | pypdf uses [semantic versioning](https://semver.org/). If you want to avoid 13 | breaking changes, please use dependency pinning (also known as version pinning). 14 | In Python, this is done by specifying the exact version you want to use in a 15 | `requirements.txt` file. A tool that can support you is `pip-compile` from 16 | [`pip-tools`](https://pypi.org/project/pip-tools/). 17 | 18 | If you are using [Poetry](https://pypi.org/project/poetry/) it is done with the 19 | `poetry.lock` file. 20 | 21 | ## How pypdf deprecates features 22 | 23 | Assume the current version of pypdf is `x.y.z`. After a discussion (e.g. via 24 | GitHub issues) we decided to remove a class / function / method. This is how 25 | we do it: 26 | 27 | 1. `x.y.(z+1)`: Add a DeprecationWarning. If there is a replacement, 28 | the replacement is also introduced and the warning informs about the change 29 | and when it will happen. 30 | The docs let users know about the deprecation and when it will happen and the new function. 31 | The CHANGELOG informs about it. 32 | 2. `(x+1).0.0`: Remove / change the code in the breaking way by replacing 33 | DeprecationWarnings by DeprecationErrors. 34 | We do this to help people who didn't look at the warnings before. 35 | The CHANGELOG informs about it. 36 | 3. `(x+2).0.0`: The DeprecationErrors are removed. 37 | 38 | This means the users have 3 warnings in the CHANGELOG, a DeprecationWarning 39 | until the next major release and a DeprecationError until the major release 40 | after that. 41 | 42 | Please note that adding warnings can be a breaking change for some users; most 43 | likely just in the CI. 44 | This means it needs to be properly documented. 45 | -------------------------------------------------------------------------------- /docs/dev/documentation.md: -------------------------------------------------------------------------------- 1 | # Documentation 2 | 3 | ## API Reference 4 | 5 | ### Method / Function Docstrings 6 | 7 | We use Google-Style Docstrings: 8 | 9 | ``` 10 | def example(param1: int, param2: str) -> bool: 11 | """ 12 | Example function with PEP 484 type annotations. 13 | 14 | Args: 15 | param1: The first parameter. 16 | param2: The second parameter. 17 | 18 | Returns: 19 | The return value. True for success, False otherwise. 20 | 21 | Raises: 22 | AttributeError: The ``Raises`` section is a list of all exceptions 23 | that are relevant to the interface. 24 | ValueError: If `param2` is equal to `param1`. 25 | 26 | Examples: 27 | Examples should be written in doctest format, and should illustrate how 28 | to use the function. 29 | 30 | >>> print([i for i in example_generator(4)]) 31 | [0, 1, 2, 3] 32 | """ 33 | ``` 34 | 35 | * The order of sections is (1) Args (2) Returns (3) Raises (4) Examples 36 | * If there is no return value, remove the 'Returns' block 37 | * Properties should not have any sections 38 | 39 | 40 | ## Issues and PRs 41 | 42 | An issue can be used to discuss what we want to achieve. 43 | 44 | A PR can be used to discuss how we achieve it. 45 | 46 | ## Commit Messages 47 | 48 | We want to have descriptive commits in the `main` branch. For this reason, every 49 | pull request (PR) is squashed. That means no matter how many commits a PR has, 50 | in the end only one combined commit will be in `main`. 51 | 52 | The title of the PR will be used as the first line of that combined commit message. 53 | 54 | The first comment within the commit will be used as the message body. 55 | 56 | See [developer intro](intro.md#commit-messages) for more details. 57 | -------------------------------------------------------------------------------- /docs/dev/intro.md: -------------------------------------------------------------------------------- 1 | # Developer Intro 2 | 3 | pypdf is a library and hence its users are developers. This document is not for 4 | the users, but for people who want to work on pypdf itself. 5 | 6 | ## Installing Requirements 7 | 8 | ``` 9 | pip install -r requirements/dev.txt 10 | ``` 11 | 12 | ## Running Tests 13 | 14 | See [testing pypdf with pytest](testing.md). 15 | 16 | ## The sample-files git submodule 17 | The reason for having the submodule `sample-files` is that we want to keep 18 | the size of the pypdf repository small while we also want to have an extensive 19 | test suite. Those two goals contradict each other. 20 | 21 | The `resources` folder should contain a select set of core examples that cover 22 | most cases we typically want to test for. The `sample-files` might cover a lot 23 | more edge cases, the behavior we get when file sizes get bigger, different 24 | PDF producers. 25 | 26 | In order to get the sample-files folder, you need to execute: 27 | 28 | ``` 29 | git submodule update --init 30 | ``` 31 | 32 | ## Tools: git and pre-commit 33 | 34 | Git is a command line application for version control. If you don't know it, 35 | you can [play ohmygit](https://ohmygit.org/) to learn it. 36 | 37 | GitHub is the service where the pypdf project is hosted. While git is free and 38 | open source, GitHub is a paid service by Microsoft, but free in a lot of 39 | cases. 40 | 41 | [pre-commit](https://pypi.org/project/pre-commit/) is a command line application 42 | that uses git hooks to automatically execute code. This allows you to avoid 43 | style issues and other code quality issues. After you entered `pre-commit install` 44 | once in your local copy of pypdf, it will automatically be executed when 45 | you `git commit`. 46 | 47 | ## Commit Messages 48 | 49 | Having a clean commit message helps people to quickly understand what the commit 50 | is about, without actually looking at the changes. The first line of the 51 | commit message is used to [auto-generate the CHANGELOG](https://github.com/py-pdf/pypdf/blob/main/make_release.py). 52 | For this reason, the format should be: 53 | 54 | ``` 55 | PREFIX: DESCRIPTION 56 | 57 | BODY 58 | ``` 59 | 60 | The `PREFIX` can be: 61 | 62 | * `SEC`: Security improvements. Typically an infinite loop that was possible. 63 | * `BUG`: A bug was fixed. Likely there is one or multiple issues. Then write in 64 | the `BODY`: `Closes #123` where 123 is the issue number on GitHub. 65 | It would be absolutely amazing if you could write a regression test in those 66 | cases. That is a test that would fail without the fix. 67 | A bug is always an issue for pypdf users - test code or CI that was fixed is 68 | not considered a bug here. 69 | * `ENH`: A new feature! Describe in the body what it can be used for. 70 | * `DEP`: A deprecation. Either marking something as "this is going to be removed" 71 | or actually removing it. 72 | * `PI`: A performance improvement. This could also be a reduction in the 73 | file size of PDF files generated by pypdf. 74 | * `ROB`: A robustness change. Dealing better with broken PDF files. 75 | * `DOC`: A documentation change. 76 | * `TST`: Adding or adjusting tests. 77 | * `DEV`: Developer experience improvements, e.g. pre-commit or setting up CI. 78 | * `MAINT`: Quite a lot of different stuff. Performance improvements are for sure 79 | the most interesting changes in here. Refactorings as well. 80 | * `STY`: A style change. Something that makes pypdf code more consistent. 81 | Typically a small change. It could also be better error messages for 82 | end users. 83 | 84 | The prefix is used to generate the CHANGELOG. Every PR must have exactly one - 85 | if you feel like several match, take the top one from this list that matches for 86 | your PR. 87 | 88 | ## Pull Request Size 89 | 90 | Smaller Pull Requests (PRs) are preferred as it's typically easier to merge 91 | them. For example, if you have some typos, a few code-style changes, a new 92 | feature, and a bug-fix, that could be 3 or 4 PRs. 93 | 94 | A PR must be complete. That means if you introduce a new feature it must be 95 | finished within the PR and have a test for that feature. 96 | 97 | ## Benchmarks 98 | 99 | We need to keep an eye on performance and thus we have a few benchmarks. 100 | 101 | See [py-pdf.github.io/pypdf/dev/bench](https://py-pdf.github.io/pypdf/dev/bench/) 102 | -------------------------------------------------------------------------------- /docs/dev/pypdf-parsing.md: -------------------------------------------------------------------------------- 1 | # How pypdf parses PDF files 2 | 3 | pypdf uses {class}`~pypdf.PdfReader` to parse PDF files. 4 | The method {py:meth}`PdfReader.read ` shows the basic 5 | structure of parsing: 6 | 7 | 1. **Finding and reading the cross-reference tables / trailer**: The 8 | cross-reference table (xref table) is a table of byte offsets that indicate 9 | the locations of objects within the file. The trailer provides additional 10 | information such as the root object (Catalog) and the Info object containing 11 | metadata. 12 | 2. **Parsing the objects**: After locating the xref table and the trailer, pypdf 13 | proceeds to parse the objects in the PDF. Objects in a PDF can be of various 14 | types such as dictionaries, arrays, streams, and simple data types (e.g., 15 | integers, strings). pypdf parses these objects and stores them in 16 | {py:meth}`PdfReader.resolved_objects `, 17 | populated by {py:meth}`cache_indirect_object `. 18 | 3. **Decoding content streams**: The content of a PDF is typically stored in 19 | content streams, which are sequences of PDF operators and operands. pypdf 20 | decodes these content streams by applying filters (e.g., `FlateDecode`, 21 | `LZWDecode`) specified in the stream's dictionary. This is only done when the 22 | object is requested by {py:meth}`PdfReader.get_object ` 23 | which uses the `PdfReader._get_object_from_stream` method. 24 | 25 | ## References 26 | 27 | [PDF 1.7 specification](https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/PDF32000_2008.pdf): 28 | * 7.5 File Structure 29 | * 7.5.4 Cross-Reference Table 30 | * 7.8 Content Streams and Resources 31 | -------------------------------------------------------------------------------- /docs/dev/pypdf-writing.md: -------------------------------------------------------------------------------- 1 | # How pypdf writes PDF files 2 | 3 | pypdf uses {py:class}`PdfWriter ` to write PDF files. pypdf has 4 | {py:class}`PdfObject ` and several subclasses with the 5 | {py:meth}`write_to_stream ` method. 6 | The {py:meth}`PdfWriter.write ` method uses the 7 | `write_to_stream` methods of the referenced objects. 8 | 9 | The {py:meth}`PdfWriter.write_stream ` method 10 | has the following core steps: 11 | 12 | 1. `_sweep_indirect_references`: This step ensures that any circular references 13 | to objects are correctly handled. It adds the object reference numbers of any 14 | circularly referenced objects to an external reference map, so that 15 | self-page-referencing trees can reference the correct new object location, 16 | rather than copying in a new copy of the page object. 17 | 2. **Write the File Header and Body** with `_write_pdf_structure`: In this step, 18 | the PDF header and objects are written to the output stream. This includes 19 | the PDF version (e.g., %PDF-1.7) and the objects that make up the content of 20 | the PDF, such as pages, annotations, and form fields. The locations (byte 21 | offsets) of these objects are stored for later use in generating the xref 22 | table. 23 | 3. **Write the Cross-Reference Table** with `_write_xref_table`: Using the stored 24 | object locations, this step generates and writes the cross-reference table 25 | (xref table) to the output stream. The cross-reference table contains the 26 | byte offsets for each object in the PDF file, allowing for quick random 27 | access to objects when reading the PDF. 28 | 4. **Write the File Trailer** with `_write_trailer`: The trailer is written to 29 | the output stream in this step. The trailer contains essential information, 30 | such as the number of objects in the PDF, the location of the root object 31 | (Catalog), and the Info object containing metadata. The trailer also 32 | specifies the location of the xref table. 33 | 34 | 35 | ## How others do it 36 | 37 | Looking at alternative software designs and implementations can help to improve 38 | our choices. 39 | 40 | ### fpdf2 41 | 42 | [fpdf2](https://pypi.org/project/fpdf2/) has a [`PDFObject` class](https://github.com/PyFPDF/fpdf2/blob/master/fpdf/syntax.py) 43 | with a serialize method which roughly maps to `pypdf.PdfObject.write_to_stream`. 44 | Some other similarities include: 45 | 46 | * [fpdf.output.OutputProducer.buffersize](https://github.com/PyFPDF/fpdf2/blob/master/fpdf/output.py#L370-L485) vs {py:meth}`pypdf.PdfWriter.write_stream ` 47 | * [fpdpf.syntax.Name](https://github.com/PyFPDF/fpdf2/blob/master/fpdf/syntax.py#L124) vs {py:class}`pypdf.generic.NameObject ` 48 | * [fpdf.syntax.build_obj_dict](https://github.com/PyFPDF/fpdf2/blob/master/fpdf/syntax.py#L222) vs {py:class}`pypdf.generic.DictionaryObject ` 49 | * [fpdf.structure_tree.NumberTree](https://github.com/PyFPDF/fpdf2/blob/master/fpdf/structure_tree.py#L17) vs 50 | {py:class}`pypdf.generic.TreeObject ` 51 | 52 | 53 | ### pdfrw 54 | 55 | [pdfrw](https://pypi.org/project/pdfrw/), in contrast, seems to work more with 56 | the standard Python objects (bool, float, string) and not wrap them in custom 57 | objects, if possible. It still has: 58 | 59 | * [PdfArray](https://github.com/pmaupin/pdfrw/blob/master/pdfrw/objects/pdfarray.py#L13) 60 | * [PdfDict](https://github.com/pmaupin/pdfrw/blob/master/pdfrw/objects/pdfdict.py#L49) 61 | * [PdfName](https://github.com/pmaupin/pdfrw/blob/master/pdfrw/objects/pdfname.py#L65) 62 | * [PdfString](https://github.com/pmaupin/pdfrw/blob/master/pdfrw/objects/pdfstring.py#L322) 63 | * [PdfIndirect](https://github.com/pmaupin/pdfrw/blob/master/pdfrw/objects/pdfindirect.py#L10) 64 | 65 | The core classes of pdfrw are 66 | [PdfReader](https://github.com/pmaupin/pdfrw/blob/master/pdfrw/pdfreader.py#L26) 67 | and 68 | [PdfWriter](https://github.com/pmaupin/pdfrw/blob/master/pdfrw/pdfwriter.py#L224) 69 | -------------------------------------------------------------------------------- /docs/dev/releasing.md: -------------------------------------------------------------------------------- 1 | # Releasing 2 | 3 | A `pypdf` release contains the following artifacts: 4 | 5 | * A new [release on PyPI](https://pypi.org/project/pypdf/) 6 | * A [release commit](https://github.com/py-pdf/pypdf/commit/91391b18bb8ec9e6e561e2795d988e8634a01a50) 7 | * Containing a changelog update 8 | * A new [git tag](https://github.com/py-pdf/pypdf/tags) 9 | * A [Github release](https://github.com/py-pdf/pypdf/releases/tag/3.15.0) 10 | 11 | ## Who does it? 12 | 13 | `pypdf` should typically only be released by one of the core maintainers / the 14 | core maintainer. At the moment, this is either stefan6419846 or pubpub-zz and Martin Thoma. 15 | 16 | Any owner of the py-pdf organization also has the technical permissions to 17 | release. 18 | 19 | ## How is it done? 20 | 21 | ### With direct push permissions 22 | 23 | This is the typical way for the core maintainer/benevolent dictator. 24 | 25 | The release contains the following steps: 26 | 27 | 1. Update the CHANGELOG.md and the _version.py via `python make_release.py`. 28 | This also prepares the release commit message. 29 | 2. Create a release commit: `git commit -eF RELEASE_COMMIT_MSG.md`. 30 | 3. Push commit: `git push`. 31 | 4. CI now builds a source and a wheels package which it pushes to PyPI. It also 32 | creates the corresponding tag and a GitHub release. 33 | 34 | ![](../_static/releasing.drawio.png) 35 | 36 | ### Using a Pull Request 37 | 38 | This is the typical way for collaborators which do not have direct push permissions for 39 | the `main` branch. 40 | 41 | The release contains the following steps: 42 | 43 | 1. Update the CHANGELOG.md and the _version.py via `python make_release.py`. 44 | This also prepares the release commit message. 45 | 2. Push the changes to a dedicated branch. 46 | 3. Open a pull request starting with `REL: `, followed by the new version number. 47 | 4. Wait for the approval of another eligible maintainer. 48 | 5. Merge the pull request with the name being the PR title and the body being 49 | the content of `RELEASE_COMMIT_MSG.md`. 50 | 7. CI now builds a source and a wheels package which it pushes to PyPI. It also 51 | creates the corresponding tag and a GitHub release. 52 | 53 | ### The Release Tag 54 | 55 | * Use the release version as the tag name. No need for a leading "v". 56 | * Use the changelog entry as the body. 57 | 58 | 59 | ## When are releases done? 60 | 61 | There is no need to wait for anything. If the CI is green (all tests succeeded), 62 | we can release. 63 | 64 | At the moment, there is no fixed release cycle - except that we usually release 65 | on Sunday. 66 | -------------------------------------------------------------------------------- /docs/dev/testing.md: -------------------------------------------------------------------------------- 1 | # Testing 2 | 3 | pypdf uses [`pytest`](https://docs.pytest.org/en/7.1.x/) for testing. 4 | 5 | To run the tests you need to install the CI (Continuous Integration) requirements by running `pip install -r requirements/ci.txt` or 6 | `pip install -r requirements/ci-3.11.txt` if running Python ≥ 3.11. 7 | 8 | ## Deselecting groups of tests 9 | 10 | pypdf makes use of the following pytest markers: 11 | 12 | * `slow`: Tests that require more than 5 seconds. 13 | * `samples`: Tests that require the [the `sample-files` git submodule](https://github.com/py-pdf/sample-files) to be initialized. As of October 2022, this is about 25 MB. 14 | * `enable_socket`: Tests that download PDF documents. They are stored locally and thus only need to be downloaded once. As of October 2022, this is about 200 MB. 15 | * To successfully run the tests, please download most of the documents beforehand: `python -c "from tests import download_test_pdfs; download_test_pdfs()"` 16 | 17 | You can disable them by `pytest -m "not enable_socket"` or `pytest -m "not samples"`. 18 | You can even disable all of them: `pytest -m "not enable_socket" -m "not samples" -m "not slow"`. 19 | 20 | Please note that this reduces test coverage. The CI will always test all files. 21 | 22 | ## Docstrings in Unit tests 23 | 24 | The first line of a docstring in a unit test should be written in a way that 25 | you could prefix it with "This tests ensures that ...", e.g. 26 | 27 | * Invalid XML in xmp_metadata is gracefully handled. 28 | * The identity is returning its input. 29 | * xmp_modify_date is extracted correctly. 30 | 31 | This way, plugins like [`pytest-testdox`](https://pypi.org/project/pytest-testdox/) 32 | can generate really nice output when the tests are running. This looks similar 33 | to the output of [mocha.js](https://mochajs.org/). 34 | 35 | If the test is a regression test, write 36 | 37 | > This test is a regression test for issue #1234 38 | 39 | If the regression test is just one parameter of other tests, then add it as 40 | a comment for that parameter. 41 | 42 | ## Evaluate a PR in-progress version 43 | 44 | You may want to test a version from a PR which has not been released yet. 45 | The easiest way is to use pip and install a version from git: 46 | 47 | a) Go the PR and identify the repository and branch. 48 | 49 | Example from below : repository: __pubpub-zz__ / branch: __iss2200__ : 50 | ![PR Header example](PR_Header_example.png) 51 | 52 | b) you can then install the version using pip from git: 53 | 54 | Example: 55 | ``` 56 | pip install git+https://github.com/pubpub-zz/pypdf.git@iss2200 57 | ``` 58 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. pypdf documentation main file, created by 2 | sphinx-quickstart on Thu Apr 7 20:13:19 2022. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to pypdf 7 | ================= 8 | 9 | pypdf is a `free `_ and open 10 | source pure-python PDF library capable of splitting, 11 | merging, cropping, and transforming the pages of PDF files. It can also add 12 | custom data, viewing options, and passwords to PDF files. 13 | pypdf can retrieve text and metadata from PDFs as well. 14 | 15 | See `pdfly `_ for a CLI application that uses pypdf to interact with PDFs. 16 | 17 | You can contribute to `pypdf on GitHub `_. 18 | 19 | .. toctree:: 20 | :caption: User Guide 21 | :maxdepth: 1 22 | 23 | user/installation 24 | user/migration-1-to-2 25 | user/robustness 26 | user/suppress-warnings 27 | user/metadata 28 | user/extract-text 29 | user/post-processing-in-text-extraction 30 | user/extract-images 31 | user/extract-attachments 32 | user/encryption-decryption 33 | user/merging-pdfs 34 | user/cropping-and-transforming 35 | user/reading-pdf-annotations 36 | user/adding-pdf-annotations 37 | user/add-watermark 38 | user/add-javascript 39 | user/viewer-preferences 40 | user/forms 41 | user/streaming-data 42 | user/file-size 43 | user/pdf-version-support 44 | user/pdfa-compliance 45 | 46 | 47 | .. toctree:: 48 | :caption: API Reference 49 | :maxdepth: 1 50 | 51 | modules/PdfReader 52 | modules/PdfWriter 53 | modules/Destination 54 | modules/DocumentInformation 55 | modules/Field 56 | modules/Fit 57 | modules/PageObject 58 | modules/PageRange 59 | modules/PaperSize 60 | modules/RectangleObject 61 | modules/Transformation 62 | modules/XmpInformation 63 | modules/annotations 64 | modules/constants 65 | modules/errors 66 | modules/generic 67 | modules/PdfDocCommon 68 | 69 | .. toctree:: 70 | :caption: Developer Guide 71 | :maxdepth: 1 72 | 73 | dev/intro 74 | dev/pdf-format 75 | dev/pypdf-parsing 76 | dev/pypdf-writing 77 | dev/cmaps 78 | dev/deprecations 79 | dev/documentation 80 | dev/testing 81 | dev/releasing 82 | 83 | .. toctree:: 84 | :caption: About pypdf 85 | :maxdepth: 1 86 | 87 | meta/CHANGELOG 88 | meta/changelog-v1 89 | meta/project-governance 90 | meta/taking-ownership 91 | meta/history 92 | meta/CONTRIBUTORS 93 | meta/scope-of-pypdf 94 | meta/comparisons 95 | meta/faq 96 | 97 | Indices and tables 98 | ================== 99 | 100 | * :ref:`genindex` 101 | * :ref:`modindex` 102 | * :ref:`search` 103 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | %SPHINXBUILD% >NUL 2>NUL 14 | if errorlevel 9009 ( 15 | echo. 16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 17 | echo.installed, then set the SPHINXBUILD environment variable to point 18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 19 | echo.may add the Sphinx directory to PATH. 20 | echo. 21 | echo.If you don't have Sphinx installed, grab it from 22 | echo.https://www.sphinx-doc.org/ 23 | exit /b 1 24 | ) 25 | 26 | if "%1" == "" goto help 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/meta/comparisons.md: -------------------------------------------------------------------------------- 1 | # pypdf vs X 2 | 3 | pypdf is a [free] and open source pure-python PDF library capable of 4 | splitting, merging, cropping, and transforming the pages of PDF files. 5 | It can also add custom data, viewing options, and passwords to PDF 6 | files. pypdf can retrieve text and metadata from PDFs as well. 7 | 8 | ## PyMuPDF and PikePDF 9 | 10 | [PyMuPDF] is a Python binding to [MuPDF] and [PikePDF] is the Python 11 | binding to [QPDF]. 12 | 13 | While both are excellent libraries for various use-cases, using them is 14 | not always possible even when they support the use-case. Both of them 15 | are powered by C libraries which makes installation harder and might 16 | cause security concerns. For MuPDF you might also need to buy a 17 | commercial license. 18 | 19 | A core feature of pypdf is that it's pure Python. That means there is 20 | no C dependency. It has been used for over 10 years and for this reason 21 | a lot of support via StackOverflow and examples on the internet. 22 | 23 | ## pypdf 24 | 25 | PyPDF2 was merged back into `pypdf`. The development continues at `pypdf`. 26 | 27 | ## PyPDF3 and PyPDF4 28 | 29 | Developing and maintaining open source software is extremely 30 | time-intensive and in the case of pypdf not paid at all. Having a 31 | continuous support is hard. 32 | 33 | pypdf was initially released in 2012 on PyPI and received releases 34 | until 2016. From 2016 to 2022 there was no update - but people were 35 | still using it. 36 | 37 | As pypdf is free software, there were attempts to fork it and continue 38 | the development. PyPDF3 was first released in 2018 and still receives 39 | updates. PyPDF4 has only one release from 2018. 40 | 41 | Martin Thoma has worked on bringing the community back to one path of 42 | development. He deprecated PyPDF2 in favor of pypdf already and pypdf has now 43 | more features and a cleaner interface than PyPDF2. See [history of 44 | pypdf](history.md). 45 | 46 | [free]: https://en.wikipedia.org/wiki/Free_software 47 | [PyMuPDF]: https://pypi.org/project/PyMuPDF/ 48 | [MuPDF]: https://mupdf.com/ 49 | [PikePDF]: https://pypi.org/project/pikepdf/ 50 | [QPDF]: https://github.com/qpdf/qpdf 51 | 52 | 53 | ## pdfminer.six and pdfplumber 54 | 55 | [`pdfminer.six`](https://pypi.org/project/pdfminer.six/) is capable of 56 | extracting the [font size](https://stackoverflow.com/a/69962459/562769) 57 | / font weight (bold-ness). It has no capabilities for writing PDF files. 58 | 59 | [`pdfplumber`](https://pypi.org/project/pdfplumber/) is a library focused on extracting data from PDF documents. Since `pdfplumber` is built on top of `pdfminer.six`, there are **no capabilities of exporting or modifying a PDF file** (see [#440 (discussions)](https://github.com/jsvine/pdfplumber/discussions/440#discussioncomment-803880)). However, `pdfplumber` is capable of converting a PDF file into an image, [draw lines and rectangles on the image](https://github.com/jsvine/pdfplumber#drawing-methods), and save it as an image file. Please note that the image conversion is done via ImageMagick (see [`pdfplumber`'s documentation](https://github.com/jsvine/pdfplumber#visual-debugging)). 60 | 61 | The `pdfplumber` community is active in answering questions and the library is maintained as of May 2023. 62 | 63 | ## pdfrw / pdfrw2 64 | 65 | I don't have experience with any of those libraries. Please add a 66 | comparison if you know pypdf and [`pdfrw`](https://pypi.org/project/pdfrw/)! 67 | 68 | Please be aware that there is also 69 | [`pdfminer`](https://pypi.org/project/pdfminer/) which is not maintained. 70 | Then there is [`pdfrw2`](https://pypi.org/project/pdfrw2/) which doesn't have 71 | a large community behind it. 72 | 73 | ## Document Generation 74 | 75 | There are (Python) [tools to generate PDF documents](https://github.com/py-pdf/awesome-pdf#generators). 76 | pypdf is not one of them. 77 | 78 | 79 | ## CLI applications 80 | 81 | pypdf is a pure Python PDF library. If you're looking for an application which 82 | you can use from the terminal, give [`pdfly`](https://pdfly.readthedocs.io/en/latest/) 83 | a shot. 84 | -------------------------------------------------------------------------------- /docs/meta/faq.md: -------------------------------------------------------------------------------- 1 | # Frequently Asked Questions 2 | 3 | ## How is pypdf related to PyPDF2? 4 | 5 | PyPDF2 was a fork from the original pyPdf. After several years, the fork was 6 | merged back into `pypdf` (now all lowercase). 7 | 8 | ## Which Python versions are supported? 9 | 10 | pypdf 3.0+ supports Python 3.6 and later. 11 | PyPDF2 2.0+ supports Python 3.6 and later. 12 | PyPDF2 1.27.10 supported Python 2.7 to 3.10. 13 | 14 | [Matthew]: https://github.com/mstamy2 15 | [source]: https://github.com/py-pdf/PyPDF2/commit/24b270d876518d15773224b5d0d6c2206db29f64#commitcomment-5038317 16 | [this sort of thing]: https://github.com/py-pdf/PyPDF2/issues/24 17 | [GitHub issue]: https://github.com/py-pdf/PyPDF2/issues 18 | 19 | ## Who uses pypdf? 20 | 21 | pyPdf is vendored [into](https://github.com/Buyanbat/XacCRM/tree/ee78e8df967182f661b6494a86444501e7d89c8f/report/pyPdf) [several](https://github.com/MyBook/calibre/tree/ca1efe3c21f6553e096dab745b3cdeb36244a5a9/src/pyPdf) [projects](https://github.com/Giacomo-De-Florio-Dev/Make_Your_PDF_Safe/tree/ec439f92243d12d54ae024668792470c6b40ee96/MakeYourPDFsafe_V1.3/PyPDF2). That 22 | means the code of pyPdf was copied into that project. 23 | 24 | Projects that depend on pypdf: 25 | 26 | * [Camelot](https://github.com/camelot-dev/camelot): A Python library to extract tabular data from PDFs 27 | * [edi](https://github.com/OCA/edi): Electronic Data Interchange modules 28 | * [amazon-textract-textractor](https://github.com/aws-samples/amazon-textract-textractor/blob/42444b08c672607eadbdcd64f3c5adb2d85383de/helper/setup.py): Analyze documents with Amazon Textract and generate output in multiple formats. 29 | * [maigret](https://github.com/soxoj/maigret): Collect a dossier on a person by username from thousands of sites 30 | * [deda](https://github.com/dfd-tud/deda): tracking Dots Extraction, Decoding and Anonymisation toolkit 31 | * [opencanary](https://github.com/thinkst/opencanary) 32 | * Document Conversions 33 | * [rst2pdf](https://github.com/rst2pdf/rst2pdf) 34 | * [xhtml2pdf](https://github.com/xhtml2pdf/xhtml2pdf) 35 | * [doc2text](https://github.com/jlsutherland/doc2text) 36 | * [pdfalyzer](https://pypi.org/project/pdfalyzer/): A PDF analysis tool for visualizing the inner tree-like data structure of a PDF in spectacularly large and colorful diagrams as well as scanning the binary streams embedded in the PDF for hidden potentially malicious content. 37 | 38 | ## How do I cite pypdf? 39 | 40 | In BibTeX format: 41 | 42 | ``` 43 | @misc{pypdf, 44 | title = {The {pypdf} library}, 45 | author = {Mathieu Fenniak and 46 | Matthew Stamy and 47 | pubpub-zz and 48 | Martin Thoma and 49 | Matthew Peveler and 50 | exiledkingcc and {pypdf Contributors}}, 51 | year = {2024}, 52 | url = {https://pypi.org/project/pypdf/} 53 | note = {See https://pypdf.readthedocs.io/en/latest/meta/CONTRIBUTORS.html for all contributors} 54 | } 55 | ``` 56 | 57 | ## Which License does pypdf use? 58 | 59 | `pypdf` uses the [BSD-3-Clause license](https://en.wikipedia.org/wiki/BSD_licenses#3-clause), see the LICENSE file. 60 | -------------------------------------------------------------------------------- /docs/meta/history.md: -------------------------------------------------------------------------------- 1 | # History of pypdf 2 | 3 | ## The Origins: pyPdf (2005-2010) 4 | 5 | In 2005, [Mathieu Fenniak] launched pyPdf "as a PDF toolkit..." 6 | focused on 7 | 8 | - document manipulation: by-page splitting, concatenation, and 9 | merging; 10 | - document introspection; 11 | - page cropping; and 12 | - document encryption and decryption. 13 | 14 | The last release of PyPI was [pyPdf 1.13](https://pypi.org/project/pyPdf/#history) 15 | in 2010. 16 | 17 | ## PyPDF2 is born (2011-2016) 18 | 19 | At the end of 2011, after consultation with Mathieu and others, Phaseit 20 | sponsored PyPDF2 as a fork of pyPdf on GitHub. The initial impetus was 21 | to handle a wider range of input PDF instances; Phaseit\'s commercial 22 | work often encounters PDF instances \"in the wild\" that it needs to 23 | manage (mostly concatenate and paginate), but that deviate so much from 24 | PDF standards that pyPdf can\'t read them. PyPDF2 reads a considerably 25 | wider range of real-world PDF instances. 26 | 27 | Neither pyPdf nor PyPDF2 aims to be universal, that is, to provide all 28 | possible PDF-related functionality. Note that the similar-appearing 29 | [pyfpdf] of Mariano Reingart is most comparable to [ReportLab], in that 30 | both ReportLab and pyfpdf emphasize document generation. Interestingly 31 | enough, pyfpdf builds in a basic HTML→PDF converter while PyPDF2 has no 32 | knowledge of HTML. 33 | 34 | So what is PyPDF2 truly about? Think about popular [pdftk] for a moment. 35 | PyPDF2 does what pdftk does, and it does so within your current Python 36 | process, and it handles a wider range of variant PDF formats 37 | \[explain\]. PyPDF2 has its own FAQ to answer other questions that have 38 | arisen. 39 | 40 | The Reddit [/r/python crowd chatted] obliquely and briefly about PyPDF2 41 | in March 2012. 42 | 43 | The core developer / maintainer was Matthew Stamy. 44 | 45 | ## PyPDF3 and PyPDF4 (2018 - 2022) 46 | 47 | Two approaches were made to get PyPDF2 active again: PyPDF3 and PyPDF4. 48 | 49 | PyPDF3 had it's first release in 2018 and its last one in February 2022. 50 | It never got the user base from PyPDF2. 51 | 52 | PyPDF4 only had one release in 2018. 53 | 54 | ## PyPDF2: Reborn (2022) 55 | 56 | Martin Thoma took over maintenance of PyPDF2 in April 2022. It had over 100 57 | open PRs and 321 open issues. 58 | 59 | [pubpub-zz](https://github.com/pubpub-zz) was extremely active, especially 60 | for text extraction. 61 | 62 | [Matthew Peveler](https://github.com/MasterOdin) helped a lot with reviews 63 | and general project decisions. 64 | 65 | [exiledkingcc](https://github.com/exiledkingcc) added support for modern 66 | encryption schemes. 67 | 68 | 69 | ## pypdf: Back to the Roots (2023-2024) 70 | 71 | In order to make things simpler for beginners, PyPDF2 was merged back into 72 | pypdf. Now all lowercase, without a number. We hope that the folks who 73 | develop PyPDF3 and PyPDF4 also join us. 74 | 75 | Compared to `PyPDF2 >= 3.0.0`, `pypdf >= 3.1.0` now offers: 76 | 77 | * AES reading and writing support. Not only with PyCryptoDome, but also with cryptography. 78 | * Text extraction improvements, e.g. for math content. [pypdf is now comparable with Tika, pypdfium2, and PyMuPDF](https://github.com/py-pdf/benchmarks) 79 | * Annotation support 80 | * Performance Improvements and Bugfixes 81 | * Page Label support 82 | 83 | stefan6419846 made his [first PR for pypdf](https://github.com/py-pdf/pypdf/pull/2022) 84 | in July 2023 and joined the project. 85 | 86 | 87 | [Mathieu Fenniak]: https://mathieu.fenniak.net/ 88 | [pyfpdf]: https://github.com/reingart/pyfpdf 89 | [ReportLab]: https://www.reportlab.com/software/opensource/rl-toolkit/ 90 | [pdftk]: https://www.pdflabs.com/tools/pdftk-the-pdf-toolkit/https://www.pdflabs.com/tools/pdftk-the-pdf-toolkit/ 91 | [/r/python crowd chatted]: https://www.reddit.com/r/Python/comments/qsvfm/pypdf2_updates_pypdf_pypdf2_is_an_opensource/ 92 | -------------------------------------------------------------------------------- /docs/meta/scope-of-pypdf.md: -------------------------------------------------------------------------------- 1 | # Scope of pypdf 2 | 3 | What features should pypdf have and which features will it never have? 4 | 5 | pypdf aims at making interactions with PDF documents simpler. Core tasks that 6 | pypdf can perform are: 7 | 8 | * Document manipulation: Splitting, merging, cropping, and transforming the pages of PDF files 9 | * Data Extraction: Extract text and metadata from PDF documents 10 | * Security: Decrypt / encrypt PDF documents 11 | 12 | Typical indicators that something should be done by pypdf: 13 | 14 | * The task needs in-depth knowledge of the PDF format 15 | * It currently requires a lot of code or even is impossible to do with pypdf 16 | * It's neither mentioned in "belongs in user code" nor in "out of scope" 17 | * It already is in the issue list with the [is-feature tag](https://github.com/py-pdf/pypdf/labels/is-feature). 18 | 19 | The [moonshot extensions](https://github.com/py-pdf/pypdf/discussions/1181) are 20 | features we would like to have, but are currently not able to add (PRs are 21 | welcome 😉) 22 | 23 | ## Belongs in user code 24 | 25 | Here are a few indicators that a feature belongs into users code (and not into pypdf): 26 | 27 | 1. The use-case is very specific. Most people will not encounter the same need. 28 | 2. It can be done without knowledge of the PDF specification 29 | 3. It cannot be done without (non-pdf) domain knowledge. Anything that is 30 | specific to your industry. 31 | 32 | ## Out of scope 33 | 34 | While this list is infinitely long, there are a few topics that are asked 35 | multiple times. 36 | 37 | Those topics are out of scope for pypdf. They will never be part of pypdf: 38 | 39 | 1. **Optical Character Recognition (OCR)**: OCR is about extracting text from 40 | images. That is very different from the kind of text extraction pypdf is 41 | doing. Please note that images can be within PDF documents. In the case of 42 | scanned documents, the whole page is an image. Some scanners automatically 43 | execute OCR and add a text-layer behind the scanned page. That is something 44 | pypdf can use, if it's present. As a rule-of-thumb: If you cannot mark/copy 45 | the text, it's likely an image. A noteworthy open source OCR project is 46 | [tesseract](https://github.com/tesseract-ocr/tesseract). 47 | 2. **Format Conversion**: Converting docx / HTML to PDF or PDF to those formats. 48 | You might want to have a look at [`pdfkit`](https://pypi.org/project/pdfkit/) 49 | and similar projects. 50 | 51 | Out of scope for the moment, but might be added if there are enough contributors: 52 | 53 | * **Digital Signature Support** ([reference 54 | ticket](https://github.com/py-pdf/pypdf/issues/302)): Cryptography is 55 | complicated. It's important to get it right. pypdf currently doesn't have 56 | enough active contributors to properly add digital signautre support. For the 57 | moment, [pyhanko](https://pypi.org/project/pyHanko/) seems to be the best 58 | choice. 59 | * **PDF Generation from Scratch**: pypdf can manipulate existing PDF documents, 60 | add annotations, combine / split / crop / transform. It can add blank pages. 61 | But if you want to generate invoices, you might want to have a look at 62 | [`reportlab`](https://pypi.org/project/reportlab/) / 63 | [`fpdf2`](https://pypi.org/project/fpdf2/) or document conversion tools like 64 | [`pdfkit`](https://pypi.org/project/pdfkit/). 65 | * **Replacing words within a PDF**: [Extracting text from PDF is hard](../user/extract-text.md#why-text-extraction-is-hard). 66 | Replacing text in a reliable way is even harder. For example, one word might 67 | be split into multiple tokens. Hence it's not a simple "search and replace" 68 | in some cases. 69 | * **(Not) Extracting headers/footers/page numbers**: While you can apply 70 | heuristics, there is no way to always make it work. PDF documents simply 71 | don't contain the information what a header/footer/page number is. 72 | 73 | 74 | ### Library vs Application 75 | 76 | It's also worth pointing out that `pypdf` is designed to be a library. It is not 77 | an application. That has several implications: 78 | 79 | * Execution: pypdf cannot be executed directly, but only be called from within 80 | a program written by a pypdf user. In contrast, an application is executed 81 | by it's own. 82 | * Dependencies: pypdf should have a minimal set of dependencies and only 83 | restrict them where it is strictly necessary. In contrast, applications should 84 | be installed in environments which are isolated from other applications. They 85 | can pin their dependencies. 86 | 87 | If you're looking for a way to interact with PDF files via Shell, you should 88 | either write a script using pypdf or use [`pdfly`](https://pypi.org/project/pdfly/). 89 | -------------------------------------------------------------------------------- /docs/meta/taking-ownership.md: -------------------------------------------------------------------------------- 1 | # Taking Ownership of pypdf 2 | 3 | pypdf is currently maintained by stefan6419846. We want to avoid that 4 | pypdf ever goes unmaintained again. This document serves as a guide to avoid 5 | that if I become unavailable, e.g. due to severe health issues. 6 | 7 | This currently is just an abstract scenario. I'm fine and I will likely do this 8 | for several more years, but I have seen how projects stand still for many years 9 | because of the maintainer becoming inactive. 10 | 11 | ## What belongs to pypdf? 12 | 13 | The resources needed for maintaining pypdf are: 14 | 15 | * PyPI: [pypdf](https://pypi.org/project/pypdf/) and [PyPDF2](https://pypi.org/project/PyPDF2/) 16 | * Github: [pypdf](https://github.com/py-pdf/pypdf) (the repository, not the organization) 17 | * ReadTheDocs: [pypdf](https://readthedocs.org/projects/pypdf/) and [PyPDF2](https://readthedocs.org/projects/pypdf2/) 18 | 19 | ## When may somebody take ownership? 20 | 21 | **No activity in 180 days**: If I don't answer e-mails (see my GitHub profile) 22 | and don't make any commits / merges for half a year, you can consider pypdf "not 23 | maintained". 24 | 25 | ## Who may take ownership? 26 | 27 | Preferably, one of the owners of the GitHub `py-pdf` organization takes care of 28 | that. 29 | 30 | As of 27th of August 2023, the following people might be candidates: 31 | 32 | * [Lucas-C](https://github.com/Lucas-C): He maintains fpdf2 and is a py-pdf owner 33 | * [pubpub-zz](https://github.com/pubpub-zz): He is one of the most active contributors 34 | to pypdf 35 | * [Matthew Peveler](https://github.com/MasterOdin): Less active, but he is very 36 | careful about breaking changes and an experienced software developer. 37 | * [exiledkingcc](https://github.com/exiledkingcc): He has contributed the core 38 | changes related to encryption. 39 | 40 | ## How to take ownership? 41 | 42 | * PyPI: Follow [PEP 541 – Package Index Name Retention](https://peps.python.org/pep-0541/) 43 | * GitHub: Talk with one of the other py-pdf organization owners 44 | * ReadTheDocs: Follow the [Abandoned projects policy](https://docs.readthedocs.io/en/latest/abandoned-projects.html) 45 | -------------------------------------------------------------------------------- /docs/modules/Destination.rst: -------------------------------------------------------------------------------- 1 | The Destination Class 2 | --------------------- 3 | 4 | .. autoclass:: pypdf.generic.Destination 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/modules/DocumentInformation.rst: -------------------------------------------------------------------------------- 1 | The DocumentInformation Class 2 | ----------------------------- 3 | 4 | .. autoclass:: pypdf.DocumentInformation 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/modules/Field.rst: -------------------------------------------------------------------------------- 1 | The Field Class 2 | --------------- 3 | 4 | .. autoclass:: pypdf.generic.Field 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/modules/Fit.rst: -------------------------------------------------------------------------------- 1 | The Fit Class 2 | ------------- 3 | 4 | .. autoclass:: pypdf.generic.Fit 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/modules/PageObject.rst: -------------------------------------------------------------------------------- 1 | The PageObject Class 2 | -------------------- 3 | 4 | .. autoclass:: pypdf._page.PageObject 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | .. autoclass:: pypdf._page.VirtualListImages 10 | :members: 11 | :undoc-members: 12 | :show-inheritance: 13 | 14 | .. autoclass:: pypdf._page.ImageFile 15 | :members: 16 | :inherited-members: File 17 | :undoc-members: 18 | 19 | .. autofunction:: pypdf.mult 20 | -------------------------------------------------------------------------------- /docs/modules/PageRange.rst: -------------------------------------------------------------------------------- 1 | The PageRange Class 2 | ------------------- 3 | 4 | .. autoclass:: pypdf.PageRange 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/modules/PaperSize.rst: -------------------------------------------------------------------------------- 1 | The PaperSize Class 2 | ------------------- 3 | 4 | .. autoclass:: pypdf.PaperSize 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | Add blank page with PaperSize 10 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 11 | .. code-block:: python 12 | :linenos: 13 | 14 | from pypdf import PaperSize, PdfWriter 15 | 16 | writer = PdfWriter(clone_from="sample.pdf") 17 | writer.add_blank_page(PaperSize.A8.width, PaperSize.A8.height) 18 | with open("output.pdf", "wb") as output_stream: 19 | writer.write(output_stream) 20 | 21 | Insert blank page with PaperSize 22 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 23 | .. code-block:: python 24 | :linenos: 25 | 26 | from pypdf import PaperSize, PdfWriter 27 | 28 | writer = PdfWriter(clone_from="sample.pdf") 29 | writer.insert_blank_page(PaperSize.A8.width, PaperSize.A8.height, 1) 30 | with open("output.pdf", "wb") as output_stream: 31 | writer.write(output_stream) 32 | -------------------------------------------------------------------------------- /docs/modules/PdfDocCommon.rst: -------------------------------------------------------------------------------- 1 | The PdfDocCommon Class 2 | ---------------------- 3 | 4 | **PdfDocCommon** is an abstract class which is inherited by :class:`~pypdf.PdfReader` and :class:`~pypdf.PdfWriter`. 5 | 6 | Where identified in the API, you can use any of the derived class. 7 | 8 | .. autoclass:: pypdf._doc_common.PdfDocCommon 9 | :members: 10 | :inherited-members: 11 | :undoc-members: 12 | :show-inheritance: 13 | -------------------------------------------------------------------------------- /docs/modules/PdfReader.rst: -------------------------------------------------------------------------------- 1 | The PdfReader Class 2 | ------------------- 3 | 4 | .. autoclass:: pypdf.PdfReader 5 | :members: 6 | :inherited-members: 7 | :undoc-members: 8 | :show-inheritance: 9 | 10 | .. autoclass:: pypdf.PasswordType 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | -------------------------------------------------------------------------------- /docs/modules/PdfWriter.rst: -------------------------------------------------------------------------------- 1 | The PdfWriter Class 2 | ------------------- 3 | 4 | .. autoclass:: pypdf.PdfWriter 5 | :members: 6 | :inherited-members: 7 | :undoc-members: 8 | :show-inheritance: 9 | 10 | .. autoclass:: pypdf.ObjectDeletionFlag 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | -------------------------------------------------------------------------------- /docs/modules/RectangleObject.rst: -------------------------------------------------------------------------------- 1 | The RectangleObject Class 2 | ------------------------- 3 | 4 | .. autoclass:: pypdf.generic.RectangleObject 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/modules/Transformation.rst: -------------------------------------------------------------------------------- 1 | The Transformation Class 2 | ------------------------ 3 | 4 | .. autoclass:: pypdf.Transformation 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/modules/XmpInformation.rst: -------------------------------------------------------------------------------- 1 | The XmpInformation Class 2 | ------------------------- 3 | 4 | .. autoclass:: pypdf.xmp.XmpInformation 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/modules/annotations.rst: -------------------------------------------------------------------------------- 1 | The annotations module 2 | ---------------------- 3 | 4 | .. automodule:: pypdf.annotations 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/modules/constants.rst: -------------------------------------------------------------------------------- 1 | Constants 2 | --------- 3 | 4 | .. autoclass:: pypdf.constants.AnnotationFlag 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | .. autoclass:: pypdf.constants.ImageType 10 | :members: 11 | :undoc-members: 12 | :show-inheritance: 13 | 14 | .. autoclass:: pypdf.constants.PageLabelStyle 15 | :members: 16 | :undoc-members: 17 | :show-inheritance: 18 | 19 | .. autoclass:: pypdf.constants.UserAccessPermissions 20 | :members: 21 | :undoc-members: 22 | :show-inheritance: 23 | 24 | .. autoclass:: pypdf.constants.FieldDictionaryAttributes 25 | :members: 26 | :undoc-members: 27 | :exclude-members: FT, Parent, Kids, T, TU, TM, V, DV, AA, Opt, attributes, attributes_dict 28 | :show-inheritance: 29 | -------------------------------------------------------------------------------- /docs/modules/errors.rst: -------------------------------------------------------------------------------- 1 | Errors 2 | ------ 3 | 4 | .. automodule:: pypdf.errors 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/modules/generic.rst: -------------------------------------------------------------------------------- 1 | Generic PDF objects 2 | ------------------- 3 | 4 | .. automodule:: pypdf.generic 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | :exclude-members: Destination, Field, Fit, RectangleObject 9 | 10 | 11 | .. autoclass:: pypdf._protocols.PdfObjectProtocol 12 | :members: 13 | :undoc-members: 14 | :show-inheritance: 15 | 16 | 17 | .. autoclass:: pypdf._protocols.XmpInformationProtocol 18 | :members: 19 | :undoc-members: 20 | :show-inheritance: 21 | 22 | 23 | .. autoclass:: pypdf._protocols.PdfCommonDocProtocol 24 | :members: 25 | :undoc-members: 26 | :show-inheritance: 27 | 28 | 29 | .. autoclass:: pypdf._protocols.PdfReaderProtocol 30 | :members: 31 | :undoc-members: 32 | :show-inheritance: 33 | 34 | 35 | .. autoclass:: pypdf._protocols.PdfWriterProtocol 36 | :members: 37 | :undoc-members: 38 | :show-inheritance: 39 | -------------------------------------------------------------------------------- /docs/user/add-javascript.md: -------------------------------------------------------------------------------- 1 | # Adding JavaScript to a PDF 2 | 3 | PDF readers vary in the extent they support JavaScript, with some not supporting it at all. 4 | 5 | Adobe has documentation on its support here: 6 | [https://opensource.adobe.com/dc-acrobat-sdk-docs/library/jsapiref/index.html](https://opensource.adobe.com/dc-acrobat-sdk-docs/library/jsapiref/index.html) 7 | 8 | ## Launch print window on opening 9 | 10 | ```python 11 | from pypdf import PdfWriter 12 | 13 | writer = PdfWriter(clone_from="example.pdf") 14 | 15 | # Add JavaScript to launch the print window on opening this PDF. 16 | writer.add_js("this.print({bUI:true,bSilent:false,bShrinkToFit:true});") 17 | 18 | # Write to pypdf-output.pdf. 19 | with open("pypdf-output.pdf", "wb") as fp: 20 | writer.write(fp) 21 | ``` 22 | -------------------------------------------------------------------------------- /docs/user/add-watermark.md: -------------------------------------------------------------------------------- 1 | # Adding a Stamp or Watermark to a PDF 2 | 3 | Adding stamps or watermarks are two common ways to manipulate PDF files. 4 | A stamp is adding something on top of the document, a watermark is in the 5 | background of the document. 6 | 7 | ## Stamp (Overlay) / Watermark (Underlay) 8 | 9 | The process of stamping and watermarking is the same, you just need to set `over` parameter to `True` for stamping and `False` for watermarking. 10 | 11 | You can use {func}`~pypdf._page.PageObject.merge_page` if you don't need to transform the stamp: 12 | 13 | ```python 14 | from pypdf import PdfReader, PdfWriter 15 | 16 | stamp = PdfReader("bg.pdf").pages[0] 17 | writer = PdfWriter(clone_from="source.pdf") 18 | for page in writer.pages: 19 | page.merge_page(stamp, over=False) # here set to False for watermarking 20 | 21 | writer.write("out.pdf") 22 | ``` 23 | 24 | Otherwise use {func}`~pypdf._page.PageObject.merge_transformed_page` with {class}`~pypdf.Transformation` if you need to translate, rotate, scale, etc. the stamp before merging it to the content page. 25 | 26 | ```python 27 | from pathlib import Path 28 | from typing import List, Union 29 | 30 | from pypdf import PdfReader, PdfWriter, Transformation 31 | 32 | 33 | def stamp( 34 | content_pdf: Union[Path, str], 35 | stamp_pdf: Union[Path, str], 36 | pdf_result: Union[Path, str], 37 | page_indices: Union[None, List[int]] = None, 38 | ): 39 | stamp_page = PdfReader(stamp_pdf).pages[0] 40 | 41 | writer = PdfWriter() 42 | # page_indices can be a List(array) of page, tuples are for range definition 43 | reader = PdfReader(content_pdf) 44 | writer.append(reader, pages=page_indices) 45 | 46 | for content_page in writer.pages: 47 | content_page.merge_transformed_page( 48 | stamp_page, 49 | Transformation().scale(0.5), 50 | ) 51 | 52 | writer.write(pdf_result) 53 | 54 | 55 | stamp("example.pdf", "stamp.pdf", "out.pdf") 56 | ``` 57 | 58 | If you are experiencing wrongly rotated watermarks/stamps, try to use 59 | {func}`~pypdf._page.PageObject.transfer_rotation_to_content` on the corresponding pages beforehand 60 | to fix the page boxes. 61 | 62 | Example of stamp: 63 | ![stamp.png](stamp.png) 64 | 65 | Example of watermark: 66 | ![watermark.png](watermark.png) 67 | 68 | 69 | ## Stamping images directly 70 | 71 | The above code only works for stamps that are already in PDF format. 72 | However, you can easily convert an image to PDF image using 73 | [Pillow](https://pypi.org/project/Pillow/). 74 | 75 | 76 | ```python 77 | from io import BytesIO 78 | from pathlib import Path 79 | from typing import List, Union 80 | 81 | from PIL import Image 82 | from pypdf import PageRange, PdfReader, PdfWriter, Transformation 83 | 84 | 85 | def image_to_pdf(stamp_img: Union[Path, str]) -> PdfReader: 86 | img = Image.open(stamp_img) 87 | img_as_pdf = BytesIO() 88 | img.save(img_as_pdf, "pdf") 89 | return PdfReader(img_as_pdf) 90 | 91 | 92 | def stamp_img( 93 | content_pdf: Union[Path, str], 94 | stamp_img: Union[Path, str], 95 | pdf_result: Union[Path, str], 96 | page_indices: Union[PageRange, List[int], None] = None, 97 | ): 98 | # Convert the image to a PDF 99 | stamp_pdf = image_to_pdf(stamp_img) 100 | 101 | # Then use the same stamp code from above 102 | stamp_page = stamp_pdf.pages[0] 103 | 104 | writer = PdfWriter() 105 | 106 | reader = PdfReader(content_pdf) 107 | writer.append(reader, pages=page_indices) 108 | for content_page in writer.pages: 109 | content_page.merge_transformed_page( 110 | stamp_page, 111 | Transformation(), 112 | ) 113 | 114 | with open(pdf_result, "wb") as fp: 115 | writer.write(fp) 116 | 117 | 118 | stamp_img("example.pdf", "example.png", "out.pdf") 119 | ``` 120 | -------------------------------------------------------------------------------- /docs/user/annotation-circle.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/docs/user/annotation-circle.png -------------------------------------------------------------------------------- /docs/user/annotation-highlight.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/docs/user/annotation-highlight.png -------------------------------------------------------------------------------- /docs/user/annotation-line.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/docs/user/annotation-line.png -------------------------------------------------------------------------------- /docs/user/annotation-polygon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/docs/user/annotation-polygon.png -------------------------------------------------------------------------------- /docs/user/annotation-polyline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/docs/user/annotation-polyline.png -------------------------------------------------------------------------------- /docs/user/annotation-popup.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/docs/user/annotation-popup.png -------------------------------------------------------------------------------- /docs/user/annotation-square.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/docs/user/annotation-square.png -------------------------------------------------------------------------------- /docs/user/encryption-decryption.md: -------------------------------------------------------------------------------- 1 | # Encryption and Decryption of PDFs 2 | 3 | PDF encryption makes use of [`RC4`](https://en.wikipedia.org/wiki/RC4) and 4 | [`AES`](https://en.wikipedia.org/wiki/Advanced_Encryption_Standard) algorithms 5 | with different key length. `pypdf` supports all of them until `PDF-2.0`, which 6 | is the latest PDF standard. 7 | 8 | `pypdf` use an extra dependency to do encryption or decryption for `AES` algorithms. 9 | We recommend [`pyca/cryptography`](https://cryptography.io/en/latest/). Alternatively, 10 | you can use [`pycryptodome`](https://pypi.org/project/pycryptodome/). 11 | 12 | ```{note} 13 | Please see the note in the [installation guide](installation.md) 14 | for installing the extra dependencies if interacting with PDFs that use AES. 15 | ``` 16 | 17 | ## Encrypt 18 | 19 | You can encrypt a PDF by using a password: 20 | 21 | ```python 22 | from pypdf import PdfReader, PdfWriter 23 | 24 | reader = PdfReader("example.pdf") 25 | writer = PdfWriter(clone_from=reader) 26 | 27 | # Add a password to the new PDF 28 | writer.encrypt("my-secret-password", algorithm="AES-256") 29 | 30 | # Save the new PDF to a file 31 | with open("encrypted-pdf.pdf", "wb") as f: 32 | writer.write(f) 33 | ``` 34 | The algorithm can be one of `RC4-40`, `RC4-128`, `AES-128`, `AES-256-R5`, `AES-256`. 35 | We recommend using `AES-256-R5`. 36 | 37 | ```{warning} 38 | pypdf uses `RC4` by default for compatibility if you omit the "algorithm" parameter. 39 | Since `RC4` is insecure, you should use `AES` algorithms. 40 | ``` 41 | 42 | ## Decrypt 43 | 44 | You can decrypt a PDF using the appropriate password: 45 | 46 | ```python 47 | from pypdf import PdfReader, PdfWriter 48 | 49 | reader = PdfReader("encrypted-pdf.pdf") 50 | 51 | if reader.is_encrypted: 52 | reader.decrypt("my-secret-password") 53 | 54 | writer = PdfWriter(clone_from=reader) 55 | 56 | # Save the new PDF to a file 57 | with open("decrypted-pdf.pdf", "wb") as f: 58 | writer.write(f) 59 | ``` 60 | -------------------------------------------------------------------------------- /docs/user/error-hierarchy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/docs/user/error-hierarchy.png -------------------------------------------------------------------------------- /docs/user/extract-attachments.md: -------------------------------------------------------------------------------- 1 | # Extract Attachments 2 | 3 | PDF documents can contain attachments. Attachments have a name, but it might not 4 | be unique. For this reason, the value of `reader.attachments["attachment_name"]` 5 | is a list. 6 | 7 | You can extract all attachments like this: 8 | 9 | ```python 10 | from pypdf import PdfReader 11 | 12 | reader = PdfReader("example.pdf") 13 | 14 | for name, content_list in reader.attachments.items(): 15 | for i, content in enumerate(content_list): 16 | with open(f"{name}-{i}", "wb") as fp: 17 | fp.write(content) 18 | ``` 19 | 20 | Alternatively, you can retrieve them in an object-oriented fashion if you need 21 | further details for these files: 22 | 23 | ```python 24 | from pypdf import PdfReader 25 | 26 | reader = PdfReader("example.pdf") 27 | 28 | for attachment in reader.attachment_list: 29 | print(attachment.name, attachment.alternative_name, attachment.content) 30 | ``` 31 | -------------------------------------------------------------------------------- /docs/user/extract-images.md: -------------------------------------------------------------------------------- 1 | # Extract Images 2 | 3 | ```{note} 4 | In order to use the following code you need to install optional 5 | dependencies, see [installation guide](installation.md). 6 | ``` 7 | 8 | Every page of a PDF document can contain an arbitrary amount of images. 9 | The names of the files may not be unique. 10 | 11 | ```python 12 | from pypdf import PdfReader 13 | 14 | reader = PdfReader("example.pdf") 15 | 16 | page = reader.pages[0] 17 | 18 | for count, image_file_object in enumerate(page.images): 19 | with open(str(count) + image_file_object.name, "wb") as fp: 20 | fp.write(image_file_object.data) 21 | ``` 22 | 23 | # Other images 24 | 25 | Some other objects can contain images, such as stamp annotations. 26 | 27 | For example, this document contains such stamps: 28 | [test_stamp.pdf](https://github.com/user-attachments/files/15751424/test_stamp.pdf) 29 | 30 | You can extract the image from the annotation with the following code: 31 | 32 | ```python 33 | from pypdf import PdfReader 34 | 35 | reader = PdfReader("test_stamp.pdf") 36 | im = ( 37 | reader.pages[0]["/Annots"][0] 38 | .get_object()["/AP"]["/N"]["/Resources"]["/XObject"]["/Im4"] 39 | .decode_as_image() 40 | ) 41 | 42 | im.show() 43 | ``` 44 | -------------------------------------------------------------------------------- /docs/user/file-size.md: -------------------------------------------------------------------------------- 1 | # Reduce PDF File Size 2 | 3 | There are multiple ways to reduce the size of a given PDF file. The easiest 4 | one is to remove content (e.g. images) or pages. 5 | 6 | ## Removing duplication 7 | 8 | Some PDF documents contain the same object multiple times. For example, if an 9 | image appears three times in a PDF it could be embedded three times. Or it can 10 | be embedded once and referenced twice. 11 | 12 | When adding data to a PdfWriter, the data is copied while respecting the original format. 13 | For example, if two pages include the same image which is duplicated in the source document, the object will be duplicated in the PdfWriter object. 14 | 15 | Additionally, when you delete objects in a document, pypdf cannot easily identify whether the objects are used elsewhere or not or if the user wants to keep them in. When writing the PDF file, these objects will be hidden within (part of the file, but not displayed). 16 | 17 | In order to reduce the file size, use a compression call: `writer.compress_identical_objects(remove_identicals=True, remove_orphans=True)` 18 | 19 | * `remove_identicals` enables/disables compression merging identical objects. 20 | * `remove_orphans` enables/disables suppression of unused objects. 21 | 22 | It is recommended to apply this process just before writing to the file/stream. 23 | 24 | It depends on the PDF how well this works, but we have seen an 86% file 25 | reduction (from 5.7 MB to 0.8 MB) within a real PDF. 26 | 27 | 28 | ## Removing Images 29 | 30 | 31 | ```python 32 | from pypdf import PdfWriter 33 | 34 | writer = PdfWriter(clone_from="example.pdf") 35 | 36 | writer.remove_images() 37 | 38 | with open("out.pdf", "wb") as f: 39 | writer.write(f) 40 | ``` 41 | 42 | ## Reducing Image Quality 43 | 44 | If we reduce the quality of the images within the PDF, we can **sometimes** 45 | reduce the file size of the PDF overall. That depends on how well the reduced 46 | quality image can be compressed. 47 | 48 | ```python 49 | from pypdf import PdfWriter 50 | 51 | writer = PdfWriter(clone_from="example.pdf") 52 | 53 | for page in writer.pages: 54 | for img in page.images: 55 | img.replace(img.image, quality=80) 56 | 57 | with open("out.pdf", "wb") as f: 58 | writer.write(f) 59 | ``` 60 | 61 | ## Lossless Compression 62 | 63 | pypdf supports the FlateDecode filter which uses the zlib/deflate compression 64 | method. It is a lossless compression, meaning the resulting PDF looks exactly 65 | the same. 66 | 67 | Deflate compression can be applied to a page via 68 | {meth}`page.compress_content_streams `: 69 | 70 | ```python 71 | from pypdf import PdfWriter 72 | 73 | writer = PdfWriter(clone_from="example.pdf") 74 | 75 | for page in writer.pages: 76 | page.compress_content_streams() # This is CPU intensive! 77 | 78 | with open("out.pdf", "wb") as f: 79 | writer.write(f) 80 | ``` 81 | 82 | `page.compress_content_streams` uses [`zlib.compress`](https://docs.python.org/3/library/zlib.html#zlib.compress) 83 | and supports the `level` parameter: `level=0` means no compression, 84 | `level=9` refers to the highest compression. 85 | 86 | Using this method, we have seen a reduction by 70% (from 11.8 MB to 3.5 MB) 87 | with a real PDF. 88 | 89 | ## Removing Sources 90 | 91 | When a page is removed from the page list, its content will still be present in 92 | the PDF file. This means that the data may still be used elsewhere. 93 | 94 | Simply removing a page from the page list will reduce the page count but not the 95 | file size. In order to exclude the content completely, the pages should not be 96 | added to the PDF using the PdfWriter.append() function. Instead, only the 97 | desired pages should be selected for inclusion 98 | (note: [PR #1843](https://github.com/py-pdf/pypdf/pull/1843) will add a page 99 | deletion feature). 100 | 101 | There can be issues with poor PDF formatting, such as when all pages are linked 102 | to the same resource. In such cases, dropping references to specific pages 103 | becomes useless because there is only one source for all pages. 104 | 105 | Cropping is an ineffective method for reducing the file size because it only 106 | adjusts the viewboxes and not the external parts of the source image. Therefore, 107 | the content that is no longer visible will still be present in the PDF. 108 | 109 | ## Going Further 110 | 111 | The presentation [Putting a Squeeze on Your PDF](https://youtube.com/watch?v=tgOABUhVwFs) has other suggestions. One takeaway is that most of the significant size optimizations usually come from image and font modification. However, font optimization, such as replacing, merging, and subsetting, is not within the functionality of pypdf at the moment. 112 | -------------------------------------------------------------------------------- /docs/user/free-text-annotation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/docs/user/free-text-annotation.png -------------------------------------------------------------------------------- /docs/user/installation.md: -------------------------------------------------------------------------------- 1 | # Installation 2 | 3 | There are several ways to install pypdf. The most common option is to use pip. 4 | 5 | ## pip 6 | 7 | pypdf requires Python 3.8+ to run. 8 | 9 | Typically Python comes with `pip`, a package installer. Using it you can 10 | install pypdf: 11 | 12 | ```bash 13 | pip install pypdf 14 | ``` 15 | 16 | If you are not a super-user (a system administrator / root), you can also just 17 | install pypdf for your current user: 18 | 19 | ```bash 20 | pip install --user pypdf 21 | ``` 22 | 23 | ### Optional dependencies 24 | 25 | pypdf tries to be as self-contained as possible, but for some tasks the amount 26 | of work to properly maintain the code would be too high. This is especially the 27 | case for cryptography and image formats. 28 | 29 | If you simply want to install all optional dependencies, run: 30 | 31 | ``` 32 | pip install pypdf[full] 33 | ``` 34 | 35 | Alternatively, you can install just some: 36 | 37 | If you plan to use pypdf for encrypting or decrypting PDFs that use AES, you 38 | will need to install some extra dependencies. Encryption using RC4 is supported 39 | using the regular installation. 40 | 41 | ``` 42 | pip install pypdf[crypto] 43 | ``` 44 | 45 | If you plan to use image extraction, you need Pillow: 46 | 47 | ``` 48 | pip install pypdf[image] 49 | ``` 50 | 51 | For JBIG2 support, you need to install a global OS-level package as well: 52 | [`jbig2dec`](https://github.com/ArtifexSoftware/jbig2dec) The installation procedure 53 | depends on our operating system. For Ubuntu, just use the following for example: 54 | 55 | ``` 56 | sudo apt-get install jbig2dec 57 | ``` 58 | 59 | ## Python Version Support 60 | 61 | Since pypdf 4.0, every release, including point releases, should work with all 62 | supported versions of [Python](https://devguide.python.org/versions/). Thus 63 | every point release is designed to work with all existing Python versions, 64 | excluding end-of-life versions. 65 | 66 | Previous versions of pypdf support the following versions of Python: 67 | 68 | | Python | 3.11 | 3.10 | 3.9 | 3.8 | 3.7 | 3.6 | 2.7 | 69 | | ---------------------- |:----:|:----:|:---:|:---:|:---:|:---:|:---:| 70 | | pypdf 3.x | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | 71 | | PyPDF2 >= 2.0 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | 72 | | PyPDF2 1.20.0 - 1.28.4 | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | 73 | | PyPDF2 1.15.0 - 1.20.0 | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | 74 | 75 | 76 | ## Anaconda 77 | 78 | Anaconda users can [install pypdf via conda-forge](https://anaconda.org/conda-forge/pypdf). 79 | 80 | 81 | ## Development Version 82 | 83 | In case you want to use the current version under development: 84 | 85 | ```bash 86 | pip install git+https://github.com/py-pdf/pypdf.git 87 | ``` 88 | -------------------------------------------------------------------------------- /docs/user/merge-45-deg-rot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/docs/user/merge-45-deg-rot.png -------------------------------------------------------------------------------- /docs/user/merge-rotate-expand.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/docs/user/merge-rotate-expand.png -------------------------------------------------------------------------------- /docs/user/merge-translated.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/docs/user/merge-translated.png -------------------------------------------------------------------------------- /docs/user/metadata.md: -------------------------------------------------------------------------------- 1 | # Metadata 2 | 3 | ## Reading metadata 4 | 5 | ```python 6 | from pypdf import PdfReader 7 | 8 | reader = PdfReader("example.pdf") 9 | 10 | meta = reader.metadata 11 | 12 | # All of the following could be None! 13 | print(meta.title) 14 | print(meta.author) 15 | print(meta.subject) 16 | print(meta.creator) 17 | print(meta.producer) 18 | print(meta.creation_date) 19 | print(meta.modification_date) 20 | ``` 21 | 22 | ## Writing metadata 23 | 24 | ```python 25 | from datetime import datetime 26 | from pypdf import PdfReader, PdfWriter 27 | 28 | reader = PdfReader("example.pdf") 29 | writer = PdfWriter() 30 | 31 | # Add all pages to the writer 32 | for page in reader.pages: 33 | writer.add_page(page) 34 | 35 | # If you want to add the old metadata, include these two lines 36 | if reader.metadata is not None: 37 | writer.add_metadata(reader.metadata) 38 | 39 | # Format the current date and time for the metadata 40 | utc_time = "-05'00'" # UTC time optional 41 | time = datetime.now().strftime(f"D\072%Y%m%d%H%M%S{utc_time}") 42 | 43 | # Add the new metadata 44 | writer.add_metadata( 45 | { 46 | "/Author": "Martin", 47 | "/Producer": "Libre Writer", 48 | "/Title": "Title", 49 | "/Subject": "Subject", 50 | "/Keywords": "Keywords", 51 | "/CreationDate": time, 52 | "/ModDate": time, 53 | "/Creator": "Creator", 54 | "/CustomField": "CustomField", 55 | } 56 | ) 57 | 58 | # Save the new PDF to a file 59 | with open("meta-pdf.pdf", "wb") as f: 60 | writer.write(f) 61 | ``` 62 | 63 | ## Updating metadata 64 | 65 | ```python 66 | from pypdf import PdfWriter 67 | 68 | writer = PdfWriter(clone_from="example.pdf") 69 | 70 | # Change some values 71 | writer.add_metadata( 72 | { 73 | "/Author": "Martin", 74 | "/Producer": "Libre Writer", 75 | "/Title": "Title", 76 | } 77 | ) 78 | 79 | # Clear all data but keep the entry in PDF 80 | writer.metadata = {} 81 | 82 | # Replace all entries with new set of entries 83 | writer.metadata = { 84 | "/Author": "Martin", 85 | "/Producer": "Libre Writer", 86 | } 87 | 88 | # Save the new PDF to a file 89 | with open("meta-pdf.pdf", "wb") as f: 90 | writer.write(f) 91 | ``` 92 | 93 | ## Removing metadata entry 94 | 95 | ```python 96 | from pypdf import PdfWriter 97 | 98 | writer = PdfWriter("example.pdf") 99 | 100 | # Remove Metadata (/Info entry) 101 | writer.metadata = None 102 | 103 | # Save the new PDF to a file 104 | with open("meta-pdf.pdf", "wb") as f: 105 | writer.write(f) 106 | ``` 107 | -------------------------------------------------------------------------------- /docs/user/nup-dest1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/docs/user/nup-dest1.png -------------------------------------------------------------------------------- /docs/user/nup-dest2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/docs/user/nup-dest2.png -------------------------------------------------------------------------------- /docs/user/nup-source.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/docs/user/nup-source.png -------------------------------------------------------------------------------- /docs/user/page-stamped.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/docs/user/page-stamped.png -------------------------------------------------------------------------------- /docs/user/page.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/docs/user/page.png -------------------------------------------------------------------------------- /docs/user/pdf-version-support.md: -------------------------------------------------------------------------------- 1 | # PDF Version Support 2 | 3 | PDF comes in the following versions: 4 | 5 | * 1993: 1.0 6 | * 1994: 1.1 7 | * 1996: 1.2 8 | * 1999: 1.3 9 | * 2001: 1.4 10 | * 2003: 1.5 11 | * 2004: 1.6 12 | * 2008: 1.7, ISO 32000-1:2008 13 | * 2017: 2.0, ISO 32000-2:2017 14 | 15 | The general format didn't change, but new features got added. It can be that 16 | pypdf can do the operations you want on PDF 2.0 files without fully supporting 17 | all features of PDF 2.0. 18 | 19 | ## PDF Feature Support by pypdf 20 | 21 | | Feature | PDF Version | pypdf Support | 22 | | --------------------------------------- |:-----------:|:--------------:| 23 | | CMaps | 1.4 | ✅ | 24 | | Transparent Graphics | 1.4 | ✅ | 25 | | Content Stream Compression | 1.5 | ✅ | 26 | | Cross-reference Streams | 1.5 | ❓ | 27 | | Object Streams | 1.5 | ✅ | 28 | | Optional Content Groups (OCGs) | 1.5 | ❓ | 29 | | AES Encryption | 1.6 | ✅ | 30 | 31 | See [History of PDF](https://en.wikipedia.org/wiki/History_of_PDF) for more 32 | features. 33 | 34 | Some PDF features are not supported by pypdf, but other libraries can be used 35 | for them: 36 | 37 | * [pyHanko](https://pyhanko.readthedocs.io/en/latest/index.html): Cryptographically sign a PDF ([#302](https://github.com/py-pdf/pypdf/issues/302)) 38 | * [camelot-py](https://pypi.org/project/camelot-py/): Table Extraction ([#231](https://github.com/py-pdf/pypdf/issues/231)) 39 | -------------------------------------------------------------------------------- /docs/user/plain-merge.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/docs/user/plain-merge.png -------------------------------------------------------------------------------- /docs/user/post-processing-in-text-extraction.md: -------------------------------------------------------------------------------- 1 | # Post-Processing of Text Extraction 2 | 3 | Post-processing can recognizably improve the results of text extraction. It is, 4 | however, outside of the scope of pypdf itself. Hence the library will not give 5 | any direct support for it. It is a natural language processing (NLP) task. 6 | 7 | This page lists a few examples what can be done as well as a community recipe 8 | that can be used as a general purpose post-processing step. If you know more 9 | about the specific domain of your documents, e.g. the language, it is likely 10 | that you can find custom solutions that work better in your context. 11 | 12 | ## Ligature Replacement 13 | 14 | ```python 15 | def replace_ligatures(text: str) -> str: 16 | ligatures = { 17 | "ff": "ff", 18 | "fi": "fi", 19 | "fl": "fl", 20 | "ffi": "ffi", 21 | "ffl": "ffl", 22 | "ſt": "ft", 23 | "st": "st", 24 | # "Ꜳ": "AA", 25 | # "Æ": "AE", 26 | "ꜳ": "aa", 27 | } 28 | for search, replace in ligatures.items(): 29 | text = text.replace(search, replace) 30 | return text 31 | ``` 32 | 33 | ## Dehyphenation 34 | 35 | Hyphens are used to break words up so that the appearance of the page is nicer. 36 | 37 | ```python 38 | from typing import List 39 | 40 | 41 | def remove_hyphens(text: str) -> str: 42 | """ 43 | 44 | This fails for: 45 | * Natural dashes: well-known, self-replication, use-cases, non-semantic, 46 | Post-processing, Window-wise, viewpoint-dependent 47 | * Trailing math operands: 2 - 4 48 | * Names: Lopez-Ferreras, VGG-19, CIFAR-100 49 | """ 50 | lines = [line.rstrip() for line in text.split("\n")] 51 | 52 | # Find dashes 53 | line_numbers = [] 54 | for line_no, line in enumerate(lines[:-1]): 55 | if line.endswith("-"): 56 | line_numbers.append(line_no) 57 | 58 | # Replace 59 | for line_no in line_numbers: 60 | lines = dehyphenate(lines, line_no) 61 | 62 | return "\n".join(lines) 63 | 64 | 65 | def dehyphenate(lines: List[str], line_no: int) -> List[str]: 66 | next_line = lines[line_no + 1] 67 | word_suffix = next_line.split(" ")[0] 68 | 69 | lines[line_no] = lines[line_no][:-1] + word_suffix 70 | lines[line_no + 1] = lines[line_no + 1][len(word_suffix) :] 71 | return lines 72 | ``` 73 | 74 | ## Header/Footer Removal 75 | 76 | The following header/footer removal has several drawbacks: 77 | 78 | * False-positives, e.g. for the first page when there is a date like 2024. 79 | * False-negatives in many cases: 80 | * Dynamic part, e.g. page label is in the header. 81 | * Even/odd pages have different headers. 82 | * Some pages, e.g. the first one or chapter pages, do not have a header. 83 | 84 | ```python 85 | def remove_footer(extracted_texts: list[str], page_labels: list[str]): 86 | def remove_page_labels(extracted_texts, page_labels): 87 | processed = [] 88 | for text, label in zip(extracted_texts, page_labels): 89 | text_left = text.lstrip() 90 | if text_left.startswith(label): 91 | text = text_left[len(label) :] 92 | 93 | text_right = text.rstrip() 94 | if text_right.endswith(label): 95 | text = text_right[: -len(label)] 96 | 97 | processed.append(text) 98 | return processed 99 | 100 | extracted_texts = remove_page_labels(extracted_texts, page_labels) 101 | return extracted_texts 102 | ``` 103 | 104 | ## Other ideas 105 | 106 | * Whitespaces in units: Between a number and its unit should be a space. 107 | ([source](https://tex.stackexchange.com/questions/20962/should-i-put-a-space-between-a-number-and-its-unit)). 108 | That means: 42 ms, 42 GHz, 42 GB. 109 | * Percent: English style guides prescribe writing the percent sign following the number without any space between (e.g. 50%). 110 | * Whitespaces before dots: Should typically be removed. 111 | * Whitespaces after dots: Should typically be added. 112 | -------------------------------------------------------------------------------- /docs/user/reading-pdf-annotations.md: -------------------------------------------------------------------------------- 1 | # Reading PDF Annotations 2 | 3 | PDF 2.0 defines the following annotation types: 4 | 5 | * Text 6 | * Link 7 | * FreeText 8 | * Line 9 | * Square 10 | * Circle 11 | * Polygon 12 | * PolyLine 13 | * Highlight 14 | * Underline 15 | * Squiggly 16 | * StrikeOut 17 | * Caret 18 | * Stamp 19 | * Ink 20 | * Popup 21 | * FileAttachment 22 | * Sound 23 | * Movie 24 | * Screen 25 | * Widget 26 | * PrinterMark 27 | * TrapNet 28 | * Watermark 29 | * 3D 30 | * Redact 31 | * Projection 32 | * RichMedia 33 | 34 | In general, annotations can be read like this: 35 | 36 | ```python 37 | from pypdf import PdfReader 38 | 39 | reader = PdfReader("annotated.pdf") 40 | 41 | for page in reader.pages: 42 | if "/Annots" in page: 43 | for annotation in page["/Annots"]: 44 | obj = annotation.get_object() 45 | print({"subtype": obj["/Subtype"], "location": obj["/Rect"]}) 46 | ``` 47 | 48 | Examples of reading three of the most common annotations: 49 | 50 | ## Text 51 | 52 | ```python 53 | from pypdf import PdfReader 54 | 55 | reader = PdfReader("example.pdf") 56 | 57 | for page in reader.pages: 58 | if "/Annots" in page: 59 | for annotation in page["/Annots"]: 60 | subtype = annotation.get_object()["/Subtype"] 61 | if subtype == "/Text": 62 | print(annotation.get_object()["/Contents"]) 63 | ``` 64 | 65 | ## Highlights 66 | 67 | ```python 68 | from pypdf import PdfReader 69 | 70 | reader = PdfReader("example.pdf") 71 | 72 | for page in reader.pages: 73 | if "/Annots" in page: 74 | for annotation in page["/Annots"]: 75 | subtype = annotation.get_object()["/Subtype"] 76 | if subtype == "/Highlight": 77 | coords = annotation.get_object()["/QuadPoints"] 78 | x1, y1, x2, y2, x3, y3, x4, y4 = coords 79 | ``` 80 | 81 | ## Attachments 82 | 83 | ```python 84 | from pypdf import PdfReader 85 | 86 | reader = PdfReader("example.pdf") 87 | 88 | attachments = {} 89 | for page in reader.pages: 90 | if "/Annots" in page: 91 | for annotation in page["/Annots"]: 92 | subtype = annotation.get_object()["/Subtype"] 93 | if subtype == "/FileAttachment": 94 | fileobj = annotation.get_object()["/FS"] 95 | attachments[fileobj["/F"]] = fileobj["/EF"]["/F"].get_data() 96 | ``` 97 | -------------------------------------------------------------------------------- /docs/user/robustness.md: -------------------------------------------------------------------------------- 1 | # Robustness and strict=False 2 | 3 | PDF is [specified in various versions](https://pdfa.org/resource/pdf-specification-archive/). 4 | The specification of PDF 2.0 has 1003 pages. This length makes it hard to get 5 | everything right. As a consequence, a lot of PDF files are not strictly following the 6 | specification. 7 | 8 | If a PDF file does not follow the specification, it is not always possible to 9 | be certain what the intended effect would be. Think of the following broken 10 | Python code as an example: 11 | 12 | ```python 13 | # Broken 14 | function (foo, bar): 15 | 16 | # Potentially intended: 17 | def function(foo, bar): 18 | ... 19 | 20 | # Also possible: 21 | function = (foo, bar) 22 | ``` 23 | 24 | Writing a parser you can go two paths: Either you try to be forgiving and try 25 | to figure out what the user intended, or you are strict and just tell the user 26 | that they should fix their stuff. 27 | 28 | pypdf gives you the option to be strict or not. 29 | 30 | pypdf has two core objects: 31 | 32 | * {class}`~pypdf.PdfReader` 33 | * {class}`~pypdf.PdfWriter` 34 | 35 | Only the PdfReader has a `strict` parameter, since presumably you do not want 36 | to write a non-conforming PDF. 37 | 38 | Choosing `strict=True` means that pypdf will raise an exception if a PDF does 39 | not follow the specification. 40 | 41 | Choosing `strict=False` means that pypdf will try to be forgiving and do 42 | something reasonable, but it will log a warning message. It is a best-effort 43 | approach. 44 | -------------------------------------------------------------------------------- /docs/user/scaling.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/docs/user/scaling.png -------------------------------------------------------------------------------- /docs/user/stamp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/docs/user/stamp.png -------------------------------------------------------------------------------- /docs/user/streaming-data.md: -------------------------------------------------------------------------------- 1 | # Streaming Data with pypdf 2 | 3 | In some cases you might want to avoid saving things explicitly as a file 4 | to disk, e.g. when you want to store the PDF in a database or AWS S3. 5 | 6 | pypdf supports streaming data to a file-like object: 7 | 8 | ```python 9 | from io import BytesIO 10 | 11 | # Prepare example 12 | with open("example.pdf", "rb") as fh: 13 | bytes_stream = BytesIO(fh.read()) 14 | 15 | # Read from bytes_stream 16 | reader = PdfReader(bytes_stream) 17 | 18 | # Write to bytes_stream 19 | writer = PdfWriter() 20 | with BytesIO() as bytes_stream: 21 | writer.write(bytes_stream) 22 | ``` 23 | 24 | ## Writing a PDF directly to AWS S3 25 | 26 | Suppose you want to manipulate a PDF and write it directly to AWS S3 without having 27 | to write the document to a file first. We have the original PDF in `raw_bytes_data` as `bytes` 28 | and want to set `my-secret-password`: 29 | 30 | ```python 31 | from io import BytesIO 32 | 33 | import boto3 34 | from pypdf import PdfReader, PdfWriter 35 | 36 | 37 | reader = PdfReader(BytesIO(raw_bytes_data)) 38 | writer = PdfWriter() 39 | 40 | # Add all pages to the writer 41 | for page in reader.pages: 42 | writer.add_page(page) 43 | 44 | # Add a password to the new PDF 45 | writer.encrypt("my-secret-password") 46 | 47 | # Save the new PDF to a file 48 | with BytesIO() as bytes_stream: 49 | writer.write(bytes_stream) 50 | bytes_stream.seek(0) 51 | s3 = boto3.client("s3") 52 | s3.write_get_object_response( 53 | Body=bytes_stream, RequestRoute=request_route, RequestToken=request_token 54 | ) 55 | ``` 56 | 57 | ## Reading PDFs directly from cloud services 58 | 59 | One option is to first download the file and then pass the local file path to `PdfReader`. 60 | Another option is to get a byte stream. 61 | 62 | For AWS S3 it works like this: 63 | 64 | ```python 65 | from io import BytesIO 66 | 67 | import boto3 68 | from pypdf import PdfReader 69 | 70 | 71 | s3 = boto3.client("s3") 72 | obj = s3.get_object(Body=csv_buffer.getvalue(), Bucket="my-bucket", Key="my/doc.pdf") 73 | reader = PdfReader(BytesIO(obj["Body"].read())) 74 | ``` 75 | 76 | To use with Google Cloud storage: 77 | 78 | ```python 79 | from io import BytesIO 80 | 81 | from google.cloud import storage 82 | 83 | # os.environ["GOOGLE_APPLICATION_CREDENTIALS"] must be set 84 | storage_client = storage.Client() 85 | blob = storage_client.bucket("my-bucket").blob("mydoc.pdf") 86 | file_stream = BytesIO() 87 | blob.download_to_file(file_stream) 88 | reader = PdfReader(file_stream) 89 | ``` 90 | -------------------------------------------------------------------------------- /docs/user/suppress-warnings.md: -------------------------------------------------------------------------------- 1 | # Exceptions, Warnings, and Log messages 2 | 3 | pypdf makes use of three mechanisms to show if something went wrong: 4 | 5 | * **Exceptions** are error cases that pypdf users should explicitly handle. 6 | In the `strict=True` mode, most log messages with the warning level will 7 | become exceptions. This can be useful in applications where you can require 8 | a user to fix the broken PDF. 9 | * **Warnings** are avoidable issues, such as using deprecated classes / 10 | functions / parameters. Another example is missing capabilities of pypdf. 11 | In those cases, pypdf users should adjust their code. Warnings 12 | are issued by the `warnings` module - those are different from the log-level 13 | "warning". 14 | * **Log messages** are informative messages that can be used for post-mortem 15 | analysis. Most of the time, users can ignore them. They come in different 16 | *levels*, such as info / warning / error indicating the severity. 17 | Examples are non-standard compliant PDF files which pypdf can deal with or 18 | a missing implementation that leads to a part of the text not being extracted. 19 | 20 | 21 | ## Exceptions 22 | 23 | Exceptions need to be caught if you want to handle them. For example, you could 24 | want to read the text from a PDF as a part of a search function. 25 | 26 | Most PDF files do not follow the specification. In this case pypdf needs to 27 | guess which kinds of mistakes were potentially done when the PDF file was created. 28 | See [the robustness page](robustness.md) for the related issues. 29 | 30 | As a user, you likely do not care about it. If it is readable in any way, you 31 | want the text. You might use pdfminer.six as a fallback and do this: 32 | 33 | ```python 34 | from pypdf import PdfReader 35 | from pdfminer.high_level import extract_text as fallback_text_extraction 36 | 37 | text = "" 38 | try: 39 | reader = PdfReader("example.pdf") 40 | for page in reader.pages: 41 | text += page.extract_text() 42 | except Exception as exc: 43 | text = fallback_text_extraction("example.pdf") 44 | ``` 45 | 46 | You could also capture [`pypdf.errors.PyPdfError`](https://github.com/py-pdf/pypdf/blob/main/pypdf/errors.py) 47 | if you prefer something more specific. 48 | 49 | ## Warnings 50 | 51 | The [`warnings` module](https://docs.python.org/3/library/warnings.html) allows 52 | you to ignore warnings: 53 | 54 | ```python 55 | import warnings 56 | 57 | warnings.filterwarnings("ignore") 58 | ``` 59 | 60 | In many cases, you actually want to start Python with the `-W` flag so that you 61 | see all warnings. This is especially true for Continuous Integration (CI). 62 | 63 | ## Log messages 64 | 65 | Log messages can be noisy in some cases. pypdf hopefully has a reasonable 66 | level of log messages, but you can reduce which types of messages you want to 67 | see: 68 | 69 | ```python 70 | import logging 71 | 72 | logger = logging.getLogger("pypdf") 73 | logger.setLevel(logging.ERROR) 74 | ``` 75 | 76 | The [`logging` module](https://docs.python.org/3/library/logging.html#logging-levels) 77 | defines six log levels: 78 | 79 | * CRITICAL 80 | * ERROR 81 | * WARNING 82 | * INFO 83 | * DEBUG 84 | * NOTSET 85 | -------------------------------------------------------------------------------- /docs/user/text-annotation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/docs/user/text-annotation.png -------------------------------------------------------------------------------- /docs/user/viewer-preferences.md: -------------------------------------------------------------------------------- 1 | # Adding Viewer Preferences 2 | 3 | It is possible to set viewer preferences of a PDF file. 4 | §12.2 of the [PDF 1.7 specification](https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/PDF32000_2008.pdf). 5 | 6 | Note that the `/ViewerPreferences` dictionary does not exist by default. 7 | If it is not already present, it must be created by calling the 8 | {func}`~pypdf.PdfWriter.create_viewer_preferences` method. 9 | 10 | If viewer preferences exist in a PDF file being read with {class}`~pypdf.PdfReader`, 11 | you can access them as properties of {attr}`~pypdf.PdfReader.viewer_preferences`. 12 | Otherwise, the {attr}`~pypdf.PdfReader.viewer_preferences` property will be set to `None`. 13 | 14 | ## Example 15 | 16 | ```python 17 | from pypdf import PdfWriter 18 | from pypdf.generic import ArrayObject, NumberObject 19 | 20 | writer = PdfWriter() 21 | 22 | writer.create_viewer_preferences() 23 | 24 | # /HideToolbar 25 | writer.viewer_preferences.hide_toolbar = True 26 | # /HideMenubar 27 | writer.viewer_preferences.hide_menubar = True 28 | # /HideWindowUI 29 | writer.viewer_preferences.hide_windowui = True 30 | # /FitWindow 31 | writer.viewer_preferences.fit_window = True 32 | # /CenterWindow 33 | writer.viewer_preferences.center_window = True 34 | # /DisplayDocTitle 35 | writer.viewer_preferences.display_doctitle = True 36 | 37 | # /NonFullScreenPageMode 38 | writer.viewer_preferences.non_fullscreen_pagemode = "/UseNone" # default 39 | writer.viewer_preferences.non_fullscreen_pagemode = "/UseOutlines" 40 | writer.viewer_preferences.non_fullscreen_pagemode = "/UseThumbs" 41 | writer.viewer_preferences.non_fullscreen_pagemode = "/UseOC" 42 | 43 | # /Direction 44 | writer.viewer_preferences.direction = "/L2R" # default 45 | writer.viewer_preferences.direction = "/R2L" 46 | 47 | # /ViewArea 48 | writer.viewer_preferences.view_area = "/CropBox" 49 | # /ViewClip 50 | writer.viewer_preferences.view_clip = "/CropBox" 51 | # /PrintArea 52 | writer.viewer_preferences.print_area = "/CropBox" 53 | # /PrintClip 54 | writer.viewer_preferences.print_clip = "/CropBox" 55 | 56 | # /PrintScaling 57 | writer.viewer_preferences.print_scaling = "/None" 58 | writer.viewer_preferences.print_scaling = "/AppDefault" # default according to PDF spec 59 | 60 | # /Duplex 61 | writer.viewer_preferences.duplex = "/Simplex" 62 | writer.viewer_preferences.duplex = "/DuplexFlipShortEdge" 63 | writer.viewer_preferences.duplex = "/DuplexFlipLongEdge" 64 | 65 | # /PickTrayByPDFSize 66 | writer.viewer_preferences.pick_tray_by_pdfsize = True 67 | # /PrintPageRange 68 | writer.viewer_preferences.print_pagerange = ArrayObject( 69 | [NumberObject("1"), NumberObject("10"), NumberObject("20"), NumberObject("30")] 70 | ) 71 | # /NumCopies 72 | writer.viewer_preferences.num_copies = 2 73 | 74 | for i in range(40): 75 | writer.add_blank_page(10, 10) 76 | 77 | with open("output.pdf", "wb") as output_stream: 78 | writer.write(output_stream) 79 | ``` 80 | 81 | The names beginning with a slash character are part of the PDF file format. They are 82 | included here to ease searching the pypdf documentation 83 | for these names from the PDF specification. 84 | -------------------------------------------------------------------------------- /docs/user/watermark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/docs/user/watermark.png -------------------------------------------------------------------------------- /pypdf/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | pypdf is a free and open-source pure-python PDF library capable of splitting, 3 | merging, cropping, and transforming the pages of PDF files. It can also add 4 | custom data, viewing options, and passwords to PDF files. pypdf can retrieve 5 | text and metadata from PDFs as well. 6 | 7 | You can read the full docs at https://pypdf.readthedocs.io/. 8 | """ 9 | 10 | from ._crypt_providers import crypt_provider 11 | from ._doc_common import DocumentInformation 12 | from ._encryption import PasswordType 13 | from ._merger import PdfMerger 14 | from ._page import PageObject, Transformation, mult 15 | from ._reader import PdfReader 16 | from ._version import __version__ 17 | from ._writer import ObjectDeletionFlag, PdfWriter 18 | from .constants import ImageType 19 | from .pagerange import PageRange, parse_filename_page_ranges 20 | from .papersizes import PaperSize 21 | 22 | try: 23 | import PIL 24 | 25 | pil_version = PIL.__version__ 26 | except ImportError: 27 | pil_version = "none" 28 | 29 | _debug_versions = ( 30 | f"pypdf=={__version__}, {crypt_provider=}, PIL={pil_version}" 31 | ) 32 | 33 | __all__ = [ 34 | "DocumentInformation", 35 | "ImageType", 36 | "ObjectDeletionFlag", 37 | "PageObject", 38 | "PageRange", 39 | "PaperSize", 40 | "PasswordType", 41 | "PdfMerger", 42 | "PdfReader", 43 | "PdfWriter", 44 | "Transformation", 45 | "__version__", 46 | "_debug_versions", 47 | "mult", 48 | "parse_filename_page_ranges", 49 | ] 50 | -------------------------------------------------------------------------------- /pypdf/_codecs/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List 2 | 3 | from .adobe_glyphs import adobe_glyphs 4 | from .pdfdoc import _pdfdoc_encoding 5 | from .std import _std_encoding 6 | from .symbol import _symbol_encoding 7 | from .zapfding import _zapfding_encoding 8 | 9 | 10 | def fill_from_encoding(enc: str) -> List[str]: 11 | lst: List[str] = [] 12 | for x in range(256): 13 | try: 14 | lst += (bytes((x,)).decode(enc),) 15 | except Exception: 16 | lst += (chr(x),) 17 | return lst 18 | 19 | 20 | def rev_encoding(enc: List[str]) -> Dict[str, int]: 21 | rev: Dict[str, int] = {} 22 | for i in range(256): 23 | char = enc[i] 24 | if char == "\u0000": 25 | continue 26 | assert char not in rev, f"{char} at {i} already at {rev[char]}" 27 | rev[char] = i 28 | return rev 29 | 30 | 31 | _win_encoding = fill_from_encoding("cp1252") 32 | _mac_encoding = fill_from_encoding("mac_roman") 33 | 34 | 35 | _win_encoding_rev: Dict[str, int] = rev_encoding(_win_encoding) 36 | _mac_encoding_rev: Dict[str, int] = rev_encoding(_mac_encoding) 37 | _symbol_encoding_rev: Dict[str, int] = rev_encoding(_symbol_encoding) 38 | _zapfding_encoding_rev: Dict[str, int] = rev_encoding(_zapfding_encoding) 39 | _pdfdoc_encoding_rev: Dict[str, int] = rev_encoding(_pdfdoc_encoding) 40 | 41 | 42 | charset_encoding: Dict[str, List[str]] = { 43 | "/StandardEncoding": _std_encoding, 44 | "/WinAnsiEncoding": _win_encoding, 45 | "/MacRomanEncoding": _mac_encoding, 46 | "/PDFDocEncoding": _pdfdoc_encoding, 47 | "/Symbol": _symbol_encoding, 48 | "/ZapfDingbats": _zapfding_encoding, 49 | } 50 | 51 | __all__ = [ 52 | "_mac_encoding", 53 | "_pdfdoc_encoding", 54 | "_pdfdoc_encoding_rev", 55 | "_std_encoding", 56 | "_symbol_encoding", 57 | "_win_encoding", 58 | "_zapfding_encoding", 59 | "adobe_glyphs", 60 | "charset_encoding", 61 | ] 62 | -------------------------------------------------------------------------------- /pypdf/_codecs/std.py: -------------------------------------------------------------------------------- 1 | _std_encoding = [ 2 | "\x00", 3 | "\x01", 4 | "\x02", 5 | "\x03", 6 | "\x04", 7 | "\x05", 8 | "\x06", 9 | "\x07", 10 | "\x08", 11 | "\t", 12 | "\n", 13 | "\x0b", 14 | "\x0c", 15 | "\r", 16 | "\x0e", 17 | "\x0f", 18 | "\x10", 19 | "\x11", 20 | "\x12", 21 | "\x13", 22 | "\x14", 23 | "\x15", 24 | "\x16", 25 | "\x17", 26 | "\x18", 27 | "\x19", 28 | "\x1a", 29 | "\x1b", 30 | "\x1c", 31 | "\x1d", 32 | "\x1e", 33 | "\x1f", 34 | " ", 35 | "!", 36 | '"', 37 | "#", 38 | "$", 39 | "%", 40 | "&", 41 | "’", 42 | "(", 43 | ")", 44 | "*", 45 | "+", 46 | ",", 47 | "-", 48 | ".", 49 | "/", 50 | "0", 51 | "1", 52 | "2", 53 | "3", 54 | "4", 55 | "5", 56 | "6", 57 | "7", 58 | "8", 59 | "9", 60 | ":", 61 | ";", 62 | "<", 63 | "=", 64 | ">", 65 | "?", 66 | "@", 67 | "A", 68 | "B", 69 | "C", 70 | "D", 71 | "E", 72 | "F", 73 | "G", 74 | "H", 75 | "I", 76 | "J", 77 | "K", 78 | "L", 79 | "M", 80 | "N", 81 | "O", 82 | "P", 83 | "Q", 84 | "R", 85 | "S", 86 | "T", 87 | "U", 88 | "V", 89 | "W", 90 | "X", 91 | "Y", 92 | "Z", 93 | "[", 94 | "\\", 95 | "]", 96 | "^", 97 | "_", 98 | "‘", 99 | "a", 100 | "b", 101 | "c", 102 | "d", 103 | "e", 104 | "f", 105 | "g", 106 | "h", 107 | "i", 108 | "j", 109 | "k", 110 | "l", 111 | "m", 112 | "n", 113 | "o", 114 | "p", 115 | "q", 116 | "r", 117 | "s", 118 | "t", 119 | "u", 120 | "v", 121 | "w", 122 | "x", 123 | "y", 124 | "z", 125 | "{", 126 | "|", 127 | "}", 128 | "~", 129 | "\x7f", 130 | "\x80", 131 | "\x81", 132 | "\x82", 133 | "\x83", 134 | "\x84", 135 | "\x85", 136 | "\x86", 137 | "\x87", 138 | "\x88", 139 | "\x89", 140 | "\x8a", 141 | "\x8b", 142 | "\x8c", 143 | "\x8d", 144 | "\x8e", 145 | "\x8f", 146 | "\x90", 147 | "\x91", 148 | "\x92", 149 | "\x93", 150 | "\x94", 151 | "\x95", 152 | "\x96", 153 | "\x97", 154 | "\x98", 155 | "\x99", 156 | "\x9a", 157 | "\x9b", 158 | "\x9c", 159 | "\x9d", 160 | "\x9e", 161 | "\x9f", 162 | "\xa0", 163 | "¡", 164 | "¢", 165 | "£", 166 | "⁄", 167 | "¥", 168 | "ƒ", 169 | "§", 170 | "¤", 171 | "'", 172 | "“", 173 | "«", 174 | "‹", 175 | "›", 176 | "fi", 177 | "fl", 178 | "°", 179 | "–", 180 | "†", 181 | "‡", 182 | "·", 183 | "µ", 184 | "¶", 185 | "•", 186 | "‚", 187 | "„", 188 | "”", 189 | "»", 190 | "…", 191 | "‰", 192 | "¾", 193 | "¿", 194 | "À", 195 | "`", 196 | "´", 197 | "ˆ", 198 | "˜", 199 | "¯", 200 | "˘", 201 | "˙", 202 | "¨", 203 | "É", 204 | "˚", 205 | "¸", 206 | "Ì", 207 | "˝", 208 | "˛", 209 | "ˇ", 210 | "—", 211 | "Ñ", 212 | "Ò", 213 | "Ó", 214 | "Ô", 215 | "Õ", 216 | "Ö", 217 | "×", 218 | "Ø", 219 | "Ù", 220 | "Ú", 221 | "Û", 222 | "Ü", 223 | "Ý", 224 | "Þ", 225 | "ß", 226 | "à", 227 | "Æ", 228 | "â", 229 | "ª", 230 | "ä", 231 | "å", 232 | "æ", 233 | "ç", 234 | "Ł", 235 | "Ø", 236 | "Œ", 237 | "º", 238 | "ì", 239 | "í", 240 | "î", 241 | "ï", 242 | "ð", 243 | "æ", 244 | "ò", 245 | "ó", 246 | "ô", 247 | "ı", 248 | "ö", 249 | "÷", 250 | "ł", 251 | "ø", 252 | "œ", 253 | "ß", 254 | "ü", 255 | "ý", 256 | "þ", 257 | "ÿ", 258 | ] 259 | -------------------------------------------------------------------------------- /pypdf/_crypt_providers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, exiledkingcc 2 | # All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions are 6 | # met: 7 | # 8 | # * Redistributions of source code must retain the above copyright notice, 9 | # this list of conditions and the following disclaimer. 10 | # * Redistributions in binary form must reproduce the above copyright notice, 11 | # this list of conditions and the following disclaimer in the documentation 12 | # and/or other materials provided with the distribution. 13 | # * The name of the author may not be used to endorse or promote products 14 | # derived from this software without specific prior written permission. 15 | # 16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 20 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26 | # POSSIBILITY OF SUCH DAMAGE. 27 | 28 | from pypdf._crypt_providers._base import CryptBase, CryptIdentity 29 | 30 | try: 31 | from pypdf._crypt_providers._cryptography import ( 32 | CryptAES, 33 | CryptRC4, 34 | aes_cbc_decrypt, 35 | aes_cbc_encrypt, 36 | aes_ecb_decrypt, 37 | aes_ecb_encrypt, 38 | crypt_provider, 39 | rc4_decrypt, 40 | rc4_encrypt, 41 | ) 42 | from pypdf._utils import Version 43 | 44 | if Version(crypt_provider[1]) <= Version("3.0"): 45 | # This is due to the backend parameter being required back then: 46 | # https://cryptography.io/en/latest/changelog/#v3-1 47 | raise ImportError("cryptography<=3.0 is not supported") # pragma: no cover 48 | except ImportError: 49 | try: 50 | from pypdf._crypt_providers._pycryptodome import ( # type: ignore 51 | CryptAES, 52 | CryptRC4, 53 | aes_cbc_decrypt, 54 | aes_cbc_encrypt, 55 | aes_ecb_decrypt, 56 | aes_ecb_encrypt, 57 | crypt_provider, 58 | rc4_decrypt, 59 | rc4_encrypt, 60 | ) 61 | except ImportError: 62 | from pypdf._crypt_providers._fallback import ( # type: ignore 63 | CryptAES, 64 | CryptRC4, 65 | aes_cbc_decrypt, 66 | aes_cbc_encrypt, 67 | aes_ecb_decrypt, 68 | aes_ecb_encrypt, 69 | crypt_provider, 70 | rc4_decrypt, 71 | rc4_encrypt, 72 | ) 73 | 74 | __all__ = [ 75 | "CryptAES", 76 | "CryptBase", 77 | "CryptIdentity", 78 | "CryptRC4", 79 | "aes_cbc_decrypt", 80 | "aes_cbc_encrypt", 81 | "aes_ecb_decrypt", 82 | "aes_ecb_encrypt", 83 | "crypt_provider", 84 | "rc4_decrypt", 85 | "rc4_encrypt", 86 | ] 87 | -------------------------------------------------------------------------------- /pypdf/_crypt_providers/_base.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, exiledkingcc 2 | # All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions are 6 | # met: 7 | # 8 | # * Redistributions of source code must retain the above copyright notice, 9 | # this list of conditions and the following disclaimer. 10 | # * Redistributions in binary form must reproduce the above copyright notice, 11 | # this list of conditions and the following disclaimer in the documentation 12 | # and/or other materials provided with the distribution. 13 | # * The name of the author may not be used to endorse or promote products 14 | # derived from this software without specific prior written permission. 15 | # 16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 20 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26 | # POSSIBILITY OF SUCH DAMAGE. 27 | 28 | 29 | class CryptBase: 30 | def encrypt(self, data: bytes) -> bytes: # pragma: no cover 31 | return data 32 | 33 | def decrypt(self, data: bytes) -> bytes: # pragma: no cover 34 | return data 35 | 36 | 37 | class CryptIdentity(CryptBase): 38 | pass 39 | -------------------------------------------------------------------------------- /pypdf/_crypt_providers/_fallback.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, exiledkingcc 2 | # All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions are 6 | # met: 7 | # 8 | # * Redistributions of source code must retain the above copyright notice, 9 | # this list of conditions and the following disclaimer. 10 | # * Redistributions in binary form must reproduce the above copyright notice, 11 | # this list of conditions and the following disclaimer in the documentation 12 | # and/or other materials provided with the distribution. 13 | # * The name of the author may not be used to endorse or promote products 14 | # derived from this software without specific prior written permission. 15 | # 16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 20 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26 | # POSSIBILITY OF SUCH DAMAGE. 27 | 28 | from pypdf._crypt_providers._base import CryptBase 29 | from pypdf.errors import DependencyError 30 | 31 | _DEPENDENCY_ERROR_STR = "cryptography>=3.1 is required for AES algorithm" 32 | 33 | 34 | crypt_provider = ("local_crypt_fallback", "0.0.0") 35 | 36 | 37 | class CryptRC4(CryptBase): 38 | def __init__(self, key: bytes) -> None: 39 | self.s = bytearray(range(256)) 40 | j = 0 41 | for i in range(256): 42 | j = (j + self.s[i] + key[i % len(key)]) % 256 43 | self.s[i], self.s[j] = self.s[j], self.s[i] 44 | 45 | def encrypt(self, data: bytes) -> bytes: 46 | s = bytearray(self.s) 47 | out = [0 for _ in range(len(data))] 48 | i, j = 0, 0 49 | for k in range(len(data)): 50 | i = (i + 1) % 256 51 | j = (j + s[i]) % 256 52 | s[i], s[j] = s[j], s[i] 53 | x = s[(s[i] + s[j]) % 256] 54 | out[k] = data[k] ^ x 55 | return bytes(out) 56 | 57 | def decrypt(self, data: bytes) -> bytes: 58 | return self.encrypt(data) 59 | 60 | 61 | class CryptAES(CryptBase): 62 | def __init__(self, key: bytes) -> None: 63 | pass 64 | 65 | def encrypt(self, data: bytes) -> bytes: 66 | raise DependencyError(_DEPENDENCY_ERROR_STR) 67 | 68 | def decrypt(self, data: bytes) -> bytes: 69 | raise DependencyError(_DEPENDENCY_ERROR_STR) 70 | 71 | 72 | def rc4_encrypt(key: bytes, data: bytes) -> bytes: 73 | return CryptRC4(key).encrypt(data) 74 | 75 | 76 | def rc4_decrypt(key: bytes, data: bytes) -> bytes: 77 | return CryptRC4(key).decrypt(data) 78 | 79 | 80 | def aes_ecb_encrypt(key: bytes, data: bytes) -> bytes: 81 | raise DependencyError(_DEPENDENCY_ERROR_STR) 82 | 83 | 84 | def aes_ecb_decrypt(key: bytes, data: bytes) -> bytes: 85 | raise DependencyError(_DEPENDENCY_ERROR_STR) 86 | 87 | 88 | def aes_cbc_encrypt(key: bytes, iv: bytes, data: bytes) -> bytes: 89 | raise DependencyError(_DEPENDENCY_ERROR_STR) 90 | 91 | 92 | def aes_cbc_decrypt(key: bytes, iv: bytes, data: bytes) -> bytes: 93 | raise DependencyError(_DEPENDENCY_ERROR_STR) 94 | -------------------------------------------------------------------------------- /pypdf/_crypt_providers/_pycryptodome.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, exiledkingcc 2 | # All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions are 6 | # met: 7 | # 8 | # * Redistributions of source code must retain the above copyright notice, 9 | # this list of conditions and the following disclaimer. 10 | # * Redistributions in binary form must reproduce the above copyright notice, 11 | # this list of conditions and the following disclaimer in the documentation 12 | # and/or other materials provided with the distribution. 13 | # * The name of the author may not be used to endorse or promote products 14 | # derived from this software without specific prior written permission. 15 | # 16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 20 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26 | # POSSIBILITY OF SUCH DAMAGE. 27 | 28 | import secrets 29 | 30 | from Crypto import __version__ 31 | from Crypto.Cipher import AES, ARC4 32 | from Crypto.Util.Padding import pad 33 | 34 | from pypdf._crypt_providers._base import CryptBase 35 | 36 | crypt_provider = ("pycryptodome", __version__) 37 | 38 | 39 | class CryptRC4(CryptBase): 40 | def __init__(self, key: bytes) -> None: 41 | self.key = key 42 | 43 | def encrypt(self, data: bytes) -> bytes: 44 | return ARC4.ARC4Cipher(self.key).encrypt(data) 45 | 46 | def decrypt(self, data: bytes) -> bytes: 47 | return ARC4.ARC4Cipher(self.key).decrypt(data) 48 | 49 | 50 | class CryptAES(CryptBase): 51 | def __init__(self, key: bytes) -> None: 52 | self.key = key 53 | 54 | def encrypt(self, data: bytes) -> bytes: 55 | iv = secrets.token_bytes(16) 56 | data = pad(data, 16) 57 | aes = AES.new(self.key, AES.MODE_CBC, iv) 58 | return iv + aes.encrypt(data) 59 | 60 | def decrypt(self, data: bytes) -> bytes: 61 | iv = data[:16] 62 | data = data[16:] 63 | # for empty encrypted data 64 | if not data: 65 | return data 66 | 67 | # just for robustness, it does not happen under normal circumstances 68 | if len(data) % 16 != 0: 69 | data = pad(data, 16) 70 | 71 | aes = AES.new(self.key, AES.MODE_CBC, iv) 72 | d = aes.decrypt(data) 73 | return d[: -d[-1]] 74 | 75 | 76 | def rc4_encrypt(key: bytes, data: bytes) -> bytes: 77 | return ARC4.ARC4Cipher(key).encrypt(data) 78 | 79 | 80 | def rc4_decrypt(key: bytes, data: bytes) -> bytes: 81 | return ARC4.ARC4Cipher(key).decrypt(data) 82 | 83 | 84 | def aes_ecb_encrypt(key: bytes, data: bytes) -> bytes: 85 | return AES.new(key, AES.MODE_ECB).encrypt(data) 86 | 87 | 88 | def aes_ecb_decrypt(key: bytes, data: bytes) -> bytes: 89 | return AES.new(key, AES.MODE_ECB).decrypt(data) 90 | 91 | 92 | def aes_cbc_encrypt(key: bytes, iv: bytes, data: bytes) -> bytes: 93 | return AES.new(key, AES.MODE_CBC, iv).encrypt(data) 94 | 95 | 96 | def aes_cbc_decrypt(key: bytes, iv: bytes, data: bytes) -> bytes: 97 | return AES.new(key, AES.MODE_CBC, iv).decrypt(data) 98 | -------------------------------------------------------------------------------- /pypdf/_merger.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2006, Mathieu Fenniak 2 | # All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions are 6 | # met: 7 | # 8 | # * Redistributions of source code must retain the above copyright notice, 9 | # this list of conditions and the following disclaimer. 10 | # * Redistributions in binary form must reproduce the above copyright notice, 11 | # this list of conditions and the following disclaimer in the documentation 12 | # and/or other materials provided with the distribution. 13 | # * The name of the author may not be used to endorse or promote products 14 | # derived from this software without specific prior written permission. 15 | # 16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 20 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26 | # POSSIBILITY OF SUCH DAMAGE. 27 | 28 | 29 | from ._utils import ( 30 | deprecation_with_replacement, 31 | ) 32 | 33 | 34 | class PdfMerger: 35 | """ 36 | Use :class:`PdfWriter` instead. 37 | 38 | .. deprecated:: 5.0.0 39 | """ 40 | 41 | def __init__(self) -> None: 42 | deprecation_with_replacement("PdfMerger", "PdfWriter", "5.0.0") 43 | -------------------------------------------------------------------------------- /pypdf/_protocols.py: -------------------------------------------------------------------------------- 1 | """Helpers for working with PDF types.""" 2 | 3 | from abc import abstractmethod 4 | from pathlib import Path 5 | from typing import IO, Any, Dict, List, Optional, Protocol, Tuple, Union 6 | 7 | from ._utils import StrByteType, StreamType 8 | 9 | 10 | class PdfObjectProtocol(Protocol): 11 | indirect_reference: Any 12 | 13 | def clone( 14 | self, 15 | pdf_dest: Any, 16 | force_duplicate: bool = False, 17 | ignore_fields: Union[Tuple[str, ...], List[str], None] = (), 18 | ) -> Any: 19 | ... # pragma: no cover 20 | 21 | def _reference_clone(self, clone: Any, pdf_dest: Any) -> Any: 22 | ... # pragma: no cover 23 | 24 | def get_object(self) -> Optional["PdfObjectProtocol"]: 25 | ... # pragma: no cover 26 | 27 | def hash_value(self) -> bytes: 28 | ... # pragma: no cover 29 | 30 | def write_to_stream( 31 | self, stream: StreamType, encryption_key: Union[None, str, bytes] = None 32 | ) -> None: 33 | ... # pragma: no cover 34 | 35 | 36 | class XmpInformationProtocol(PdfObjectProtocol): 37 | pass 38 | 39 | 40 | class PdfCommonDocProtocol(Protocol): 41 | @property 42 | def pdf_header(self) -> str: 43 | ... # pragma: no cover 44 | 45 | @property 46 | def pages(self) -> List[Any]: 47 | ... # pragma: no cover 48 | 49 | @property 50 | def root_object(self) -> PdfObjectProtocol: 51 | ... # pragma: no cover 52 | 53 | def get_object(self, indirect_reference: Any) -> Optional[PdfObjectProtocol]: 54 | ... # pragma: no cover 55 | 56 | @property 57 | def strict(self) -> bool: 58 | ... # pragma: no cover 59 | 60 | 61 | class PdfReaderProtocol(PdfCommonDocProtocol, Protocol): 62 | @property 63 | @abstractmethod 64 | def xref(self) -> Dict[int, Dict[int, Any]]: 65 | ... # pragma: no cover 66 | 67 | @property 68 | @abstractmethod 69 | def trailer(self) -> Dict[str, Any]: 70 | ... # pragma: no cover 71 | 72 | 73 | class PdfWriterProtocol(PdfCommonDocProtocol, Protocol): 74 | _objects: List[Any] 75 | _id_translated: Dict[int, Dict[int, int]] 76 | 77 | incremental: bool 78 | _reader: Any # PdfReader 79 | 80 | @abstractmethod 81 | def write(self, stream: Union[Path, StrByteType]) -> Tuple[bool, IO[Any]]: 82 | ... # pragma: no cover 83 | 84 | @abstractmethod 85 | def _add_object(self, obj: Any) -> Any: 86 | ... # pragma: no cover 87 | -------------------------------------------------------------------------------- /pypdf/_text_extraction/_layout_mode/__init__.py: -------------------------------------------------------------------------------- 1 | """Layout mode text extraction extension for pypdf""" 2 | from ._fixed_width_page import ( 3 | fixed_char_width, 4 | fixed_width_page, 5 | text_show_operations, 6 | y_coordinate_groups, 7 | ) 8 | from ._font import Font 9 | 10 | __all__ = [ 11 | "Font", 12 | "fixed_char_width", 13 | "fixed_width_page", 14 | "text_show_operations", 15 | "y_coordinate_groups", 16 | ] 17 | -------------------------------------------------------------------------------- /pypdf/_version.py: -------------------------------------------------------------------------------- 1 | __version__ = "5.6.0" 2 | -------------------------------------------------------------------------------- /pypdf/annotations/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | PDF specifies several annotation types which pypdf makes available here. 3 | 4 | The names of the annotations and their attributes do not reflect the names in 5 | the specification in all cases. For example, the PDF standard defines a 6 | 'Square' annotation that does not actually need to be square. For this reason, 7 | pypdf calls it 'Rectangle'. 8 | 9 | At their core, all annotation types are DictionaryObjects. That means if pypdf 10 | does not implement a feature, users can easily extend the given functionality. 11 | """ 12 | 13 | 14 | from ._base import NO_FLAGS, AnnotationDictionary 15 | from ._markup_annotations import ( 16 | Ellipse, 17 | FreeText, 18 | Highlight, 19 | Line, 20 | MarkupAnnotation, 21 | Polygon, 22 | PolyLine, 23 | Rectangle, 24 | Text, 25 | ) 26 | from ._non_markup_annotations import Link, Popup 27 | 28 | __all__ = [ 29 | "NO_FLAGS", 30 | "AnnotationDictionary", 31 | "Ellipse", 32 | "FreeText", 33 | "Highlight", 34 | "Line", 35 | "Link", 36 | "MarkupAnnotation", 37 | "PolyLine", 38 | "Polygon", 39 | "Popup", 40 | "Rectangle", 41 | "Text", 42 | ] 43 | -------------------------------------------------------------------------------- /pypdf/annotations/_base.py: -------------------------------------------------------------------------------- 1 | from abc import ABC 2 | 3 | from ..constants import AnnotationFlag 4 | from ..generic import NameObject, NumberObject 5 | from ..generic._data_structures import DictionaryObject 6 | 7 | 8 | class AnnotationDictionary(DictionaryObject, ABC): 9 | def __init__(self) -> None: 10 | from ..generic._base import NameObject 11 | 12 | # /Rect should not be added here as Polygon and PolyLine can automatically set it 13 | self[NameObject("/Type")] = NameObject("/Annot") 14 | # The flags were NOT added to the constructor on purpose: 15 | # We expect that most users don't want to change the default. 16 | # If they do, they can use the property. The default is 0. 17 | 18 | @property 19 | def flags(self) -> AnnotationFlag: 20 | return self.get(NameObject("/F"), AnnotationFlag(0)) 21 | 22 | @flags.setter 23 | def flags(self, value: AnnotationFlag) -> None: 24 | self[NameObject("/F")] = NumberObject(value) 25 | 26 | 27 | NO_FLAGS = AnnotationFlag(0) 28 | -------------------------------------------------------------------------------- /pypdf/annotations/_non_markup_annotations.py: -------------------------------------------------------------------------------- 1 | from typing import TYPE_CHECKING, Any, Optional, Tuple, Union 2 | 3 | from ..generic._base import ( 4 | BooleanObject, 5 | NameObject, 6 | NumberObject, 7 | TextStringObject, 8 | ) 9 | from ..generic._data_structures import ArrayObject, DictionaryObject 10 | from ..generic._fit import DEFAULT_FIT, Fit 11 | from ..generic._rectangle import RectangleObject 12 | from ._base import AnnotationDictionary 13 | 14 | 15 | class Link(AnnotationDictionary): 16 | def __init__( 17 | self, 18 | *, 19 | rect: Union[RectangleObject, Tuple[float, float, float, float]], 20 | border: Optional[ArrayObject] = None, 21 | url: Optional[str] = None, 22 | target_page_index: Optional[int] = None, 23 | fit: Fit = DEFAULT_FIT, 24 | **kwargs: Any, 25 | ) -> None: 26 | super().__init__(**kwargs) 27 | if TYPE_CHECKING: 28 | from ..types import BorderArrayType 29 | 30 | is_external = url is not None 31 | is_internal = target_page_index is not None 32 | if not is_external and not is_internal: 33 | raise ValueError( 34 | "Either 'url' or 'target_page_index' have to be provided. Both were None." 35 | ) 36 | if is_external and is_internal: 37 | raise ValueError( 38 | "Either 'url' or 'target_page_index' have to be provided. " 39 | f"{url=}, {target_page_index=}" 40 | ) 41 | 42 | border_arr: BorderArrayType 43 | if border is not None: 44 | border_arr = [NumberObject(n) for n in border[:3]] 45 | if len(border) == 4: 46 | dash_pattern = ArrayObject([NumberObject(n) for n in border[3]]) 47 | border_arr.append(dash_pattern) 48 | else: 49 | border_arr = [NumberObject(0)] * 3 50 | 51 | self.update( 52 | { 53 | NameObject("/Type"): NameObject("/Annot"), 54 | NameObject("/Subtype"): NameObject("/Link"), 55 | NameObject("/Rect"): RectangleObject(rect), 56 | NameObject("/Border"): ArrayObject(border_arr), 57 | } 58 | ) 59 | if is_external: 60 | self[NameObject("/A")] = DictionaryObject( 61 | { 62 | NameObject("/S"): NameObject("/URI"), 63 | NameObject("/Type"): NameObject("/Action"), 64 | NameObject("/URI"): TextStringObject(url), 65 | } 66 | ) 67 | if is_internal: 68 | # This needs to be updated later! 69 | dest_deferred = DictionaryObject( 70 | { 71 | "target_page_index": NumberObject(target_page_index), 72 | "fit": NameObject(fit.fit_type), 73 | "fit_args": fit.fit_args, 74 | } 75 | ) 76 | self[NameObject("/Dest")] = dest_deferred 77 | 78 | 79 | class Popup(AnnotationDictionary): 80 | def __init__( 81 | self, 82 | *, 83 | rect: Union[RectangleObject, Tuple[float, float, float, float]], 84 | parent: Optional[DictionaryObject] = None, 85 | open: bool = False, 86 | **kwargs: Any, 87 | ) -> None: 88 | super().__init__(**kwargs) 89 | self.update( 90 | { 91 | NameObject("/Subtype"): NameObject("/Popup"), 92 | NameObject("/Rect"): RectangleObject(rect), 93 | NameObject("/Open"): BooleanObject(open), 94 | } 95 | ) 96 | if parent: 97 | # This needs to be an indirect object 98 | try: 99 | self[NameObject("/Parent")] = parent.indirect_reference 100 | except AttributeError: 101 | from .._utils import logger_warning 102 | 103 | logger_warning( 104 | "Unregistered Parent object : No Parent field set", 105 | __name__, 106 | ) 107 | -------------------------------------------------------------------------------- /pypdf/errors.py: -------------------------------------------------------------------------------- 1 | """ 2 | All errors/exceptions pypdf raises and all of the warnings it uses. 3 | 4 | Please note that broken PDF files might cause other Exceptions. 5 | """ 6 | 7 | 8 | class DeprecationError(Exception): 9 | """Raised when a deprecated feature is used.""" 10 | 11 | 12 | class DependencyError(Exception): 13 | """ 14 | Raised when a required dependency (a library or module that pypdf depends on) 15 | is not available or cannot be imported. 16 | """ 17 | 18 | 19 | class PyPdfError(Exception): 20 | """Base class for all exceptions raised by pypdf.""" 21 | 22 | 23 | class PdfReadError(PyPdfError): 24 | """Raised when there is an issue reading a PDF file.""" 25 | 26 | 27 | class PageSizeNotDefinedError(PyPdfError): 28 | """Raised when the page size of a PDF document is not defined.""" 29 | 30 | 31 | class PdfReadWarning(UserWarning): 32 | """Issued when there is a potential issue reading a PDF file, but it can still be read.""" 33 | 34 | 35 | class PdfStreamError(PdfReadError): 36 | """Raised when there is an issue reading the stream of data in a PDF file.""" 37 | 38 | 39 | class ParseError(PyPdfError): 40 | """ 41 | Raised when there is an issue parsing (analyzing and understanding the 42 | structure and meaning of) a PDF file. 43 | """ 44 | 45 | 46 | class FileNotDecryptedError(PdfReadError): 47 | """ 48 | Raised when a PDF file that has been encrypted 49 | (meaning it requires a password to be accessed) has not been successfully 50 | decrypted. 51 | """ 52 | 53 | 54 | class WrongPasswordError(FileNotDecryptedError): 55 | """Raised when the wrong password is used to try to decrypt an encrypted PDF file.""" 56 | 57 | 58 | class EmptyFileError(PdfReadError): 59 | """Raised when a PDF file is empty or has no content.""" 60 | 61 | 62 | class EmptyImageDataError(PyPdfError): 63 | """Raised when trying to process an image that has no data.""" 64 | 65 | 66 | STREAM_TRUNCATED_PREMATURELY = "Stream has ended unexpectedly" 67 | -------------------------------------------------------------------------------- /pypdf/generic/_outline.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | 3 | from .._utils import StreamType, deprecation_no_replacement 4 | from ._base import NameObject 5 | from ._data_structures import Destination 6 | 7 | 8 | class OutlineItem(Destination): 9 | def write_to_stream( 10 | self, stream: StreamType, encryption_key: Union[None, str, bytes] = None 11 | ) -> None: 12 | if encryption_key is not None: # deprecated 13 | deprecation_no_replacement( 14 | "the encryption_key parameter of write_to_stream", "5.0.0" 15 | ) 16 | stream.write(b"<<\n") 17 | for key in [ 18 | NameObject(x) 19 | for x in ["/Title", "/Parent", "/First", "/Last", "/Next", "/Prev"] 20 | if x in self 21 | ]: 22 | key.write_to_stream(stream) 23 | stream.write(b" ") 24 | value = self.raw_get(key) 25 | value.write_to_stream(stream) 26 | stream.write(b"\n") 27 | key = NameObject("/Dest") 28 | key.write_to_stream(stream) 29 | stream.write(b" ") 30 | value = self.dest_array 31 | value.write_to_stream(stream) 32 | stream.write(b"\n") 33 | stream.write(b">>") 34 | -------------------------------------------------------------------------------- /pypdf/generic/_rectangle.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Tuple, Union 2 | 3 | from ._base import FloatObject, NumberObject 4 | from ._data_structures import ArrayObject 5 | 6 | 7 | class RectangleObject(ArrayObject): 8 | """ 9 | This class is used to represent *page boxes* in pypdf. 10 | 11 | These boxes include: 12 | 13 | * :attr:`artbox ` 14 | * :attr:`bleedbox ` 15 | * :attr:`cropbox ` 16 | * :attr:`mediabox ` 17 | * :attr:`trimbox ` 18 | """ 19 | 20 | def __init__( 21 | self, arr: Union["RectangleObject", Tuple[float, float, float, float]] 22 | ) -> None: 23 | # must have four points 24 | assert len(arr) == 4 25 | # automatically convert arr[x] into NumberObject(arr[x]) if necessary 26 | ArrayObject.__init__(self, [self._ensure_is_number(x) for x in arr]) # type: ignore 27 | 28 | def _ensure_is_number(self, value: Any) -> Union[FloatObject, NumberObject]: 29 | if not isinstance(value, (FloatObject, NumberObject)): 30 | value = FloatObject(value) 31 | return value 32 | 33 | def scale(self, sx: float, sy: float) -> "RectangleObject": 34 | return RectangleObject( 35 | ( 36 | float(self.left) * sx, 37 | float(self.bottom) * sy, 38 | float(self.right) * sx, 39 | float(self.top) * sy, 40 | ) 41 | ) 42 | 43 | def __repr__(self) -> str: 44 | return f"RectangleObject({list(self)!r})" 45 | 46 | @property 47 | def left(self) -> FloatObject: 48 | return self[0] 49 | 50 | @left.setter 51 | def left(self, f: float) -> None: 52 | self[0] = FloatObject(f) 53 | 54 | @property 55 | def bottom(self) -> FloatObject: 56 | return self[1] 57 | 58 | @bottom.setter 59 | def bottom(self, f: float) -> None: 60 | self[1] = FloatObject(f) 61 | 62 | @property 63 | def right(self) -> FloatObject: 64 | return self[2] 65 | 66 | @right.setter 67 | def right(self, f: float) -> None: 68 | self[2] = FloatObject(f) 69 | 70 | @property 71 | def top(self) -> FloatObject: 72 | return self[3] 73 | 74 | @top.setter 75 | def top(self, f: float) -> None: 76 | self[3] = FloatObject(f) 77 | 78 | @property 79 | def lower_left(self) -> Tuple[float, float]: 80 | """ 81 | Property to read and modify the lower left coordinate of this box 82 | in (x,y) form. 83 | """ 84 | return self.left, self.bottom 85 | 86 | @lower_left.setter 87 | def lower_left(self, value: Tuple[float, float]) -> None: 88 | self[0], self[1] = (self._ensure_is_number(x) for x in value) 89 | 90 | @property 91 | def lower_right(self) -> Tuple[float, float]: 92 | """ 93 | Property to read and modify the lower right coordinate of this box 94 | in (x,y) form. 95 | """ 96 | return self.right, self.bottom 97 | 98 | @lower_right.setter 99 | def lower_right(self, value: Tuple[float, float]) -> None: 100 | self[2], self[1] = (self._ensure_is_number(x) for x in value) 101 | 102 | @property 103 | def upper_left(self) -> Tuple[float, float]: 104 | """ 105 | Property to read and modify the upper left coordinate of this box 106 | in (x,y) form. 107 | """ 108 | return self.left, self.top 109 | 110 | @upper_left.setter 111 | def upper_left(self, value: Tuple[float, float]) -> None: 112 | self[0], self[3] = (self._ensure_is_number(x) for x in value) 113 | 114 | @property 115 | def upper_right(self) -> Tuple[float, float]: 116 | """ 117 | Property to read and modify the upper right coordinate of this box 118 | in (x,y) form. 119 | """ 120 | return self.right, self.top 121 | 122 | @upper_right.setter 123 | def upper_right(self, value: Tuple[float, float]) -> None: 124 | self[2], self[3] = (self._ensure_is_number(x) for x in value) 125 | 126 | @property 127 | def width(self) -> float: 128 | return self.right - self.left 129 | 130 | @property 131 | def height(self) -> float: 132 | return self.top - self.bottom 133 | -------------------------------------------------------------------------------- /pypdf/papersizes.py: -------------------------------------------------------------------------------- 1 | """Helper to get paper sizes.""" 2 | 3 | from typing import NamedTuple 4 | 5 | 6 | class Dimensions(NamedTuple): 7 | width: int 8 | height: int 9 | 10 | 11 | class PaperSize: 12 | """(width, height) of the paper in portrait mode in pixels at 72 ppi.""" 13 | 14 | # Notes of how to calculate it: 15 | # 1. Get the size of the paper in millimeters 16 | # 2. Convert it to inches (25.4 millimeters is equal to 1 inch) 17 | # 3. Convert it to pixels at 72dpi (1 inch is equal to 72 pixels) 18 | 19 | # All Din-A paper sizes follow this pattern: 20 | # 2 x A(n - 1) = A(n) 21 | # So the height of the next bigger one is the width of the smaller one 22 | # The ratio is always approximately 1:2**0.5 23 | # Additionally, A0 is defined to have an area of 1 m**2 24 | # https://en.wikipedia.org/wiki/ISO_216 25 | # Be aware of rounding issues! 26 | A0 = Dimensions(2384, 3370) # 841mm x 1189mm 27 | A1 = Dimensions(1684, 2384) 28 | A2 = Dimensions(1191, 1684) 29 | A3 = Dimensions(842, 1191) 30 | A4 = Dimensions( 31 | 595, 842 32 | ) # Printer paper, documents - this is by far the most common 33 | A5 = Dimensions(420, 595) # Paperback books 34 | A6 = Dimensions(298, 420) # Postcards 35 | A7 = Dimensions(210, 298) 36 | A8 = Dimensions(147, 210) 37 | 38 | # Envelopes 39 | C4 = Dimensions(649, 918) 40 | 41 | 42 | _din_a = ( 43 | PaperSize.A0, 44 | PaperSize.A1, 45 | PaperSize.A2, 46 | PaperSize.A3, 47 | PaperSize.A4, 48 | PaperSize.A5, 49 | PaperSize.A6, 50 | PaperSize.A7, 51 | PaperSize.A8, 52 | ) 53 | -------------------------------------------------------------------------------- /pypdf/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/pypdf/py.typed -------------------------------------------------------------------------------- /pypdf/types.py: -------------------------------------------------------------------------------- 1 | """Helpers for working with PDF types.""" 2 | 3 | import sys 4 | from typing import List, Literal, Union 5 | 6 | if sys.version_info[:2] >= (3, 10): 7 | # Python 3.10+: https://www.python.org/dev/peps/pep-0484 8 | from typing import TypeAlias 9 | else: 10 | from typing_extensions import TypeAlias 11 | 12 | from .generic._base import NameObject, NullObject, NumberObject 13 | from .generic._data_structures import ArrayObject, Destination 14 | from .generic._outline import OutlineItem 15 | 16 | BorderArrayType: TypeAlias = List[Union[NameObject, NumberObject, ArrayObject]] 17 | OutlineItemType: TypeAlias = Union[OutlineItem, Destination] 18 | FitType: TypeAlias = Literal[ 19 | "/XYZ", "/Fit", "/FitH", "/FitV", "/FitR", "/FitB", "/FitBH", "/FitBV" 20 | ] 21 | # Those go with the FitType: They specify values for the fit 22 | ZoomArgType: TypeAlias = Union[NumberObject, NullObject, float] 23 | ZoomArgsType: TypeAlias = List[ZoomArgType] 24 | 25 | # Recursive types like the following are not yet supported by mypy: 26 | # OutlineType = List[Union[Destination, "OutlineType"]] 27 | # See https://github.com/python/mypy/issues/731 28 | # Hence use this for the moment: 29 | OutlineType = List[Union[Destination, List[Union[Destination, List[Destination]]]]] 30 | 31 | LayoutType: TypeAlias = Literal[ 32 | "/NoLayout", 33 | "/SinglePage", 34 | "/OneColumn", 35 | "/TwoColumnLeft", 36 | "/TwoColumnRight", 37 | "/TwoPageLeft", 38 | "/TwoPageRight", 39 | ] 40 | PagemodeType: TypeAlias = Literal[ 41 | "/UseNone", 42 | "/UseOutlines", 43 | "/UseThumbs", 44 | "/FullScreen", 45 | "/UseOC", 46 | "/UseAttachments", 47 | ] 48 | AnnotationSubtype: TypeAlias = Literal[ 49 | "/Text", 50 | "/Link", 51 | "/FreeText", 52 | "/Line", 53 | "/Square", 54 | "/Circle", 55 | "/Polygon", 56 | "/PolyLine", 57 | "/Highlight", 58 | "/Underline", 59 | "/Squiggly", 60 | "/StrikeOut", 61 | "/Caret", 62 | "/Stamp", 63 | "/Ink", 64 | "/Popup", 65 | "/FileAttachment", 66 | "/Sound", 67 | "/Movie", 68 | "/Screen", 69 | "/Widget", 70 | "/PrinterMark", 71 | "/TrapNet", 72 | "/Watermark", 73 | "/3D", 74 | "/Redact", 75 | "/Projection", 76 | "/RichMedia", 77 | ] 78 | -------------------------------------------------------------------------------- /requirements/ci-3.11.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file is autogenerated by pip-compile with Python 3.11 3 | # by the following command: 4 | # 5 | # pip-compile --output-file=requirements/ci-3.11.txt requirements/ci.in 6 | # 7 | cffi==1.17.1 8 | # via cryptography 9 | coverage[toml]==7.6.4 10 | # via 11 | # -r requirements/ci.in 12 | # pytest-cov 13 | cryptography==44.0.1 14 | # via -r requirements/ci.in 15 | defusedxml==0.7.1 16 | # via fpdf2 17 | exceptiongroup==1.2.2 18 | # via pytest 19 | execnet==2.1.1 20 | # via pytest-xdist 21 | fonttools==4.54.1 22 | # via fpdf2 23 | fpdf2==2.8.1 24 | # via -r requirements/ci.in 25 | iniconfig==2.0.0 26 | # via pytest 27 | mypy==1.16.0 28 | # via -r requirements/ci.in 29 | mypy-extensions==1.0.0 30 | # via mypy 31 | packaging==24.1 32 | # via pytest 33 | pillow==11.0.0 34 | # via 35 | # -r requirements/ci.in 36 | # fpdf2 37 | pluggy==1.5.0 38 | # via pytest 39 | py-cpuinfo==9.0.0 40 | # via pytest-benchmark 41 | pycparser==2.22 42 | # via cffi 43 | pytest==8.3.3 44 | # via 45 | # -r requirements/ci.in 46 | # pytest-benchmark 47 | # pytest-cov 48 | # pytest-socket 49 | # pytest-timeout 50 | # pytest-xdist 51 | pytest-benchmark==4.0.0 52 | # via -r requirements/ci.in 53 | pytest-cov==5.0.0 54 | # via -r requirements/ci.in 55 | pytest-socket==0.7.0 56 | # via -r requirements/ci.in 57 | pytest-timeout==2.3.1 58 | # via -r requirements/ci.in 59 | pytest-xdist==3.6.1 60 | # via -r requirements/ci.in 61 | pyyaml==6.0.2 62 | # via -r requirements/ci.in 63 | ruff==0.11.0 64 | # via -r requirements/ci.in 65 | tomli==2.0.2 66 | # via 67 | # coverage 68 | # mypy 69 | # pytest 70 | typeguard==4.3.0 71 | # via -r requirements/ci.in 72 | types-pillow==10.2.0.20240822 73 | # via -r requirements/ci.in 74 | typing-extensions==4.12.2 75 | # via 76 | # mypy 77 | # typeguard 78 | -------------------------------------------------------------------------------- /requirements/ci.in: -------------------------------------------------------------------------------- 1 | coverage 2 | fpdf2 3 | mypy 4 | pillow 5 | cryptography 6 | pytest 7 | pytest-benchmark 8 | pytest-socket 9 | pytest-timeout 10 | pytest-xdist 11 | pytest-cov 12 | # ruff # only take this for 3.11 13 | typeguard 14 | types-Pillow 15 | pyyaml 16 | -------------------------------------------------------------------------------- /requirements/ci.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file is autogenerated by pip-compile with Python 3.8 3 | # by the following command: 4 | # 5 | # pip-compile requirements/ci.in 6 | # 7 | cffi==1.17.1 8 | # via cryptography 9 | coverage[toml]==7.6.1 10 | # via 11 | # -r requirements/ci.in 12 | # pytest-cov 13 | cryptography==44.0.1 14 | # via -r requirements/ci.in 15 | defusedxml==0.7.1 16 | # via fpdf2 17 | exceptiongroup==1.2.2 18 | # via pytest 19 | execnet==2.1.1 20 | # via pytest-xdist 21 | fonttools==4.54.1 22 | # via fpdf2 23 | fpdf2==2.8.1 24 | # via -r requirements/ci.in 25 | importlib-metadata==8.5.0 26 | # via typeguard 27 | iniconfig==2.0.0 28 | # via pytest 29 | mypy==1.13.0 30 | # via -r requirements/ci.in 31 | mypy-extensions==1.0.0 32 | # via mypy 33 | packaging==24.1 34 | # via pytest 35 | pillow==10.4.0 36 | # via 37 | # -r requirements/ci.in 38 | # fpdf2 39 | pluggy==1.5.0 40 | # via pytest 41 | py-cpuinfo==9.0.0 42 | # via pytest-benchmark 43 | pycparser==2.22 44 | # via cffi 45 | pytest==8.3.3 46 | # via 47 | # -r requirements/ci.in 48 | # pytest-benchmark 49 | # pytest-cov 50 | # pytest-socket 51 | # pytest-timeout 52 | # pytest-xdist 53 | pytest-benchmark==4.0.0 54 | # via -r requirements/ci.in 55 | pytest-cov==5.0.0 56 | # via -r requirements/ci.in 57 | pytest-socket==0.7.0 58 | # via -r requirements/ci.in 59 | pytest-timeout==2.3.1 60 | # via -r requirements/ci.in 61 | pytest-xdist==3.6.1 62 | # via -r requirements/ci.in 63 | pyyaml==6.0.2 64 | # via -r requirements/ci.in 65 | tomli==2.0.2 66 | # via 67 | # coverage 68 | # mypy 69 | # pytest 70 | typeguard==4.3.0 71 | # via -r requirements/ci.in 72 | types-pillow==10.2.0.20240822 73 | # via -r requirements/ci.in 74 | typing-extensions==4.12.2 75 | # via 76 | # mypy 77 | # typeguard 78 | zipp==3.20.2 79 | # via importlib-metadata 80 | -------------------------------------------------------------------------------- /requirements/dev.in: -------------------------------------------------------------------------------- 1 | pillow 2 | pip-tools 3 | pre-commit 4 | pytest-cov 5 | flit 6 | wheel 7 | -------------------------------------------------------------------------------- /requirements/dev.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file is autogenerated by pip-compile with Python 3.8 3 | # by the following command: 4 | # 5 | # pip-compile requirements/dev.in 6 | # 7 | build==1.2.2.post1 8 | # via pip-tools 9 | certifi==2024.8.30 10 | # via requests 11 | cfgv==3.4.0 12 | # via pre-commit 13 | charset-normalizer==3.4.0 14 | # via requests 15 | click==8.1.7 16 | # via pip-tools 17 | coverage[toml]==7.6.1 18 | # via pytest-cov 19 | distlib==0.3.9 20 | # via virtualenv 21 | docutils==0.20.1 22 | # via flit 23 | exceptiongroup==1.2.2 24 | # via pytest 25 | filelock==3.16.1 26 | # via virtualenv 27 | flit==3.9.0 28 | # via -r requirements/dev.in 29 | flit-core==3.9.0 30 | # via flit 31 | identify==2.6.1 32 | # via pre-commit 33 | idna==3.10 34 | # via requests 35 | importlib-metadata==8.5.0 36 | # via build 37 | iniconfig==2.0.0 38 | # via pytest 39 | nodeenv==1.9.1 40 | # via pre-commit 41 | packaging==24.1 42 | # via 43 | # build 44 | # pytest 45 | pillow==10.4.0 46 | # via -r requirements/dev.in 47 | pip-tools==7.4.1 48 | # via -r requirements/dev.in 49 | platformdirs==4.3.6 50 | # via virtualenv 51 | pluggy==1.5.0 52 | # via pytest 53 | pre-commit==3.5.0 54 | # via -r requirements/dev.in 55 | pyproject-hooks==1.2.0 56 | # via 57 | # build 58 | # pip-tools 59 | pytest==8.3.3 60 | # via pytest-cov 61 | pytest-cov==5.0.0 62 | # via -r requirements/dev.in 63 | pyyaml==6.0.2 64 | # via pre-commit 65 | requests==2.32.3 66 | # via flit 67 | tomli==2.0.2 68 | # via 69 | # build 70 | # coverage 71 | # pip-tools 72 | # pytest 73 | tomli-w==1.0.0 74 | # via flit 75 | urllib3==2.2.3 76 | # via requests 77 | virtualenv==20.27.0 78 | # via pre-commit 79 | wheel==0.44.0 80 | # via 81 | # -r requirements/dev.in 82 | # pip-tools 83 | zipp==3.20.2 84 | # via importlib-metadata 85 | 86 | # The following packages are considered to be unsafe in a requirements file: 87 | # pip 88 | # setuptools 89 | -------------------------------------------------------------------------------- /requirements/docs.in: -------------------------------------------------------------------------------- 1 | sphinx 2 | sphinx_rtd_theme 3 | myst_parser 4 | -------------------------------------------------------------------------------- /requirements/docs.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file is autogenerated by pip-compile with Python 3.10 3 | # by the following command: 4 | # 5 | # pip-compile requirements/docs.in 6 | # 7 | alabaster==1.0.0 8 | # via sphinx 9 | babel==2.16.0 10 | # via sphinx 11 | certifi==2024.8.30 12 | # via requests 13 | charset-normalizer==3.4.0 14 | # via requests 15 | docutils==0.21.2 16 | # via 17 | # myst-parser 18 | # sphinx 19 | # sphinx-rtd-theme 20 | idna==3.10 21 | # via requests 22 | imagesize==1.4.1 23 | # via sphinx 24 | jinja2==3.1.6 25 | # via 26 | # myst-parser 27 | # sphinx 28 | markdown-it-py==3.0.0 29 | # via 30 | # mdit-py-plugins 31 | # myst-parser 32 | markupsafe==3.0.1 33 | # via jinja2 34 | mdit-py-plugins==0.4.2 35 | # via myst-parser 36 | mdurl==0.1.2 37 | # via markdown-it-py 38 | myst-parser==4.0.0 39 | # via -r requirements/docs.in 40 | packaging==24.1 41 | # via sphinx 42 | pygments==2.18.0 43 | # via sphinx 44 | pyyaml==6.0.2 45 | # via myst-parser 46 | requests==2.32.3 47 | # via sphinx 48 | snowballstemmer==2.2.0 49 | # via sphinx 50 | sphinx==8.1.3 51 | # via 52 | # -r requirements/docs.in 53 | # myst-parser 54 | # sphinx-rtd-theme 55 | # sphinxcontrib-jquery 56 | sphinx-rtd-theme==3.0.1 57 | # via -r requirements/docs.in 58 | sphinxcontrib-applehelp==2.0.0 59 | # via sphinx 60 | sphinxcontrib-devhelp==2.0.0 61 | # via sphinx 62 | sphinxcontrib-htmlhelp==2.1.0 63 | # via sphinx 64 | sphinxcontrib-jquery==4.1 65 | # via sphinx-rtd-theme 66 | sphinxcontrib-jsmath==1.0.1 67 | # via sphinx 68 | sphinxcontrib-qthelp==2.0.0 69 | # via sphinx 70 | sphinxcontrib-serializinghtml==2.0.0 71 | # via sphinx 72 | tomli==2.0.2 73 | # via sphinx 74 | urllib3==2.2.3 75 | # via requests 76 | -------------------------------------------------------------------------------- /resources/010-pdflatex-forms.txt: -------------------------------------------------------------------------------- 1 | Name 2 | 3 | Check 4 | 5 | Submit 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 1 -------------------------------------------------------------------------------- /resources/AEO.1172.layout.rot180.txt: -------------------------------------------------------------------------------- 1 | 9 1of Page 2022 AEO Management Co. All Rights Reserved. Proprietary and Confidential AEO Business Information. Subject to Legal Action if Disclosed Without Authorization from AEO.Date Printed: 17/Nov/2022 2 | PRODUCT SUMMARY 3 | Fit / Other: 4 | 1172 KNIT SHORTIE Style Desc: 5 | SUMMER-B 2023 Season: 6 | 50 / 170 Division / Dept: 7 | AMERICAN EAGLE OUTFITTERSCompany: 8 | SUMMER-B 2023 1172 KNIT SHORTIE STYLE: 1172 9 | STATUS: FNL 10 | -------------------------------------------------------------------------------- /resources/AEO.1172.layout.txt: -------------------------------------------------------------------------------- 1 | STATUS: FNL 2 | STYLE: 1172 1172 KNIT SHORTIE SUMMER-B 2023 3 | Company: AMERICAN EAGLE OUTFITTERS 4 | Division / Dept: 50 / 170 5 | Season: SUMMER-B 2023 6 | Style Desc: 1172 KNIT SHORTIE 7 | Fit / Other: 8 | PRODUCT SUMMARY 9 | Date Printed: 17/Nov/2022 2022 AEO Management Co. All Rights Reserved. Proprietary and Confidential AEO Business Information. Subject to Legal Action if Disclosed Without Authorization from AEO. Page 1of 9 10 | -------------------------------------------------------------------------------- /resources/AutoCad_Diagram.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/AutoCad_Diagram.pdf -------------------------------------------------------------------------------- /resources/AutoCad_Simple.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/AutoCad_Simple.pdf -------------------------------------------------------------------------------- /resources/Epic.Page.layout.txt: -------------------------------------------------------------------------------- 1 | All Postprocedure Notes 2 | Last edited 10/11/23 0919 by Danny Chaung, DO 3 | Date of Service 10/11/23 0918 4 | Status: Signed 5 | Anesthesia Post Evaluation 6 | 7 | Procedure Summary 8 | 9 | Date: 10/11/23 Room / Location: EHMC ENDOSCOPY 10 | Anesthesia Start: 0852 Anesthesia Stop: 0918 11 | Procedure: COLONOSCOPY Diagnosis: Cancer screening 12 | Scheduled Providers: Walter A Klein, MD; Danny Chaung, Responsible Provider: Danny Chaung, DO 13 | DO 14 | Anesthesia Type: general ASA Status: 2 15 | 16 | 17 | Patient location during evaluation: PACU 18 | Post op Vital Signs: stable 19 | 20 | Level of consciousness: awake and alert 21 | Pain management: adequate analgesia 22 | Airway patency: patent 23 | Anesthetic complications: no 24 | Respiratory status: unassisted 25 | Hydration status: continuing 26 | Post-op Complications: No 27 | 28 | 29 | 30 | Assessment: Nausea and Vomiting: absent 31 | 32 | 33 | 34 | 35 | MIPS Measure #404 - Smoking Abstinence 36 | Is the patient a current smoker? No (XX404) 37 | 38 | 39 | 40 | -------------------------------------------------------------------------------- /resources/FormTestFromOo.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/FormTestFromOo.pdf -------------------------------------------------------------------------------- /resources/GeoBase_NHNC1_Data_Model_UML_EN.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/GeoBase_NHNC1_Data_Model_UML_EN.pdf -------------------------------------------------------------------------------- /resources/SF424_page2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/SF424_page2.pdf -------------------------------------------------------------------------------- /resources/Sample_Td-matrix.pdf: -------------------------------------------------------------------------------- 1 | %PDF-1.4 2 | %---- 3 | 1 0 obj 4 | << /Pages 3 0 R 5 | /Type /Catalog /Version /1.4 >> 6 | endobj 7 | 2 0 obj 8 | << /CreationDate (D:20220823232400+02'00') /Producer (vim) >> 9 | endobj 10 | 3 0 obj 11 | << /Count 1 /Kids [ 4 0 R ] /Type /Pages >> 12 | endobj 13 | 4 0 obj 14 | << /Contents [ 5 0 R ] /MediaBox [ 0 0 842 595 ] /Parent 3 0 R 15 | /Resources << /Font << /F1 6 0 R >> >> /TrimBox [ 0 0 842 595 ] 16 | /Type /Page >> 17 | endobj 18 | 5 0 obj 19 | << /Length 0575 >> 20 | stream 21 | 0.0 G 22 | 0.0 0.0 1.0 rg 200 100 200 100 re B 23 | 0.2 0.2 1.0 rg 400 100 200 100 re B 24 | 0.4 0.4 1.0 rg 200 200 200 100 re B 25 | 0.6 0.6 1.0 rg 400 200 200 100 re B 26 | 27 | 0.3 0.0 0.0 rg 28 | BT 29 | % Move text to 210 110 via Td-operation 30 | /F1 12 Tf 31 | 210 110 Td 32 | (Hello PDF\041) Tj 33 | 34 | % Tm-operation without scale followed by Td 35 | 1 0 0 1 200 0 Tm 36 | 210 110 Td 37 | (Hello PDF 200 0 Td!) Tj 38 | 39 | % Tm-operation with horizontal scale 40 | 2 0 0 1 0 0 Tm 41 | 105 210 Td 42 | (Hello PDF 2 1!) Tj 43 | 44 | % Tm-operation with dual scale 45 | /F1 2.5 Tf 46 | 10 0 0 7 0 0 Tm 47 | 41 30 Td 48 | (Hello PDF 10 7!) Tj 49 | 50 | ET 51 | endstream 52 | endobj 53 | 6 0 obj 54 | << /Type /Font 55 | /Subtype /Type1 56 | /BaseFont /Helvetica-Bold 57 | >> 58 | endobj 59 | xref 60 | 0 7 61 | 0000000000 65535 f 62 | 0000000015 00000 n 63 | 0000000081 00000 n 64 | 0000000158 00000 n 65 | 0000000217 00000 n 66 | 0000000380 00000 n 67 | 0000001006 00000 n 68 | trailer << /Size 7 /Info 2 0 R /Root 1 0 R >> 69 | startxref 70 | 1087 71 | %%EOF 72 | 73 | -------------------------------------------------------------------------------- /resources/Seige_of_Vicksburg_Sample_OCR-crazyones-merged.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/Seige_of_Vicksburg_Sample_OCR-crazyones-merged.pdf -------------------------------------------------------------------------------- /resources/Seige_of_Vicksburg_Sample_OCR.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/Seige_of_Vicksburg_Sample_OCR.pdf -------------------------------------------------------------------------------- /resources/attachment.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/attachment.pdf -------------------------------------------------------------------------------- /resources/box.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/box.pdf -------------------------------------------------------------------------------- /resources/bytes.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/bytes.pdf -------------------------------------------------------------------------------- /resources/commented-xmp.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/commented-xmp.pdf -------------------------------------------------------------------------------- /resources/commented.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/commented.pdf -------------------------------------------------------------------------------- /resources/crazyones-encrypted-256.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/crazyones-encrypted-256.pdf -------------------------------------------------------------------------------- /resources/crazyones.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/crazyones.pdf -------------------------------------------------------------------------------- /resources/crazyones.txt: -------------------------------------------------------------------------------- 1 | The Crazy Ones 2 | October 14, 1998 3 | Heres to the crazy ones. The misfits. The rebels. The troublemakers. 4 | The round pegs in the square holes. 5 | The ones who see things differently. Theyre not fond of rules. And 6 | they have no respect for the status quo. You can quote them, 7 | disagree with them, glorify or vilify them. 8 | About the only thing you cant do is ignore them. Because they change 9 | things. They invent. They imagine. They heal. They explore. They 10 | create. They inspire. They push the human race forward. 11 | Maybe they have to be crazy. 12 | How else can you stare at an empty canvas and see a work of art? Or 13 | sit in silence and hear a song thats never been written? Or gaze at 14 | a red planet and see a laboratory on wheels? 15 | We make tools for these kinds of people. 16 | While some see them as the crazy ones, we see genius. Because the 17 | people who are crazy enough to think they can change the world, 18 | are the ones who do. -------------------------------------------------------------------------------- /resources/crazyones_layout_vertical_space.txt: -------------------------------------------------------------------------------- 1 | The Crazy Ones 2 | October 14, 1998 3 | 4 | Heres to the crazy ones. The misfits. The rebels. The troublemakers. 5 | The round pegs in the square holes. 6 | The ones who see things differently. Theyre not fond of rules. And 7 | they have no respect for the status quo. You can quote them, 8 | disagree with them, glorify or vilify them. 9 | About the only thing you cant do is ignore them. Because they change 10 | things. They invent. They imagine. They heal. They explore. They 11 | create. They inspire. They push the human race forward. 12 | Maybe they have to be crazy. 13 | How else can you stare at an empty canvas and see a work of art? Or 14 | sit in silence and hear a song thats never been written? Or gaze at 15 | a red planet and see a laboratory on wheels? 16 | We make tools for these kinds of people. 17 | While some see them as the crazy ones, we see genius. Because the 18 | people who are crazy enough to think they can change the world, 19 | are the ones who do. -------------------------------------------------------------------------------- /resources/crazyones_layout_vertical_space_font_height_weight.txt: -------------------------------------------------------------------------------- 1 | The Crazy Ones 2 | October 14, 1998 3 | 4 | Heres to the crazy ones. The misfits. The rebels. The troublemakers. 5 | The round pegs in the square holes. 6 | 7 | The ones who see things differently. Theyre not fond of rules. And 8 | they have no respect for the status quo. You can quote them, 9 | disagree with them, glorify or vilify them. 10 | 11 | About the only thing you cant do is ignore them. Because they change 12 | things. They invent. They imagine. They heal. They explore. They 13 | create. They inspire. They push the human race forward. 14 | 15 | Maybe they have to be crazy. 16 | 17 | How else can you stare at an empty canvas and see a work of art? Or 18 | sit in silence and hear a song thats never been written? Or gaze at 19 | a red planet and see a laboratory on wheels? 20 | 21 | We make tools for these kinds of people. 22 | 23 | While some see them as the crazy ones, we see genius. Because the 24 | people who are crazy enough to think they can change the world, 25 | are the ones who do. -------------------------------------------------------------------------------- /resources/encrypted-file.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/encrypted-file.pdf -------------------------------------------------------------------------------- /resources/encrypted_doc_no_id.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/encrypted_doc_no_id.pdf -------------------------------------------------------------------------------- /resources/encryption/r2-empty-password.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/encryption/r2-empty-password.pdf -------------------------------------------------------------------------------- /resources/encryption/r2-owner-password.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/encryption/r2-owner-password.pdf -------------------------------------------------------------------------------- /resources/encryption/r2-user-password.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/encryption/r2-user-password.pdf -------------------------------------------------------------------------------- /resources/encryption/r3-empty-password.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/encryption/r3-empty-password.pdf -------------------------------------------------------------------------------- /resources/encryption/r3-user-password.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/encryption/r3-user-password.pdf -------------------------------------------------------------------------------- /resources/encryption/r4-aes-user-password.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/encryption/r4-aes-user-password.pdf -------------------------------------------------------------------------------- /resources/encryption/r4-owner-password.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/encryption/r4-owner-password.pdf -------------------------------------------------------------------------------- /resources/encryption/r4-user-password.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/encryption/r4-user-password.pdf -------------------------------------------------------------------------------- /resources/encryption/r5-empty-password.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/encryption/r5-empty-password.pdf -------------------------------------------------------------------------------- /resources/encryption/r5-owner-password.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/encryption/r5-owner-password.pdf -------------------------------------------------------------------------------- /resources/encryption/r5-user-password.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/encryption/r5-user-password.pdf -------------------------------------------------------------------------------- /resources/encryption/r6-both-passwords.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/encryption/r6-both-passwords.pdf -------------------------------------------------------------------------------- /resources/encryption/r6-empty-password.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/encryption/r6-empty-password.pdf -------------------------------------------------------------------------------- /resources/encryption/r6-owner-password.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/encryption/r6-owner-password.pdf -------------------------------------------------------------------------------- /resources/encryption/r6-user-password.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/encryption/r6-user-password.pdf -------------------------------------------------------------------------------- /resources/encryption/unencrypted.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/encryption/unencrypted.pdf -------------------------------------------------------------------------------- /resources/form.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/form.pdf -------------------------------------------------------------------------------- /resources/form_acrobatReader.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/form_acrobatReader.pdf -------------------------------------------------------------------------------- /resources/form_evince.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/form_evince.pdf -------------------------------------------------------------------------------- /resources/git.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/git.pdf -------------------------------------------------------------------------------- /resources/hello-world.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/hello-world.pdf -------------------------------------------------------------------------------- /resources/imagemagick-ASCII85Decode.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/imagemagick-ASCII85Decode.pdf -------------------------------------------------------------------------------- /resources/imagemagick-CCITTFaxDecode.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/imagemagick-CCITTFaxDecode.pdf -------------------------------------------------------------------------------- /resources/imagemagick-images.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/imagemagick-images.pdf -------------------------------------------------------------------------------- /resources/imagemagick-lzw.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/imagemagick-lzw.pdf -------------------------------------------------------------------------------- /resources/indirect-rotation.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/indirect-rotation.pdf -------------------------------------------------------------------------------- /resources/inkscape-abc.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/inkscape-abc.pdf -------------------------------------------------------------------------------- /resources/issue-297.pdf: -------------------------------------------------------------------------------- 1 | %PDF-1.1 2 | 3 | 1 0 obj 4 | << 5 | /Type /Catalog 6 | /Outlines 2 0 R 7 | /Pages 3 0 R 8 | /OpenAction 7 0 R 9 | >> 10 | endobj 11 | 12 | 2 0 obj 13 | << 14 | /Type /Outlines 15 | /Count 0 16 | >> 17 | endobj 18 | 19 | 3 0 obj 20 | << 21 | /Type /Pages 22 | /Kids [4 0 R] 23 | /Count 1 24 | >> 25 | endobj 26 | 27 | 4 0 obj 28 | << 29 | /Type /Page 30 | /Parent 3 0 R 31 | /MediaBox [0 0 612 792] 32 | /Contents 5 0 R 33 | /Resources << 34 | /ProcSet [/PDF /Text] 35 | /Font << /F1 6 0 R >> 36 | >> 37 | >> 38 | endobj 39 | 40 | 5 0 obj 41 | << /Length 56 >> 42 | stream 43 | BT /F1 12 Tf 100 700 Td 15 TL (test example) Tj ET 44 | endstream 45 | endobj 46 | 47 | 6 0 obj 48 | << 49 | /Type /Font 50 | /Subtype /Type1 51 | /Name /F1 52 | /BaseFont /Helvetica 53 | /Encoding /MacRomanEncoding 54 | >> 55 | endobj 56 | 57 | 7 0 obj 58 | << 59 | /Type /Action 60 | /S /JavaScript 61 | /JS (app.alert({cMsg: 'Hello alert', cTitle: 'Testing PDF', nIcon: 3});) 62 | >> 63 | endobj 64 | 65 | xref 66 | 0 8 67 | 0000000000 65535 f 68 | 0000000012 00000 n 69 | 0000000109 00000 n 70 | 0000000165 00000 n 71 | 0000000234 00000 n 72 | 0000000439 00000 n 73 | 0000000553 00000 n 74 | 0000000677 00000 n 75 | trailer 76 | << 77 | /Size 8 78 | /Root 1 0 R 79 | >> 80 | startxref 81 | 842 82 | %%EOF -------------------------------------------------------------------------------- /resources/issue-301.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/issue-301.pdf -------------------------------------------------------------------------------- /resources/issue-604.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/issue-604.pdf -------------------------------------------------------------------------------- /resources/issue-914-xmp-data.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/issue-914-xmp-data.pdf -------------------------------------------------------------------------------- /resources/jpeg.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/jpeg.pdf -------------------------------------------------------------------------------- /resources/labeled-edges-center-image.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/labeled-edges-center-image.pdf -------------------------------------------------------------------------------- /resources/libreoffice-form.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/libreoffice-form.pdf -------------------------------------------------------------------------------- /resources/libreoffice-writer-password.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/libreoffice-writer-password.pdf -------------------------------------------------------------------------------- /resources/lzw_decoder_table_overflow.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/lzw_decoder_table_overflow.bin -------------------------------------------------------------------------------- /resources/metadata.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/metadata.pdf -------------------------------------------------------------------------------- /resources/missing_info.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/missing_info.pdf -------------------------------------------------------------------------------- /resources/multilang.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/multilang.pdf -------------------------------------------------------------------------------- /resources/outline-without-title.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/outline-without-title.pdf -------------------------------------------------------------------------------- /resources/outlines-with-invalid-destinations.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/outlines-with-invalid-destinations.pdf -------------------------------------------------------------------------------- /resources/pdflatex-forms.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/pdflatex-forms.pdf -------------------------------------------------------------------------------- /resources/pdflatex-outline.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/pdflatex-outline.pdf -------------------------------------------------------------------------------- /resources/reportlab-inline-image.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/reportlab-inline-image.pdf -------------------------------------------------------------------------------- /resources/selenium-pypdf-issue-177.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/selenium-pypdf-issue-177.pdf -------------------------------------------------------------------------------- /resources/side-by-side-subfig.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/side-by-side-subfig.pdf -------------------------------------------------------------------------------- /resources/test Orient.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/test Orient.pdf -------------------------------------------------------------------------------- /resources/test_watermarking_reportlab_rendering.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/test_watermarking_reportlab_rendering.png -------------------------------------------------------------------------------- /resources/toy.layout.txt: -------------------------------------------------------------------------------- 1 | AWAY again1 2 | AWAY again2 3 | 4 | 5 | Something[cited] 6 | 7 | Single quote operator 8 | Double quote operator 9 | Last Txt -------------------------------------------------------------------------------- /resources/toy.pdf: -------------------------------------------------------------------------------- 1 | %PDF-1.4 2 | 1 0 obj 3 | << /Type /Catalog 4 | /Outlines 2 0 R 5 | /Pages 3 0 R 6 | >> 7 | endobj 8 | 2 0 obj 9 | << /Type /Outlines 10 | /Count 0 11 | >> 12 | endobj 13 | 3 0 obj 14 | << /Type /Pages 15 | /Kids [4 0 R] 16 | /Count 1 17 | >> 18 | endobj 19 | 4 0 obj 20 | << /Type /Page 21 | /Parent 3 0 R 22 | /MediaBox [0 0 612 792] 23 | /Contents 5 0 R 24 | /Resources << /ProcSet 6 0 R 25 | /Font << /F1 7 0 R >> 26 | >> 27 | >> 28 | endobj 29 | 5 0 obj 30 | << /Length 396 >> 31 | stream 32 | q .75000 0 0 .75000 0 792 cm 33 | q .32000 0 0 .32000 0 0 cm 34 | q 35 | q .20812 0 0 .20832 0 0 cm 36 | BT 37 | /F1 200 Tf 38 | 600 -656 Td 39 | [(AWAY again1)] TJ 40 | ET Q 41 | q .20812 0 0 .20832 0 0 cm 42 | BT 43 | /F1 200 Tf 44 | 600 TL 45 | 900 -906 Td 46 | [300 (A) 120 (W) -120 (A) 95 (Y again) (2)] TJ 47 | T* 48 | (Something) Tj 49 | 50 Ts 50 | ([cited]) Tj 51 | 600 -300 TD 52 | 100 Tw 53 | 80 Tz 54 | (Single quote operator) ' 55 | 0 Ts 56 | 200 120 (Double quote operator) " 57 | T* 58 | (Last Txt) Tj 59 | ET Q Q Q Q 60 | endstream 61 | endobj 62 | 6 0 obj 63 | [/PDF /Text] 64 | endobj 65 | 7 0 obj 66 | << /Type /Font 67 | /Subtype /TrueType 68 | /Name /F1 69 | /BaseFont /Arial 70 | /Encoding /WinAnsiEncoding 71 | >> 72 | endobj 73 | xref 74 | 0 8 75 | 0000000000 65535 f 76 | 0000000009 00000 n 77 | 0000000076 00000 n 78 | 0000000120 00000 n 79 | 0000000177 00000 n 80 | 0000000318 00000 n 81 | 0000000765 00000 n 82 | 0000000793 00000 n 83 | trailer 84 | << /Size 8 85 | /Root 1 0 R 86 | >> 87 | startxref 88 | 899 89 | %%EOF -------------------------------------------------------------------------------- /resources/two-different-pages.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/two-different-pages.pdf -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | """Fixtures that are available automatically for all tests.""" 2 | 3 | import uuid 4 | from pathlib import Path 5 | 6 | import pytest 7 | 8 | TESTS_ROOT = Path(__file__).parent.resolve() 9 | PROJECT_ROOT = TESTS_ROOT.parent 10 | RESOURCE_ROOT = PROJECT_ROOT / "resources" 11 | 12 | 13 | @pytest.fixture(scope="session") 14 | def pdf_file_path(tmp_path_factory): 15 | return tmp_path_factory.mktemp("pypdf-data") / f"{uuid.uuid4()}.pdf" 16 | 17 | 18 | @pytest.fixture(scope="session") 19 | def txt_file_path(tmp_path_factory): 20 | return tmp_path_factory.mktemp("pypdf-data") / f"{uuid.uuid4()}.txt" 21 | -------------------------------------------------------------------------------- /tests/generic/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/tests/generic/__init__.py -------------------------------------------------------------------------------- /tests/generic/test_image_inline.py: -------------------------------------------------------------------------------- 1 | """Test the pypdf.generic._image_inline module.""" 2 | from io import BytesIO 3 | 4 | from pypdf.generic._image_inline import is_followed_by_binary_data 5 | 6 | 7 | def test_is_followed_by_binary_data(): 8 | # Empty/too short stream. 9 | stream = BytesIO() 10 | assert not is_followed_by_binary_data(stream) 11 | 12 | stream = BytesIO(b" q\n") 13 | assert not is_followed_by_binary_data(stream) 14 | 15 | # byte < 32 and no whitespace. 16 | stream = BytesIO(b"\x00\x11\x13\x37") 17 | assert is_followed_by_binary_data(stream) 18 | assert stream.read(1) == b"\x00" 19 | assert is_followed_by_binary_data(stream) 20 | assert stream.read(1) == b"\x11" 21 | assert is_followed_by_binary_data(stream) 22 | assert stream.read() == b"\x13\x37" 23 | 24 | # byte < 32, but whitespace. 25 | stream = BytesIO(b" q\n") 26 | assert not is_followed_by_binary_data(stream) 27 | 28 | # Whitespace only. 29 | stream = BytesIO(b" \n\n\n \n") 30 | assert not is_followed_by_binary_data(stream) 31 | 32 | # No `operator_end`. 33 | stream = BytesIO(b"\n\n\n\n\n\n\n\nBT\n") 34 | assert not is_followed_by_binary_data(stream) 35 | 36 | # Operator length is <= 3. 37 | stream = BytesIO(b"\n\n\n\n\n\n\nBT\n") 38 | assert not is_followed_by_binary_data(stream) 39 | 40 | # Operator length is > 3. 41 | stream = BytesIO(b"\n\n\n\n\nTEST\n") 42 | assert is_followed_by_binary_data(stream) 43 | 44 | # Just characters. 45 | stream = BytesIO(b" ABCDEF") 46 | assert is_followed_by_binary_data(stream) 47 | 48 | # No `operator_start`. 49 | stream = BytesIO(b"ABCDEFG") 50 | assert is_followed_by_binary_data(stream) 51 | 52 | # Name object. 53 | stream = BytesIO(b"/R10 gs\n/R12 cs\n") 54 | assert not is_followed_by_binary_data(stream) 55 | 56 | # Numbers. 57 | stream = BytesIO(b"1337 42 m\n") 58 | assert not is_followed_by_binary_data(stream) 59 | 60 | stream = BytesIO(b"1234.56 42 13 37 10 20 c\n") 61 | assert not is_followed_by_binary_data(stream) 62 | -------------------------------------------------------------------------------- /tests/scripts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/tests/scripts/__init__.py -------------------------------------------------------------------------------- /tests/test_constants.py: -------------------------------------------------------------------------------- 1 | """Test the pypdf.constants module.""" 2 | import re 3 | from typing import Callable 4 | 5 | import pytest 6 | 7 | from pypdf.constants import PDF_KEYS, GraphicsStateParameters, UserAccessPermissions 8 | 9 | 10 | def test_slash_prefix(): 11 | """ 12 | Naming conventions of PDF_KEYS (constant names) are followed. 13 | 14 | This test function validates if PDF key names follow the required pattern: 15 | - Starts with a slash "/" 16 | - Followed by an uppercase letter 17 | - Contains alphanumeric characters (letters and digits) 18 | - The attribute name should be a case-insensitive match, with underscores removed 19 | """ 20 | pattern = re.compile(r"^\/[A-Z]+[a-zA-Z0-9]*$") 21 | for cls in PDF_KEYS: 22 | for attr in dir(cls): 23 | # Skip magic methods 24 | if attr.startswith("__") and attr.endswith("__"): 25 | continue 26 | 27 | # Skip methods 28 | constant_value = getattr(cls, attr) 29 | if isinstance(constant_value, Callable): 30 | continue 31 | 32 | assert constant_value.startswith("/") 33 | assert attr.replace("_", "").casefold() == constant_value[1:].casefold() 34 | 35 | # There are a few exceptions that may be lowercase 36 | if cls == GraphicsStateParameters and attr in ["ca", "op"]: 37 | continue 38 | 39 | assert pattern.match(constant_value) 40 | 41 | 42 | def test_user_access_permissions__dict_handling(): 43 | # Value is mix of configurable and reserved bits. 44 | # Reserved bits should not be part of the dictionary. 45 | as_dict = UserAccessPermissions(512 + 64 + 8).to_dict() 46 | assert as_dict == { 47 | "add_or_modify": False, 48 | "assemble_doc": False, 49 | "extract": False, 50 | "extract_text_and_graphics": True, 51 | "fill_form_fields": False, 52 | "modify": True, 53 | "print": False, 54 | "print_to_representation": False, 55 | } 56 | 57 | # Convert the dictionary back to an integer. 58 | # This should add the reserved bits automatically. 59 | permissions = UserAccessPermissions.from_dict(as_dict) 60 | assert permissions == 4294963912 61 | 62 | # Roundtrip for valid dictionary. 63 | data = { 64 | "add_or_modify": True, 65 | "assemble_doc": False, 66 | "extract": False, 67 | "extract_text_and_graphics": True, 68 | "fill_form_fields": False, 69 | "modify": True, 70 | "print": False, 71 | "print_to_representation": True, 72 | } 73 | assert UserAccessPermissions.from_dict(data).to_dict() == data 74 | 75 | # Empty inputs. 76 | assert UserAccessPermissions.from_dict({}) == 4294963392 # Reserved bits. 77 | assert UserAccessPermissions(0).to_dict() == { 78 | "add_or_modify": False, 79 | "assemble_doc": False, 80 | "extract": False, 81 | "extract_text_and_graphics": False, 82 | "fill_form_fields": False, 83 | "modify": False, 84 | "print": False, 85 | "print_to_representation": False, 86 | } 87 | 88 | # Unknown dictionary keys. 89 | data = { 90 | "add_or_modify": True, 91 | "key1": False, 92 | "key2": True, 93 | } 94 | unknown = { 95 | "key1": False, 96 | "key2": True, 97 | } 98 | with pytest.raises( 99 | ValueError, 100 | match=f"Unknown dictionary keys: {unknown!r}" 101 | ): 102 | UserAccessPermissions.from_dict(data) 103 | 104 | 105 | def test_user_access_permissions__all(): 106 | all_permissions = UserAccessPermissions.all() 107 | all_int = int(all_permissions) 108 | all_string = bin(all_permissions) 109 | 110 | assert all_string.startswith("0b") 111 | assert len(all_string[2:]) == 32 # 32-bit integer 112 | 113 | assert all_int & UserAccessPermissions.R1 == 0 114 | assert all_int & UserAccessPermissions.R2 == 0 115 | assert all_int & UserAccessPermissions.PRINT == UserAccessPermissions.PRINT 116 | assert all_int & UserAccessPermissions.R7 == UserAccessPermissions.R7 117 | assert all_int & UserAccessPermissions.R31 == UserAccessPermissions.R31 118 | -------------------------------------------------------------------------------- /tests/test_forms.py: -------------------------------------------------------------------------------- 1 | """Test form-related functionality. Separate file to keep overview.""" 2 | 3 | from io import BytesIO 4 | 5 | import pytest 6 | 7 | from pypdf import PdfReader, PdfWriter 8 | from tests import get_data_from_url 9 | 10 | 11 | @pytest.mark.enable_socket 12 | def test_form_button__v_value_should_be_name_object(): 13 | url = "https://github.com/user-attachments/files/18736500/blank-form.pdf" 14 | name = "issue3115.pdf" 15 | reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) 16 | writer = PdfWriter(clone_from=reader) 17 | writer.update_page_form_field_values( 18 | writer.pages[0], 19 | {"Other": "/On"}, 20 | auto_regenerate=False, 21 | ) 22 | stream = BytesIO() 23 | writer.write(stream) 24 | 25 | # Wrong: `/V (/On)`. 26 | assert b"\n/V /On\n" in stream.getvalue() 27 | -------------------------------------------------------------------------------- /tests/test_javascript.py: -------------------------------------------------------------------------------- 1 | """Test topics around the usage of JavaScript in PDF documents.""" 2 | from pathlib import Path 3 | from typing import Any 4 | 5 | import pytest 6 | 7 | from pypdf import PdfReader, PdfWriter 8 | 9 | # Configure path environment 10 | TESTS_ROOT = Path(__file__).parent.resolve() 11 | PROJECT_ROOT = TESTS_ROOT.parent 12 | RESOURCE_ROOT = PROJECT_ROOT / "resources" 13 | 14 | 15 | @pytest.fixture 16 | def pdf_file_writer(): 17 | reader = PdfReader(RESOURCE_ROOT / "issue-604.pdf") 18 | writer = PdfWriter() 19 | writer.append_pages_from_reader(reader) 20 | return writer 21 | 22 | 23 | def test_add_js(pdf_file_writer): 24 | pdf_file_writer.add_js("this.print({bUI:true,bSilent:false,bShrinkToFit:true});") 25 | 26 | assert ( 27 | "/Names" in pdf_file_writer._root_object 28 | ), "add_js should add a name catalog in the root object." 29 | assert ( 30 | "/JavaScript" in pdf_file_writer._root_object["/Names"] 31 | ), "add_js should add a JavaScript name tree under the name catalog." 32 | 33 | 34 | def test_added_js(pdf_file_writer): 35 | def get_javascript_name() -> Any: 36 | assert "/Names" in pdf_file_writer._root_object 37 | assert "/JavaScript" in pdf_file_writer._root_object["/Names"] 38 | assert "/Names" in pdf_file_writer._root_object["/Names"]["/JavaScript"] 39 | return pdf_file_writer._root_object["/Names"]["/JavaScript"]["/Names"][ 40 | -2 41 | ] # return -2 in order to get the latest javascript 42 | 43 | pdf_file_writer.add_js("this.print({bUI:true,bSilent:false,bShrinkToFit:true});") 44 | first_js = get_javascript_name() 45 | 46 | pdf_file_writer.add_js("this.print({bUI:true,bSilent:false,bShrinkToFit:true});") 47 | second_js = get_javascript_name() 48 | 49 | assert ( 50 | first_js != second_js 51 | ), "add_js should add to the previous script in the catalog." 52 | -------------------------------------------------------------------------------- /tests/test_pagerange.py: -------------------------------------------------------------------------------- 1 | """Test the pypdf.pagerange module.""" 2 | import pytest 3 | 4 | from pypdf.pagerange import PageRange, ParseError, parse_filename_page_ranges 5 | 6 | 7 | def test_equality(): 8 | pr1 = PageRange(slice(0, 5)) 9 | pr2 = PageRange(slice(0, 5)) 10 | assert pr1 == pr2 11 | 12 | 13 | @pytest.mark.parametrize( 14 | ("page_range", "expected"), 15 | [(slice(0, 5), "0:5"), (slice(0, 5, 2), "0:5:2"), ("-1", "-1:"), ("0", "0")], 16 | ) 17 | def test_str(page_range, expected): 18 | assert str(PageRange(page_range)) == expected 19 | 20 | 21 | @pytest.mark.parametrize( 22 | ("page_range", "expected"), 23 | [(slice(0, 5), "PageRange('0:5')"), (slice(0, 5, 2), "PageRange('0:5:2')")], 24 | ) 25 | def test_repr(page_range, expected): 26 | assert repr(PageRange(page_range)) == expected 27 | 28 | 29 | def test_equality_other_objectc(): 30 | pr1 = PageRange(slice(0, 5)) 31 | pr2 = "PageRange(slice(0, 5))" 32 | assert pr1 != pr2 33 | 34 | 35 | def test_idempotency(): 36 | pr = PageRange(slice(0, 5)) 37 | pr2 = PageRange(pr) 38 | assert pr == pr2 39 | 40 | 41 | @pytest.mark.parametrize( 42 | ("range_str", "expected"), 43 | [ 44 | ("42", slice(42, 43)), 45 | ("1:2", slice(1, 2)), 46 | ], 47 | ) 48 | def test_str_init(range_str, expected): 49 | pr = PageRange(range_str) 50 | assert pr._slice == expected 51 | assert PageRange.valid 52 | 53 | 54 | def test_str_init_error(): 55 | init_str = "1-2" 56 | assert PageRange.valid(init_str) is False 57 | with pytest.raises(ParseError) as exc: 58 | PageRange(init_str) 59 | assert exc.value.args[0] == "1-2" 60 | 61 | 62 | @pytest.mark.parametrize( 63 | ("params", "expected"), 64 | [ 65 | (["foo.pdf", "1:5"], [("foo.pdf", PageRange("1:5"))]), 66 | ( 67 | ["foo.pdf", "1:5", "bar.pdf"], 68 | [("foo.pdf", PageRange("1:5")), ("bar.pdf", PageRange(":"))], 69 | ), 70 | ], 71 | ) 72 | def test_parse_filename_page_ranges(params, expected): 73 | assert parse_filename_page_ranges(params) == expected 74 | 75 | 76 | def test_parse_filename_page_ranges_err(): 77 | with pytest.raises(ValueError) as exc: 78 | parse_filename_page_ranges(["1:5", "foo.pdf"]) 79 | assert ( 80 | exc.value.args[0] == "The first argument must be a filename, not a page range." 81 | ) 82 | 83 | 84 | @pytest.mark.parametrize( 85 | ("a", "b", "expected"), 86 | [ 87 | (PageRange(slice(0, 5)), PageRange(slice(2, 10)), slice(0, 10)), 88 | (PageRange(slice(0, 5)), PageRange(slice(2, 3)), slice(0, 5)), 89 | (PageRange(slice(0, 5)), PageRange(slice(5, 10)), slice(0, 10)), 90 | ], 91 | ) 92 | def test_addition(a, b, expected): 93 | pr1 = PageRange(a) 94 | pr2 = PageRange(b) 95 | assert pr1 + pr2 == PageRange(expected) 96 | assert pr2 + pr1 == PageRange(expected) # addition is commutative 97 | 98 | 99 | @pytest.mark.parametrize( 100 | ("a", "b"), 101 | [ 102 | (PageRange(slice(0, 5)), PageRange(slice(7, 10))), 103 | (PageRange(slice(7, 10)), PageRange(slice(0, 5))), 104 | ], 105 | ) 106 | def test_addition_gap(a: PageRange, b: PageRange): 107 | with pytest.raises(ValueError) as exc: 108 | a + b 109 | assert exc.value.args[0] == "Can't add PageRanges with gap" 110 | 111 | 112 | def test_addition_non_page_range(): 113 | with pytest.raises(TypeError) as exc: 114 | PageRange(slice(0, 5)) + "2:7" 115 | assert exc.value.args[0] == "Can't add PageRange and " 116 | 117 | 118 | def test_addition_stride(): 119 | a = PageRange(slice(0, 5, 2)) 120 | b = PageRange(slice(7, 9)) 121 | with pytest.raises(ValueError) as exc: 122 | a + b 123 | assert exc.value.args[0] == "Can't add PageRange with stride" 124 | -------------------------------------------------------------------------------- /tests/test_papersizes.py: -------------------------------------------------------------------------------- 1 | """Test the pypdf.papersizes module.""" 2 | import pytest 3 | 4 | from pypdf import papersizes 5 | 6 | 7 | def test_din_a0_paper_size(): 8 | """The dimensions and area of the DIN A0 paper size are correct.""" 9 | dim = papersizes.PaperSize.A0 10 | area_square_pixels = float(dim.width) * dim.height 11 | 12 | # 72 pixels is 1 inch 13 | area_square_inch = area_square_pixels / 72**2 14 | 15 | # 25.4 millimeter is equal to 1 inches 16 | area_square_mm = area_square_inch * (25.4) ** 2 17 | assert abs(area_square_mm - 999949) < 100 18 | conversion_factor = 72 / 25.4 19 | assert (dim.width - 841 * conversion_factor) < 1 20 | assert (dim.width - 1189 * conversion_factor) < 1 21 | 22 | 23 | @pytest.mark.parametrize("dimensions", papersizes._din_a) 24 | def test_din_a_aspect_ratio(dimensions): 25 | """The aspect ratio of DIN A paper sizes is correct.""" 26 | assert abs(dimensions.height - dimensions.width * 2**0.5) <= 2.5 27 | 28 | 29 | @pytest.mark.parametrize( 30 | ("dimensions_a", "dimensions_b"), 31 | list(zip(papersizes._din_a, papersizes._din_a[1:])), 32 | ) 33 | def test_din_a_size_doubling(dimensions_a, dimensions_b): 34 | """The height of a DIN A paper size doubles when moving to the next size.""" 35 | assert abs(dimensions_a.height - 2 * dimensions_b.width) <= 4 36 | -------------------------------------------------------------------------------- /tests/test_pdfa.py: -------------------------------------------------------------------------------- 1 | """Ensure that pypdf doesn't break PDF/A compliance.""" 2 | 3 | from io import BytesIO 4 | from pathlib import Path 5 | from typing import Optional 6 | 7 | import pytest 8 | 9 | from pypdf import PdfReader, PdfWriter 10 | 11 | TESTS_ROOT = Path(__file__).parent.resolve() 12 | PROJECT_ROOT = TESTS_ROOT.parent 13 | RESOURCE_ROOT = PROJECT_ROOT / "resources" 14 | SAMPLE_ROOT = PROJECT_ROOT / "sample-files" 15 | 16 | 17 | def is_pdfa1b_compliant(src: BytesIO): 18 | """Check if a PDF is PDF/A-1b compliant.""" 19 | 20 | def document_information_has_analoguos_xml(src: BytesIO) -> bool: 21 | reader = PdfReader(src) 22 | meta = reader.metadata 23 | xmp = reader.xmp_metadata 24 | if not meta: 25 | return True 26 | if not xmp: 27 | return False 28 | if meta.title and not xmp.dc_title: 29 | return meta.title == xmp.dc_title 30 | return True 31 | 32 | return document_information_has_analoguos_xml(src) 33 | 34 | 35 | @pytest.mark.samples 36 | @pytest.mark.parametrize( 37 | ("src", "diagnostic_write_name"), 38 | [ 39 | (SAMPLE_ROOT / "021-pdfa/crazyones-pdfa.pdf", None), 40 | ], 41 | ) 42 | def test_pdfa(src: Path, diagnostic_write_name: Optional[str]): 43 | with open(src, "rb") as fp: 44 | data = BytesIO(fp.read()) 45 | reader = PdfReader(src) 46 | assert is_pdfa1b_compliant(data) 47 | writer = PdfWriter() 48 | writer.clone_document_from_reader(reader) 49 | 50 | stream = BytesIO() 51 | writer.write(stream) 52 | stream.seek(0) 53 | 54 | assert is_pdfa1b_compliant(stream) 55 | if diagnostic_write_name: 56 | with open(diagnostic_write_name, "wb") as fp: 57 | stream.seek(0) 58 | fp.write(stream.read()) 59 | -------------------------------------------------------------------------------- /tests/test_protocols.py: -------------------------------------------------------------------------------- 1 | """Test the pypdf._protocols module.""" 2 | from pypdf._protocols import PdfObjectProtocol 3 | 4 | 5 | class IPdfObjectProtocol(PdfObjectProtocol): 6 | pass 7 | 8 | 9 | def test_pdfobjectprotocol(): 10 | o = IPdfObjectProtocol() 11 | assert o.clone(None, False, None) is None 12 | assert o._reference_clone(None, None) is None 13 | assert o.get_object() is None 14 | assert o.hash_value() is None 15 | assert o.write_to_stream(None) is None 16 | -------------------------------------------------------------------------------- /tests/test_xobject_image_helpers.py: -------------------------------------------------------------------------------- 1 | """Test the pypdf._xobj_image_helpers module.""" 2 | from io import BytesIO 3 | 4 | import pytest 5 | 6 | from pypdf import PdfReader 7 | from pypdf._xobj_image_helpers import _extended_image_frombytes, _handle_flate 8 | from pypdf.errors import EmptyImageDataError, PdfReadError 9 | from pypdf.generic import ArrayObject, DecodedStreamObject, NameObject, NumberObject 10 | 11 | from . import get_data_from_url 12 | 13 | 14 | @pytest.mark.enable_socket 15 | def test_get_imagemode_recursion_depth(): 16 | """Avoid infinite recursion for nested color spaces.""" 17 | url = "https://github.com/py-pdf/pypdf/files/12814018/out1.pdf" 18 | name = "issue2240.pdf" 19 | # Simple example: Just let the color space object reference itself. 20 | # The alternative would be to generate a chain of referencing objects. 21 | content = get_data_from_url(url, name=name) 22 | source = b"\n10 0 obj\n[ /DeviceN [ /HKS#2044#20K /Magenta /Yellow /Black ] 7 0 R 11 0 R 12 0 R ]\nendobj\n" 23 | target = b"\n10 0 obj\n[ /DeviceN [ /HKS#2044#20K /Magenta /Yellow /Black ] 10 0 R 11 0 R 12 0 R ]\nendobj\n" 24 | reader = PdfReader(BytesIO(content.replace(source, target))) 25 | with pytest.raises( 26 | PdfReadError, 27 | match="Color spaces nested too deeply. If required, consider increasing MAX_IMAGE_MODE_NESTING_DEPTH.", 28 | ): 29 | reader.pages[0].images[0] 30 | 31 | 32 | def test_handle_flate__image_mode_1(caplog): 33 | data = b"\x00\xe0\x00" 34 | lookup = DecodedStreamObject() 35 | expected_data = [ 36 | (66, 66, 66), 37 | (66, 66, 66), 38 | (66, 66, 66), 39 | (0, 19, 55), 40 | (0, 19, 55), 41 | (0, 19, 55), 42 | (66, 66, 66), 43 | (66, 66, 66), 44 | (66, 66, 66), 45 | ] 46 | 47 | # No trailing data. 48 | lookup.set_data(b"\x42\x42\x42\x00\x13\x37") 49 | result = _handle_flate( 50 | size=(3, 3), 51 | data=data, 52 | mode="1", 53 | color_space=ArrayObject( 54 | [NameObject("/Indexed"), NameObject("/DeviceRGB"), NumberObject(1), lookup] 55 | ), 56 | colors=2, 57 | obj_as_text="dummy", 58 | ) 59 | assert expected_data == list(result[0].getdata()) 60 | assert not caplog.text 61 | 62 | # Trailing whitespace. 63 | lookup.set_data(b"\x42\x42\x42\x00\x13\x37 \x0a") 64 | result = _handle_flate( 65 | size=(3, 3), 66 | data=data, 67 | mode="1", 68 | color_space=ArrayObject( 69 | [NameObject("/Indexed"), NameObject("/DeviceRGB"), NumberObject(1), lookup] 70 | ), 71 | colors=2, 72 | obj_as_text="dummy", 73 | ) 74 | assert expected_data == list(result[0].getdata()) 75 | assert not caplog.text 76 | 77 | # Trailing non-whitespace character. 78 | lookup.set_data(b"\x42\x42\x42\x00\x13\x37\x12") 79 | result = _handle_flate( 80 | size=(3, 3), 81 | data=data, 82 | mode="1", 83 | color_space=ArrayObject( 84 | [ 85 | NameObject("/Indexed"), 86 | NameObject("/DeviceRGB"), 87 | NumberObject(1), 88 | lookup, 89 | ] 90 | ), 91 | colors=2, 92 | obj_as_text="dummy", 93 | ) 94 | assert expected_data == list(result[0].getdata()) 95 | assert "Too many lookup values: Expected 6, got 7." in caplog.text 96 | 97 | # Not enough lookup data. 98 | # `\xe0` of the original input (the middle part) does not use `0x37 = 55` for the lookup 99 | # here, but received a custom padding of `0`. 100 | lookup.set_data(b"\x42\x42\x42\x00\x13") 101 | caplog.clear() 102 | expected_short_data = [entry if entry[0] == 66 else (0, 19, 0) for entry in expected_data] 103 | result = _handle_flate( 104 | size=(3, 3), 105 | data=data, 106 | mode="1", 107 | color_space=ArrayObject( 108 | [ 109 | NameObject("/Indexed"), 110 | NameObject("/DeviceRGB"), 111 | NumberObject(1), 112 | lookup, 113 | ] 114 | ), 115 | colors=2, 116 | obj_as_text="dummy", 117 | ) 118 | assert expected_short_data == list(result[0].getdata()) 119 | assert "Not enough lookup values: Expected 6, got 5." in caplog.text 120 | 121 | 122 | def test_extended_image_frombytes_zero_data(): 123 | mode = "RGB" 124 | size = (1, 1) 125 | data = b"" 126 | 127 | with pytest.raises(EmptyImageDataError, match="Data is 0 bytes, cannot process an image from empty data."): 128 | _extended_image_frombytes(mode, size, data) 129 | --------------------------------------------------------------------------------