├── .git-blame-ignore-revs
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── bug-report.md
    │   └── feature-request.md
    ├── SECURITY.md
    ├── dependabot.yml
    ├── scripts
    │   └── check_pr_title.py
    └── workflows
    │   ├── benchmark.yaml
    │   ├── create-github-release.yaml
    │   ├── github-ci.yaml
    │   ├── publish-to-pypi.yaml
    │   ├── release.yaml
    │   └── title-check.yml
├── .gitignore
├── .gitmodules
├── .pre-commit-config.yaml
├── .readthedocs.yaml
├── CHANGELOG.md
├── CONTRIBUTING.md
├── CONTRIBUTORS.md
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.md
├── docs
    ├── Makefile
    ├── _static
    │   ├── logo.png
    │   ├── releasing.drawio
    │   └── releasing.drawio.png
    ├── conf.py
    ├── dev
    │   ├── PR_Header_example.png
    │   ├── cmaps.md
    │   ├── deprecations.md
    │   ├── documentation.md
    │   ├── intro.md
    │   ├── pdf-format.md
    │   ├── pypdf-parsing.md
    │   ├── pypdf-writing.md
    │   ├── releasing.md
    │   └── testing.md
    ├── index.rst
    ├── make.bat
    ├── meta
    │   ├── changelog-v1.md
    │   ├── comparisons.md
    │   ├── faq.md
    │   ├── history.md
    │   ├── project-governance.md
    │   ├── scope-of-pypdf.md
    │   └── taking-ownership.md
    ├── modules
    │   ├── Destination.rst
    │   ├── DocumentInformation.rst
    │   ├── Field.rst
    │   ├── Fit.rst
    │   ├── PageObject.rst
    │   ├── PageRange.rst
    │   ├── PaperSize.rst
    │   ├── PdfDocCommon.rst
    │   ├── PdfReader.rst
    │   ├── PdfWriter.rst
    │   ├── RectangleObject.rst
    │   ├── Transformation.rst
    │   ├── XmpInformation.rst
    │   ├── annotations.rst
    │   ├── constants.rst
    │   ├── errors.rst
    │   └── generic.rst
    └── user
    │   ├── add-javascript.md
    │   ├── add-watermark.md
    │   ├── adding-pdf-annotations.md
    │   ├── annotation-circle.png
    │   ├── annotation-highlight.png
    │   ├── annotation-line.png
    │   ├── annotation-polygon.png
    │   ├── annotation-polyline.png
    │   ├── annotation-popup.png
    │   ├── annotation-square.png
    │   ├── cropping-and-transforming.md
    │   ├── encryption-decryption.md
    │   ├── error-hierarchy.png
    │   ├── extract-attachments.md
    │   ├── extract-images.md
    │   ├── extract-text.md
    │   ├── file-size.md
    │   ├── forms.md
    │   ├── free-text-annotation.png
    │   ├── installation.md
    │   ├── merge-45-deg-rot.png
    │   ├── merge-rotate-expand.png
    │   ├── merge-translated.png
    │   ├── merging-pdfs.md
    │   ├── metadata.md
    │   ├── migration-1-to-2.md
    │   ├── nup-dest1.png
    │   ├── nup-dest2.png
    │   ├── nup-source.png
    │   ├── page-stamped.png
    │   ├── page.png
    │   ├── pdf-version-support.md
    │   ├── pdfa-compliance.md
    │   ├── plain-merge.png
    │   ├── post-processing-in-text-extraction.md
    │   ├── reading-pdf-annotations.md
    │   ├── robustness.md
    │   ├── scaling.png
    │   ├── stamp.png
    │   ├── streaming-data.md
    │   ├── suppress-warnings.md
    │   ├── text-annotation.png
    │   ├── viewer-preferences.md
    │   └── watermark.png
├── make_release.py
├── pypdf
    ├── __init__.py
    ├── _cmap.py
    ├── _codecs
    │   ├── __init__.py
    │   ├── _codecs.py
    │   ├── adobe_glyphs.py
    │   ├── pdfdoc.py
    │   ├── std.py
    │   ├── symbol.py
    │   └── zapfding.py
    ├── _crypt_providers
    │   ├── __init__.py
    │   ├── _base.py
    │   ├── _cryptography.py
    │   ├── _fallback.py
    │   └── _pycryptodome.py
    ├── _doc_common.py
    ├── _encryption.py
    ├── _merger.py
    ├── _page.py
    ├── _page_labels.py
    ├── _protocols.py
    ├── _reader.py
    ├── _text_extraction
    │   ├── __init__.py
    │   └── _layout_mode
    │   │   ├── __init__.py
    │   │   ├── _fixed_width_page.py
    │   │   ├── _font.py
    │   │   ├── _font_widths.py
    │   │   ├── _text_state_manager.py
    │   │   └── _text_state_params.py
    ├── _utils.py
    ├── _version.py
    ├── _writer.py
    ├── _xobj_image_helpers.py
    ├── annotations
    │   ├── __init__.py
    │   ├── _base.py
    │   ├── _markup_annotations.py
    │   └── _non_markup_annotations.py
    ├── constants.py
    ├── errors.py
    ├── filters.py
    ├── generic
    │   ├── __init__.py
    │   ├── _base.py
    │   ├── _data_structures.py
    │   ├── _files.py
    │   ├── _fit.py
    │   ├── _image_inline.py
    │   ├── _outline.py
    │   ├── _rectangle.py
    │   ├── _utils.py
    │   └── _viewerpref.py
    ├── pagerange.py
    ├── papersizes.py
    ├── py.typed
    ├── types.py
    └── xmp.py
├── pyproject.toml
├── requirements
    ├── ci-3.11.txt
    ├── ci.in
    ├── ci.txt
    ├── dev.in
    ├── dev.txt
    ├── docs.in
    └── docs.txt
├── resources
    ├── 010-pdflatex-forms.txt
    ├── AEO.1172.layout.rot180.txt
    ├── AEO.1172.layout.txt
    ├── AutoCad_Diagram.pdf
    ├── AutoCad_Simple.pdf
    ├── Claim Maker Alerts Guide_pg2.layout.txt
    ├── Epic.Page.layout.txt
    ├── FormTestFromOo.pdf
    ├── GeoBase_NHNC1_Data_Model_UML_EN.pdf
    ├── SF424_page2.pdf
    ├── Sample_Td-matrix.pdf
    ├── Seige_of_Vicksburg_Sample_OCR-crazyones-merged.pdf
    ├── Seige_of_Vicksburg_Sample_OCR.pdf
    ├── attachment.pdf
    ├── box.pdf
    ├── bytes.pdf
    ├── commented-xmp.pdf
    ├── commented.pdf
    ├── crazyones-encrypted-256.pdf
    ├── crazyones.pdf
    ├── crazyones.txt
    ├── crazyones_layout_vertical_space.txt
    ├── crazyones_layout_vertical_space_font_height_weight.txt
    ├── encrypted-file.pdf
    ├── encrypted_doc_no_id.pdf
    ├── encryption
    │   ├── r2-empty-password.pdf
    │   ├── r2-owner-password.pdf
    │   ├── r2-user-password.pdf
    │   ├── r3-empty-password.pdf
    │   ├── r3-user-password.pdf
    │   ├── r4-aes-user-password.pdf
    │   ├── r4-owner-password.pdf
    │   ├── r4-user-password.pdf
    │   ├── r5-empty-password.pdf
    │   ├── r5-owner-password.pdf
    │   ├── r5-user-password.pdf
    │   ├── r6-both-passwords.pdf
    │   ├── r6-empty-password.pdf
    │   ├── r6-owner-password.pdf
    │   ├── r6-user-password.pdf
    │   └── unencrypted.pdf
    ├── form.pdf
    ├── form_acrobatReader.pdf
    ├── form_evince.pdf
    ├── git.pdf
    ├── hello-world.pdf
    ├── imagemagick-ASCII85Decode.pdf
    ├── imagemagick-CCITTFaxDecode.pdf
    ├── imagemagick-images.pdf
    ├── imagemagick-lzw.pdf
    ├── indirect-rotation.pdf
    ├── inkscape-abc.pdf
    ├── issue-297.pdf
    ├── issue-301.pdf
    ├── issue-604.pdf
    ├── issue-914-xmp-data.pdf
    ├── jpeg.pdf
    ├── jpeg.txt
    ├── labeled-edges-center-image.pdf
    ├── libreoffice-form.pdf
    ├── libreoffice-writer-password.pdf
    ├── lzw_decoder_table_overflow.bin
    ├── metadata.pdf
    ├── missing_info.pdf
    ├── multicolumn-lorem-ipsum.txt
    ├── multilang.pdf
    ├── outline-without-title.pdf
    ├── outlines-with-invalid-destinations.pdf
    ├── pdflatex-forms.pdf
    ├── pdflatex-outline.pdf
    ├── reportlab-inline-image.pdf
    ├── selenium-pypdf-issue-177.pdf
    ├── side-by-side-subfig.pdf
    ├── test Orient.pdf
    ├── test_watermarking_reportlab_rendering.png
    ├── toy.layout.txt
    ├── toy.pdf
    └── two-different-pages.pdf
└── tests
    ├── __init__.py
    ├── bench.py
    ├── conftest.py
    ├── example_files.yaml
    ├── generic
        ├── __init__.py
        ├── test_files.py
        └── test_image_inline.py
    ├── scripts
        ├── __init__.py
        ├── data
        │   └── commits__version_4_0_1.json
        └── test_make_release.py
    ├── test_annotations.py
    ├── test_cmap.py
    ├── test_codecs.py
    ├── test_constants.py
    ├── test_doc_common.py
    ├── test_encryption.py
    ├── test_filters.py
    ├── test_forms.py
    ├── test_generic.py
    ├── test_images.py
    ├── test_javascript.py
    ├── test_merger.py
    ├── test_page.py
    ├── test_page_labels.py
    ├── test_pagerange.py
    ├── test_papersizes.py
    ├── test_pdfa.py
    ├── test_protocols.py
    ├── test_reader.py
    ├── test_text_extraction.py
    ├── test_utils.py
    ├── test_workflows.py
    ├── test_writer.py
    ├── test_xmp.py
    └── test_xobject_image_helpers.py


/.git-blame-ignore-revs:
--------------------------------------------------------------------------------
 1 | # This file helps us to ignore style / formatting / doc changes
 2 | # in git blame. That is useful when we're trying to find the root cause of an
 3 | # error.
 4 | 
 5 | # Docstring formatting
 6 | a89ff74d8c0203278a039d9496a3d8df4d134f84
 7 | 
 8 | # STY: Apply pre-commit (black, isort) + use snake_case variables (#832)
 9 | eef03d935dfeacaa75848b39082cf94d833d3174
10 | 
11 | # STY: Apply black and isort
12 | baeb7d23278de0f8d00ca9f2b656bf0674f08937
13 | 
14 | # STY: Documentation, Variable names (#839)
15 | 444fca22836df061d9d23e71ffb7d68edcdfa766
16 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug-report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Report a bug
 3 | about: Something broke!
 4 | title: ''
 5 | labels: Bug
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | Replace this: What happened? What were you trying to achieve?
11 | 
12 | ## Environment
13 | 
14 | Which environment were you using when you encountered the problem?
15 | 
16 | ```bash
17 | $ python -m platform
18 | # TODO: Your output goes here
19 | 
20 | $ python -c "import pypdf;print(pypdf._debug_versions)"
21 | # TODO: Your output goes here
22 | ```
23 | 
24 | ## Code + PDF
25 | 
26 | This is a minimal, complete example that shows the issue:
27 | 
28 | ```python
29 | # TODO: Your code goes here
30 | ```
31 | 
32 | Share here the PDF file(s) that cause the issue. The smaller they are, the
33 | better. Let us know if we may add them to our tests!
34 | 
35 | ## Traceback
36 | 
37 | This is the complete traceback I see:
38 | 
39 | ```
40 | # TODO: Your traceback goes here (if applicable)
41 | ```
42 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature-request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Request a Feature
 3 | about: What do you think is missing in pypdf?
 4 | title: ''
 5 | labels: Feature Request
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | ## Explanation
11 | 
12 | Explain briefly what you want to achieve.
13 | 
14 | ## Code Example
15 | 
16 | How would your feature be used? (Remove this if it is not applicable.)
17 | 
18 | ```python
19 | from pypdf import PdfReader, PdfWriter
20 | 
21 | ...  # your new feature in action!
22 | ```
23 | 


--------------------------------------------------------------------------------
/.github/SECURITY.md:
--------------------------------------------------------------------------------
 1 | # Security Policy
 2 | 
 3 | ## Supported Versions
 4 | 
 5 | Security fixes are applied to the latest version.
 6 | 
 7 | ## Reporting a Vulnerability
 8 | 
 9 | If you find a potential security issue, please report it using the
10 | [private vulnerability reporting](https://docs.github.com/en/code-security/security-advisories/guidance-on-reporting-and-writing-information-about-vulnerabilities/privately-reporting-a-security-vulnerability) feature of GitHub to
11 | automatically inform all relevant team members. Otherwise, please
12 | get in touch with stefan6419846 through e-mail (current maintainer,
13 | address in GitHub profile).
14 | 
15 | We will try to find a fix in a timely manner and will then issue a security
16 | advisory together with the update via GitHub
17 | ([example](https://github.com/py-pdf/pypdf/security/advisories/GHSA-xcjx-m2pj-8g79)).
18 | 
19 | If you don't get a reaction within 30 days, please open a public issue on
20 | GitHub.
21 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | # Set update schedule for GitHub Actions
 2 | 
 3 | version: 2
 4 | updates:
 5 | 
 6 |   - package-ecosystem: "github-actions"
 7 |     directory: "/"
 8 |     schedule:
 9 |       interval: "daily"
10 |     commit-message:
11 |       prefix: "DEV"
12 | 


--------------------------------------------------------------------------------
/.github/scripts/check_pr_title.py:
--------------------------------------------------------------------------------
 1 | """Check that all PR titles follow the desired scheme."""  # noqa: INP001
 2 | 
 3 | import os
 4 | import sys
 5 | 
 6 | KNOWN_PREFIXES = (
 7 |     "SEC: ",
 8 |     "BUG: ",
 9 |     "ENH: ",
10 |     "DEP: ",
11 |     "PI: ",
12 |     "ROB: ",
13 |     "DOC: ",
14 |     "TST: ",
15 |     "DEV: ",
16 |     "STY: ",
17 |     "MAINT: ",
18 |     "REL: ",  # For internal use only.
19 | )
20 | PR_TITLE = os.getenv("PR_TITLE", "")
21 | 
22 | if not PR_TITLE.startswith(KNOWN_PREFIXES) or not PR_TITLE.split(": ", maxsplit=1)[1]:
23 |     sys.stderr.write(
24 |         f"The PR title '{PR_TITLE}' does not follow the projects naming scheme: "
25 |         "https://pypdf.readthedocs.io/en/latest/dev/intro.html#commit-messages\n",
26 |     )
27 |     sys.stderr.write(
28 |         "If you do not know which one to choose or if multiple apply, make a best guess. "
29 |         "Nobody will complain if it does not quite fit :-)\n",
30 |     )
31 |     sys.exit(1)
32 | else:
33 |     sys.stdout.write(f"PR title '{PR_TITLE}' appears to be valid.\n")
34 | 


--------------------------------------------------------------------------------
/.github/workflows/benchmark.yaml:
--------------------------------------------------------------------------------
 1 | name: Benchmarking pypdf
 2 | on:
 3 |   push:
 4 |     branches:
 5 |       - main
 6 | 
 7 | permissions:
 8 |   contents: write
 9 |   deployments: write
10 | 
11 | jobs:
12 |   benchmark:
13 |     name: Run pytest-benchmark
14 |     runs-on: ubuntu-latest
15 |     strategy:
16 |       matrix:
17 |         python-version: ["3.x"]
18 |     steps:
19 |     - name: Checkout Code
20 |       uses: actions/checkout@v4
21 |       with:
22 |         submodules: 'recursive'
23 |     - name: Setup Python
24 |       uses: actions/setup-python@v5
25 |       with:
26 |         python-version: ${{ matrix.python-version }}
27 |     - name: Install requirements (Python 3)
28 |       run: |
29 |         pip install -r requirements/ci-3.11.txt
30 |     - name: Install pypdf
31 |       run: |
32 |         pip install .
33 |     - name: Run benchmark
34 |       run: |
35 |         pytest tests/bench.py --benchmark-json output.json
36 |     - name: Store benchmark result
37 |       uses: benchmark-action/github-action-benchmark@v1
38 |       with:
39 |         name: Python Benchmark with pytest-benchmark
40 |         tool: 'pytest'
41 |         output-file-path: output.json
42 |         # Use personal access token instead of GITHUB_TOKEN due to https://github.community/t/github-action-not-triggering-gh-pages-upon-push/16096
43 |         github-token: ${{ secrets.GITHUB_TOKEN }}
44 |         auto-push: true
45 |         # Show alert with commit comment on detecting possible performance regression
46 |         alert-threshold: '200%'
47 |         comment-on-alert: true
48 |         fail-on-alert: true
49 |         alert-comment-cc-users: '@MartinThoma'
50 | 


--------------------------------------------------------------------------------
/.github/workflows/create-github-release.yaml:
--------------------------------------------------------------------------------
 1 | name: Create a GitHub release page
 2 | 
 3 | on:
 4 |   push:
 5 |     tags:
 6 |       - '*.*.*'
 7 |   workflow_dispatch:
 8 | 
 9 | permissions:
10 |   contents: write
11 | 
12 | jobs:
13 |   build_and_publish:
14 |     name: Create a GitHub release page
15 |     runs-on: ubuntu-latest
16 |     steps:
17 |       - name: Checkout Repository
18 |         uses: actions/checkout@v4
19 |       - name: Prepare variables
20 |         id: prepare_variables
21 |         run: |
22 |           git fetch --tags --force
23 |           latest_tag=$(git describe --tags --abbrev=0)
24 |           echo "latest_tag=${latest_tag}" >> "$GITHUB_ENV"
25 |           echo "date=$(date +'%Y-%m-%d')" >> "$GITHUB_ENV"
26 |           EOF=$(dd if=/dev/urandom bs=15 count=1 status=none | base64)
27 |           echo "tag_body<<$EOF" >> "$GITHUB_ENV"
28 |           git --no-pager tag -l "${latest_tag}" --format='%(contents:body)' >> "$GITHUB_ENV"
29 |           echo "$EOF" >> "$GITHUB_ENV"
30 |       - name: Create GitHub Release 🚀
31 |         uses: softprops/action-gh-release@v2
32 |         with:
33 |           tag_name: ${{ env.latest_tag }}
34 |           name: Version ${{ env.latest_tag }}, ${{ env.date }}
35 |           draft: false
36 |           prerelease: false
37 |           body: ${{ env.tag_body }}
38 | 


--------------------------------------------------------------------------------
/.github/workflows/publish-to-pypi.yaml:
--------------------------------------------------------------------------------
 1 | name: Publish Python Package to PyPI
 2 | 
 3 | on:
 4 |   push:
 5 |     tags:
 6 |       - '*.*.*'
 7 |   workflow_dispatch:
 8 | 
 9 | jobs:
10 |   build:
11 |     name: Build distribution
12 |     runs-on: ubuntu-latest
13 | 
14 |     steps:
15 |     - uses: actions/checkout@v4
16 |     - name: Set up Python
17 |       uses: actions/setup-python@v5
18 |       with:
19 |         python-version: "3.x"
20 |     - name: Install pypa/build
21 |       run: >-
22 |         python3 -m
23 |         pip install
24 |         build
25 |         --user
26 |     - name: Build a binary wheel and a source tarball
27 |       run: python3 -m build
28 |     - name: Store the distribution packages
29 |       uses: actions/upload-artifact@v4
30 |       with:
31 |         name: python-package-distributions
32 |         path: dist/
33 | 
34 |   publish-to-pypi:
35 |     name: Publish Python distribution to PyPI
36 |     needs:
37 |     - build
38 |     runs-on: ubuntu-latest
39 |     environment:
40 |       name: pypi
41 |       url: https://pypi.org/p/pypdf
42 |     permissions:
43 |       id-token: write  # IMPORTANT: mandatory for trusted publishing
44 | 
45 |     steps:
46 |     - name: Download all the dists
47 |       uses: actions/download-artifact@v4
48 |       with:
49 |         name: python-package-distributions
50 |         path: dist/
51 |     - name: Publish distribution to PyPI
52 |       uses: pypa/gh-action-pypi-publish@release/v1
53 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yaml:
--------------------------------------------------------------------------------
 1 | # This action assumes that there is a REL-commit which already has a
 2 | # Markdown-formatted git tag. Hence the CHANGELOG is already adjusted
 3 | # and it's decided what should be in the release.
 4 | # This action only ensures the release is done with the proper contents
 5 | # and that it's announced with a Github release.
 6 | name: Create git tag
 7 | on:
 8 |   push:
 9 |     branches:
10 |       - main
11 | 
12 | permissions:
13 |   contents: write
14 | 
15 | env:
16 |   HEAD_COMMIT_MESSAGE: ${{ github.event.head_commit.message }}
17 | 
18 | jobs:
19 |   build_and_publish:
20 |     name: Publish a new version
21 |     runs-on: ubuntu-latest
22 |     if: "${{ startsWith(github.event.head_commit.message, 'REL: ') }}"
23 |     steps:
24 |       - name: Checkout Repository
25 |         uses: actions/checkout@v4
26 | 
27 |       - name: Extract version from commit message
28 |         id: extract_version
29 |         run: |
30 |           VERSION=$(echo "$HEAD_COMMIT_MESSAGE" | grep -oP '(?<=REL: )\d+\.\d+\.\d+')
31 |           echo "version=$VERSION" >> $GITHUB_OUTPUT
32 | 
33 |       - name: Extract tag message from commit message
34 |         id: extract_message
35 |         run: |
36 |           VERSION="${{ steps.extract_version.outputs.version }}"
37 |           delimiter="$(openssl rand -hex 8)"
38 |           MESSAGE=$(echo "$HEAD_COMMIT_MESSAGE" | sed "0,/REL: $VERSION/s///" )
39 |           echo "message<<${delimiter}" >> $GITHUB_OUTPUT
40 |           echo "$MESSAGE" >> $GITHUB_OUTPUT
41 |           echo "${delimiter}" >> $GITHUB_OUTPUT
42 | 
43 |       - name: Create Git Tag
44 |         run: |
45 |           VERSION="${{ steps.extract_version.outputs.version }}"
46 |           MESSAGE="${{ steps.extract_message.outputs.message }}"
47 |           git config user.name github-actions
48 |           git config user.email github-actions@github.com
49 |           git tag "$VERSION" -m "$MESSAGE"
50 |           git push origin $VERSION
51 | 


--------------------------------------------------------------------------------
/.github/workflows/title-check.yml:
--------------------------------------------------------------------------------
 1 | name: 'PR Title Check'
 2 | on:
 3 |   pull_request:
 4 |     # check when PR
 5 |     # * is created,
 6 |     # * title is edited, and
 7 |     # * new commits are added (to ensure failing title blocks merging)
 8 |     types: [opened, reopened, edited, synchronize]
 9 | 
10 | jobs:
11 |   title-check:
12 |     name: Title check
13 |     runs-on: ubuntu-latest
14 |     steps:
15 |       - name: Checkout Code
16 |         uses: actions/checkout@v4
17 |       - name: Check PR title
18 |         env:
19 |           PR_TITLE: ${{ github.event.pull_request.title }}
20 |         run: python .github/scripts/check_pr_title.py
21 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | *.swp
 3 | .DS_Store
 4 | .tox
 5 | build
 6 | .idea/*
 7 | *.egg-info/
 8 | dist/*
 9 | __pycache__/
10 | 
11 | # in-project virtual environments
12 | venv/
13 | .venv/
14 | 
15 | # Code coverage artifacts
16 | .coverage*
17 | coverage.xml
18 | 
19 | # Editors / IDEs
20 | .vscode/
21 | 
22 | # Docs
23 | docs/_build/
24 | 
25 | .cspell/
26 | 
27 | # Files generated by some of the scripts
28 | dont_commit_*.pdf
29 | pypdf-output.pdf
30 | annotated-pdf-link.pdf
31 | Image9.png
32 | pypdf_pdfLocation.txt
33 | 
34 | .python-version
35 | tests/pdf_cache/
36 | docs/meta/CHANGELOG.md
37 | docs/meta/CONTRIBUTORS.md
38 | extracted-images/
39 | 
40 | RELEASE_COMMIT_MSG.md
41 | RELEASE_TAG_MSG.md
42 | .envrc
43 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "sample-files"]
2 | 	path = sample-files
3 | 	url = https://github.com/py-pdf/sample-files
4 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | # pre-commit run --all-files
 2 | repos:
 3 | -   repo: https://github.com/pre-commit/pre-commit-hooks
 4 |     rev: v5.0.0
 5 |     hooks:
 6 |     -   id: check-ast
 7 |     -   id: check-case-conflict
 8 |     -   id: check-docstring-first
 9 |     -   id: check-yaml
10 |     -   id: debug-statements
11 |     -   id: end-of-file-fixer
12 |         exclude: "resources/.*|docs/make.bat"
13 |     -   id: fix-byte-order-marker
14 |     -   id: trailing-whitespace
15 |     -   id: mixed-line-ending
16 |         args: ['--fix=lf']
17 |         exclude: "docs/make.bat"
18 |     -   id: check-added-large-files
19 |         args: ['--maxkb=1000']
20 | 
21 | -   repo: https://github.com/charliermarsh/ruff-pre-commit
22 |     rev: v0.11.0
23 |     hooks:
24 |     -   id: ruff
25 |         args: ['--fix']
26 | 
27 | -   repo: https://github.com/asottile/pyupgrade
28 |     rev: v3.19.1
29 |     hooks:
30 |     -   id: pyupgrade
31 |         args: [--py38-plus]
32 | 
33 | -   repo: https://github.com/pre-commit/mirrors-mypy
34 |     rev: 'v1.16.0'
35 |     hooks:
36 |       - id: mypy
37 |         additional_dependencies: [types-Pillow==10.2.0.20240822]
38 |         files: ^pypdf/.*
39 | 


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 2 | version: 2
 3 | 
 4 | 
 5 | build:
 6 |   os: ubuntu-22.04
 7 |   tools:
 8 |     python: "3.12"
 9 | 
10 | # Build documentation in the "docs/" directory with Sphinx
11 | sphinx:
12 |    configuration: docs/conf.py
13 | 
14 | # If using Sphinx, optionally build your docs in additional formats such as PDF
15 | formats: all
16 | 
17 | # Optionally declare the Python requirements required to build your docs
18 | python:
19 |   install:
20 |     - requirements: requirements/docs.txt
21 |     - method: pip
22 |       path: .
23 |       extra_requirements:
24 |         - full
25 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | Please check the [documentation page dedicated to development](https://pypdf.readthedocs.io/en/stable/dev/intro.html).
 2 | 
 3 | ## Creating issues / tickets
 4 | 
 5 | Please go here: https://github.com/py-pdf/pypdf/issues
 6 | 
 7 | Typically you should not send e-mails. E-mails might only reach one person and
 8 | it could go into spam or that person might be busy. Please create issues on
 9 | GitHub instead.
10 | 
11 | Please use the templates provided.
12 | 
13 | Keep in mind that although PDF has an official specification, there are tons of
14 | variations which might require special handling. Thus, please always provide a
15 | reproducing example file for us to work with. Otherwise, we have to guess possible
16 | issues, leading to unnecessary overhead - especially since most of the contributions
17 | happen during our free time.
18 | 
19 | If you already know a fix, consider opening a pull request after reporting the issue
20 | to make life easier for everyone.
21 | 
22 | ## Creating Pull Requests
23 | 
24 | We appreciate if people make PRs, but please be aware that pypdf is used by many
25 | people. That means:
26 | 
27 | * We rarely make breaking changes and have a [deprecation process](https://pypdf.readthedocs.io/en/latest/dev/deprecations.html).
28 | * New features, especially adding to the public interface, typically need to be
29 |   discussed first.
30 | 
31 | Before you make bigger changes, open an issue to make the suggestion.
32 | Note which interface changes you want to make.
33 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2006-2008, Mathieu Fenniak
 2 | Some contributions copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com>
 3 | Some contributions copyright (c) 2014, Steve Witham <switham_github@mac-guyver.com>
 4 | 
 5 | All rights reserved.
 6 | 
 7 | Redistribution and use in source and binary forms, with or without
 8 | modification, are permitted provided that the following conditions are
 9 | met:
10 | 
11 | * Redistributions of source code must retain the above copyright notice,
12 | this list of conditions and the following disclaimer.
13 | * Redistributions in binary form must reproduce the above copyright notice,
14 | this list of conditions and the following disclaimer in the documentation
15 | and/or other materials provided with the distribution.
16 | * The name of the author may not be used to endorse or promote products
17 | derived from this software without specific prior written permission.
18 | 
19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
23 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 | POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include CHANGELOG
2 | include LICENSE
3 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | maint:
 2 | 	pre-commit autoupdate
 3 | 	pip-compile -U requirements/ci.in
 4 | 	pip-compile -U requirements/dev.in
 5 | 	pip-compile -U requirements/docs.in
 6 | 
 7 | release:
 8 | 	python make_release.py
 9 | 	git commit -eF RELEASE_COMMIT_MSG.md
10 | 
11 | clean:
12 | 	python -m pip install pyclean
13 | 	pyclean .
14 | 	rm -rf tests/__pycache__ pypdf/__pycache__ htmlcov docs/_build dist pypdf.egg-info .pytest_cache .mypy_cache .benchmarks
15 | 
16 | test:
17 | 	pytest tests --cov --cov-report term-missing -vv --cov-report html --durations=3 --timeout=60 pypdf
18 | 
19 | testtype:
20 | 	pytest tests --cov --cov-report term-missing -vv --cov-report html --durations=3 --timeout=30 --typeguard-packages=pypdf
21 | 
22 | benchmark:
23 | 	pytest tests/bench.py
24 | 
25 | mypy:
26 | 	mypy pypdf --ignore-missing-imports --check-untyped --strict
27 | 
28 | ruff:
29 | 	ruff check pypdf tests make_release.py
30 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/_static/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/docs/_static/logo.png


--------------------------------------------------------------------------------
/docs/_static/releasing.drawio.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/docs/_static/releasing.drawio.png


--------------------------------------------------------------------------------
/docs/dev/PR_Header_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/docs/dev/PR_Header_example.png


--------------------------------------------------------------------------------
/docs/dev/cmaps.md:
--------------------------------------------------------------------------------
 1 | # CMaps
 2 | 
 3 | Looking at the cmap of "crazyones":
 4 | 
 5 | ```bash
 6 | pdftk crazyones.pdf output crazyones-uncomp.pdf uncompress
 7 | ```
 8 | 
 9 | You can see this:
10 | 
11 | ```text
12 | begincmap
13 | /CMapName /T1Encoding-UTF16 def
14 | /CMapType 2 def
15 | /CIDSystemInfo <<
16 |   /Registry (Adobe)
17 |   /Ordering (UCS)
18 |   /Supplement 0
19 | >> def
20 | 1 begincodespacerange
21 | <00> <FF>
22 | endcodespacerange
23 | 1 beginbfchar
24 | <1B> <FB00>
25 | endbfchar
26 | endcmap
27 | CMapName currentdict /CMap defineresource pop
28 | ```
29 | 
30 | ## codespacerange
31 | 
32 | A codespacerange maps a complete sequence of bytes to a range of unicode glyphs.
33 | It defines a starting point:
34 | 
35 | ```text
36 | 1 beginbfchar
37 | <1B> <FB00>
38 | ```
39 | 
40 | That means that `1B` (Hex for 27) maps to the unicode character [`FB00`](https://unicode-table.com/en/FB00/) - the ligature ﬀ (two lowercase f's).
41 | 
42 | The two numbers in `begincodespacerange` mean that it starts with an offset of
43 | 0 (hence from `1B ➜ FB00`) up to an offset of FF (dec: 255), hence 1B+FF = 282
44 | ➜ [FBFF](https://www.compart.com/de/unicode/U+FBFF).
45 | 
46 | Within the text stream, there is
47 | 
48 | ```text
49 | (The)-342(mis\034ts.)
50 | ```
51 | 
52 | `\034 ` is octal for 28 decimal.
53 | 


--------------------------------------------------------------------------------
/docs/dev/deprecations.md:
--------------------------------------------------------------------------------
 1 | # The Deprecation Process
 2 | 
 3 | pypdf strives to be an excellent library for its current users and for new
 4 | ones. We are careful with introducing potentially breaking changes, but we
 5 | will do them if they provide value for the community on the long run.
 6 | 
 7 | We hope and think that deprecations will not happen frequently. If they do,
 8 | users can rely on the following procedure.
 9 | 
10 | ## Semantic Versioning
11 | 
12 | pypdf uses [semantic versioning](https://semver.org/). If you want to avoid
13 | breaking changes, please use dependency pinning (also known as version pinning).
14 | In Python, this is done by specifying the exact version you want to use in a
15 | `requirements.txt` file. A tool that can support you is `pip-compile` from
16 | [`pip-tools`](https://pypi.org/project/pip-tools/).
17 | 
18 | If you are using [Poetry](https://pypi.org/project/poetry/) it is done with the
19 | `poetry.lock` file.
20 | 
21 | ## How pypdf deprecates features
22 | 
23 | Assume the current version of pypdf is `x.y.z`. After a discussion (e.g. via
24 | GitHub issues) we decided to remove a class / function / method. This is how
25 | we do it:
26 | 
27 | 1. `x.y.(z+1)`: Add a DeprecationWarning. If there is a replacement,
28 |    the replacement is also introduced and the warning informs about the change
29 |    and when it will happen.
30 |    The docs let users know about the deprecation and when it will happen and the new function.
31 |    The CHANGELOG informs about it.
32 | 2. `(x+1).0.0`: Remove / change the code in the breaking way by replacing
33 |    DeprecationWarnings by DeprecationErrors.
34 |    We do this to help people who didn't look at the warnings before.
35 |    The CHANGELOG informs about it.
36 | 3. `(x+2).0.0`: The DeprecationErrors are removed.
37 | 
38 | This means the users have 3 warnings in the CHANGELOG, a DeprecationWarning
39 | until the next major release and a DeprecationError until the major release
40 | after that.
41 | 
42 | Please note that adding warnings can be a breaking change for some users; most
43 | likely just in the CI.
44 | This means it needs to be properly documented.
45 | 


--------------------------------------------------------------------------------
/docs/dev/documentation.md:
--------------------------------------------------------------------------------
 1 | # Documentation
 2 | 
 3 | ## API Reference
 4 | 
 5 | ### Method / Function Docstrings
 6 | 
 7 | We use Google-Style Docstrings:
 8 | 
 9 | ```
10 | def example(param1: int, param2: str) -> bool:
11 |     """
12 |     Example function with PEP 484 type annotations.
13 | 
14 |     Args:
15 |       param1: The first parameter.
16 |       param2: The second parameter.
17 | 
18 |     Returns:
19 |       The return value. True for success, False otherwise.
20 | 
21 |     Raises:
22 |       AttributeError: The ``Raises`` section is a list of all exceptions
23 |         that are relevant to the interface.
24 |       ValueError: If `param2` is equal to `param1`.
25 | 
26 |     Examples:
27 |         Examples should be written in doctest format, and should illustrate how
28 |         to use the function.
29 | 
30 |         >>> print([i for i in example_generator(4)])
31 |         [0, 1, 2, 3]
32 |     """
33 | ```
34 | 
35 | * The order of sections is (1) Args (2) Returns (3) Raises (4) Examples
36 | * If there is no return value, remove the 'Returns' block
37 | * Properties should not have any sections
38 | 
39 | 
40 | ## Issues and PRs
41 | 
42 | An issue can be used to discuss what we want to achieve.
43 | 
44 | A PR can be used to discuss how we achieve it.
45 | 
46 | ## Commit Messages
47 | 
48 | We want to have descriptive commits in the `main` branch. For this reason, every
49 | pull request (PR) is squashed. That means no matter how many commits a PR has,
50 | in the end only one combined commit will be in `main`.
51 | 
52 | The title of the PR will be used as the first line of that combined commit message.
53 | 
54 | The first comment within the commit will be used as the message body.
55 | 
56 | See [developer intro](intro.md#commit-messages) for more details.
57 | 


--------------------------------------------------------------------------------
/docs/dev/intro.md:
--------------------------------------------------------------------------------
  1 | # Developer Intro
  2 | 
  3 | pypdf is a library and hence its users are developers. This document is not for
  4 | the users, but for people who want to work on pypdf itself.
  5 | 
  6 | ## Installing Requirements
  7 | 
  8 | ```
  9 | pip install -r requirements/dev.txt
 10 | ```
 11 | 
 12 | ## Running Tests
 13 | 
 14 | See [testing pypdf with pytest](testing.md).
 15 | 
 16 | ## The sample-files git submodule
 17 | The reason for having the submodule `sample-files` is that we want to keep
 18 | the size of the pypdf repository small while we also want to have an extensive
 19 | test suite. Those two goals contradict each other.
 20 | 
 21 | The `resources` folder should contain a select set of core examples that cover
 22 | most cases we typically want to test for. The `sample-files` might cover a lot
 23 | more edge cases, the behavior we get when file sizes get bigger, different
 24 | PDF producers.
 25 | 
 26 | In order to get the sample-files folder, you need to execute:
 27 | 
 28 | ```
 29 | git submodule update --init
 30 | ```
 31 | 
 32 | ## Tools: git and pre-commit
 33 | 
 34 | Git is a command line application for version control. If you don't know it,
 35 | you can [play ohmygit](https://ohmygit.org/) to learn it.
 36 | 
 37 | GitHub is the service where the pypdf project is hosted. While git is free and
 38 | open source, GitHub is a paid service by Microsoft, but free in a lot of
 39 | cases.
 40 | 
 41 | [pre-commit](https://pypi.org/project/pre-commit/) is a command line application
 42 | that uses git hooks to automatically execute code. This allows you to avoid
 43 | style issues and other code quality issues. After you entered `pre-commit install`
 44 | once in your local copy of pypdf, it will automatically be executed when
 45 | you `git commit`.
 46 | 
 47 | ## Commit Messages
 48 | 
 49 | Having a clean commit message helps people to quickly understand what the commit
 50 | is about, without actually looking at the changes. The first line of the
 51 | commit message is used to [auto-generate the CHANGELOG](https://github.com/py-pdf/pypdf/blob/main/make_release.py).
 52 | For this reason, the format should be:
 53 | 
 54 | ```
 55 | PREFIX: DESCRIPTION
 56 | 
 57 | BODY
 58 | ```
 59 | 
 60 | The `PREFIX` can be:
 61 | 
 62 | * `SEC`: Security improvements. Typically an infinite loop that was possible.
 63 | * `BUG`: A bug was fixed. Likely there is one or multiple issues. Then write in
 64 |    the `BODY`: `Closes #123` where 123 is the issue number on GitHub.
 65 |    It would be absolutely amazing if you could write a regression test in those
 66 |    cases. That is a test that would fail without the fix.
 67 |    A bug is always an issue for pypdf users - test code or CI that was fixed is
 68 |    not considered a bug here.
 69 | * `ENH`: A new feature! Describe in the body what it can be used for.
 70 | * `DEP`: A deprecation. Either marking something as "this is going to be removed"
 71 |    or actually removing it.
 72 | * `PI`: A performance improvement. This could also be a reduction in the
 73 |         file size of PDF files generated by pypdf.
 74 | * `ROB`: A robustness change. Dealing better with broken PDF files.
 75 | * `DOC`: A documentation change.
 76 | * `TST`: Adding or adjusting tests.
 77 | * `DEV`: Developer experience improvements, e.g. pre-commit or setting up CI.
 78 | * `MAINT`: Quite a lot of different stuff. Performance improvements are for sure
 79 |            the most interesting changes in here. Refactorings as well.
 80 | * `STY`: A style change. Something that makes pypdf code more consistent.
 81 |          Typically a small change. It could also be better error messages for
 82 |          end users.
 83 | 
 84 | The prefix is used to generate the CHANGELOG. Every PR must have exactly one -
 85 | if you feel like several match, take the top one from this list that matches for
 86 | your PR.
 87 | 
 88 | ## Pull Request Size
 89 | 
 90 | Smaller Pull Requests (PRs) are preferred as it's typically easier to merge
 91 | them. For example, if you have some typos, a few code-style changes, a new
 92 | feature, and a bug-fix, that could be 3 or 4 PRs.
 93 | 
 94 | A PR must be complete. That means if you introduce a new feature it must be
 95 | finished within the PR and have a test for that feature.
 96 | 
 97 | ## Benchmarks
 98 | 
 99 | We need to keep an eye on performance and thus we have a few benchmarks.
100 | 
101 | See [py-pdf.github.io/pypdf/dev/bench](https://py-pdf.github.io/pypdf/dev/bench/)
102 | 


--------------------------------------------------------------------------------
/docs/dev/pypdf-parsing.md:
--------------------------------------------------------------------------------
 1 | # How pypdf parses PDF files
 2 | 
 3 | pypdf uses {class}`~pypdf.PdfReader` to parse PDF files.
 4 | The method {py:meth}`PdfReader.read <pypdf.PdfReader.read>` shows the basic
 5 | structure of parsing:
 6 | 
 7 | 1. **Finding and reading the cross-reference tables / trailer**: The
 8 |    cross-reference table (xref table) is a table of byte offsets that indicate
 9 |    the locations of objects within the file. The trailer provides additional
10 |    information such as the root object (Catalog) and the Info object containing
11 |    metadata.
12 | 2. **Parsing the objects**: After locating the xref table and the trailer, pypdf
13 |    proceeds to parse the objects in the PDF. Objects in a PDF can be of various
14 |    types such as dictionaries, arrays, streams, and simple data types (e.g.,
15 |    integers, strings). pypdf parses these objects and stores them in
16 |    {py:meth}`PdfReader.resolved_objects <pypdf.PdfReader.resolved_objects>`,
17 |    populated by {py:meth}`cache_indirect_object <pypdf.PdfReader.cache_indirect_object>`.
18 | 3. **Decoding content streams**: The content of a PDF is typically stored in
19 |    content streams, which are sequences of PDF operators and operands. pypdf
20 |    decodes these content streams by applying filters (e.g., `FlateDecode`,
21 |    `LZWDecode`) specified in the stream's dictionary. This is only done when the
22 |    object is requested by {py:meth}`PdfReader.get_object <pypdf.PdfReader.get_object>`
23 |    which uses the `PdfReader._get_object_from_stream` method.
24 | 
25 | ## References
26 | 
27 | [PDF 1.7 specification](https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/PDF32000_2008.pdf):
28 | * 7.5 File Structure
29 | * 7.5.4 Cross-Reference Table
30 | * 7.8 Content Streams and Resources
31 | 


--------------------------------------------------------------------------------
/docs/dev/pypdf-writing.md:
--------------------------------------------------------------------------------
 1 | # How pypdf writes PDF files
 2 | 
 3 | pypdf uses {py:class}`PdfWriter <pypdf.PdfWriter>` to write PDF files. pypdf has
 4 | {py:class}`PdfObject <pypdf.generic.PdfObject>` and several subclasses with the
 5 | {py:meth}`write_to_stream <pypdf.generic.PdfObject.write_to_stream>` method.
 6 | The {py:meth}`PdfWriter.write <pypdf.PdfWriter.write>` method uses the
 7 | `write_to_stream` methods of the referenced objects.
 8 | 
 9 | The {py:meth}`PdfWriter.write_stream <pypdf.PdfWriter.write_stream>` method
10 | has the following core steps:
11 | 
12 | 1. `_sweep_indirect_references`: This step ensures that any circular references
13 |    to objects are correctly handled. It adds the object reference numbers of any
14 |    circularly referenced objects to an external reference map, so that
15 |    self-page-referencing trees can reference the correct new object location,
16 |    rather than copying in a new copy of the page object.
17 | 2. **Write the File Header and Body** with `_write_pdf_structure`: In this step,
18 |    the PDF header and objects are written to the output stream. This includes
19 |    the PDF version (e.g., %PDF-1.7) and the objects that make up the content of
20 |    the PDF, such as pages, annotations, and form fields. The locations (byte
21 |    offsets) of these objects are stored for later use in generating the xref
22 |    table.
23 | 3. **Write the Cross-Reference Table** with `_write_xref_table`: Using the stored
24 |    object locations, this step generates and writes the cross-reference table
25 |    (xref table) to the output stream. The cross-reference table contains the
26 |    byte offsets for each object in the PDF file, allowing for quick random
27 |    access to objects when reading the PDF.
28 | 4. **Write the File Trailer** with `_write_trailer`: The trailer is written to
29 |    the output stream in this step. The trailer contains essential information,
30 |    such as the number of objects in the PDF, the location of the root object
31 |    (Catalog), and the Info object containing metadata. The trailer also
32 |    specifies the location of the xref table.
33 | 
34 | 
35 | ## How others do it
36 | 
37 | Looking at alternative software designs and implementations can help to improve
38 | our choices.
39 | 
40 | ### fpdf2
41 | 
42 | [fpdf2](https://pypi.org/project/fpdf2/) has a [`PDFObject` class](https://github.com/PyFPDF/fpdf2/blob/master/fpdf/syntax.py)
43 | with a serialize method which roughly maps to `pypdf.PdfObject.write_to_stream`.
44 | Some other similarities include:
45 | 
46 | * [fpdf.output.OutputProducer.buffersize](https://github.com/PyFPDF/fpdf2/blob/master/fpdf/output.py#L370-L485) vs {py:meth}`pypdf.PdfWriter.write_stream <pypdf.PdfWriter.write_stream>`
47 | * [fpdpf.syntax.Name](https://github.com/PyFPDF/fpdf2/blob/master/fpdf/syntax.py#L124) vs {py:class}`pypdf.generic.NameObject <pypdf.generic.NameObject>`
48 | * [fpdf.syntax.build_obj_dict](https://github.com/PyFPDF/fpdf2/blob/master/fpdf/syntax.py#L222) vs {py:class}`pypdf.generic.DictionaryObject <pypdf.generic.DictionaryObject>`
49 | * [fpdf.structure_tree.NumberTree](https://github.com/PyFPDF/fpdf2/blob/master/fpdf/structure_tree.py#L17) vs
50 |  {py:class}`pypdf.generic.TreeObject <pypdf.generic.TreeObject>`
51 | 
52 | 
53 | ### pdfrw
54 | 
55 | [pdfrw](https://pypi.org/project/pdfrw/), in contrast, seems to work more with
56 | the standard Python objects (bool, float, string) and not wrap them in custom
57 | objects, if possible. It still has:
58 | 
59 | * [PdfArray](https://github.com/pmaupin/pdfrw/blob/master/pdfrw/objects/pdfarray.py#L13)
60 | * [PdfDict](https://github.com/pmaupin/pdfrw/blob/master/pdfrw/objects/pdfdict.py#L49)
61 | * [PdfName](https://github.com/pmaupin/pdfrw/blob/master/pdfrw/objects/pdfname.py#L65)
62 | * [PdfString](https://github.com/pmaupin/pdfrw/blob/master/pdfrw/objects/pdfstring.py#L322)
63 | * [PdfIndirect](https://github.com/pmaupin/pdfrw/blob/master/pdfrw/objects/pdfindirect.py#L10)
64 | 
65 | The core classes of pdfrw are
66 | [PdfReader](https://github.com/pmaupin/pdfrw/blob/master/pdfrw/pdfreader.py#L26)
67 | and
68 | [PdfWriter](https://github.com/pmaupin/pdfrw/blob/master/pdfrw/pdfwriter.py#L224)
69 | 


--------------------------------------------------------------------------------
/docs/dev/releasing.md:
--------------------------------------------------------------------------------
 1 | # Releasing
 2 | 
 3 | A `pypdf` release contains the following artifacts:
 4 | 
 5 | * A new [release on PyPI](https://pypi.org/project/pypdf/)
 6 | * A [release commit](https://github.com/py-pdf/pypdf/commit/91391b18bb8ec9e6e561e2795d988e8634a01a50)
 7 |     * Containing a changelog update
 8 |     * A new [git tag](https://github.com/py-pdf/pypdf/tags)
 9 |         * A [Github release](https://github.com/py-pdf/pypdf/releases/tag/3.15.0)
10 | 
11 | ## Who does it?
12 | 
13 | `pypdf` should typically only be released by one of the core maintainers / the
14 | core maintainer. At the moment, this is either stefan6419846 or pubpub-zz and Martin Thoma.
15 | 
16 | Any owner of the py-pdf organization also has the technical permissions to
17 | release.
18 | 
19 | ## How is it done?
20 | 
21 | ### With direct push permissions
22 | 
23 | This is the typical way for the core maintainer/benevolent dictator.
24 | 
25 | The release contains the following steps:
26 | 
27 | 1. Update the CHANGELOG.md and the _version.py via `python make_release.py`.
28 |    This also prepares the release commit message.
29 | 2. Create a release commit: `git commit -eF RELEASE_COMMIT_MSG.md`.
30 | 3. Push commit: `git push`.
31 | 4. CI now builds a source and a wheels package which it pushes to PyPI. It also
32 |    creates the corresponding tag and a GitHub release.
33 | 
34 | ![](../_static/releasing.drawio.png)
35 | 
36 | ### Using a Pull Request
37 | 
38 | This is the typical way for collaborators which do not have direct push permissions for
39 | the `main` branch.
40 | 
41 | The release contains the following steps:
42 | 
43 | 1. Update the CHANGELOG.md and the _version.py via `python make_release.py`.
44 |    This also prepares the release commit message.
45 | 2. Push the changes to a dedicated branch.
46 | 3. Open a pull request starting with `REL: `, followed by the new version number.
47 | 4. Wait for the approval of another eligible maintainer.
48 | 5. Merge the pull request with the name being the PR title and the body being
49 |    the content of `RELEASE_COMMIT_MSG.md`.
50 | 7. CI now builds a source and a wheels package which it pushes to PyPI. It also
51 |    creates the corresponding tag and a GitHub release.
52 | 
53 | ### The Release Tag
54 | 
55 | * Use the release version as the tag name. No need for a leading "v".
56 | * Use the changelog entry as the body.
57 | 
58 | 
59 | ## When are releases done?
60 | 
61 | There is no need to wait for anything. If the CI is green (all tests succeeded),
62 | we can release.
63 | 
64 | At the moment, there is no fixed release cycle - except that we usually release
65 | on Sunday.
66 | 


--------------------------------------------------------------------------------
/docs/dev/testing.md:
--------------------------------------------------------------------------------
 1 | # Testing
 2 | 
 3 | pypdf uses [`pytest`](https://docs.pytest.org/en/7.1.x/) for testing.
 4 | 
 5 | To run the tests you need to install the CI (Continuous Integration) requirements by running `pip install -r requirements/ci.txt` or
 6 | `pip install -r requirements/ci-3.11.txt` if running Python ≥ 3.11.
 7 | 
 8 | ## Deselecting groups of tests
 9 | 
10 | pypdf makes use of the following pytest markers:
11 | 
12 | * `slow`: Tests that require more than 5 seconds.
13 | * `samples`: Tests that require the [the `sample-files` git submodule](https://github.com/py-pdf/sample-files) to be initialized. As of October 2022, this is about 25 MB.
14 | * `enable_socket`: Tests that download PDF documents. They are stored locally and thus only need to be downloaded once. As of October 2022, this is about 200 MB.
15 |   * To successfully run the tests, please download most of the documents beforehand: `python -c "from tests import download_test_pdfs; download_test_pdfs()"`
16 | 
17 | You can disable them by `pytest -m "not enable_socket"` or `pytest -m "not samples"`.
18 | You can even disable all of them: `pytest -m "not enable_socket" -m "not samples" -m "not slow"`.
19 | 
20 | Please note that this reduces test coverage. The CI will always test all files.
21 | 
22 | ## Docstrings in Unit tests
23 | 
24 | The first line of a docstring in a unit test should be written in a way that
25 | you could prefix it with "This tests ensures that ...", e.g.
26 | 
27 | * Invalid XML in xmp_metadata is gracefully handled.
28 | * The identity is returning its input.
29 | * xmp_modify_date is extracted correctly.
30 | 
31 | This way, plugins like [`pytest-testdox`](https://pypi.org/project/pytest-testdox/)
32 | can generate really nice output when the tests are running. This looks similar
33 | to the output of [mocha.js](https://mochajs.org/).
34 | 
35 | If the test is a regression test, write
36 | 
37 | > This test is a regression test for issue #1234
38 | 
39 | If the regression test is just one parameter of other tests, then add it as
40 | a comment for that parameter.
41 | 
42 | ## Evaluate a PR in-progress version
43 | 
44 | You may want to test a version from a PR which has not been released yet.
45 | The easiest way is to use pip and install a version from git:
46 | 
47 | a) Go the PR and identify the repository and branch.
48 | 
49 | Example from below : repository: __pubpub-zz__ / branch: __iss2200__ :
50 | ![PR Header example](PR_Header_example.png)
51 | 
52 | b) you can then install the version using pip from git:
53 | 
54 | Example:
55 | ```
56 | pip install git+https://github.com/pubpub-zz/pypdf.git@iss2200
57 | ```
58 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
  1 | .. pypdf documentation main file, created by
  2 |    sphinx-quickstart on Thu Apr  7 20:13:19 2022.
  3 |    You can adapt this file completely to your liking, but it should at least
  4 |    contain the root `toctree` directive.
  5 | 
  6 | Welcome to pypdf
  7 | =================
  8 | 
  9 | pypdf is a `free <https://en.wikipedia.org/wiki/Free_software>`_ and open
 10 | source pure-python PDF library capable of splitting,
 11 | merging, cropping, and transforming the pages of PDF files. It can also add
 12 | custom data, viewing options, and passwords to PDF files.
 13 | pypdf can retrieve text and metadata from PDFs as well.
 14 | 
 15 | See `pdfly <https://github.com/py-pdf/pdfly>`_ for a CLI application that uses pypdf to interact with PDFs.
 16 | 
 17 | You can contribute to `pypdf on GitHub <https://github.com/py-pdf/pypdf>`_.
 18 | 
 19 | .. toctree::
 20 |    :caption: User Guide
 21 |    :maxdepth: 1
 22 | 
 23 |    user/installation
 24 |    user/migration-1-to-2
 25 |    user/robustness
 26 |    user/suppress-warnings
 27 |    user/metadata
 28 |    user/extract-text
 29 |    user/post-processing-in-text-extraction
 30 |    user/extract-images
 31 |    user/extract-attachments
 32 |    user/encryption-decryption
 33 |    user/merging-pdfs
 34 |    user/cropping-and-transforming
 35 |    user/reading-pdf-annotations
 36 |    user/adding-pdf-annotations
 37 |    user/add-watermark
 38 |    user/add-javascript
 39 |    user/viewer-preferences
 40 |    user/forms
 41 |    user/streaming-data
 42 |    user/file-size
 43 |    user/pdf-version-support
 44 |    user/pdfa-compliance
 45 | 
 46 | 
 47 | .. toctree::
 48 |    :caption: API Reference
 49 |    :maxdepth: 1
 50 | 
 51 |    modules/PdfReader
 52 |    modules/PdfWriter
 53 |    modules/Destination
 54 |    modules/DocumentInformation
 55 |    modules/Field
 56 |    modules/Fit
 57 |    modules/PageObject
 58 |    modules/PageRange
 59 |    modules/PaperSize
 60 |    modules/RectangleObject
 61 |    modules/Transformation
 62 |    modules/XmpInformation
 63 |    modules/annotations
 64 |    modules/constants
 65 |    modules/errors
 66 |    modules/generic
 67 |    modules/PdfDocCommon
 68 | 
 69 | .. toctree::
 70 |    :caption: Developer Guide
 71 |    :maxdepth: 1
 72 | 
 73 |    dev/intro
 74 |    dev/pdf-format
 75 |    dev/pypdf-parsing
 76 |    dev/pypdf-writing
 77 |    dev/cmaps
 78 |    dev/deprecations
 79 |    dev/documentation
 80 |    dev/testing
 81 |    dev/releasing
 82 | 
 83 | .. toctree::
 84 |    :caption: About pypdf
 85 |    :maxdepth: 1
 86 | 
 87 |    meta/CHANGELOG
 88 |    meta/changelog-v1
 89 |    meta/project-governance
 90 |    meta/taking-ownership
 91 |    meta/history
 92 |    meta/CONTRIBUTORS
 93 |    meta/scope-of-pypdf
 94 |    meta/comparisons
 95 |    meta/faq
 96 | 
 97 | Indices and tables
 98 | ==================
 99 | 
100 | * :ref:`genindex`
101 | * :ref:`modindex`
102 | * :ref:`search`
103 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | 
13 | %SPHINXBUILD% >NUL 2>NUL
14 | if errorlevel 9009 (
15 | 	echo.
16 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
17 | 	echo.installed, then set the SPHINXBUILD environment variable to point
18 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
19 | 	echo.may add the Sphinx directory to PATH.
20 | 	echo.
21 | 	echo.If you don't have Sphinx installed, grab it from
22 | 	echo.https://www.sphinx-doc.org/
23 | 	exit /b 1
24 | )
25 | 
26 | if "%1" == "" goto help
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/meta/comparisons.md:
--------------------------------------------------------------------------------
 1 | # pypdf vs X
 2 | 
 3 | pypdf is a [free] and open source pure-python PDF library capable of
 4 | splitting, merging, cropping, and transforming the pages of PDF files.
 5 | It can also add custom data, viewing options, and passwords to PDF
 6 | files. pypdf can retrieve text and metadata from PDFs as well.
 7 | 
 8 | ## PyMuPDF and PikePDF
 9 | 
10 | [PyMuPDF] is a Python binding to [MuPDF] and [PikePDF] is the Python
11 | binding to [QPDF].
12 | 
13 | While both are excellent libraries for various use-cases, using them is
14 | not always possible even when they support the use-case. Both of them
15 | are powered by C libraries which makes installation harder and might
16 | cause security concerns. For MuPDF you might also need to buy a
17 | commercial license.
18 | 
19 | A core feature of pypdf is that it's pure Python. That means there is
20 | no C dependency. It has been used for over 10 years and for this reason
21 | a lot of support via StackOverflow and examples on the internet.
22 | 
23 | ## pypdf
24 | 
25 | PyPDF2 was merged back into `pypdf`. The development continues at `pypdf`.
26 | 
27 | ## PyPDF3 and PyPDF4
28 | 
29 | Developing and maintaining open source software is extremely
30 | time-intensive and in the case of pypdf not paid at all. Having a
31 | continuous support is hard.
32 | 
33 | pypdf was initially released in 2012 on PyPI and received releases
34 | until 2016. From 2016 to 2022 there was no update - but people were
35 | still using it.
36 | 
37 | As pypdf is free software, there were attempts to fork it and continue
38 | the development. PyPDF3 was first released in 2018 and still receives
39 | updates. PyPDF4 has only one release from 2018.
40 | 
41 | Martin Thoma has worked on bringing the community back to one path of
42 | development. He deprecated PyPDF2 in favor of pypdf already and pypdf has now
43 | more features and a cleaner interface than PyPDF2. See [history of
44 | pypdf](history.md).
45 | 
46 |   [free]: https://en.wikipedia.org/wiki/Free_software
47 |   [PyMuPDF]: https://pypi.org/project/PyMuPDF/
48 |   [MuPDF]: https://mupdf.com/
49 |   [PikePDF]: https://pypi.org/project/pikepdf/
50 |   [QPDF]: https://github.com/qpdf/qpdf
51 | 
52 | 
53 | ## pdfminer.six and pdfplumber
54 | 
55 | [`pdfminer.six`](https://pypi.org/project/pdfminer.six/) is capable of
56 | extracting the [font size](https://stackoverflow.com/a/69962459/562769)
57 | / font weight (bold-ness). It has no capabilities for writing PDF files.
58 | 
59 | [`pdfplumber`](https://pypi.org/project/pdfplumber/) is a library focused on extracting data from PDF documents. Since `pdfplumber` is built on top of `pdfminer.six`, there are **no capabilities of exporting or modifying a PDF file** (see [#440 (discussions)](https://github.com/jsvine/pdfplumber/discussions/440#discussioncomment-803880)). However, `pdfplumber` is capable of converting a PDF file into an image, [draw lines and rectangles on the image](https://github.com/jsvine/pdfplumber#drawing-methods), and save it as an image file. Please note that the image conversion is done via ImageMagick (see [`pdfplumber`'s documentation](https://github.com/jsvine/pdfplumber#visual-debugging)).
60 | 
61 | The `pdfplumber` community is active in answering questions and the library is maintained as of May 2023.
62 | 
63 | ## pdfrw / pdfrw2
64 | 
65 | I don't have experience with any of those libraries. Please add a
66 | comparison if you know pypdf and [`pdfrw`](https://pypi.org/project/pdfrw/)!
67 | 
68 | Please be aware that there is also
69 | [`pdfminer`](https://pypi.org/project/pdfminer/) which is not maintained.
70 | Then there is [`pdfrw2`](https://pypi.org/project/pdfrw2/) which doesn't have
71 | a large community behind it.
72 | 
73 | ## Document Generation
74 | 
75 | There are (Python) [tools to generate PDF documents](https://github.com/py-pdf/awesome-pdf#generators).
76 | pypdf is not one of them.
77 | 
78 | 
79 | ## CLI applications
80 | 
81 | pypdf is a pure Python PDF library. If you're looking for an application which
82 | you can use from the terminal, give [`pdfly`](https://pdfly.readthedocs.io/en/latest/)
83 | a shot.
84 | 


--------------------------------------------------------------------------------
/docs/meta/faq.md:
--------------------------------------------------------------------------------
 1 | # Frequently Asked Questions
 2 | 
 3 | ## How is pypdf related to PyPDF2?
 4 | 
 5 | PyPDF2 was a fork from the original pyPdf. After several years, the fork was
 6 | merged back into `pypdf` (now all lowercase).
 7 | 
 8 | ## Which Python versions are supported?
 9 | 
10 | pypdf 3.0+ supports Python 3.6 and later.
11 | PyPDF2 2.0+ supports Python 3.6 and later.
12 | PyPDF2 1.27.10 supported Python 2.7 to 3.10.
13 | 
14 |   [Matthew]: https://github.com/mstamy2
15 |   [source]: https://github.com/py-pdf/PyPDF2/commit/24b270d876518d15773224b5d0d6c2206db29f64#commitcomment-5038317
16 |   [this sort of thing]: https://github.com/py-pdf/PyPDF2/issues/24
17 |   [GitHub issue]: https://github.com/py-pdf/PyPDF2/issues
18 | 
19 | ## Who uses pypdf?
20 | 
21 | pyPdf is vendored [into](https://github.com/Buyanbat/XacCRM/tree/ee78e8df967182f661b6494a86444501e7d89c8f/report/pyPdf) [several](https://github.com/MyBook/calibre/tree/ca1efe3c21f6553e096dab745b3cdeb36244a5a9/src/pyPdf) [projects](https://github.com/Giacomo-De-Florio-Dev/Make_Your_PDF_Safe/tree/ec439f92243d12d54ae024668792470c6b40ee96/MakeYourPDFsafe_V1.3/PyPDF2). That
22 | means the code of pyPdf was copied into that project.
23 | 
24 | Projects that depend on pypdf:
25 | 
26 | * [Camelot](https://github.com/camelot-dev/camelot): A Python library to extract tabular data from PDFs
27 | * [edi](https://github.com/OCA/edi): Electronic Data Interchange modules
28 | * [amazon-textract-textractor](https://github.com/aws-samples/amazon-textract-textractor/blob/42444b08c672607eadbdcd64f3c5adb2d85383de/helper/setup.py): Analyze documents with Amazon Textract and generate output in multiple formats.
29 | * [maigret](https://github.com/soxoj/maigret): Collect a dossier on a person by username from thousands of sites
30 | * [deda](https://github.com/dfd-tud/deda): tracking Dots Extraction, Decoding and Anonymisation toolkit
31 | * [opencanary](https://github.com/thinkst/opencanary)
32 | * Document Conversions
33 |   * [rst2pdf](https://github.com/rst2pdf/rst2pdf)
34 |   * [xhtml2pdf](https://github.com/xhtml2pdf/xhtml2pdf)
35 |   * [doc2text](https://github.com/jlsutherland/doc2text)
36 | * [pdfalyzer](https://pypi.org/project/pdfalyzer/): A PDF analysis tool for visualizing the inner tree-like data structure of a PDF in spectacularly large and colorful diagrams as well as scanning the binary streams embedded in the PDF for hidden potentially malicious content.
37 | 
38 | ## How do I cite pypdf?
39 | 
40 | In BibTeX format:
41 | 
42 | ```
43 | @misc{pypdf,
44 |  title         = {The {pypdf} library},
45 |  author        = {Mathieu Fenniak and
46 |                   Matthew Stamy and
47 |                   pubpub-zz and
48 |                   Martin Thoma and
49 |                   Matthew Peveler and
50 |                   exiledkingcc and {pypdf Contributors}},
51 |  year          = {2024},
52 |  url           = {https://pypi.org/project/pypdf/}
53 |  note          = {See https://pypdf.readthedocs.io/en/latest/meta/CONTRIBUTORS.html for all contributors}
54 | }
55 | ```
56 | 
57 | ## Which License does pypdf use?
58 | 
59 | `pypdf` uses the [BSD-3-Clause license](https://en.wikipedia.org/wiki/BSD_licenses#3-clause), see the LICENSE file.
60 | 


--------------------------------------------------------------------------------
/docs/meta/history.md:
--------------------------------------------------------------------------------
 1 | # History of pypdf
 2 | 
 3 | ## The Origins: pyPdf (2005-2010)
 4 | 
 5 | In 2005, [Mathieu Fenniak] launched pyPdf "as a PDF toolkit..."
 6 | focused on
 7 | 
 8 | -   document manipulation: by-page splitting, concatenation, and
 9 |     merging;
10 | -   document introspection;
11 | -   page cropping; and
12 | -   document encryption and decryption.
13 | 
14 | The last release of PyPI was [pyPdf 1.13](https://pypi.org/project/pyPdf/#history)
15 | in 2010.
16 | 
17 | ## PyPDF2 is born (2011-2016)
18 | 
19 | At the end of 2011, after consultation with Mathieu and others, Phaseit
20 | sponsored PyPDF2 as a fork of pyPdf on GitHub. The initial impetus was
21 | to handle a wider range of input PDF instances; Phaseit\'s commercial
22 | work often encounters PDF instances \"in the wild\" that it needs to
23 | manage (mostly concatenate and paginate), but that deviate so much from
24 | PDF standards that pyPdf can\'t read them. PyPDF2 reads a considerably
25 | wider range of real-world PDF instances.
26 | 
27 | Neither pyPdf nor PyPDF2 aims to be universal, that is, to provide all
28 | possible PDF-related functionality. Note that the similar-appearing
29 | [pyfpdf] of Mariano Reingart is most comparable to [ReportLab], in that
30 | both ReportLab and pyfpdf emphasize document generation. Interestingly
31 | enough, pyfpdf builds in a basic HTML→PDF converter while PyPDF2 has no
32 | knowledge of HTML.
33 | 
34 | So what is PyPDF2 truly about? Think about popular [pdftk] for a moment.
35 | PyPDF2 does what pdftk does, and it does so within your current Python
36 | process, and it handles a wider range of variant PDF formats
37 | \[explain\]. PyPDF2 has its own FAQ to answer other questions that have
38 | arisen.
39 | 
40 | The Reddit [/r/python crowd chatted] obliquely and briefly about PyPDF2
41 | in March 2012.
42 | 
43 | The core developer / maintainer was Matthew Stamy.
44 | 
45 | ## PyPDF3 and PyPDF4 (2018 - 2022)
46 | 
47 | Two approaches were made to get PyPDF2 active again: PyPDF3 and PyPDF4.
48 | 
49 | PyPDF3 had it's first release in 2018 and its last one in February 2022.
50 | It never got the user base from PyPDF2.
51 | 
52 | PyPDF4 only had one release in 2018.
53 | 
54 | ## PyPDF2: Reborn (2022)
55 | 
56 | Martin Thoma took over maintenance of PyPDF2 in April 2022. It had over 100
57 | open PRs and 321 open issues.
58 | 
59 | [pubpub-zz](https://github.com/pubpub-zz) was extremely active, especially
60 | for text extraction.
61 | 
62 | [Matthew Peveler](https://github.com/MasterOdin) helped a lot with reviews
63 | and general project decisions.
64 | 
65 | [exiledkingcc](https://github.com/exiledkingcc) added support for modern
66 | encryption schemes.
67 | 
68 | 
69 | ## pypdf: Back to the Roots (2023-2024)
70 | 
71 | In order to make things simpler for beginners, PyPDF2 was merged back into
72 | pypdf. Now all lowercase, without a number. We hope that the folks who
73 | develop PyPDF3 and PyPDF4 also join us.
74 | 
75 | Compared to `PyPDF2 >= 3.0.0`, `pypdf >= 3.1.0` now offers:
76 | 
77 | * AES reading and writing support. Not only with PyCryptoDome, but also with cryptography.
78 | * Text extraction improvements, e.g. for math content. [pypdf is now comparable with Tika, pypdfium2, and PyMuPDF](https://github.com/py-pdf/benchmarks)
79 | * Annotation support
80 | * Performance Improvements and Bugfixes
81 | * Page Label support
82 | 
83 | stefan6419846 made his [first PR for pypdf](https://github.com/py-pdf/pypdf/pull/2022)
84 | in July 2023 and joined the project.
85 | 
86 | 
87 |   [Mathieu Fenniak]: https://mathieu.fenniak.net/
88 |   [pyfpdf]: https://github.com/reingart/pyfpdf
89 |   [ReportLab]: https://www.reportlab.com/software/opensource/rl-toolkit/
90 |   [pdftk]: https://www.pdflabs.com/tools/pdftk-the-pdf-toolkit/https://www.pdflabs.com/tools/pdftk-the-pdf-toolkit/
91 |   [/r/python crowd chatted]: https://www.reddit.com/r/Python/comments/qsvfm/pypdf2_updates_pypdf_pypdf2_is_an_opensource/
92 | 


--------------------------------------------------------------------------------
/docs/meta/scope-of-pypdf.md:
--------------------------------------------------------------------------------
 1 | # Scope of pypdf
 2 | 
 3 | What features should pypdf have and which features will it never have?
 4 | 
 5 | pypdf aims at making interactions with PDF documents simpler. Core tasks that
 6 | pypdf can perform are:
 7 | 
 8 | * Document manipulation: Splitting, merging, cropping, and transforming the pages of PDF files
 9 | * Data Extraction: Extract text and metadata from PDF documents
10 | * Security: Decrypt / encrypt PDF documents
11 | 
12 | Typical indicators that something should be done by pypdf:
13 | 
14 | * The task needs in-depth knowledge of the PDF format
15 | * It currently requires a lot of code or even is impossible to do with pypdf
16 | * It's neither mentioned in "belongs in user code" nor in "out of scope"
17 | * It already is in the issue list with the [is-feature tag](https://github.com/py-pdf/pypdf/labels/is-feature).
18 | 
19 | The [moonshot extensions](https://github.com/py-pdf/pypdf/discussions/1181) are
20 | features we would like to have, but are currently not able to add (PRs are
21 | welcome 😉)
22 | 
23 | ## Belongs in user code
24 | 
25 | Here are a few indicators that a feature belongs into users code (and not into pypdf):
26 | 
27 | 1. The use-case is very specific. Most people will not encounter the same need.
28 | 2. It can be done without knowledge of the PDF specification
29 | 3. It cannot be done without (non-pdf) domain knowledge. Anything that is
30 |    specific to your industry.
31 | 
32 | ## Out of scope
33 | 
34 | While this list is infinitely long, there are a few topics that are asked
35 | multiple times.
36 | 
37 | Those topics are out of scope for pypdf. They will never be part of pypdf:
38 | 
39 | 1. **Optical Character Recognition (OCR)**: OCR is about extracting text from
40 |    images. That is very different from the kind of text extraction pypdf is
41 |    doing. Please note that images can be within PDF documents. In the case of
42 |    scanned documents, the whole page is an image. Some scanners automatically
43 |    execute OCR and add a text-layer behind the scanned page. That is something
44 |    pypdf can use, if it's present. As a rule-of-thumb: If you cannot mark/copy
45 |    the text, it's likely an image. A noteworthy open source OCR project is
46 |    [tesseract](https://github.com/tesseract-ocr/tesseract).
47 | 2. **Format Conversion**: Converting docx / HTML to PDF or PDF to those formats.
48 |    You might want to have a look at [`pdfkit`](https://pypi.org/project/pdfkit/)
49 |    and similar projects.
50 | 
51 | Out of scope for the moment, but might be added if there are enough contributors:
52 | 
53 | * **Digital Signature Support** ([reference
54 |   ticket](https://github.com/py-pdf/pypdf/issues/302)): Cryptography is
55 |   complicated. It's important to get it right. pypdf currently doesn't have
56 |   enough active contributors to properly add digital signautre support. For the
57 |   moment, [pyhanko](https://pypi.org/project/pyHanko/) seems to be the best
58 |   choice.
59 | * **PDF Generation from Scratch**: pypdf can manipulate existing PDF documents,
60 |   add annotations, combine / split / crop / transform. It can add blank pages.
61 |   But if you want to generate invoices, you might want to have a look at
62 |   [`reportlab`](https://pypi.org/project/reportlab/) /
63 |   [`fpdf2`](https://pypi.org/project/fpdf2/) or document conversion tools like
64 |   [`pdfkit`](https://pypi.org/project/pdfkit/).
65 | * **Replacing words within a PDF**: [Extracting text from PDF is hard](../user/extract-text.md#why-text-extraction-is-hard).
66 |    Replacing text in a reliable way is even harder. For example, one word might
67 |    be split into multiple tokens. Hence it's not a simple "search and replace"
68 |    in some cases.
69 | * **(Not) Extracting headers/footers/page numbers**: While you can apply
70 |   heuristics, there is no way to always make it work. PDF documents simply
71 |   don't contain the information what a header/footer/page number is.
72 | 
73 | 
74 | ### Library vs Application
75 | 
76 | It's also worth pointing out that `pypdf` is designed to be a library. It is not
77 | an application. That has several implications:
78 | 
79 | * Execution: pypdf cannot be executed directly, but only be called from within
80 |   a program written by a pypdf user. In contrast, an application is executed
81 |   by it's own.
82 | * Dependencies: pypdf should have a minimal set of dependencies and only
83 |   restrict them where it is strictly necessary. In contrast, applications should
84 |   be installed in environments which are isolated from other applications. They
85 |   can pin their dependencies.
86 | 
87 | If you're looking for a way to interact with PDF files via Shell, you should
88 | either write a script using pypdf or use [`pdfly`](https://pypi.org/project/pdfly/).
89 | 


--------------------------------------------------------------------------------
/docs/meta/taking-ownership.md:
--------------------------------------------------------------------------------
 1 | # Taking Ownership of pypdf
 2 | 
 3 | pypdf is currently maintained by stefan6419846. We want to avoid that
 4 | pypdf ever goes unmaintained again. This document serves as a guide to avoid
 5 | that if I become unavailable, e.g. due to severe health issues.
 6 | 
 7 | This currently is just an abstract scenario. I'm fine and I will likely do this
 8 | for several more years, but I have seen how projects stand still for many years
 9 | because of the maintainer becoming inactive.
10 | 
11 | ## What belongs to pypdf?
12 | 
13 | The resources needed for maintaining pypdf are:
14 | 
15 | * PyPI: [pypdf](https://pypi.org/project/pypdf/) and [PyPDF2](https://pypi.org/project/PyPDF2/)
16 | * Github: [pypdf](https://github.com/py-pdf/pypdf) (the repository, not the organization)
17 | * ReadTheDocs: [pypdf](https://readthedocs.org/projects/pypdf/) and [PyPDF2](https://readthedocs.org/projects/pypdf2/)
18 | 
19 | ## When may somebody take ownership?
20 | 
21 | **No activity in 180 days**: If I don't answer e-mails (see my GitHub profile)
22 | and don't make any commits / merges for half a year, you can consider pypdf "not
23 | maintained".
24 | 
25 | ## Who may take ownership?
26 | 
27 | Preferably, one of the owners of the GitHub `py-pdf` organization takes care of
28 | that.
29 | 
30 | As of 27th of August 2023, the following people might be candidates:
31 | 
32 | * [Lucas-C](https://github.com/Lucas-C): He maintains fpdf2 and is a py-pdf owner
33 | * [pubpub-zz](https://github.com/pubpub-zz): He is one of the most active contributors
34 |   to pypdf
35 | * [Matthew Peveler](https://github.com/MasterOdin): Less active, but he is very
36 |   careful about breaking changes and an experienced software developer.
37 | * [exiledkingcc](https://github.com/exiledkingcc): He has contributed the core
38 |   changes related to encryption.
39 | 
40 | ## How to take ownership?
41 | 
42 | * PyPI: Follow [PEP 541 – Package Index Name Retention](https://peps.python.org/pep-0541/)
43 | * GitHub: Talk with one of the other py-pdf organization owners
44 | * ReadTheDocs: Follow the [Abandoned projects policy](https://docs.readthedocs.io/en/latest/abandoned-projects.html)
45 | 


--------------------------------------------------------------------------------
/docs/modules/Destination.rst:
--------------------------------------------------------------------------------
1 | The Destination Class
2 | ---------------------
3 | 
4 | .. autoclass:: pypdf.generic.Destination
5 |     :members:
6 |     :undoc-members:
7 |     :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/modules/DocumentInformation.rst:
--------------------------------------------------------------------------------
1 | The DocumentInformation Class
2 | -----------------------------
3 | 
4 | .. autoclass:: pypdf.DocumentInformation
5 |     :members:
6 |     :undoc-members:
7 |     :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/modules/Field.rst:
--------------------------------------------------------------------------------
1 | The Field Class
2 | ---------------
3 | 
4 | .. autoclass:: pypdf.generic.Field
5 |     :members:
6 |     :undoc-members:
7 |     :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/modules/Fit.rst:
--------------------------------------------------------------------------------
1 | The Fit Class
2 | -------------
3 | 
4 | .. autoclass:: pypdf.generic.Fit
5 |     :members:
6 |     :undoc-members:
7 |     :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/modules/PageObject.rst:
--------------------------------------------------------------------------------
 1 | The PageObject Class
 2 | --------------------
 3 | 
 4 | .. autoclass:: pypdf._page.PageObject
 5 |     :members:
 6 |     :undoc-members:
 7 |     :show-inheritance:
 8 | 
 9 | .. autoclass:: pypdf._page.VirtualListImages
10 |     :members:
11 |     :undoc-members:
12 |     :show-inheritance:
13 | 
14 | .. autoclass:: pypdf._page.ImageFile
15 |     :members:
16 |     :inherited-members: File
17 |     :undoc-members:
18 | 
19 | .. autofunction:: pypdf.mult
20 | 


--------------------------------------------------------------------------------
/docs/modules/PageRange.rst:
--------------------------------------------------------------------------------
1 | The PageRange Class
2 | -------------------
3 | 
4 | .. autoclass:: pypdf.PageRange
5 |     :members:
6 |     :undoc-members:
7 |     :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/modules/PaperSize.rst:
--------------------------------------------------------------------------------
 1 | The PaperSize Class
 2 | -------------------
 3 | 
 4 | .. autoclass:: pypdf.PaperSize
 5 |     :members:
 6 |     :undoc-members:
 7 |     :show-inheritance:
 8 | 
 9 | Add blank page with PaperSize
10 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
11 | .. code-block:: python
12 |     :linenos:
13 | 
14 |     from pypdf import PaperSize, PdfWriter
15 | 
16 |     writer = PdfWriter(clone_from="sample.pdf")
17 |     writer.add_blank_page(PaperSize.A8.width, PaperSize.A8.height)
18 |     with open("output.pdf", "wb") as output_stream:
19 |         writer.write(output_stream)
20 | 
21 | Insert blank page with PaperSize
22 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
23 | .. code-block:: python
24 |     :linenos:
25 | 
26 |     from pypdf import PaperSize, PdfWriter
27 | 
28 |     writer = PdfWriter(clone_from="sample.pdf")
29 |     writer.insert_blank_page(PaperSize.A8.width, PaperSize.A8.height, 1)
30 |     with open("output.pdf", "wb") as output_stream:
31 |         writer.write(output_stream)
32 | 


--------------------------------------------------------------------------------
/docs/modules/PdfDocCommon.rst:
--------------------------------------------------------------------------------
 1 | The PdfDocCommon Class
 2 | ----------------------
 3 | 
 4 | **PdfDocCommon** is an abstract class which is inherited by :class:`~pypdf.PdfReader` and :class:`~pypdf.PdfWriter`.
 5 | 
 6 | Where identified in the API, you can use any of the derived class.
 7 | 
 8 | .. autoclass:: pypdf._doc_common.PdfDocCommon
 9 |     :members:
10 |     :inherited-members:
11 |     :undoc-members:
12 |     :show-inheritance:
13 | 


--------------------------------------------------------------------------------
/docs/modules/PdfReader.rst:
--------------------------------------------------------------------------------
 1 | The PdfReader Class
 2 | -------------------
 3 | 
 4 | .. autoclass:: pypdf.PdfReader
 5 |     :members:
 6 |     :inherited-members:
 7 |     :undoc-members:
 8 |     :show-inheritance:
 9 | 
10 | .. autoclass:: pypdf.PasswordType
11 |     :members:
12 |     :undoc-members:
13 |     :show-inheritance:
14 | 


--------------------------------------------------------------------------------
/docs/modules/PdfWriter.rst:
--------------------------------------------------------------------------------
 1 | The PdfWriter Class
 2 | -------------------
 3 | 
 4 | .. autoclass:: pypdf.PdfWriter
 5 |     :members:
 6 |     :inherited-members:
 7 |     :undoc-members:
 8 |     :show-inheritance:
 9 | 
10 | .. autoclass:: pypdf.ObjectDeletionFlag
11 |     :members:
12 |     :undoc-members:
13 |     :show-inheritance:
14 | 


--------------------------------------------------------------------------------
/docs/modules/RectangleObject.rst:
--------------------------------------------------------------------------------
1 | The RectangleObject Class
2 | -------------------------
3 | 
4 | .. autoclass:: pypdf.generic.RectangleObject
5 |     :members:
6 |     :undoc-members:
7 |     :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/modules/Transformation.rst:
--------------------------------------------------------------------------------
1 | The Transformation Class
2 | ------------------------
3 | 
4 | .. autoclass:: pypdf.Transformation
5 |     :members:
6 |     :undoc-members:
7 |     :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/modules/XmpInformation.rst:
--------------------------------------------------------------------------------
1 | The XmpInformation Class
2 | -------------------------
3 | 
4 | .. autoclass:: pypdf.xmp.XmpInformation
5 |     :members:
6 |     :undoc-members:
7 |     :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/modules/annotations.rst:
--------------------------------------------------------------------------------
1 | The annotations module
2 | ----------------------
3 | 
4 | .. automodule:: pypdf.annotations
5 |     :members:
6 |     :undoc-members:
7 |     :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/modules/constants.rst:
--------------------------------------------------------------------------------
 1 | Constants
 2 | ---------
 3 | 
 4 | .. autoclass:: pypdf.constants.AnnotationFlag
 5 |     :members:
 6 |     :undoc-members:
 7 |     :show-inheritance:
 8 | 
 9 | .. autoclass:: pypdf.constants.ImageType
10 |     :members:
11 |     :undoc-members:
12 |     :show-inheritance:
13 | 
14 | .. autoclass:: pypdf.constants.PageLabelStyle
15 |     :members:
16 |     :undoc-members:
17 |     :show-inheritance:
18 | 
19 | .. autoclass:: pypdf.constants.UserAccessPermissions
20 |     :members:
21 |     :undoc-members:
22 |     :show-inheritance:
23 | 
24 | .. autoclass:: pypdf.constants.FieldDictionaryAttributes
25 |        :members:
26 |        :undoc-members:
27 |        :exclude-members: FT, Parent, Kids, T, TU, TM, V, DV, AA, Opt, attributes, attributes_dict
28 |        :show-inheritance:
29 | 


--------------------------------------------------------------------------------
/docs/modules/errors.rst:
--------------------------------------------------------------------------------
1 | Errors
2 | ------
3 | 
4 | .. automodule:: pypdf.errors
5 |     :members:
6 |     :undoc-members:
7 |     :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/modules/generic.rst:
--------------------------------------------------------------------------------
 1 | Generic PDF objects
 2 | -------------------
 3 | 
 4 | .. automodule:: pypdf.generic
 5 |     :members:
 6 |     :undoc-members:
 7 |     :show-inheritance:
 8 |     :exclude-members: Destination, Field, Fit, RectangleObject
 9 | 
10 | 
11 | .. autoclass:: pypdf._protocols.PdfObjectProtocol
12 |     :members:
13 |     :undoc-members:
14 |     :show-inheritance:
15 | 
16 | 
17 | .. autoclass:: pypdf._protocols.XmpInformationProtocol
18 |     :members:
19 |     :undoc-members:
20 |     :show-inheritance:
21 | 
22 | 
23 | .. autoclass:: pypdf._protocols.PdfCommonDocProtocol
24 |        :members:
25 |        :undoc-members:
26 |        :show-inheritance:
27 | 
28 | 
29 | .. autoclass:: pypdf._protocols.PdfReaderProtocol
30 |     :members:
31 |     :undoc-members:
32 |     :show-inheritance:
33 | 
34 | 
35 | .. autoclass:: pypdf._protocols.PdfWriterProtocol
36 |     :members:
37 |     :undoc-members:
38 |     :show-inheritance:
39 | 


--------------------------------------------------------------------------------
/docs/user/add-javascript.md:
--------------------------------------------------------------------------------
 1 | # Adding JavaScript to a PDF
 2 | 
 3 | PDF readers vary in the extent they support JavaScript, with some not supporting it at all.
 4 | 
 5 | Adobe has documentation on its support here:
 6 | [https://opensource.adobe.com/dc-acrobat-sdk-docs/library/jsapiref/index.html](https://opensource.adobe.com/dc-acrobat-sdk-docs/library/jsapiref/index.html)
 7 | 
 8 | ## Launch print window on opening
 9 | 
10 | ```python
11 | from pypdf import PdfWriter
12 | 
13 | writer = PdfWriter(clone_from="example.pdf")
14 | 
15 | # Add JavaScript to launch the print window on opening this PDF.
16 | writer.add_js("this.print({bUI:true,bSilent:false,bShrinkToFit:true});")
17 | 
18 | # Write to pypdf-output.pdf.
19 | with open("pypdf-output.pdf", "wb") as fp:
20 |     writer.write(fp)
21 | ```
22 | 


--------------------------------------------------------------------------------
/docs/user/add-watermark.md:
--------------------------------------------------------------------------------
  1 | # Adding a Stamp or Watermark to a PDF
  2 | 
  3 | Adding stamps or watermarks are two common ways to manipulate PDF files.
  4 | A stamp is adding something on top of the document, a watermark is in the
  5 | background of the document.
  6 | 
  7 | ## Stamp (Overlay) / Watermark (Underlay)
  8 | 
  9 | The process of stamping and watermarking is the same, you just need to set `over` parameter to `True` for stamping and `False` for watermarking.
 10 | 
 11 | You can use {func}`~pypdf._page.PageObject.merge_page` if you don't need to transform the stamp:
 12 | 
 13 | ```python
 14 | from pypdf import PdfReader, PdfWriter
 15 | 
 16 | stamp = PdfReader("bg.pdf").pages[0]
 17 | writer = PdfWriter(clone_from="source.pdf")
 18 | for page in writer.pages:
 19 |     page.merge_page(stamp, over=False)  # here set to False for watermarking
 20 | 
 21 | writer.write("out.pdf")
 22 | ```
 23 | 
 24 | Otherwise use {func}`~pypdf._page.PageObject.merge_transformed_page` with {class}`~pypdf.Transformation` if you need to translate, rotate, scale, etc. the stamp before merging it to the content page.
 25 | 
 26 | ```python
 27 | from pathlib import Path
 28 | from typing import List, Union
 29 | 
 30 | from pypdf import PdfReader, PdfWriter, Transformation
 31 | 
 32 | 
 33 | def stamp(
 34 |     content_pdf: Union[Path, str],
 35 |     stamp_pdf: Union[Path, str],
 36 |     pdf_result: Union[Path, str],
 37 |     page_indices: Union[None, List[int]] = None,
 38 | ):
 39 |     stamp_page = PdfReader(stamp_pdf).pages[0]
 40 | 
 41 |     writer = PdfWriter()
 42 |     # page_indices can be a List(array) of page, tuples are for range definition
 43 |     reader = PdfReader(content_pdf)
 44 |     writer.append(reader, pages=page_indices)
 45 | 
 46 |     for content_page in writer.pages:
 47 |         content_page.merge_transformed_page(
 48 |             stamp_page,
 49 |             Transformation().scale(0.5),
 50 |         )
 51 | 
 52 |     writer.write(pdf_result)
 53 | 
 54 | 
 55 | stamp("example.pdf", "stamp.pdf", "out.pdf")
 56 | ```
 57 | 
 58 | If you are experiencing wrongly rotated watermarks/stamps, try to use
 59 | {func}`~pypdf._page.PageObject.transfer_rotation_to_content` on the corresponding pages beforehand
 60 | to fix the page boxes.
 61 | 
 62 | Example of stamp:
 63 | ![stamp.png](stamp.png)
 64 | 
 65 | Example of watermark:
 66 | ![watermark.png](watermark.png)
 67 | 
 68 | 
 69 | ## Stamping images directly
 70 | 
 71 | The above code only works for stamps that are already in PDF format.
 72 | However, you can easily convert an image to PDF image using
 73 | [Pillow](https://pypi.org/project/Pillow/).
 74 | 
 75 | 
 76 | ```python
 77 | from io import BytesIO
 78 | from pathlib import Path
 79 | from typing import List, Union
 80 | 
 81 | from PIL import Image
 82 | from pypdf import PageRange, PdfReader, PdfWriter, Transformation
 83 | 
 84 | 
 85 | def image_to_pdf(stamp_img: Union[Path, str]) -> PdfReader:
 86 |     img = Image.open(stamp_img)
 87 |     img_as_pdf = BytesIO()
 88 |     img.save(img_as_pdf, "pdf")
 89 |     return PdfReader(img_as_pdf)
 90 | 
 91 | 
 92 | def stamp_img(
 93 |     content_pdf: Union[Path, str],
 94 |     stamp_img: Union[Path, str],
 95 |     pdf_result: Union[Path, str],
 96 |     page_indices: Union[PageRange, List[int], None] = None,
 97 | ):
 98 |     # Convert the image to a PDF
 99 |     stamp_pdf = image_to_pdf(stamp_img)
100 | 
101 |     # Then use the same stamp code from above
102 |     stamp_page = stamp_pdf.pages[0]
103 | 
104 |     writer = PdfWriter()
105 | 
106 |     reader = PdfReader(content_pdf)
107 |     writer.append(reader, pages=page_indices)
108 |     for content_page in writer.pages:
109 |         content_page.merge_transformed_page(
110 |             stamp_page,
111 |             Transformation(),
112 |         )
113 | 
114 |     with open(pdf_result, "wb") as fp:
115 |         writer.write(fp)
116 | 
117 | 
118 | stamp_img("example.pdf", "example.png", "out.pdf")
119 | ```
120 | 


--------------------------------------------------------------------------------
/docs/user/annotation-circle.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/docs/user/annotation-circle.png


--------------------------------------------------------------------------------
/docs/user/annotation-highlight.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/docs/user/annotation-highlight.png


--------------------------------------------------------------------------------
/docs/user/annotation-line.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/docs/user/annotation-line.png


--------------------------------------------------------------------------------
/docs/user/annotation-polygon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/docs/user/annotation-polygon.png


--------------------------------------------------------------------------------
/docs/user/annotation-polyline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/docs/user/annotation-polyline.png


--------------------------------------------------------------------------------
/docs/user/annotation-popup.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/docs/user/annotation-popup.png


--------------------------------------------------------------------------------
/docs/user/annotation-square.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/docs/user/annotation-square.png


--------------------------------------------------------------------------------
/docs/user/encryption-decryption.md:
--------------------------------------------------------------------------------
 1 | # Encryption and Decryption of PDFs
 2 | 
 3 | PDF encryption makes use of [`RC4`](https://en.wikipedia.org/wiki/RC4) and
 4 | [`AES`](https://en.wikipedia.org/wiki/Advanced_Encryption_Standard) algorithms
 5 | with different key length. `pypdf` supports all of them until `PDF-2.0`, which
 6 | is the latest PDF standard.
 7 | 
 8 | `pypdf` use an extra dependency to do encryption or decryption for `AES` algorithms.
 9 | We recommend [`pyca/cryptography`](https://cryptography.io/en/latest/). Alternatively,
10 | you can use [`pycryptodome`](https://pypi.org/project/pycryptodome/).
11 | 
12 | ```{note}
13 | Please see the note in the [installation guide](installation.md)
14 | for installing the extra dependencies if interacting with PDFs that use AES.
15 | ```
16 | 
17 | ## Encrypt
18 | 
19 | You can encrypt a PDF by using a password:
20 | 
21 | ```python
22 | from pypdf import PdfReader, PdfWriter
23 | 
24 | reader = PdfReader("example.pdf")
25 | writer = PdfWriter(clone_from=reader)
26 | 
27 | # Add a password to the new PDF
28 | writer.encrypt("my-secret-password", algorithm="AES-256")
29 | 
30 | # Save the new PDF to a file
31 | with open("encrypted-pdf.pdf", "wb") as f:
32 |     writer.write(f)
33 | ```
34 | The algorithm can be one of `RC4-40`, `RC4-128`, `AES-128`, `AES-256-R5`, `AES-256`.
35 | We recommend using `AES-256-R5`.
36 | 
37 | ```{warning}
38 | pypdf uses `RC4` by default for compatibility if you omit the "algorithm" parameter.
39 | Since `RC4` is insecure, you should use `AES` algorithms.
40 | ```
41 | 
42 | ## Decrypt
43 | 
44 | You can decrypt a PDF using the appropriate password:
45 | 
46 | ```python
47 | from pypdf import PdfReader, PdfWriter
48 | 
49 | reader = PdfReader("encrypted-pdf.pdf")
50 | 
51 | if reader.is_encrypted:
52 |     reader.decrypt("my-secret-password")
53 | 
54 | writer = PdfWriter(clone_from=reader)
55 | 
56 | # Save the new PDF to a file
57 | with open("decrypted-pdf.pdf", "wb") as f:
58 |     writer.write(f)
59 | ```
60 | 


--------------------------------------------------------------------------------
/docs/user/error-hierarchy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/docs/user/error-hierarchy.png


--------------------------------------------------------------------------------
/docs/user/extract-attachments.md:
--------------------------------------------------------------------------------
 1 | # Extract Attachments
 2 | 
 3 | PDF documents can contain attachments. Attachments have a name, but it might not
 4 | be unique. For this reason, the value of `reader.attachments["attachment_name"]`
 5 | is a list.
 6 | 
 7 | You can extract all attachments like this:
 8 | 
 9 | ```python
10 | from pypdf import PdfReader
11 | 
12 | reader = PdfReader("example.pdf")
13 | 
14 | for name, content_list in reader.attachments.items():
15 |     for i, content in enumerate(content_list):
16 |         with open(f"{name}-{i}", "wb") as fp:
17 |             fp.write(content)
18 | ```
19 | 
20 | Alternatively, you can retrieve them in an object-oriented fashion if you need
21 | further details for these files:
22 | 
23 | ```python
24 | from pypdf import PdfReader
25 | 
26 | reader = PdfReader("example.pdf")
27 | 
28 | for attachment in reader.attachment_list:
29 |     print(attachment.name, attachment.alternative_name, attachment.content)
30 | ```
31 | 


--------------------------------------------------------------------------------
/docs/user/extract-images.md:
--------------------------------------------------------------------------------
 1 | # Extract Images
 2 | 
 3 | ```{note}
 4 | In order to use the following code you need to install optional
 5 | dependencies, see [installation guide](installation.md).
 6 | ```
 7 | 
 8 | Every page of a PDF document can contain an arbitrary amount of images.
 9 | The names of the files may not be unique.
10 | 
11 | ```python
12 | from pypdf import PdfReader
13 | 
14 | reader = PdfReader("example.pdf")
15 | 
16 | page = reader.pages[0]
17 | 
18 | for count, image_file_object in enumerate(page.images):
19 |     with open(str(count) + image_file_object.name, "wb") as fp:
20 |         fp.write(image_file_object.data)
21 | ```
22 | 
23 | # Other images
24 | 
25 | Some other objects can contain images, such as stamp annotations.
26 | 
27 | For example, this document contains such stamps:
28 | [test_stamp.pdf](https://github.com/user-attachments/files/15751424/test_stamp.pdf)
29 | 
30 | You can extract the image from the annotation with the following code:
31 | 
32 | ```python
33 | from pypdf import PdfReader
34 | 
35 | reader = PdfReader("test_stamp.pdf")
36 | im = (
37 |     reader.pages[0]["/Annots"][0]
38 |     .get_object()["/AP"]["/N"]["/Resources"]["/XObject"]["/Im4"]
39 |     .decode_as_image()
40 | )
41 | 
42 | im.show()
43 | ```
44 | 


--------------------------------------------------------------------------------
/docs/user/file-size.md:
--------------------------------------------------------------------------------
  1 | # Reduce PDF File Size
  2 | 
  3 | There are multiple ways to reduce the size of a given PDF file. The easiest
  4 | one is to remove content (e.g. images) or pages.
  5 | 
  6 | ## Removing duplication
  7 | 
  8 | Some PDF documents contain the same object multiple times. For example, if an
  9 | image appears three times in a PDF it could be embedded three times. Or it can
 10 | be embedded once and referenced twice.
 11 | 
 12 | When adding data to a PdfWriter, the data is copied while respecting the original format.
 13 | For example, if two pages include the same image which is duplicated in the source document, the object will be duplicated in the PdfWriter object.
 14 | 
 15 | Additionally, when you delete objects in a document, pypdf cannot easily identify whether the objects are used elsewhere or not or if the user wants to keep them in. When writing the PDF file, these objects will be hidden within (part of the file, but not displayed).
 16 | 
 17 | In order to reduce the file size, use a compression call: `writer.compress_identical_objects(remove_identicals=True, remove_orphans=True)`
 18 | 
 19 | * `remove_identicals` enables/disables compression merging identical objects.
 20 | * `remove_orphans` enables/disables suppression of unused objects.
 21 | 
 22 | It is recommended to apply this process just before writing to the file/stream.
 23 | 
 24 | It depends on the PDF how well this works, but we have seen an 86% file
 25 | reduction (from 5.7 MB to 0.8 MB) within a real PDF.
 26 | 
 27 | 
 28 | ## Removing Images
 29 | 
 30 | 
 31 | ```python
 32 | from pypdf import PdfWriter
 33 | 
 34 | writer = PdfWriter(clone_from="example.pdf")
 35 | 
 36 | writer.remove_images()
 37 | 
 38 | with open("out.pdf", "wb") as f:
 39 |     writer.write(f)
 40 | ```
 41 | 
 42 | ## Reducing Image Quality
 43 | 
 44 | If we reduce the quality of the images within the PDF, we can **sometimes**
 45 | reduce the file size of the PDF overall. That depends on how well the reduced
 46 | quality image can be compressed.
 47 | 
 48 | ```python
 49 | from pypdf import PdfWriter
 50 | 
 51 | writer = PdfWriter(clone_from="example.pdf")
 52 | 
 53 | for page in writer.pages:
 54 |     for img in page.images:
 55 |         img.replace(img.image, quality=80)
 56 | 
 57 | with open("out.pdf", "wb") as f:
 58 |     writer.write(f)
 59 | ```
 60 | 
 61 | ## Lossless Compression
 62 | 
 63 | pypdf supports the FlateDecode filter which uses the zlib/deflate compression
 64 | method. It is a lossless compression, meaning the resulting PDF looks exactly
 65 | the same.
 66 | 
 67 | Deflate compression can be applied to a page via
 68 | {meth}`page.compress_content_streams <pypdf._page.PageObject.compress_content_streams>`:
 69 | 
 70 | ```python
 71 | from pypdf import PdfWriter
 72 | 
 73 | writer = PdfWriter(clone_from="example.pdf")
 74 | 
 75 | for page in writer.pages:
 76 |     page.compress_content_streams()  # This is CPU intensive!
 77 | 
 78 | with open("out.pdf", "wb") as f:
 79 |     writer.write(f)
 80 | ```
 81 | 
 82 | `page.compress_content_streams` uses [`zlib.compress`](https://docs.python.org/3/library/zlib.html#zlib.compress)
 83 | and supports the `level` parameter: `level=0` means no compression,
 84 | `level=9` refers to the highest compression.
 85 | 
 86 | Using this method, we have seen a reduction by 70% (from 11.8 MB to 3.5 MB)
 87 | with a real PDF.
 88 | 
 89 | ## Removing Sources
 90 | 
 91 | When a page is removed from the page list, its content will still be present in
 92 | the PDF file. This means that the data may still be used elsewhere.
 93 | 
 94 | Simply removing a page from the page list will reduce the page count but not the
 95 | file size. In order to exclude the content completely, the pages should not be
 96 | added to the PDF using the PdfWriter.append() function. Instead, only the
 97 | desired pages should be selected for inclusion
 98 | (note: [PR #1843](https://github.com/py-pdf/pypdf/pull/1843) will add a page
 99 | deletion feature).
100 | 
101 | There can be issues with poor PDF formatting, such as when all pages are linked
102 | to the same resource. In such cases, dropping references to specific pages
103 | becomes useless because there is only one source for all pages.
104 | 
105 | Cropping is an ineffective method for reducing the file size because it only
106 | adjusts the viewboxes and not the external parts of the source image. Therefore,
107 | the content that is no longer visible will still be present in the PDF.
108 | 
109 | ## Going Further
110 | 
111 | The presentation [Putting a Squeeze on Your PDF](https://youtube.com/watch?v=tgOABUhVwFs) has other suggestions. One takeaway is that most of the significant size optimizations usually come from image and font modification. However, font optimization, such as replacing, merging, and subsetting, is not within the functionality of pypdf at the moment.
112 | 


--------------------------------------------------------------------------------
/docs/user/free-text-annotation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/docs/user/free-text-annotation.png


--------------------------------------------------------------------------------
/docs/user/installation.md:
--------------------------------------------------------------------------------
 1 | # Installation
 2 | 
 3 | There are several ways to install pypdf. The most common option is to use pip.
 4 | 
 5 | ## pip
 6 | 
 7 | pypdf requires Python 3.8+ to run.
 8 | 
 9 | Typically Python comes with `pip`, a package installer. Using it you can
10 | install pypdf:
11 | 
12 | ```bash
13 | pip install pypdf
14 | ```
15 | 
16 | If you are not a super-user (a system administrator / root), you can also just
17 | install pypdf for your current user:
18 | 
19 | ```bash
20 | pip install --user pypdf
21 | ```
22 | 
23 | ### Optional dependencies
24 | 
25 | pypdf tries to be as self-contained as possible, but for some tasks the amount
26 | of work to properly maintain the code would be too high. This is especially the
27 | case for cryptography and image formats.
28 | 
29 | If you simply want to install all optional dependencies, run:
30 | 
31 | ```
32 | pip install pypdf[full]
33 | ```
34 | 
35 | Alternatively, you can install just some:
36 | 
37 | If you plan to use pypdf for encrypting or decrypting PDFs that use AES, you
38 | will need to install some extra dependencies. Encryption using RC4 is supported
39 | using the regular installation.
40 | 
41 | ```
42 | pip install pypdf[crypto]
43 | ```
44 | 
45 | If you plan to use image extraction, you need Pillow:
46 | 
47 | ```
48 | pip install pypdf[image]
49 | ```
50 | 
51 | For JBIG2 support, you need to install a global OS-level package as well:
52 | [`jbig2dec`](https://github.com/ArtifexSoftware/jbig2dec) The installation procedure
53 | depends on our operating system. For Ubuntu, just use the following for example:
54 | 
55 | ```
56 | sudo apt-get install jbig2dec
57 | ```
58 | 
59 | ## Python Version Support
60 | 
61 | Since pypdf 4.0, every release, including point releases, should work with all
62 | supported versions of [Python](https://devguide.python.org/versions/). Thus
63 | every point release is designed to work with all existing Python versions,
64 | excluding end-of-life versions.
65 | 
66 | Previous versions of pypdf support the following versions of Python:
67 | 
68 | | Python                 | 3.11 | 3.10 | 3.9 | 3.8 | 3.7 | 3.6 | 2.7 |
69 | | ---------------------- |:----:|:----:|:---:|:---:|:---:|:---:|:---:|
70 | | pypdf 3.x              | ✅   | ✅  | ✅ | ✅  | ✅  | ✅ | ❌ |
71 | | PyPDF2 >= 2.0          | ✅   | ✅  | ✅ | ✅  | ✅  | ✅ | ❌ |
72 | | PyPDF2 1.20.0 - 1.28.4 | ❌   | ✅  | ✅ | ✅  | ✅  | ✅ | ✅ |
73 | | PyPDF2 1.15.0 - 1.20.0 | ❌   | ❌  | ❌ | ❌  | ❌  | ❌ | ✅ |
74 | 
75 | 
76 | ## Anaconda
77 | 
78 | Anaconda users can [install pypdf via conda-forge](https://anaconda.org/conda-forge/pypdf).
79 | 
80 | 
81 | ## Development Version
82 | 
83 | In case you want to use the current version under development:
84 | 
85 | ```bash
86 | pip install git+https://github.com/py-pdf/pypdf.git
87 | ```
88 | 


--------------------------------------------------------------------------------
/docs/user/merge-45-deg-rot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/docs/user/merge-45-deg-rot.png


--------------------------------------------------------------------------------
/docs/user/merge-rotate-expand.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/docs/user/merge-rotate-expand.png


--------------------------------------------------------------------------------
/docs/user/merge-translated.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/docs/user/merge-translated.png


--------------------------------------------------------------------------------
/docs/user/metadata.md:
--------------------------------------------------------------------------------
  1 | # Metadata
  2 | 
  3 | ## Reading metadata
  4 | 
  5 | ```python
  6 | from pypdf import PdfReader
  7 | 
  8 | reader = PdfReader("example.pdf")
  9 | 
 10 | meta = reader.metadata
 11 | 
 12 | # All of the following could be None!
 13 | print(meta.title)
 14 | print(meta.author)
 15 | print(meta.subject)
 16 | print(meta.creator)
 17 | print(meta.producer)
 18 | print(meta.creation_date)
 19 | print(meta.modification_date)
 20 | ```
 21 | 
 22 | ## Writing metadata
 23 | 
 24 | ```python
 25 | from datetime import datetime
 26 | from pypdf import PdfReader, PdfWriter
 27 | 
 28 | reader = PdfReader("example.pdf")
 29 | writer = PdfWriter()
 30 | 
 31 | # Add all pages to the writer
 32 | for page in reader.pages:
 33 |     writer.add_page(page)
 34 | 
 35 | # If you want to add the old metadata, include these two lines
 36 | if reader.metadata is not None:
 37 |     writer.add_metadata(reader.metadata)
 38 | 
 39 | # Format the current date and time for the metadata
 40 | utc_time = "-05'00'"  # UTC time optional
 41 | time = datetime.now().strftime(f"D\072%Y%m%d%H%M%S{utc_time}")
 42 | 
 43 | # Add the new metadata
 44 | writer.add_metadata(
 45 |     {
 46 |         "/Author": "Martin",
 47 |         "/Producer": "Libre Writer",
 48 |         "/Title": "Title",
 49 |         "/Subject": "Subject",
 50 |         "/Keywords": "Keywords",
 51 |         "/CreationDate": time,
 52 |         "/ModDate": time,
 53 |         "/Creator": "Creator",
 54 |         "/CustomField": "CustomField",
 55 |     }
 56 | )
 57 | 
 58 | # Save the new PDF to a file
 59 | with open("meta-pdf.pdf", "wb") as f:
 60 |     writer.write(f)
 61 | ```
 62 | 
 63 | ## Updating metadata
 64 | 
 65 | ```python
 66 | from pypdf import PdfWriter
 67 | 
 68 | writer = PdfWriter(clone_from="example.pdf")
 69 | 
 70 | # Change some values
 71 | writer.add_metadata(
 72 |     {
 73 |         "/Author": "Martin",
 74 |         "/Producer": "Libre Writer",
 75 |         "/Title": "Title",
 76 |     }
 77 | )
 78 | 
 79 | # Clear all data but keep the entry in PDF
 80 | writer.metadata = {}
 81 | 
 82 | # Replace all entries with new set of entries
 83 | writer.metadata = {
 84 |     "/Author": "Martin",
 85 |     "/Producer": "Libre Writer",
 86 | }
 87 | 
 88 | # Save the new PDF to a file
 89 | with open("meta-pdf.pdf", "wb") as f:
 90 |     writer.write(f)
 91 | ```
 92 | 
 93 | ## Removing metadata entry
 94 | 
 95 | ```python
 96 | from pypdf import PdfWriter
 97 | 
 98 | writer = PdfWriter("example.pdf")
 99 | 
100 | # Remove Metadata (/Info entry)
101 | writer.metadata = None
102 | 
103 | # Save the new PDF to a file
104 | with open("meta-pdf.pdf", "wb") as f:
105 |     writer.write(f)
106 | ```
107 | 


--------------------------------------------------------------------------------
/docs/user/nup-dest1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/docs/user/nup-dest1.png


--------------------------------------------------------------------------------
/docs/user/nup-dest2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/docs/user/nup-dest2.png


--------------------------------------------------------------------------------
/docs/user/nup-source.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/docs/user/nup-source.png


--------------------------------------------------------------------------------
/docs/user/page-stamped.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/docs/user/page-stamped.png


--------------------------------------------------------------------------------
/docs/user/page.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/docs/user/page.png


--------------------------------------------------------------------------------
/docs/user/pdf-version-support.md:
--------------------------------------------------------------------------------
 1 | # PDF Version Support
 2 | 
 3 | PDF comes in the following versions:
 4 | 
 5 | * 1993: 1.0
 6 | * 1994: 1.1
 7 | * 1996: 1.2
 8 | * 1999: 1.3
 9 | * 2001: 1.4
10 | * 2003: 1.5
11 | * 2004: 1.6
12 | * 2008: 1.7, ISO 32000-1:2008
13 | * 2017: 2.0, ISO 32000-2:2017
14 | 
15 | The general format didn't change, but new features got added. It can be that
16 | pypdf can do the operations you want on PDF 2.0 files without fully supporting
17 | all features of PDF 2.0.
18 | 
19 | ## PDF Feature Support by pypdf
20 | 
21 | | Feature                                 | PDF Version | pypdf Support  |
22 | | --------------------------------------- |:-----------:|:--------------:|
23 | | CMaps                                   | 1.4         | ✅             |
24 | | Transparent Graphics                    | 1.4         | ✅             |
25 | | Content Stream Compression              | 1.5         | ✅             |
26 | | Cross-reference Streams                 | 1.5         | ❓             |
27 | | Object Streams                          | 1.5         | ✅             |
28 | | Optional Content Groups (OCGs)          | 1.5         | ❓             |
29 | | AES Encryption                          | 1.6         | ✅             |
30 | 
31 | See [History of PDF](https://en.wikipedia.org/wiki/History_of_PDF) for more
32 | features.
33 | 
34 | Some PDF features are not supported by pypdf, but other libraries can be used
35 | for them:
36 | 
37 | * [pyHanko](https://pyhanko.readthedocs.io/en/latest/index.html): Cryptographically sign a PDF ([#302](https://github.com/py-pdf/pypdf/issues/302))
38 | * [camelot-py](https://pypi.org/project/camelot-py/): Table Extraction ([#231](https://github.com/py-pdf/pypdf/issues/231))
39 | 


--------------------------------------------------------------------------------
/docs/user/plain-merge.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/docs/user/plain-merge.png


--------------------------------------------------------------------------------
/docs/user/post-processing-in-text-extraction.md:
--------------------------------------------------------------------------------
  1 | # Post-Processing of Text Extraction
  2 | 
  3 | Post-processing can recognizably improve the results of text extraction. It is,
  4 | however, outside of the scope of pypdf itself. Hence the library will not give
  5 | any direct support for it. It is a natural language processing (NLP) task.
  6 | 
  7 | This page lists a few examples what can be done as well as a community recipe
  8 | that can be used as a general purpose post-processing step. If you know more
  9 | about the specific domain of your documents, e.g. the language, it is likely
 10 | that you can find custom solutions that work better in your context.
 11 | 
 12 | ## Ligature Replacement
 13 | 
 14 | ```python
 15 | def replace_ligatures(text: str) -> str:
 16 |     ligatures = {
 17 |         "ﬀ": "ff",
 18 |         "ﬁ": "fi",
 19 |         "ﬂ": "fl",
 20 |         "ﬃ": "ffi",
 21 |         "ﬄ": "ffl",
 22 |         "ﬅ": "ft",
 23 |         "ﬆ": "st",
 24 |         # "Ꜳ": "AA",
 25 |         # "Æ": "AE",
 26 |         "ꜳ": "aa",
 27 |     }
 28 |     for search, replace in ligatures.items():
 29 |         text = text.replace(search, replace)
 30 |     return text
 31 | ```
 32 | 
 33 | ## Dehyphenation
 34 | 
 35 | Hyphens are used to break words up so that the appearance of the page is nicer.
 36 | 
 37 | ```python
 38 | from typing import List
 39 | 
 40 | 
 41 | def remove_hyphens(text: str) -> str:
 42 |     """
 43 | 
 44 |     This fails for:
 45 |     * Natural dashes: well-known, self-replication, use-cases, non-semantic,
 46 |                       Post-processing, Window-wise, viewpoint-dependent
 47 |     * Trailing math operands: 2 - 4
 48 |     * Names: Lopez-Ferreras, VGG-19, CIFAR-100
 49 |     """
 50 |     lines = [line.rstrip() for line in text.split("\n")]
 51 | 
 52 |     # Find dashes
 53 |     line_numbers = []
 54 |     for line_no, line in enumerate(lines[:-1]):
 55 |         if line.endswith("-"):
 56 |             line_numbers.append(line_no)
 57 | 
 58 |     # Replace
 59 |     for line_no in line_numbers:
 60 |         lines = dehyphenate(lines, line_no)
 61 | 
 62 |     return "\n".join(lines)
 63 | 
 64 | 
 65 | def dehyphenate(lines: List[str], line_no: int) -> List[str]:
 66 |     next_line = lines[line_no + 1]
 67 |     word_suffix = next_line.split(" ")[0]
 68 | 
 69 |     lines[line_no] = lines[line_no][:-1] + word_suffix
 70 |     lines[line_no + 1] = lines[line_no + 1][len(word_suffix) :]
 71 |     return lines
 72 | ```
 73 | 
 74 | ## Header/Footer Removal
 75 | 
 76 | The following header/footer removal has several drawbacks:
 77 | 
 78 | * False-positives, e.g. for the first page when there is a date like 2024.
 79 | * False-negatives in many cases:
 80 |     * Dynamic part, e.g. page label is in the header.
 81 |     * Even/odd pages have different headers.
 82 |     * Some pages, e.g. the first one or chapter pages, do not have a header.
 83 | 
 84 | ```python
 85 | def remove_footer(extracted_texts: list[str], page_labels: list[str]):
 86 |     def remove_page_labels(extracted_texts, page_labels):
 87 |         processed = []
 88 |         for text, label in zip(extracted_texts, page_labels):
 89 |             text_left = text.lstrip()
 90 |             if text_left.startswith(label):
 91 |                 text = text_left[len(label) :]
 92 | 
 93 |             text_right = text.rstrip()
 94 |             if text_right.endswith(label):
 95 |                 text = text_right[: -len(label)]
 96 | 
 97 |             processed.append(text)
 98 |         return processed
 99 | 
100 |     extracted_texts = remove_page_labels(extracted_texts, page_labels)
101 |     return extracted_texts
102 | ```
103 | 
104 | ## Other ideas
105 | 
106 | * Whitespaces in units: Between a number and its unit should be a space.
107 |   ([source](https://tex.stackexchange.com/questions/20962/should-i-put-a-space-between-a-number-and-its-unit)).
108 |   That means: 42 ms, 42 GHz, 42 GB.
109 | * Percent: English style guides prescribe writing the percent sign following the number without any space between (e.g. 50%).
110 | * Whitespaces before dots: Should typically be removed.
111 | * Whitespaces after dots: Should typically be added.
112 | 


--------------------------------------------------------------------------------
/docs/user/reading-pdf-annotations.md:
--------------------------------------------------------------------------------
 1 | # Reading PDF Annotations
 2 | 
 3 | PDF 2.0 defines the following annotation types:
 4 | 
 5 | * Text
 6 | * Link
 7 | * FreeText
 8 | * Line
 9 | * Square
10 | * Circle
11 | * Polygon
12 | * PolyLine
13 | * Highlight
14 | * Underline
15 | * Squiggly
16 | * StrikeOut
17 | * Caret
18 | * Stamp
19 | * Ink
20 | * Popup
21 | * FileAttachment
22 | * Sound
23 | * Movie
24 | * Screen
25 | * Widget
26 | * PrinterMark
27 | * TrapNet
28 | * Watermark
29 | * 3D
30 | * Redact
31 | * Projection
32 | * RichMedia
33 | 
34 | In general, annotations can be read like this:
35 | 
36 | ```python
37 | from pypdf import PdfReader
38 | 
39 | reader = PdfReader("annotated.pdf")
40 | 
41 | for page in reader.pages:
42 |     if "/Annots" in page:
43 |         for annotation in page["/Annots"]:
44 |             obj = annotation.get_object()
45 |             print({"subtype": obj["/Subtype"], "location": obj["/Rect"]})
46 | ```
47 | 
48 | Examples of reading three of the most common annotations:
49 | 
50 | ## Text
51 | 
52 | ```python
53 | from pypdf import PdfReader
54 | 
55 | reader = PdfReader("example.pdf")
56 | 
57 | for page in reader.pages:
58 |     if "/Annots" in page:
59 |         for annotation in page["/Annots"]:
60 |             subtype = annotation.get_object()["/Subtype"]
61 |             if subtype == "/Text":
62 |                 print(annotation.get_object()["/Contents"])
63 | ```
64 | 
65 | ## Highlights
66 | 
67 | ```python
68 | from pypdf import PdfReader
69 | 
70 | reader = PdfReader("example.pdf")
71 | 
72 | for page in reader.pages:
73 |     if "/Annots" in page:
74 |         for annotation in page["/Annots"]:
75 |             subtype = annotation.get_object()["/Subtype"]
76 |             if subtype == "/Highlight":
77 |                 coords = annotation.get_object()["/QuadPoints"]
78 |                 x1, y1, x2, y2, x3, y3, x4, y4 = coords
79 | ```
80 | 
81 | ## Attachments
82 | 
83 | ```python
84 | from pypdf import PdfReader
85 | 
86 | reader = PdfReader("example.pdf")
87 | 
88 | attachments = {}
89 | for page in reader.pages:
90 |     if "/Annots" in page:
91 |         for annotation in page["/Annots"]:
92 |             subtype = annotation.get_object()["/Subtype"]
93 |             if subtype == "/FileAttachment":
94 |                 fileobj = annotation.get_object()["/FS"]
95 |                 attachments[fileobj["/F"]] = fileobj["/EF"]["/F"].get_data()
96 | ```
97 | 


--------------------------------------------------------------------------------
/docs/user/robustness.md:
--------------------------------------------------------------------------------
 1 | # Robustness and strict=False
 2 | 
 3 | PDF is [specified in various versions](https://pdfa.org/resource/pdf-specification-archive/).
 4 | The specification of PDF 2.0 has 1003 pages. This length makes it hard to get
 5 | everything right. As a consequence, a lot of PDF files are not strictly following the
 6 | specification.
 7 | 
 8 | If a PDF file does not follow the specification, it is not always possible to
 9 | be certain what the intended effect would be. Think of the following broken
10 | Python code as an example:
11 | 
12 | ```python
13 | # Broken
14 | function (foo, bar):
15 | 
16 | # Potentially intended:
17 | def function(foo, bar):
18 |     ...
19 | 
20 | # Also possible:
21 | function = (foo, bar)
22 | ```
23 | 
24 | Writing a parser you can go two paths: Either you try to be forgiving and try
25 | to figure out what the user intended, or you are strict and just tell the user
26 | that they should fix their stuff.
27 | 
28 | pypdf gives you the option to be strict or not.
29 | 
30 | pypdf has two core objects:
31 | 
32 | * {class}`~pypdf.PdfReader`
33 | * {class}`~pypdf.PdfWriter`
34 | 
35 | Only the PdfReader has a `strict` parameter, since presumably you do not want
36 | to write a non-conforming PDF.
37 | 
38 | Choosing `strict=True` means that pypdf will raise an exception if a PDF does
39 | not follow the specification.
40 | 
41 | Choosing `strict=False` means that pypdf will try to be forgiving and do
42 | something reasonable, but it will log a warning message. It is a best-effort
43 | approach.
44 | 


--------------------------------------------------------------------------------
/docs/user/scaling.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/docs/user/scaling.png


--------------------------------------------------------------------------------
/docs/user/stamp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/docs/user/stamp.png


--------------------------------------------------------------------------------
/docs/user/streaming-data.md:
--------------------------------------------------------------------------------
 1 | # Streaming Data with pypdf
 2 | 
 3 | In some cases you might want to avoid saving things explicitly as a file
 4 | to disk, e.g. when you want to store the PDF in a database or AWS S3.
 5 | 
 6 | pypdf supports streaming data to a file-like object:
 7 | 
 8 | ```python
 9 | from io import BytesIO
10 | 
11 | # Prepare example
12 | with open("example.pdf", "rb") as fh:
13 |     bytes_stream = BytesIO(fh.read())
14 | 
15 | # Read from bytes_stream
16 | reader = PdfReader(bytes_stream)
17 | 
18 | # Write to bytes_stream
19 | writer = PdfWriter()
20 | with BytesIO() as bytes_stream:
21 |     writer.write(bytes_stream)
22 | ```
23 | 
24 | ## Writing a PDF directly to AWS S3
25 | 
26 | Suppose you want to manipulate a PDF and write it directly to AWS S3 without having
27 | to write the document to a file first. We have the original PDF in `raw_bytes_data` as `bytes`
28 | and want to set `my-secret-password`:
29 | 
30 | ```python
31 | from io import BytesIO
32 | 
33 | import boto3
34 | from pypdf import PdfReader, PdfWriter
35 | 
36 | 
37 | reader = PdfReader(BytesIO(raw_bytes_data))
38 | writer = PdfWriter()
39 | 
40 | # Add all pages to the writer
41 | for page in reader.pages:
42 |     writer.add_page(page)
43 | 
44 | # Add a password to the new PDF
45 | writer.encrypt("my-secret-password")
46 | 
47 | # Save the new PDF to a file
48 | with BytesIO() as bytes_stream:
49 |     writer.write(bytes_stream)
50 |     bytes_stream.seek(0)
51 |     s3 = boto3.client("s3")
52 |     s3.write_get_object_response(
53 |         Body=bytes_stream, RequestRoute=request_route, RequestToken=request_token
54 |     )
55 | ```
56 | 
57 | ## Reading PDFs directly from cloud services
58 | 
59 | One option is to first download the file and then pass the local file path to `PdfReader`.
60 | Another option is to get a byte stream.
61 | 
62 | For AWS S3 it works like this:
63 | 
64 | ```python
65 | from io import BytesIO
66 | 
67 | import boto3
68 | from pypdf import PdfReader
69 | 
70 | 
71 | s3 = boto3.client("s3")
72 | obj = s3.get_object(Body=csv_buffer.getvalue(), Bucket="my-bucket", Key="my/doc.pdf")
73 | reader = PdfReader(BytesIO(obj["Body"].read()))
74 | ```
75 | 
76 | To use with Google Cloud storage:
77 | 
78 | ```python
79 | from io import BytesIO
80 | 
81 | from google.cloud import storage
82 | 
83 | # os.environ["GOOGLE_APPLICATION_CREDENTIALS"] must be set
84 | storage_client = storage.Client()
85 | blob = storage_client.bucket("my-bucket").blob("mydoc.pdf")
86 | file_stream = BytesIO()
87 | blob.download_to_file(file_stream)
88 | reader = PdfReader(file_stream)
89 | ```
90 | 


--------------------------------------------------------------------------------
/docs/user/suppress-warnings.md:
--------------------------------------------------------------------------------
 1 | # Exceptions, Warnings, and Log messages
 2 | 
 3 | pypdf makes use of three mechanisms to show if something went wrong:
 4 | 
 5 | * **Exceptions** are error cases that pypdf users should explicitly handle.
 6 |   In the `strict=True` mode, most log messages with the warning level will
 7 |   become exceptions. This can be useful in applications where you can require
 8 |   a user to fix the broken PDF.
 9 | * **Warnings** are avoidable issues, such as using deprecated classes /
10 |   functions / parameters. Another example is missing capabilities of pypdf.
11 |   In those cases, pypdf users should adjust their code. Warnings
12 |   are issued by the `warnings` module - those are different from the log-level
13 |   "warning".
14 | * **Log messages** are informative messages that can be used for post-mortem
15 |   analysis. Most of the time, users can ignore them. They come in different
16 |   *levels*, such as info / warning / error indicating the severity.
17 |   Examples are non-standard compliant PDF files which pypdf can deal with or
18 |   a missing implementation that leads to a part of the text not being extracted.
19 | 
20 | 
21 | ## Exceptions
22 | 
23 | Exceptions need to be caught if you want to handle them. For example, you could
24 | want to read the text from a PDF as a part of a search function.
25 | 
26 | Most PDF files do not follow the specification. In this case pypdf needs to
27 | guess which kinds of mistakes were potentially done when the PDF file was created.
28 | See [the robustness page](robustness.md) for the related issues.
29 | 
30 | As a user, you likely do not care about it. If it is readable in any way, you
31 | want the text. You might use pdfminer.six as a fallback and do this:
32 | 
33 | ```python
34 | from pypdf import PdfReader
35 | from pdfminer.high_level import extract_text as fallback_text_extraction
36 | 
37 | text = ""
38 | try:
39 |     reader = PdfReader("example.pdf")
40 |     for page in reader.pages:
41 |         text += page.extract_text()
42 | except Exception as exc:
43 |     text = fallback_text_extraction("example.pdf")
44 | ```
45 | 
46 | You could also capture [`pypdf.errors.PyPdfError`](https://github.com/py-pdf/pypdf/blob/main/pypdf/errors.py)
47 | if you prefer something more specific.
48 | 
49 | ## Warnings
50 | 
51 | The [`warnings` module](https://docs.python.org/3/library/warnings.html) allows
52 | you to ignore warnings:
53 | 
54 | ```python
55 | import warnings
56 | 
57 | warnings.filterwarnings("ignore")
58 | ```
59 | 
60 | In many cases, you actually want to start Python with the `-W` flag so that you
61 | see all warnings. This is especially true for Continuous Integration (CI).
62 | 
63 | ## Log messages
64 | 
65 | Log messages can be noisy in some cases. pypdf hopefully has a reasonable
66 | level of log messages, but you can reduce which types of messages you want to
67 | see:
68 | 
69 | ```python
70 | import logging
71 | 
72 | logger = logging.getLogger("pypdf")
73 | logger.setLevel(logging.ERROR)
74 | ```
75 | 
76 | The [`logging` module](https://docs.python.org/3/library/logging.html#logging-levels)
77 | defines six log levels:
78 | 
79 | * CRITICAL
80 | * ERROR
81 | * WARNING
82 | * INFO
83 | * DEBUG
84 | * NOTSET
85 | 


--------------------------------------------------------------------------------
/docs/user/text-annotation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/docs/user/text-annotation.png


--------------------------------------------------------------------------------
/docs/user/viewer-preferences.md:
--------------------------------------------------------------------------------
 1 | # Adding Viewer Preferences
 2 | 
 3 | It is possible to set viewer preferences of a PDF file.
 4 | §12.2 of the [PDF 1.7 specification](https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/PDF32000_2008.pdf).
 5 | 
 6 | Note that the `/ViewerPreferences` dictionary does not exist by default.
 7 | If it is not already present, it must be created by calling the
 8 | {func}`~pypdf.PdfWriter.create_viewer_preferences` method.
 9 | 
10 | If viewer preferences exist in a PDF file being read with {class}`~pypdf.PdfReader`,
11 | you can access them as properties of {attr}`~pypdf.PdfReader.viewer_preferences`.
12 | Otherwise, the {attr}`~pypdf.PdfReader.viewer_preferences` property will be set to `None`.
13 | 
14 | ## Example
15 | 
16 | ```python
17 | from pypdf import PdfWriter
18 | from pypdf.generic import ArrayObject, NumberObject
19 | 
20 | writer = PdfWriter()
21 | 
22 | writer.create_viewer_preferences()
23 | 
24 | # /HideToolbar
25 | writer.viewer_preferences.hide_toolbar = True
26 | # /HideMenubar
27 | writer.viewer_preferences.hide_menubar = True
28 | # /HideWindowUI
29 | writer.viewer_preferences.hide_windowui = True
30 | # /FitWindow
31 | writer.viewer_preferences.fit_window = True
32 | # /CenterWindow
33 | writer.viewer_preferences.center_window = True
34 | # /DisplayDocTitle
35 | writer.viewer_preferences.display_doctitle = True
36 | 
37 | # /NonFullScreenPageMode
38 | writer.viewer_preferences.non_fullscreen_pagemode = "/UseNone"  # default
39 | writer.viewer_preferences.non_fullscreen_pagemode = "/UseOutlines"
40 | writer.viewer_preferences.non_fullscreen_pagemode = "/UseThumbs"
41 | writer.viewer_preferences.non_fullscreen_pagemode = "/UseOC"
42 | 
43 | # /Direction
44 | writer.viewer_preferences.direction = "/L2R"  # default
45 | writer.viewer_preferences.direction = "/R2L"
46 | 
47 | # /ViewArea
48 | writer.viewer_preferences.view_area = "/CropBox"
49 | # /ViewClip
50 | writer.viewer_preferences.view_clip = "/CropBox"
51 | # /PrintArea
52 | writer.viewer_preferences.print_area = "/CropBox"
53 | # /PrintClip
54 | writer.viewer_preferences.print_clip = "/CropBox"
55 | 
56 | # /PrintScaling
57 | writer.viewer_preferences.print_scaling = "/None"
58 | writer.viewer_preferences.print_scaling = "/AppDefault"  # default according to PDF spec
59 | 
60 | # /Duplex
61 | writer.viewer_preferences.duplex = "/Simplex"
62 | writer.viewer_preferences.duplex = "/DuplexFlipShortEdge"
63 | writer.viewer_preferences.duplex = "/DuplexFlipLongEdge"
64 | 
65 | # /PickTrayByPDFSize
66 | writer.viewer_preferences.pick_tray_by_pdfsize = True
67 | # /PrintPageRange
68 | writer.viewer_preferences.print_pagerange = ArrayObject(
69 |     [NumberObject("1"), NumberObject("10"), NumberObject("20"), NumberObject("30")]
70 | )
71 | # /NumCopies
72 | writer.viewer_preferences.num_copies = 2
73 | 
74 | for i in range(40):
75 |     writer.add_blank_page(10, 10)
76 | 
77 | with open("output.pdf", "wb") as output_stream:
78 |     writer.write(output_stream)
79 | ```
80 | 
81 | The names beginning with a slash character are part of the PDF file format. They are
82 | included here to ease searching the pypdf documentation
83 | for these names from the PDF specification.
84 | 


--------------------------------------------------------------------------------
/docs/user/watermark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/docs/user/watermark.png


--------------------------------------------------------------------------------
/pypdf/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | pypdf is a free and open-source pure-python PDF library capable of splitting,
 3 | merging, cropping, and transforming the pages of PDF files. It can also add
 4 | custom data, viewing options, and passwords to PDF files. pypdf can retrieve
 5 | text and metadata from PDFs as well.
 6 | 
 7 | You can read the full docs at https://pypdf.readthedocs.io/.
 8 | """
 9 | 
10 | from ._crypt_providers import crypt_provider
11 | from ._doc_common import DocumentInformation
12 | from ._encryption import PasswordType
13 | from ._merger import PdfMerger
14 | from ._page import PageObject, Transformation, mult
15 | from ._reader import PdfReader
16 | from ._version import __version__
17 | from ._writer import ObjectDeletionFlag, PdfWriter
18 | from .constants import ImageType
19 | from .pagerange import PageRange, parse_filename_page_ranges
20 | from .papersizes import PaperSize
21 | 
22 | try:
23 |     import PIL
24 | 
25 |     pil_version = PIL.__version__
26 | except ImportError:
27 |     pil_version = "none"
28 | 
29 | _debug_versions = (
30 |     f"pypdf=={__version__}, {crypt_provider=}, PIL={pil_version}"
31 | )
32 | 
33 | __all__ = [
34 |     "DocumentInformation",
35 |     "ImageType",
36 |     "ObjectDeletionFlag",
37 |     "PageObject",
38 |     "PageRange",
39 |     "PaperSize",
40 |     "PasswordType",
41 |     "PdfMerger",
42 |     "PdfReader",
43 |     "PdfWriter",
44 |     "Transformation",
45 |     "__version__",
46 |     "_debug_versions",
47 |     "mult",
48 |     "parse_filename_page_ranges",
49 | ]
50 | 


--------------------------------------------------------------------------------
/pypdf/_codecs/__init__.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, List
 2 | 
 3 | from .adobe_glyphs import adobe_glyphs
 4 | from .pdfdoc import _pdfdoc_encoding
 5 | from .std import _std_encoding
 6 | from .symbol import _symbol_encoding
 7 | from .zapfding import _zapfding_encoding
 8 | 
 9 | 
10 | def fill_from_encoding(enc: str) -> List[str]:
11 |     lst: List[str] = []
12 |     for x in range(256):
13 |         try:
14 |             lst += (bytes((x,)).decode(enc),)
15 |         except Exception:
16 |             lst += (chr(x),)
17 |     return lst
18 | 
19 | 
20 | def rev_encoding(enc: List[str]) -> Dict[str, int]:
21 |     rev: Dict[str, int] = {}
22 |     for i in range(256):
23 |         char = enc[i]
24 |         if char == "\u0000":
25 |             continue
26 |         assert char not in rev, f"{char} at {i} already at {rev[char]}"
27 |         rev[char] = i
28 |     return rev
29 | 
30 | 
31 | _win_encoding = fill_from_encoding("cp1252")
32 | _mac_encoding = fill_from_encoding("mac_roman")
33 | 
34 | 
35 | _win_encoding_rev: Dict[str, int] = rev_encoding(_win_encoding)
36 | _mac_encoding_rev: Dict[str, int] = rev_encoding(_mac_encoding)
37 | _symbol_encoding_rev: Dict[str, int] = rev_encoding(_symbol_encoding)
38 | _zapfding_encoding_rev: Dict[str, int] = rev_encoding(_zapfding_encoding)
39 | _pdfdoc_encoding_rev: Dict[str, int] = rev_encoding(_pdfdoc_encoding)
40 | 
41 | 
42 | charset_encoding: Dict[str, List[str]] = {
43 |     "/StandardEncoding": _std_encoding,
44 |     "/WinAnsiEncoding": _win_encoding,
45 |     "/MacRomanEncoding": _mac_encoding,
46 |     "/PDFDocEncoding": _pdfdoc_encoding,
47 |     "/Symbol": _symbol_encoding,
48 |     "/ZapfDingbats": _zapfding_encoding,
49 | }
50 | 
51 | __all__ = [
52 |     "_mac_encoding",
53 |     "_pdfdoc_encoding",
54 |     "_pdfdoc_encoding_rev",
55 |     "_std_encoding",
56 |     "_symbol_encoding",
57 |     "_win_encoding",
58 |     "_zapfding_encoding",
59 |     "adobe_glyphs",
60 |     "charset_encoding",
61 | ]
62 | 


--------------------------------------------------------------------------------
/pypdf/_codecs/std.py:
--------------------------------------------------------------------------------
  1 | _std_encoding = [
  2 |     "\x00",
  3 |     "\x01",
  4 |     "\x02",
  5 |     "\x03",
  6 |     "\x04",
  7 |     "\x05",
  8 |     "\x06",
  9 |     "\x07",
 10 |     "\x08",
 11 |     "\t",
 12 |     "\n",
 13 |     "\x0b",
 14 |     "\x0c",
 15 |     "\r",
 16 |     "\x0e",
 17 |     "\x0f",
 18 |     "\x10",
 19 |     "\x11",
 20 |     "\x12",
 21 |     "\x13",
 22 |     "\x14",
 23 |     "\x15",
 24 |     "\x16",
 25 |     "\x17",
 26 |     "\x18",
 27 |     "\x19",
 28 |     "\x1a",
 29 |     "\x1b",
 30 |     "\x1c",
 31 |     "\x1d",
 32 |     "\x1e",
 33 |     "\x1f",
 34 |     " ",
 35 |     "!",
 36 |     '"',
 37 |     "#",
 38 |     "$",
 39 |     "%",
 40 |     "&",
 41 |     "’",
 42 |     "(",
 43 |     ")",
 44 |     "*",
 45 |     "+",
 46 |     ",",
 47 |     "-",
 48 |     ".",
 49 |     "/",
 50 |     "0",
 51 |     "1",
 52 |     "2",
 53 |     "3",
 54 |     "4",
 55 |     "5",
 56 |     "6",
 57 |     "7",
 58 |     "8",
 59 |     "9",
 60 |     ":",
 61 |     ";",
 62 |     "<",
 63 |     "=",
 64 |     ">",
 65 |     "?",
 66 |     "@",
 67 |     "A",
 68 |     "B",
 69 |     "C",
 70 |     "D",
 71 |     "E",
 72 |     "F",
 73 |     "G",
 74 |     "H",
 75 |     "I",
 76 |     "J",
 77 |     "K",
 78 |     "L",
 79 |     "M",
 80 |     "N",
 81 |     "O",
 82 |     "P",
 83 |     "Q",
 84 |     "R",
 85 |     "S",
 86 |     "T",
 87 |     "U",
 88 |     "V",
 89 |     "W",
 90 |     "X",
 91 |     "Y",
 92 |     "Z",
 93 |     "[",
 94 |     "\\",
 95 |     "]",
 96 |     "^",
 97 |     "_",
 98 |     "‘",
 99 |     "a",
100 |     "b",
101 |     "c",
102 |     "d",
103 |     "e",
104 |     "f",
105 |     "g",
106 |     "h",
107 |     "i",
108 |     "j",
109 |     "k",
110 |     "l",
111 |     "m",
112 |     "n",
113 |     "o",
114 |     "p",
115 |     "q",
116 |     "r",
117 |     "s",
118 |     "t",
119 |     "u",
120 |     "v",
121 |     "w",
122 |     "x",
123 |     "y",
124 |     "z",
125 |     "{",
126 |     "|",
127 |     "}",
128 |     "~",
129 |     "\x7f",
130 |     "\x80",
131 |     "\x81",
132 |     "\x82",
133 |     "\x83",
134 |     "\x84",
135 |     "\x85",
136 |     "\x86",
137 |     "\x87",
138 |     "\x88",
139 |     "\x89",
140 |     "\x8a",
141 |     "\x8b",
142 |     "\x8c",
143 |     "\x8d",
144 |     "\x8e",
145 |     "\x8f",
146 |     "\x90",
147 |     "\x91",
148 |     "\x92",
149 |     "\x93",
150 |     "\x94",
151 |     "\x95",
152 |     "\x96",
153 |     "\x97",
154 |     "\x98",
155 |     "\x99",
156 |     "\x9a",
157 |     "\x9b",
158 |     "\x9c",
159 |     "\x9d",
160 |     "\x9e",
161 |     "\x9f",
162 |     "\xa0",
163 |     "¡",
164 |     "¢",
165 |     "£",
166 |     "⁄",
167 |     "¥",
168 |     "ƒ",
169 |     "§",
170 |     "¤",
171 |     "'",
172 |     "“",
173 |     "«",
174 |     "‹",
175 |     "›",
176 |     "ﬁ",
177 |     "ﬂ",
178 |     "°",
179 |     "–",
180 |     "†",
181 |     "‡",
182 |     "·",
183 |     "µ",
184 |     "¶",
185 |     "•",
186 |     "‚",
187 |     "„",
188 |     "”",
189 |     "»",
190 |     "…",
191 |     "‰",
192 |     "¾",
193 |     "¿",
194 |     "À",
195 |     "`",
196 |     "´",
197 |     "ˆ",
198 |     "˜",
199 |     "¯",
200 |     "˘",
201 |     "˙",
202 |     "¨",
203 |     "É",
204 |     "˚",
205 |     "¸",
206 |     "Ì",
207 |     "˝",
208 |     "˛",
209 |     "ˇ",
210 |     "—",
211 |     "Ñ",
212 |     "Ò",
213 |     "Ó",
214 |     "Ô",
215 |     "Õ",
216 |     "Ö",
217 |     "×",
218 |     "Ø",
219 |     "Ù",
220 |     "Ú",
221 |     "Û",
222 |     "Ü",
223 |     "Ý",
224 |     "Þ",
225 |     "ß",
226 |     "à",
227 |     "Æ",
228 |     "â",
229 |     "ª",
230 |     "ä",
231 |     "å",
232 |     "æ",
233 |     "ç",
234 |     "Ł",
235 |     "Ø",
236 |     "Œ",
237 |     "º",
238 |     "ì",
239 |     "í",
240 |     "î",
241 |     "ï",
242 |     "ð",
243 |     "æ",
244 |     "ò",
245 |     "ó",
246 |     "ô",
247 |     "ı",
248 |     "ö",
249 |     "÷",
250 |     "ł",
251 |     "ø",
252 |     "œ",
253 |     "ß",
254 |     "ü",
255 |     "ý",
256 |     "þ",
257 |     "ÿ",
258 | ]
259 | 


--------------------------------------------------------------------------------
/pypdf/_crypt_providers/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, exiledkingcc
 2 | # All rights reserved.
 3 | #
 4 | # Redistribution and use in source and binary forms, with or without
 5 | # modification, are permitted provided that the following conditions are
 6 | # met:
 7 | #
 8 | # * Redistributions of source code must retain the above copyright notice,
 9 | # this list of conditions and the following disclaimer.
10 | # * Redistributions in binary form must reproduce the above copyright notice,
11 | # this list of conditions and the following disclaimer in the documentation
12 | # and/or other materials provided with the distribution.
13 | # * The name of the author may not be used to endorse or promote products
14 | # derived from this software without specific prior written permission.
15 | #
16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 | # POSSIBILITY OF SUCH DAMAGE.
27 | 
28 | from pypdf._crypt_providers._base import CryptBase, CryptIdentity
29 | 
30 | try:
31 |     from pypdf._crypt_providers._cryptography import (
32 |         CryptAES,
33 |         CryptRC4,
34 |         aes_cbc_decrypt,
35 |         aes_cbc_encrypt,
36 |         aes_ecb_decrypt,
37 |         aes_ecb_encrypt,
38 |         crypt_provider,
39 |         rc4_decrypt,
40 |         rc4_encrypt,
41 |     )
42 |     from pypdf._utils import Version
43 | 
44 |     if Version(crypt_provider[1]) <= Version("3.0"):
45 |         # This is due to the backend parameter being required back then:
46 |         # https://cryptography.io/en/latest/changelog/#v3-1
47 |         raise ImportError("cryptography<=3.0 is not supported")  # pragma: no cover
48 | except ImportError:
49 |     try:
50 |         from pypdf._crypt_providers._pycryptodome import (  # type: ignore
51 |             CryptAES,
52 |             CryptRC4,
53 |             aes_cbc_decrypt,
54 |             aes_cbc_encrypt,
55 |             aes_ecb_decrypt,
56 |             aes_ecb_encrypt,
57 |             crypt_provider,
58 |             rc4_decrypt,
59 |             rc4_encrypt,
60 |         )
61 |     except ImportError:
62 |         from pypdf._crypt_providers._fallback import (  # type: ignore
63 |             CryptAES,
64 |             CryptRC4,
65 |             aes_cbc_decrypt,
66 |             aes_cbc_encrypt,
67 |             aes_ecb_decrypt,
68 |             aes_ecb_encrypt,
69 |             crypt_provider,
70 |             rc4_decrypt,
71 |             rc4_encrypt,
72 |         )
73 | 
74 | __all__ = [
75 |     "CryptAES",
76 |     "CryptBase",
77 |     "CryptIdentity",
78 |     "CryptRC4",
79 |     "aes_cbc_decrypt",
80 |     "aes_cbc_encrypt",
81 |     "aes_ecb_decrypt",
82 |     "aes_ecb_encrypt",
83 |     "crypt_provider",
84 |     "rc4_decrypt",
85 |     "rc4_encrypt",
86 | ]
87 | 


--------------------------------------------------------------------------------
/pypdf/_crypt_providers/_base.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, exiledkingcc
 2 | # All rights reserved.
 3 | #
 4 | # Redistribution and use in source and binary forms, with or without
 5 | # modification, are permitted provided that the following conditions are
 6 | # met:
 7 | #
 8 | # * Redistributions of source code must retain the above copyright notice,
 9 | # this list of conditions and the following disclaimer.
10 | # * Redistributions in binary form must reproduce the above copyright notice,
11 | # this list of conditions and the following disclaimer in the documentation
12 | # and/or other materials provided with the distribution.
13 | # * The name of the author may not be used to endorse or promote products
14 | # derived from this software without specific prior written permission.
15 | #
16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 | # POSSIBILITY OF SUCH DAMAGE.
27 | 
28 | 
29 | class CryptBase:
30 |     def encrypt(self, data: bytes) -> bytes:  # pragma: no cover
31 |         return data
32 | 
33 |     def decrypt(self, data: bytes) -> bytes:  # pragma: no cover
34 |         return data
35 | 
36 | 
37 | class CryptIdentity(CryptBase):
38 |     pass
39 | 


--------------------------------------------------------------------------------
/pypdf/_crypt_providers/_fallback.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, exiledkingcc
 2 | # All rights reserved.
 3 | #
 4 | # Redistribution and use in source and binary forms, with or without
 5 | # modification, are permitted provided that the following conditions are
 6 | # met:
 7 | #
 8 | # * Redistributions of source code must retain the above copyright notice,
 9 | # this list of conditions and the following disclaimer.
10 | # * Redistributions in binary form must reproduce the above copyright notice,
11 | # this list of conditions and the following disclaimer in the documentation
12 | # and/or other materials provided with the distribution.
13 | # * The name of the author may not be used to endorse or promote products
14 | # derived from this software without specific prior written permission.
15 | #
16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 | # POSSIBILITY OF SUCH DAMAGE.
27 | 
28 | from pypdf._crypt_providers._base import CryptBase
29 | from pypdf.errors import DependencyError
30 | 
31 | _DEPENDENCY_ERROR_STR = "cryptography>=3.1 is required for AES algorithm"
32 | 
33 | 
34 | crypt_provider = ("local_crypt_fallback", "0.0.0")
35 | 
36 | 
37 | class CryptRC4(CryptBase):
38 |     def __init__(self, key: bytes) -> None:
39 |         self.s = bytearray(range(256))
40 |         j = 0
41 |         for i in range(256):
42 |             j = (j + self.s[i] + key[i % len(key)]) % 256
43 |             self.s[i], self.s[j] = self.s[j], self.s[i]
44 | 
45 |     def encrypt(self, data: bytes) -> bytes:
46 |         s = bytearray(self.s)
47 |         out = [0 for _ in range(len(data))]
48 |         i, j = 0, 0
49 |         for k in range(len(data)):
50 |             i = (i + 1) % 256
51 |             j = (j + s[i]) % 256
52 |             s[i], s[j] = s[j], s[i]
53 |             x = s[(s[i] + s[j]) % 256]
54 |             out[k] = data[k] ^ x
55 |         return bytes(out)
56 | 
57 |     def decrypt(self, data: bytes) -> bytes:
58 |         return self.encrypt(data)
59 | 
60 | 
61 | class CryptAES(CryptBase):
62 |     def __init__(self, key: bytes) -> None:
63 |         pass
64 | 
65 |     def encrypt(self, data: bytes) -> bytes:
66 |         raise DependencyError(_DEPENDENCY_ERROR_STR)
67 | 
68 |     def decrypt(self, data: bytes) -> bytes:
69 |         raise DependencyError(_DEPENDENCY_ERROR_STR)
70 | 
71 | 
72 | def rc4_encrypt(key: bytes, data: bytes) -> bytes:
73 |     return CryptRC4(key).encrypt(data)
74 | 
75 | 
76 | def rc4_decrypt(key: bytes, data: bytes) -> bytes:
77 |     return CryptRC4(key).decrypt(data)
78 | 
79 | 
80 | def aes_ecb_encrypt(key: bytes, data: bytes) -> bytes:
81 |     raise DependencyError(_DEPENDENCY_ERROR_STR)
82 | 
83 | 
84 | def aes_ecb_decrypt(key: bytes, data: bytes) -> bytes:
85 |     raise DependencyError(_DEPENDENCY_ERROR_STR)
86 | 
87 | 
88 | def aes_cbc_encrypt(key: bytes, iv: bytes, data: bytes) -> bytes:
89 |     raise DependencyError(_DEPENDENCY_ERROR_STR)
90 | 
91 | 
92 | def aes_cbc_decrypt(key: bytes, iv: bytes, data: bytes) -> bytes:
93 |     raise DependencyError(_DEPENDENCY_ERROR_STR)
94 | 


--------------------------------------------------------------------------------
/pypdf/_crypt_providers/_pycryptodome.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, exiledkingcc
 2 | # All rights reserved.
 3 | #
 4 | # Redistribution and use in source and binary forms, with or without
 5 | # modification, are permitted provided that the following conditions are
 6 | # met:
 7 | #
 8 | # * Redistributions of source code must retain the above copyright notice,
 9 | # this list of conditions and the following disclaimer.
10 | # * Redistributions in binary form must reproduce the above copyright notice,
11 | # this list of conditions and the following disclaimer in the documentation
12 | # and/or other materials provided with the distribution.
13 | # * The name of the author may not be used to endorse or promote products
14 | # derived from this software without specific prior written permission.
15 | #
16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 | # POSSIBILITY OF SUCH DAMAGE.
27 | 
28 | import secrets
29 | 
30 | from Crypto import __version__
31 | from Crypto.Cipher import AES, ARC4
32 | from Crypto.Util.Padding import pad
33 | 
34 | from pypdf._crypt_providers._base import CryptBase
35 | 
36 | crypt_provider = ("pycryptodome", __version__)
37 | 
38 | 
39 | class CryptRC4(CryptBase):
40 |     def __init__(self, key: bytes) -> None:
41 |         self.key = key
42 | 
43 |     def encrypt(self, data: bytes) -> bytes:
44 |         return ARC4.ARC4Cipher(self.key).encrypt(data)
45 | 
46 |     def decrypt(self, data: bytes) -> bytes:
47 |         return ARC4.ARC4Cipher(self.key).decrypt(data)
48 | 
49 | 
50 | class CryptAES(CryptBase):
51 |     def __init__(self, key: bytes) -> None:
52 |         self.key = key
53 | 
54 |     def encrypt(self, data: bytes) -> bytes:
55 |         iv = secrets.token_bytes(16)
56 |         data = pad(data, 16)
57 |         aes = AES.new(self.key, AES.MODE_CBC, iv)
58 |         return iv + aes.encrypt(data)
59 | 
60 |     def decrypt(self, data: bytes) -> bytes:
61 |         iv = data[:16]
62 |         data = data[16:]
63 |         # for empty encrypted data
64 |         if not data:
65 |             return data
66 | 
67 |         # just for robustness, it does not happen under normal circumstances
68 |         if len(data) % 16 != 0:
69 |             data = pad(data, 16)
70 | 
71 |         aes = AES.new(self.key, AES.MODE_CBC, iv)
72 |         d = aes.decrypt(data)
73 |         return d[: -d[-1]]
74 | 
75 | 
76 | def rc4_encrypt(key: bytes, data: bytes) -> bytes:
77 |     return ARC4.ARC4Cipher(key).encrypt(data)
78 | 
79 | 
80 | def rc4_decrypt(key: bytes, data: bytes) -> bytes:
81 |     return ARC4.ARC4Cipher(key).decrypt(data)
82 | 
83 | 
84 | def aes_ecb_encrypt(key: bytes, data: bytes) -> bytes:
85 |     return AES.new(key, AES.MODE_ECB).encrypt(data)
86 | 
87 | 
88 | def aes_ecb_decrypt(key: bytes, data: bytes) -> bytes:
89 |     return AES.new(key, AES.MODE_ECB).decrypt(data)
90 | 
91 | 
92 | def aes_cbc_encrypt(key: bytes, iv: bytes, data: bytes) -> bytes:
93 |     return AES.new(key, AES.MODE_CBC, iv).encrypt(data)
94 | 
95 | 
96 | def aes_cbc_decrypt(key: bytes, iv: bytes, data: bytes) -> bytes:
97 |     return AES.new(key, AES.MODE_CBC, iv).decrypt(data)
98 | 


--------------------------------------------------------------------------------
/pypdf/_merger.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2006, Mathieu Fenniak
 2 | # All rights reserved.
 3 | #
 4 | # Redistribution and use in source and binary forms, with or without
 5 | # modification, are permitted provided that the following conditions are
 6 | # met:
 7 | #
 8 | # * Redistributions of source code must retain the above copyright notice,
 9 | # this list of conditions and the following disclaimer.
10 | # * Redistributions in binary form must reproduce the above copyright notice,
11 | # this list of conditions and the following disclaimer in the documentation
12 | # and/or other materials provided with the distribution.
13 | # * The name of the author may not be used to endorse or promote products
14 | # derived from this software without specific prior written permission.
15 | #
16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 | # POSSIBILITY OF SUCH DAMAGE.
27 | 
28 | 
29 | from ._utils import (
30 |     deprecation_with_replacement,
31 | )
32 | 
33 | 
34 | class PdfMerger:
35 |     """
36 |     Use :class:`PdfWriter` instead.
37 | 
38 |     .. deprecated:: 5.0.0
39 |     """
40 | 
41 |     def __init__(self) -> None:
42 |         deprecation_with_replacement("PdfMerger", "PdfWriter", "5.0.0")
43 | 


--------------------------------------------------------------------------------
/pypdf/_protocols.py:
--------------------------------------------------------------------------------
 1 | """Helpers for working with PDF types."""
 2 | 
 3 | from abc import abstractmethod
 4 | from pathlib import Path
 5 | from typing import IO, Any, Dict, List, Optional, Protocol, Tuple, Union
 6 | 
 7 | from ._utils import StrByteType, StreamType
 8 | 
 9 | 
10 | class PdfObjectProtocol(Protocol):
11 |     indirect_reference: Any
12 | 
13 |     def clone(
14 |         self,
15 |         pdf_dest: Any,
16 |         force_duplicate: bool = False,
17 |         ignore_fields: Union[Tuple[str, ...], List[str], None] = (),
18 |     ) -> Any:
19 |         ...  # pragma: no cover
20 | 
21 |     def _reference_clone(self, clone: Any, pdf_dest: Any) -> Any:
22 |         ...  # pragma: no cover
23 | 
24 |     def get_object(self) -> Optional["PdfObjectProtocol"]:
25 |         ...  # pragma: no cover
26 | 
27 |     def hash_value(self) -> bytes:
28 |         ...  # pragma: no cover
29 | 
30 |     def write_to_stream(
31 |         self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
32 |     ) -> None:
33 |         ...  # pragma: no cover
34 | 
35 | 
36 | class XmpInformationProtocol(PdfObjectProtocol):
37 |     pass
38 | 
39 | 
40 | class PdfCommonDocProtocol(Protocol):
41 |     @property
42 |     def pdf_header(self) -> str:
43 |         ...  # pragma: no cover
44 | 
45 |     @property
46 |     def pages(self) -> List[Any]:
47 |         ...  # pragma: no cover
48 | 
49 |     @property
50 |     def root_object(self) -> PdfObjectProtocol:
51 |         ...  # pragma: no cover
52 | 
53 |     def get_object(self, indirect_reference: Any) -> Optional[PdfObjectProtocol]:
54 |         ...  # pragma: no cover
55 | 
56 |     @property
57 |     def strict(self) -> bool:
58 |         ...  # pragma: no cover
59 | 
60 | 
61 | class PdfReaderProtocol(PdfCommonDocProtocol, Protocol):
62 |     @property
63 |     @abstractmethod
64 |     def xref(self) -> Dict[int, Dict[int, Any]]:
65 |         ...  # pragma: no cover
66 | 
67 |     @property
68 |     @abstractmethod
69 |     def trailer(self) -> Dict[str, Any]:
70 |         ...  # pragma: no cover
71 | 
72 | 
73 | class PdfWriterProtocol(PdfCommonDocProtocol, Protocol):
74 |     _objects: List[Any]
75 |     _id_translated: Dict[int, Dict[int, int]]
76 | 
77 |     incremental: bool
78 |     _reader: Any  # PdfReader
79 | 
80 |     @abstractmethod
81 |     def write(self, stream: Union[Path, StrByteType]) -> Tuple[bool, IO[Any]]:
82 |         ...  # pragma: no cover
83 | 
84 |     @abstractmethod
85 |     def _add_object(self, obj: Any) -> Any:
86 |         ...  # pragma: no cover
87 | 


--------------------------------------------------------------------------------
/pypdf/_text_extraction/_layout_mode/__init__.py:
--------------------------------------------------------------------------------
 1 | """Layout mode text extraction extension for pypdf"""
 2 | from ._fixed_width_page import (
 3 |     fixed_char_width,
 4 |     fixed_width_page,
 5 |     text_show_operations,
 6 |     y_coordinate_groups,
 7 | )
 8 | from ._font import Font
 9 | 
10 | __all__ = [
11 |     "Font",
12 |     "fixed_char_width",
13 |     "fixed_width_page",
14 |     "text_show_operations",
15 |     "y_coordinate_groups",
16 | ]
17 | 


--------------------------------------------------------------------------------
/pypdf/_version.py:
--------------------------------------------------------------------------------
1 | __version__ = "5.6.0"
2 | 


--------------------------------------------------------------------------------
/pypdf/annotations/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | PDF specifies several annotation types which pypdf makes available here.
 3 | 
 4 | The names of the annotations and their attributes do not reflect the names in
 5 | the specification in all cases. For example, the PDF standard defines a
 6 | 'Square' annotation that does not actually need to be square. For this reason,
 7 | pypdf calls it 'Rectangle'.
 8 | 
 9 | At their core, all annotation types are DictionaryObjects. That means if pypdf
10 | does not implement a feature, users can easily extend the given functionality.
11 | """
12 | 
13 | 
14 | from ._base import NO_FLAGS, AnnotationDictionary
15 | from ._markup_annotations import (
16 |     Ellipse,
17 |     FreeText,
18 |     Highlight,
19 |     Line,
20 |     MarkupAnnotation,
21 |     Polygon,
22 |     PolyLine,
23 |     Rectangle,
24 |     Text,
25 | )
26 | from ._non_markup_annotations import Link, Popup
27 | 
28 | __all__ = [
29 |     "NO_FLAGS",
30 |     "AnnotationDictionary",
31 |     "Ellipse",
32 |     "FreeText",
33 |     "Highlight",
34 |     "Line",
35 |     "Link",
36 |     "MarkupAnnotation",
37 |     "PolyLine",
38 |     "Polygon",
39 |     "Popup",
40 |     "Rectangle",
41 |     "Text",
42 | ]
43 | 


--------------------------------------------------------------------------------
/pypdf/annotations/_base.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC
 2 | 
 3 | from ..constants import AnnotationFlag
 4 | from ..generic import NameObject, NumberObject
 5 | from ..generic._data_structures import DictionaryObject
 6 | 
 7 | 
 8 | class AnnotationDictionary(DictionaryObject, ABC):
 9 |     def __init__(self) -> None:
10 |         from ..generic._base import NameObject
11 | 
12 |         # /Rect should not be added here as Polygon and PolyLine can automatically set it
13 |         self[NameObject("/Type")] = NameObject("/Annot")
14 |         # The flags were NOT added to the constructor on purpose:
15 |         # We expect that most users don't want to change the default.
16 |         # If they do, they can use the property. The default is 0.
17 | 
18 |     @property
19 |     def flags(self) -> AnnotationFlag:
20 |         return self.get(NameObject("/F"), AnnotationFlag(0))
21 | 
22 |     @flags.setter
23 |     def flags(self, value: AnnotationFlag) -> None:
24 |         self[NameObject("/F")] = NumberObject(value)
25 | 
26 | 
27 | NO_FLAGS = AnnotationFlag(0)
28 | 


--------------------------------------------------------------------------------
/pypdf/annotations/_non_markup_annotations.py:
--------------------------------------------------------------------------------
  1 | from typing import TYPE_CHECKING, Any, Optional, Tuple, Union
  2 | 
  3 | from ..generic._base import (
  4 |     BooleanObject,
  5 |     NameObject,
  6 |     NumberObject,
  7 |     TextStringObject,
  8 | )
  9 | from ..generic._data_structures import ArrayObject, DictionaryObject
 10 | from ..generic._fit import DEFAULT_FIT, Fit
 11 | from ..generic._rectangle import RectangleObject
 12 | from ._base import AnnotationDictionary
 13 | 
 14 | 
 15 | class Link(AnnotationDictionary):
 16 |     def __init__(
 17 |         self,
 18 |         *,
 19 |         rect: Union[RectangleObject, Tuple[float, float, float, float]],
 20 |         border: Optional[ArrayObject] = None,
 21 |         url: Optional[str] = None,
 22 |         target_page_index: Optional[int] = None,
 23 |         fit: Fit = DEFAULT_FIT,
 24 |         **kwargs: Any,
 25 |     ) -> None:
 26 |         super().__init__(**kwargs)
 27 |         if TYPE_CHECKING:
 28 |             from ..types import BorderArrayType
 29 | 
 30 |         is_external = url is not None
 31 |         is_internal = target_page_index is not None
 32 |         if not is_external and not is_internal:
 33 |             raise ValueError(
 34 |                 "Either 'url' or 'target_page_index' have to be provided. Both were None."
 35 |             )
 36 |         if is_external and is_internal:
 37 |             raise ValueError(
 38 |                 "Either 'url' or 'target_page_index' have to be provided. "
 39 |                 f"{url=}, {target_page_index=}"
 40 |             )
 41 | 
 42 |         border_arr: BorderArrayType
 43 |         if border is not None:
 44 |             border_arr = [NumberObject(n) for n in border[:3]]
 45 |             if len(border) == 4:
 46 |                 dash_pattern = ArrayObject([NumberObject(n) for n in border[3]])
 47 |                 border_arr.append(dash_pattern)
 48 |         else:
 49 |             border_arr = [NumberObject(0)] * 3
 50 | 
 51 |         self.update(
 52 |             {
 53 |                 NameObject("/Type"): NameObject("/Annot"),
 54 |                 NameObject("/Subtype"): NameObject("/Link"),
 55 |                 NameObject("/Rect"): RectangleObject(rect),
 56 |                 NameObject("/Border"): ArrayObject(border_arr),
 57 |             }
 58 |         )
 59 |         if is_external:
 60 |             self[NameObject("/A")] = DictionaryObject(
 61 |                 {
 62 |                     NameObject("/S"): NameObject("/URI"),
 63 |                     NameObject("/Type"): NameObject("/Action"),
 64 |                     NameObject("/URI"): TextStringObject(url),
 65 |                 }
 66 |             )
 67 |         if is_internal:
 68 |             # This needs to be updated later!
 69 |             dest_deferred = DictionaryObject(
 70 |                 {
 71 |                     "target_page_index": NumberObject(target_page_index),
 72 |                     "fit": NameObject(fit.fit_type),
 73 |                     "fit_args": fit.fit_args,
 74 |                 }
 75 |             )
 76 |             self[NameObject("/Dest")] = dest_deferred
 77 | 
 78 | 
 79 | class Popup(AnnotationDictionary):
 80 |     def __init__(
 81 |         self,
 82 |         *,
 83 |         rect: Union[RectangleObject, Tuple[float, float, float, float]],
 84 |         parent: Optional[DictionaryObject] = None,
 85 |         open: bool = False,
 86 |         **kwargs: Any,
 87 |     ) -> None:
 88 |         super().__init__(**kwargs)
 89 |         self.update(
 90 |             {
 91 |                 NameObject("/Subtype"): NameObject("/Popup"),
 92 |                 NameObject("/Rect"): RectangleObject(rect),
 93 |                 NameObject("/Open"): BooleanObject(open),
 94 |             }
 95 |         )
 96 |         if parent:
 97 |             # This needs to be an indirect object
 98 |             try:
 99 |                 self[NameObject("/Parent")] = parent.indirect_reference
100 |             except AttributeError:
101 |                 from .._utils import logger_warning
102 | 
103 |                 logger_warning(
104 |                     "Unregistered Parent object : No Parent field set",
105 |                     __name__,
106 |                 )
107 | 


--------------------------------------------------------------------------------
/pypdf/errors.py:
--------------------------------------------------------------------------------
 1 | """
 2 | All errors/exceptions pypdf raises and all of the warnings it uses.
 3 | 
 4 | Please note that broken PDF files might cause other Exceptions.
 5 | """
 6 | 
 7 | 
 8 | class DeprecationError(Exception):
 9 |     """Raised when a deprecated feature is used."""
10 | 
11 | 
12 | class DependencyError(Exception):
13 |     """
14 |     Raised when a required dependency (a library or module that pypdf depends on)
15 |     is not available or cannot be imported.
16 |     """
17 | 
18 | 
19 | class PyPdfError(Exception):
20 |     """Base class for all exceptions raised by pypdf."""
21 | 
22 | 
23 | class PdfReadError(PyPdfError):
24 |     """Raised when there is an issue reading a PDF file."""
25 | 
26 | 
27 | class PageSizeNotDefinedError(PyPdfError):
28 |     """Raised when the page size of a PDF document is not defined."""
29 | 
30 | 
31 | class PdfReadWarning(UserWarning):
32 |     """Issued when there is a potential issue reading a PDF file, but it can still be read."""
33 | 
34 | 
35 | class PdfStreamError(PdfReadError):
36 |     """Raised when there is an issue reading the stream of data in a PDF file."""
37 | 
38 | 
39 | class ParseError(PyPdfError):
40 |     """
41 |     Raised when there is an issue parsing (analyzing and understanding the
42 |     structure and meaning of) a PDF file.
43 |     """
44 | 
45 | 
46 | class FileNotDecryptedError(PdfReadError):
47 |     """
48 |     Raised when a PDF file that has been encrypted
49 |     (meaning it requires a password to be accessed) has not been successfully
50 |     decrypted.
51 |     """
52 | 
53 | 
54 | class WrongPasswordError(FileNotDecryptedError):
55 |     """Raised when the wrong password is used to try to decrypt an encrypted PDF file."""
56 | 
57 | 
58 | class EmptyFileError(PdfReadError):
59 |     """Raised when a PDF file is empty or has no content."""
60 | 
61 | 
62 | class EmptyImageDataError(PyPdfError):
63 |     """Raised when trying to process an image that has no data."""
64 | 
65 | 
66 | STREAM_TRUNCATED_PREMATURELY = "Stream has ended unexpectedly"
67 | 


--------------------------------------------------------------------------------
/pypdf/generic/_outline.py:
--------------------------------------------------------------------------------
 1 | from typing import Union
 2 | 
 3 | from .._utils import StreamType, deprecation_no_replacement
 4 | from ._base import NameObject
 5 | from ._data_structures import Destination
 6 | 
 7 | 
 8 | class OutlineItem(Destination):
 9 |     def write_to_stream(
10 |         self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
11 |     ) -> None:
12 |         if encryption_key is not None:  # deprecated
13 |             deprecation_no_replacement(
14 |                 "the encryption_key parameter of write_to_stream", "5.0.0"
15 |             )
16 |         stream.write(b"<<\n")
17 |         for key in [
18 |             NameObject(x)
19 |             for x in ["/Title", "/Parent", "/First", "/Last", "/Next", "/Prev"]
20 |             if x in self
21 |         ]:
22 |             key.write_to_stream(stream)
23 |             stream.write(b" ")
24 |             value = self.raw_get(key)
25 |             value.write_to_stream(stream)
26 |             stream.write(b"\n")
27 |         key = NameObject("/Dest")
28 |         key.write_to_stream(stream)
29 |         stream.write(b" ")
30 |         value = self.dest_array
31 |         value.write_to_stream(stream)
32 |         stream.write(b"\n")
33 |         stream.write(b">>")
34 | 


--------------------------------------------------------------------------------
/pypdf/generic/_rectangle.py:
--------------------------------------------------------------------------------
  1 | from typing import Any, Tuple, Union
  2 | 
  3 | from ._base import FloatObject, NumberObject
  4 | from ._data_structures import ArrayObject
  5 | 
  6 | 
  7 | class RectangleObject(ArrayObject):
  8 |     """
  9 |     This class is used to represent *page boxes* in pypdf.
 10 | 
 11 |     These boxes include:
 12 | 
 13 |     * :attr:`artbox <pypdf._page.PageObject.artbox>`
 14 |     * :attr:`bleedbox <pypdf._page.PageObject.bleedbox>`
 15 |     * :attr:`cropbox <pypdf._page.PageObject.cropbox>`
 16 |     * :attr:`mediabox <pypdf._page.PageObject.mediabox>`
 17 |     * :attr:`trimbox <pypdf._page.PageObject.trimbox>`
 18 |     """
 19 | 
 20 |     def __init__(
 21 |         self, arr: Union["RectangleObject", Tuple[float, float, float, float]]
 22 |     ) -> None:
 23 |         # must have four points
 24 |         assert len(arr) == 4
 25 |         # automatically convert arr[x] into NumberObject(arr[x]) if necessary
 26 |         ArrayObject.__init__(self, [self._ensure_is_number(x) for x in arr])  # type: ignore
 27 | 
 28 |     def _ensure_is_number(self, value: Any) -> Union[FloatObject, NumberObject]:
 29 |         if not isinstance(value, (FloatObject, NumberObject)):
 30 |             value = FloatObject(value)
 31 |         return value
 32 | 
 33 |     def scale(self, sx: float, sy: float) -> "RectangleObject":
 34 |         return RectangleObject(
 35 |             (
 36 |                 float(self.left) * sx,
 37 |                 float(self.bottom) * sy,
 38 |                 float(self.right) * sx,
 39 |                 float(self.top) * sy,
 40 |             )
 41 |         )
 42 | 
 43 |     def __repr__(self) -> str:
 44 |         return f"RectangleObject({list(self)!r})"
 45 | 
 46 |     @property
 47 |     def left(self) -> FloatObject:
 48 |         return self[0]
 49 | 
 50 |     @left.setter
 51 |     def left(self, f: float) -> None:
 52 |         self[0] = FloatObject(f)
 53 | 
 54 |     @property
 55 |     def bottom(self) -> FloatObject:
 56 |         return self[1]
 57 | 
 58 |     @bottom.setter
 59 |     def bottom(self, f: float) -> None:
 60 |         self[1] = FloatObject(f)
 61 | 
 62 |     @property
 63 |     def right(self) -> FloatObject:
 64 |         return self[2]
 65 | 
 66 |     @right.setter
 67 |     def right(self, f: float) -> None:
 68 |         self[2] = FloatObject(f)
 69 | 
 70 |     @property
 71 |     def top(self) -> FloatObject:
 72 |         return self[3]
 73 | 
 74 |     @top.setter
 75 |     def top(self, f: float) -> None:
 76 |         self[3] = FloatObject(f)
 77 | 
 78 |     @property
 79 |     def lower_left(self) -> Tuple[float, float]:
 80 |         """
 81 |         Property to read and modify the lower left coordinate of this box
 82 |         in (x,y) form.
 83 |         """
 84 |         return self.left, self.bottom
 85 | 
 86 |     @lower_left.setter
 87 |     def lower_left(self, value: Tuple[float, float]) -> None:
 88 |         self[0], self[1] = (self._ensure_is_number(x) for x in value)
 89 | 
 90 |     @property
 91 |     def lower_right(self) -> Tuple[float, float]:
 92 |         """
 93 |         Property to read and modify the lower right coordinate of this box
 94 |         in (x,y) form.
 95 |         """
 96 |         return self.right, self.bottom
 97 | 
 98 |     @lower_right.setter
 99 |     def lower_right(self, value: Tuple[float, float]) -> None:
100 |         self[2], self[1] = (self._ensure_is_number(x) for x in value)
101 | 
102 |     @property
103 |     def upper_left(self) -> Tuple[float, float]:
104 |         """
105 |         Property to read and modify the upper left coordinate of this box
106 |         in (x,y) form.
107 |         """
108 |         return self.left, self.top
109 | 
110 |     @upper_left.setter
111 |     def upper_left(self, value: Tuple[float, float]) -> None:
112 |         self[0], self[3] = (self._ensure_is_number(x) for x in value)
113 | 
114 |     @property
115 |     def upper_right(self) -> Tuple[float, float]:
116 |         """
117 |         Property to read and modify the upper right coordinate of this box
118 |         in (x,y) form.
119 |         """
120 |         return self.right, self.top
121 | 
122 |     @upper_right.setter
123 |     def upper_right(self, value: Tuple[float, float]) -> None:
124 |         self[2], self[3] = (self._ensure_is_number(x) for x in value)
125 | 
126 |     @property
127 |     def width(self) -> float:
128 |         return self.right - self.left
129 | 
130 |     @property
131 |     def height(self) -> float:
132 |         return self.top - self.bottom
133 | 


--------------------------------------------------------------------------------
/pypdf/papersizes.py:
--------------------------------------------------------------------------------
 1 | """Helper to get paper sizes."""
 2 | 
 3 | from typing import NamedTuple
 4 | 
 5 | 
 6 | class Dimensions(NamedTuple):
 7 |     width: int
 8 |     height: int
 9 | 
10 | 
11 | class PaperSize:
12 |     """(width, height) of the paper in portrait mode in pixels at 72 ppi."""
13 | 
14 |     # Notes of how to calculate it:
15 |     # 1. Get the size of the paper in millimeters
16 |     # 2. Convert it to inches (25.4 millimeters is equal to 1 inch)
17 |     # 3. Convert it to pixels at 72dpi (1 inch is equal to 72 pixels)
18 | 
19 |     # All Din-A paper sizes follow this pattern:
20 |     # 2 x A(n - 1) = A(n)
21 |     # So the height of the next bigger one is the width of the smaller one
22 |     # The ratio is always approximately 1:2**0.5
23 |     # Additionally, A0 is defined to have an area of 1 m**2
24 |     # https://en.wikipedia.org/wiki/ISO_216
25 |     # Be aware of rounding issues!
26 |     A0 = Dimensions(2384, 3370)  # 841mm x 1189mm
27 |     A1 = Dimensions(1684, 2384)
28 |     A2 = Dimensions(1191, 1684)
29 |     A3 = Dimensions(842, 1191)
30 |     A4 = Dimensions(
31 |         595, 842
32 |     )  # Printer paper, documents - this is by far the most common
33 |     A5 = Dimensions(420, 595)  # Paperback books
34 |     A6 = Dimensions(298, 420)  # Postcards
35 |     A7 = Dimensions(210, 298)
36 |     A8 = Dimensions(147, 210)
37 | 
38 |     # Envelopes
39 |     C4 = Dimensions(649, 918)
40 | 
41 | 
42 | _din_a = (
43 |     PaperSize.A0,
44 |     PaperSize.A1,
45 |     PaperSize.A2,
46 |     PaperSize.A3,
47 |     PaperSize.A4,
48 |     PaperSize.A5,
49 |     PaperSize.A6,
50 |     PaperSize.A7,
51 |     PaperSize.A8,
52 | )
53 | 


--------------------------------------------------------------------------------
/pypdf/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/pypdf/py.typed


--------------------------------------------------------------------------------
/pypdf/types.py:
--------------------------------------------------------------------------------
 1 | """Helpers for working with PDF types."""
 2 | 
 3 | import sys
 4 | from typing import List, Literal, Union
 5 | 
 6 | if sys.version_info[:2] >= (3, 10):
 7 |     # Python 3.10+: https://www.python.org/dev/peps/pep-0484
 8 |     from typing import TypeAlias
 9 | else:
10 |     from typing_extensions import TypeAlias
11 | 
12 | from .generic._base import NameObject, NullObject, NumberObject
13 | from .generic._data_structures import ArrayObject, Destination
14 | from .generic._outline import OutlineItem
15 | 
16 | BorderArrayType: TypeAlias = List[Union[NameObject, NumberObject, ArrayObject]]
17 | OutlineItemType: TypeAlias = Union[OutlineItem, Destination]
18 | FitType: TypeAlias = Literal[
19 |     "/XYZ", "/Fit", "/FitH", "/FitV", "/FitR", "/FitB", "/FitBH", "/FitBV"
20 | ]
21 | # Those go with the FitType: They specify values for the fit
22 | ZoomArgType: TypeAlias = Union[NumberObject, NullObject, float]
23 | ZoomArgsType: TypeAlias = List[ZoomArgType]
24 | 
25 | # Recursive types like the following are not yet supported by mypy:
26 | #    OutlineType = List[Union[Destination, "OutlineType"]]
27 | # See https://github.com/python/mypy/issues/731
28 | # Hence use this for the moment:
29 | OutlineType = List[Union[Destination, List[Union[Destination, List[Destination]]]]]
30 | 
31 | LayoutType: TypeAlias = Literal[
32 |     "/NoLayout",
33 |     "/SinglePage",
34 |     "/OneColumn",
35 |     "/TwoColumnLeft",
36 |     "/TwoColumnRight",
37 |     "/TwoPageLeft",
38 |     "/TwoPageRight",
39 | ]
40 | PagemodeType: TypeAlias = Literal[
41 |     "/UseNone",
42 |     "/UseOutlines",
43 |     "/UseThumbs",
44 |     "/FullScreen",
45 |     "/UseOC",
46 |     "/UseAttachments",
47 | ]
48 | AnnotationSubtype: TypeAlias = Literal[
49 |     "/Text",
50 |     "/Link",
51 |     "/FreeText",
52 |     "/Line",
53 |     "/Square",
54 |     "/Circle",
55 |     "/Polygon",
56 |     "/PolyLine",
57 |     "/Highlight",
58 |     "/Underline",
59 |     "/Squiggly",
60 |     "/StrikeOut",
61 |     "/Caret",
62 |     "/Stamp",
63 |     "/Ink",
64 |     "/Popup",
65 |     "/FileAttachment",
66 |     "/Sound",
67 |     "/Movie",
68 |     "/Screen",
69 |     "/Widget",
70 |     "/PrinterMark",
71 |     "/TrapNet",
72 |     "/Watermark",
73 |     "/3D",
74 |     "/Redact",
75 |     "/Projection",
76 |     "/RichMedia",
77 | ]
78 | 


--------------------------------------------------------------------------------
/requirements/ci-3.11.txt:
--------------------------------------------------------------------------------
 1 | #
 2 | # This file is autogenerated by pip-compile with Python 3.11
 3 | # by the following command:
 4 | #
 5 | #    pip-compile --output-file=requirements/ci-3.11.txt requirements/ci.in
 6 | #
 7 | cffi==1.17.1
 8 |     # via cryptography
 9 | coverage[toml]==7.6.4
10 |     # via
11 |     #   -r requirements/ci.in
12 |     #   pytest-cov
13 | cryptography==44.0.1
14 |     # via -r requirements/ci.in
15 | defusedxml==0.7.1
16 |     # via fpdf2
17 | exceptiongroup==1.2.2
18 |     # via pytest
19 | execnet==2.1.1
20 |     # via pytest-xdist
21 | fonttools==4.54.1
22 |     # via fpdf2
23 | fpdf2==2.8.1
24 |     # via -r requirements/ci.in
25 | iniconfig==2.0.0
26 |     # via pytest
27 | mypy==1.16.0
28 |     # via -r requirements/ci.in
29 | mypy-extensions==1.0.0
30 |     # via mypy
31 | packaging==24.1
32 |     # via pytest
33 | pillow==11.0.0
34 |     # via
35 |     #   -r requirements/ci.in
36 |     #   fpdf2
37 | pluggy==1.5.0
38 |     # via pytest
39 | py-cpuinfo==9.0.0
40 |     # via pytest-benchmark
41 | pycparser==2.22
42 |     # via cffi
43 | pytest==8.3.3
44 |     # via
45 |     #   -r requirements/ci.in
46 |     #   pytest-benchmark
47 |     #   pytest-cov
48 |     #   pytest-socket
49 |     #   pytest-timeout
50 |     #   pytest-xdist
51 | pytest-benchmark==4.0.0
52 |     # via -r requirements/ci.in
53 | pytest-cov==5.0.0
54 |     # via -r requirements/ci.in
55 | pytest-socket==0.7.0
56 |     # via -r requirements/ci.in
57 | pytest-timeout==2.3.1
58 |     # via -r requirements/ci.in
59 | pytest-xdist==3.6.1
60 |     # via -r requirements/ci.in
61 | pyyaml==6.0.2
62 |     # via -r requirements/ci.in
63 | ruff==0.11.0
64 |     # via -r requirements/ci.in
65 | tomli==2.0.2
66 |     # via
67 |     #   coverage
68 |     #   mypy
69 |     #   pytest
70 | typeguard==4.3.0
71 |     # via -r requirements/ci.in
72 | types-pillow==10.2.0.20240822
73 |     # via -r requirements/ci.in
74 | typing-extensions==4.12.2
75 |     # via
76 |     #   mypy
77 |     #   typeguard
78 | 


--------------------------------------------------------------------------------
/requirements/ci.in:
--------------------------------------------------------------------------------
 1 | coverage
 2 | fpdf2
 3 | mypy
 4 | pillow
 5 | cryptography
 6 | pytest
 7 | pytest-benchmark
 8 | pytest-socket
 9 | pytest-timeout
10 | pytest-xdist
11 | pytest-cov
12 | # ruff  # only take this for 3.11
13 | typeguard
14 | types-Pillow
15 | pyyaml
16 | 


--------------------------------------------------------------------------------
/requirements/ci.txt:
--------------------------------------------------------------------------------
 1 | #
 2 | # This file is autogenerated by pip-compile with Python 3.8
 3 | # by the following command:
 4 | #
 5 | #    pip-compile requirements/ci.in
 6 | #
 7 | cffi==1.17.1
 8 |     # via cryptography
 9 | coverage[toml]==7.6.1
10 |     # via
11 |     #   -r requirements/ci.in
12 |     #   pytest-cov
13 | cryptography==44.0.1
14 |     # via -r requirements/ci.in
15 | defusedxml==0.7.1
16 |     # via fpdf2
17 | exceptiongroup==1.2.2
18 |     # via pytest
19 | execnet==2.1.1
20 |     # via pytest-xdist
21 | fonttools==4.54.1
22 |     # via fpdf2
23 | fpdf2==2.8.1
24 |     # via -r requirements/ci.in
25 | importlib-metadata==8.5.0
26 |     # via typeguard
27 | iniconfig==2.0.0
28 |     # via pytest
29 | mypy==1.13.0
30 |     # via -r requirements/ci.in
31 | mypy-extensions==1.0.0
32 |     # via mypy
33 | packaging==24.1
34 |     # via pytest
35 | pillow==10.4.0
36 |     # via
37 |     #   -r requirements/ci.in
38 |     #   fpdf2
39 | pluggy==1.5.0
40 |     # via pytest
41 | py-cpuinfo==9.0.0
42 |     # via pytest-benchmark
43 | pycparser==2.22
44 |     # via cffi
45 | pytest==8.3.3
46 |     # via
47 |     #   -r requirements/ci.in
48 |     #   pytest-benchmark
49 |     #   pytest-cov
50 |     #   pytest-socket
51 |     #   pytest-timeout
52 |     #   pytest-xdist
53 | pytest-benchmark==4.0.0
54 |     # via -r requirements/ci.in
55 | pytest-cov==5.0.0
56 |     # via -r requirements/ci.in
57 | pytest-socket==0.7.0
58 |     # via -r requirements/ci.in
59 | pytest-timeout==2.3.1
60 |     # via -r requirements/ci.in
61 | pytest-xdist==3.6.1
62 |     # via -r requirements/ci.in
63 | pyyaml==6.0.2
64 |     # via -r requirements/ci.in
65 | tomli==2.0.2
66 |     # via
67 |     #   coverage
68 |     #   mypy
69 |     #   pytest
70 | typeguard==4.3.0
71 |     # via -r requirements/ci.in
72 | types-pillow==10.2.0.20240822
73 |     # via -r requirements/ci.in
74 | typing-extensions==4.12.2
75 |     # via
76 |     #   mypy
77 |     #   typeguard
78 | zipp==3.20.2
79 |     # via importlib-metadata
80 | 


--------------------------------------------------------------------------------
/requirements/dev.in:
--------------------------------------------------------------------------------
1 | pillow
2 | pip-tools
3 | pre-commit
4 | pytest-cov
5 | flit
6 | wheel
7 | 


--------------------------------------------------------------------------------
/requirements/dev.txt:
--------------------------------------------------------------------------------
 1 | #
 2 | # This file is autogenerated by pip-compile with Python 3.8
 3 | # by the following command:
 4 | #
 5 | #    pip-compile requirements/dev.in
 6 | #
 7 | build==1.2.2.post1
 8 |     # via pip-tools
 9 | certifi==2024.8.30
10 |     # via requests
11 | cfgv==3.4.0
12 |     # via pre-commit
13 | charset-normalizer==3.4.0
14 |     # via requests
15 | click==8.1.7
16 |     # via pip-tools
17 | coverage[toml]==7.6.1
18 |     # via pytest-cov
19 | distlib==0.3.9
20 |     # via virtualenv
21 | docutils==0.20.1
22 |     # via flit
23 | exceptiongroup==1.2.2
24 |     # via pytest
25 | filelock==3.16.1
26 |     # via virtualenv
27 | flit==3.9.0
28 |     # via -r requirements/dev.in
29 | flit-core==3.9.0
30 |     # via flit
31 | identify==2.6.1
32 |     # via pre-commit
33 | idna==3.10
34 |     # via requests
35 | importlib-metadata==8.5.0
36 |     # via build
37 | iniconfig==2.0.0
38 |     # via pytest
39 | nodeenv==1.9.1
40 |     # via pre-commit
41 | packaging==24.1
42 |     # via
43 |     #   build
44 |     #   pytest
45 | pillow==10.4.0
46 |     # via -r requirements/dev.in
47 | pip-tools==7.4.1
48 |     # via -r requirements/dev.in
49 | platformdirs==4.3.6
50 |     # via virtualenv
51 | pluggy==1.5.0
52 |     # via pytest
53 | pre-commit==3.5.0
54 |     # via -r requirements/dev.in
55 | pyproject-hooks==1.2.0
56 |     # via
57 |     #   build
58 |     #   pip-tools
59 | pytest==8.3.3
60 |     # via pytest-cov
61 | pytest-cov==5.0.0
62 |     # via -r requirements/dev.in
63 | pyyaml==6.0.2
64 |     # via pre-commit
65 | requests==2.32.3
66 |     # via flit
67 | tomli==2.0.2
68 |     # via
69 |     #   build
70 |     #   coverage
71 |     #   pip-tools
72 |     #   pytest
73 | tomli-w==1.0.0
74 |     # via flit
75 | urllib3==2.2.3
76 |     # via requests
77 | virtualenv==20.27.0
78 |     # via pre-commit
79 | wheel==0.44.0
80 |     # via
81 |     #   -r requirements/dev.in
82 |     #   pip-tools
83 | zipp==3.20.2
84 |     # via importlib-metadata
85 | 
86 | # The following packages are considered to be unsafe in a requirements file:
87 | # pip
88 | # setuptools
89 | 


--------------------------------------------------------------------------------
/requirements/docs.in:
--------------------------------------------------------------------------------
1 | sphinx
2 | sphinx_rtd_theme
3 | myst_parser
4 | 


--------------------------------------------------------------------------------
/requirements/docs.txt:
--------------------------------------------------------------------------------
 1 | #
 2 | # This file is autogenerated by pip-compile with Python 3.10
 3 | # by the following command:
 4 | #
 5 | #    pip-compile requirements/docs.in
 6 | #
 7 | alabaster==1.0.0
 8 |     # via sphinx
 9 | babel==2.16.0
10 |     # via sphinx
11 | certifi==2024.8.30
12 |     # via requests
13 | charset-normalizer==3.4.0
14 |     # via requests
15 | docutils==0.21.2
16 |     # via
17 |     #   myst-parser
18 |     #   sphinx
19 |     #   sphinx-rtd-theme
20 | idna==3.10
21 |     # via requests
22 | imagesize==1.4.1
23 |     # via sphinx
24 | jinja2==3.1.6
25 |     # via
26 |     #   myst-parser
27 |     #   sphinx
28 | markdown-it-py==3.0.0
29 |     # via
30 |     #   mdit-py-plugins
31 |     #   myst-parser
32 | markupsafe==3.0.1
33 |     # via jinja2
34 | mdit-py-plugins==0.4.2
35 |     # via myst-parser
36 | mdurl==0.1.2
37 |     # via markdown-it-py
38 | myst-parser==4.0.0
39 |     # via -r requirements/docs.in
40 | packaging==24.1
41 |     # via sphinx
42 | pygments==2.18.0
43 |     # via sphinx
44 | pyyaml==6.0.2
45 |     # via myst-parser
46 | requests==2.32.3
47 |     # via sphinx
48 | snowballstemmer==2.2.0
49 |     # via sphinx
50 | sphinx==8.1.3
51 |     # via
52 |     #   -r requirements/docs.in
53 |     #   myst-parser
54 |     #   sphinx-rtd-theme
55 |     #   sphinxcontrib-jquery
56 | sphinx-rtd-theme==3.0.1
57 |     # via -r requirements/docs.in
58 | sphinxcontrib-applehelp==2.0.0
59 |     # via sphinx
60 | sphinxcontrib-devhelp==2.0.0
61 |     # via sphinx
62 | sphinxcontrib-htmlhelp==2.1.0
63 |     # via sphinx
64 | sphinxcontrib-jquery==4.1
65 |     # via sphinx-rtd-theme
66 | sphinxcontrib-jsmath==1.0.1
67 |     # via sphinx
68 | sphinxcontrib-qthelp==2.0.0
69 |     # via sphinx
70 | sphinxcontrib-serializinghtml==2.0.0
71 |     # via sphinx
72 | tomli==2.0.2
73 |     # via sphinx
74 | urllib3==2.2.3
75 |     # via requests
76 | 


--------------------------------------------------------------------------------
/resources/010-pdflatex-forms.txt:
--------------------------------------------------------------------------------
 1 | Name
 2 | 
 3 | Check
 4 | 
 5 | Submit
 6 | 
 7 | 
 8 | 
 9 | 
10 | 
11 | 
12 | 
13 | 
14 | 
15 | 
16 | 
17 | 
18 | 
19 | 
20 | 
21 | 
22 | 
23 | 
24 | 
25 | 
26 | 
27 | 
28 | 
29 | 
30 | 
31 | 
32 | 
33 | 
34 | 
35 | 
36 | 
37 | 
38 | 
39 | 
40 | 
41 | 
42 | 
43 | 
44 | 
45 | 
46 | 
47 | 
48 | 
49 | 
50 | 
51 | 
52 | 
53 | 
54 | 
55 | 
56 |                                                                    1


--------------------------------------------------------------------------------
/resources/AEO.1172.layout.rot180.txt:
--------------------------------------------------------------------------------
 1 | 9   1of   Page                                                                                                                                                                          2022 AEO Management Co. All Rights Reserved. Proprietary and Confidential AEO Business Information. Subject to Legal Action if Disclosed Without Authorization from AEO.Date Printed: 17/Nov/2022
 2 |                                                                                                                                               PRODUCT SUMMARY
 3 |                                                                                                                                                                                                                    Fit / Other:
 4 |                                                                                                                                                                                        1172 KNIT SHORTIE           Style Desc:
 5 |                                                                                                                                                                                        SUMMER-B 2023               Season:
 6 |                                                                                                                                                                                        50 / 170                    Division / Dept:
 7 |                                                                                                                                                                                        AMERICAN EAGLE OUTFITTERSCompany:
 8 |                               SUMMER-B 2023                                                                                                   1172 KNIT SHORTIE                                                                                          STYLE: 1172
 9 |                                                                                                                                         STATUS: FNL
10 | 


--------------------------------------------------------------------------------
/resources/AEO.1172.layout.txt:
--------------------------------------------------------------------------------
 1 |                                                                                                                 STATUS: FNL
 2 | STYLE: 1172                                                                                               1172 KNIT SHORTIE                                                                                               SUMMER-B 2023
 3 |                                       Company:                    AMERICAN EAGLE OUTFITTERS
 4 |                                       Division / Dept:            50 / 170
 5 |                                       Season:                     SUMMER-B 2023
 6 |                                       Style Desc:                 1172 KNIT SHORTIE
 7 |                                       Fit / Other:
 8 |                                                                                                            PRODUCT SUMMARY
 9 | Date Printed: 17/Nov/2022                                        2022 AEO Management Co. All Rights Reserved. Proprietary and Confidential AEO Business Information. Subject to Legal Action if Disclosed Without Authorization from AEO.     Page  1of 9
10 | 


--------------------------------------------------------------------------------
/resources/AutoCad_Diagram.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/AutoCad_Diagram.pdf


--------------------------------------------------------------------------------
/resources/AutoCad_Simple.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/AutoCad_Simple.pdf


--------------------------------------------------------------------------------
/resources/Epic.Page.layout.txt:
--------------------------------------------------------------------------------
 1 | All Postprocedure Notes
 2 |    Last edited 10/11/23 0919 by Danny Chaung, DO
 3 |    Date of Service 10/11/23 0918
 4 |    Status: Signed
 5 | Anesthesia Post Evaluation
 6 | 
 7 | Procedure Summary
 8 | 
 9 |    Date: 10/11/23                                                Room / Location: EHMC ENDOSCOPY
10 |    Anesthesia Start: 0852                                        Anesthesia Stop: 0918
11 |    Procedure: COLONOSCOPY                                        Diagnosis: Cancer screening
12 |    Scheduled Providers: Walter A Klein, MD; Danny Chaung,        Responsible Provider: Danny Chaung, DO
13 |    DO
14 |    Anesthesia Type: general                                      ASA Status: 2
15 | 
16 | 
17 | Patient location during evaluation: PACU
18 | Post op Vital Signs: stable
19 | 
20 | Level of consciousness: awake and alert
21 | Pain management: adequate analgesia
22 | Airway patency: patent
23 | Anesthetic complications: no
24 | Respiratory status: unassisted
25 | Hydration status: continuing
26 | Post-op Complications: No
27 | 
28 | 
29 | 
30 | Assessment: Nausea and Vomiting: absent
31 | 
32 | 
33 | 
34 | 
35 | MIPS Measure #404 - Smoking Abstinence
36 | Is the patient a current smoker? No (XX404)
37 | 
38 | 
39 | 
40 | 


--------------------------------------------------------------------------------
/resources/FormTestFromOo.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/FormTestFromOo.pdf


--------------------------------------------------------------------------------
/resources/GeoBase_NHNC1_Data_Model_UML_EN.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/GeoBase_NHNC1_Data_Model_UML_EN.pdf


--------------------------------------------------------------------------------
/resources/SF424_page2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/SF424_page2.pdf


--------------------------------------------------------------------------------
/resources/Sample_Td-matrix.pdf:
--------------------------------------------------------------------------------
 1 | %PDF-1.4
 2 | %----
 3 | 1 0 obj
 4 | << /Pages 3 0 R
 5 |    /Type /Catalog /Version /1.4 >>
 6 | endobj
 7 | 2 0 obj
 8 | << /CreationDate (D:20220823232400+02'00') /Producer (vim) >>
 9 | endobj
10 | 3 0 obj
11 | << /Count 1 /Kids [ 4 0 R ] /Type /Pages >>
12 | endobj
13 | 4 0 obj
14 | << /Contents [ 5 0 R ] /MediaBox [ 0 0 842 595 ] /Parent 3 0 R
15 |    /Resources << /Font << /F1 6 0 R >> >> /TrimBox [ 0 0 842 595 ]
16 |    /Type /Page >>
17 | endobj
18 | 5 0 obj
19 | << /Length 0575 >>
20 | stream
21 | 0.0 G
22 | 0.0 0.0 1.0 rg 200 100 200 100 re B
23 | 0.2 0.2 1.0 rg 400 100 200 100 re B
24 | 0.4 0.4 1.0 rg 200 200 200 100 re B
25 | 0.6 0.6 1.0 rg 400 200 200 100 re B
26 | 
27 | 0.3 0.0 0.0 rg
28 | BT
29 |    % Move text to 210 110 via Td-operation
30 |    /F1 12 Tf
31 |    210 110 Td
32 |    (Hello PDF\041) Tj
33 | 
34 |    % Tm-operation without scale followed by Td
35 |    1 0 0 1 200 0 Tm
36 |    210 110 Td
37 |    (Hello PDF 200 0 Td!) Tj
38 | 
39 |    % Tm-operation with horizontal scale
40 |    2 0 0 1 0 0 Tm
41 |    105 210 Td
42 |    (Hello PDF 2 1!) Tj
43 | 
44 |    % Tm-operation with dual scale
45 |    /F1 2.5 Tf
46 |    10 0 0 7 0 0 Tm
47 |    41 30 Td
48 |    (Hello PDF 10 7!) Tj
49 | 
50 | ET
51 | endstream
52 | endobj
53 | 6 0 obj
54 | << /Type /Font
55 |    /Subtype /Type1
56 |    /BaseFont /Helvetica-Bold
57 | >>
58 | endobj
59 | xref
60 | 0 7
61 | 0000000000 65535 f 
62 | 0000000015 00000 n 
63 | 0000000081 00000 n 
64 | 0000000158 00000 n 
65 | 0000000217 00000 n 
66 | 0000000380 00000 n 
67 | 0000001006 00000 n 
68 | trailer << /Size 7 /Info 2 0 R /Root 1 0 R >>
69 | startxref
70 | 1087
71 | %%EOF
72 | 
73 | 


--------------------------------------------------------------------------------
/resources/Seige_of_Vicksburg_Sample_OCR-crazyones-merged.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/Seige_of_Vicksburg_Sample_OCR-crazyones-merged.pdf


--------------------------------------------------------------------------------
/resources/Seige_of_Vicksburg_Sample_OCR.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/Seige_of_Vicksburg_Sample_OCR.pdf


--------------------------------------------------------------------------------
/resources/attachment.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/attachment.pdf


--------------------------------------------------------------------------------
/resources/box.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/box.pdf


--------------------------------------------------------------------------------
/resources/bytes.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/bytes.pdf


--------------------------------------------------------------------------------
/resources/commented-xmp.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/commented-xmp.pdf


--------------------------------------------------------------------------------
/resources/commented.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/commented.pdf


--------------------------------------------------------------------------------
/resources/crazyones-encrypted-256.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/crazyones-encrypted-256.pdf


--------------------------------------------------------------------------------
/resources/crazyones.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/crazyones.pdf


--------------------------------------------------------------------------------
/resources/crazyones.txt:
--------------------------------------------------------------------------------
 1 | The Crazy Ones
 2 | October 14, 1998
 3 | Heres to the crazy ones. The misﬁts. The rebels. The troublemakers.
 4 | The round pegs in the square holes.
 5 | The ones who see things diﬀerently. Theyre not fond of rules. And
 6 | they have no respect for the status quo. You can quote them,
 7 | disagree with them, glorify or vilify them.
 8 | About the only thing you cant do is ignore them. Because they change
 9 | things. They invent. They imagine. They heal. They explore. They
10 | create. They inspire. They push the human race forward.
11 | Maybe they have to be crazy.
12 | How else can you stare at an empty canvas and see a work of art? Or
13 | sit in silence and hear a song thats never been written? Or gaze at
14 | a red planet and see a laboratory on wheels?
15 | We make tools for these kinds of people.
16 | While some see them as the crazy ones, we see genius. Because the
17 | people who are crazy enough to think they can change the world,
18 | are the ones who do.


--------------------------------------------------------------------------------
/resources/crazyones_layout_vertical_space.txt:
--------------------------------------------------------------------------------
 1 | The Crazy Ones
 2 | October 14, 1998
 3 | 
 4 |    Heres to the crazy ones. The misﬁts. The rebels. The troublemakers.
 5 |        The round pegs in the square holes.
 6 |    The ones who see things diﬀerently. Theyre not fond of rules. And
 7 |        they have no respect for the status quo. You can quote them,
 8 |        disagree with them, glorify or vilify them.
 9 |    About the only thing you cant do is ignore them. Because they change
10 |        things. They invent. They imagine. They heal. They explore. They
11 |        create. They inspire. They push the human race forward.
12 |    Maybe they have to be crazy.
13 |    How else can you stare at an empty canvas and see a work of art? Or
14 |        sit in silence and hear a song thats never been written? Or gaze at
15 |        a red planet and see a laboratory on wheels?
16 |    We make tools for these kinds of people.
17 |    While some see them as the crazy ones, we see genius. Because the
18 |        people who are crazy enough to think they can change the world,
19 |        are the ones who do.


--------------------------------------------------------------------------------
/resources/crazyones_layout_vertical_space_font_height_weight.txt:
--------------------------------------------------------------------------------
 1 | The Crazy Ones
 2 | October 14, 1998
 3 | 
 4 |    Heres to the crazy ones. The misﬁts. The rebels. The troublemakers.
 5 |        The round pegs in the square holes.
 6 | 
 7 |    The ones who see things diﬀerently. Theyre not fond of rules. And
 8 |        they have no respect for the status quo. You can quote them,
 9 |        disagree with them, glorify or vilify them.
10 | 
11 |    About the only thing you cant do is ignore them. Because they change
12 |        things. They invent. They imagine. They heal. They explore. They
13 |        create. They inspire. They push the human race forward.
14 | 
15 |    Maybe they have to be crazy.
16 | 
17 |    How else can you stare at an empty canvas and see a work of art? Or
18 |        sit in silence and hear a song thats never been written? Or gaze at
19 |        a red planet and see a laboratory on wheels?
20 | 
21 |    We make tools for these kinds of people.
22 | 
23 |    While some see them as the crazy ones, we see genius. Because the
24 |        people who are crazy enough to think they can change the world,
25 |        are the ones who do.


--------------------------------------------------------------------------------
/resources/encrypted-file.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/encrypted-file.pdf


--------------------------------------------------------------------------------
/resources/encrypted_doc_no_id.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/encrypted_doc_no_id.pdf


--------------------------------------------------------------------------------
/resources/encryption/r2-empty-password.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/encryption/r2-empty-password.pdf


--------------------------------------------------------------------------------
/resources/encryption/r2-owner-password.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/encryption/r2-owner-password.pdf


--------------------------------------------------------------------------------
/resources/encryption/r2-user-password.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/encryption/r2-user-password.pdf


--------------------------------------------------------------------------------
/resources/encryption/r3-empty-password.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/encryption/r3-empty-password.pdf


--------------------------------------------------------------------------------
/resources/encryption/r3-user-password.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/encryption/r3-user-password.pdf


--------------------------------------------------------------------------------
/resources/encryption/r4-aes-user-password.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/encryption/r4-aes-user-password.pdf


--------------------------------------------------------------------------------
/resources/encryption/r4-owner-password.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/encryption/r4-owner-password.pdf


--------------------------------------------------------------------------------
/resources/encryption/r4-user-password.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/encryption/r4-user-password.pdf


--------------------------------------------------------------------------------
/resources/encryption/r5-empty-password.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/encryption/r5-empty-password.pdf


--------------------------------------------------------------------------------
/resources/encryption/r5-owner-password.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/encryption/r5-owner-password.pdf


--------------------------------------------------------------------------------
/resources/encryption/r5-user-password.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/encryption/r5-user-password.pdf


--------------------------------------------------------------------------------
/resources/encryption/r6-both-passwords.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/encryption/r6-both-passwords.pdf


--------------------------------------------------------------------------------
/resources/encryption/r6-empty-password.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/encryption/r6-empty-password.pdf


--------------------------------------------------------------------------------
/resources/encryption/r6-owner-password.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/encryption/r6-owner-password.pdf


--------------------------------------------------------------------------------
/resources/encryption/r6-user-password.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/encryption/r6-user-password.pdf


--------------------------------------------------------------------------------
/resources/encryption/unencrypted.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/encryption/unencrypted.pdf


--------------------------------------------------------------------------------
/resources/form.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/form.pdf


--------------------------------------------------------------------------------
/resources/form_acrobatReader.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/form_acrobatReader.pdf


--------------------------------------------------------------------------------
/resources/form_evince.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/form_evince.pdf


--------------------------------------------------------------------------------
/resources/git.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/git.pdf


--------------------------------------------------------------------------------
/resources/hello-world.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/hello-world.pdf


--------------------------------------------------------------------------------
/resources/imagemagick-ASCII85Decode.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/imagemagick-ASCII85Decode.pdf


--------------------------------------------------------------------------------
/resources/imagemagick-CCITTFaxDecode.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/imagemagick-CCITTFaxDecode.pdf


--------------------------------------------------------------------------------
/resources/imagemagick-images.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/imagemagick-images.pdf


--------------------------------------------------------------------------------
/resources/imagemagick-lzw.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/imagemagick-lzw.pdf


--------------------------------------------------------------------------------
/resources/indirect-rotation.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/indirect-rotation.pdf


--------------------------------------------------------------------------------
/resources/inkscape-abc.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/inkscape-abc.pdf


--------------------------------------------------------------------------------
/resources/issue-297.pdf:
--------------------------------------------------------------------------------
 1 | %PDF-1.1
 2 | 
 3 | 1 0 obj
 4 | <<
 5 |  /Type /Catalog
 6 |  /Outlines 2 0 R
 7 |  /Pages 3 0 R
 8 |  /OpenAction 7 0 R
 9 | >>
10 | endobj
11 | 
12 | 2 0 obj
13 | <<
14 |  /Type /Outlines
15 |  /Count 0
16 | >>
17 | endobj
18 | 
19 | 3 0 obj
20 | <<
21 |  /Type /Pages
22 |  /Kids [4 0 R]
23 |  /Count 1
24 | >>
25 | endobj
26 | 
27 | 4 0 obj
28 | <<
29 |  /Type /Page
30 |  /Parent 3 0 R
31 |  /MediaBox [0 0 612 792]
32 |  /Contents 5 0 R
33 |  /Resources <<
34 |              /ProcSet [/PDF /Text]
35 |              /Font << /F1 6 0 R >>
36 |             >>
37 | >>
38 | endobj
39 | 
40 | 5 0 obj
41 | << /Length 56 >>
42 | stream
43 | BT /F1 12 Tf 100 700 Td 15 TL (test example) Tj ET
44 | endstream
45 | endobj
46 | 
47 | 6 0 obj
48 | <<
49 |  /Type /Font
50 |  /Subtype /Type1
51 |  /Name /F1
52 |  /BaseFont /Helvetica
53 |  /Encoding /MacRomanEncoding
54 | >>
55 | endobj
56 | 
57 | 7 0 obj
58 | <<
59 |  /Type /Action
60 |  /S /JavaScript
61 |  /JS (app.alert({cMsg: 'Hello alert', cTitle: 'Testing PDF', nIcon: 3});)
62 | >>
63 | endobj
64 | 
65 | xref
66 | 0 8
67 | 0000000000 65535 f
68 | 0000000012 00000 n
69 | 0000000109 00000 n
70 | 0000000165 00000 n
71 | 0000000234 00000 n
72 | 0000000439 00000 n
73 | 0000000553 00000 n
74 | 0000000677 00000 n
75 | trailer
76 | <<
77 |  /Size 8
78 |  /Root 1 0 R
79 | >>
80 | startxref
81 | 842
82 | %%EOF


--------------------------------------------------------------------------------
/resources/issue-301.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/issue-301.pdf


--------------------------------------------------------------------------------
/resources/issue-604.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/issue-604.pdf


--------------------------------------------------------------------------------
/resources/issue-914-xmp-data.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/issue-914-xmp-data.pdf


--------------------------------------------------------------------------------
/resources/jpeg.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/jpeg.pdf


--------------------------------------------------------------------------------
/resources/labeled-edges-center-image.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/labeled-edges-center-image.pdf


--------------------------------------------------------------------------------
/resources/libreoffice-form.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/libreoffice-form.pdf


--------------------------------------------------------------------------------
/resources/libreoffice-writer-password.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/libreoffice-writer-password.pdf


--------------------------------------------------------------------------------
/resources/lzw_decoder_table_overflow.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/lzw_decoder_table_overflow.bin


--------------------------------------------------------------------------------
/resources/metadata.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/metadata.pdf


--------------------------------------------------------------------------------
/resources/missing_info.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/missing_info.pdf


--------------------------------------------------------------------------------
/resources/multilang.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/multilang.pdf


--------------------------------------------------------------------------------
/resources/outline-without-title.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/outline-without-title.pdf


--------------------------------------------------------------------------------
/resources/outlines-with-invalid-destinations.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/outlines-with-invalid-destinations.pdf


--------------------------------------------------------------------------------
/resources/pdflatex-forms.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/pdflatex-forms.pdf


--------------------------------------------------------------------------------
/resources/pdflatex-outline.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/pdflatex-outline.pdf


--------------------------------------------------------------------------------
/resources/reportlab-inline-image.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/reportlab-inline-image.pdf


--------------------------------------------------------------------------------
/resources/selenium-pypdf-issue-177.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/selenium-pypdf-issue-177.pdf


--------------------------------------------------------------------------------
/resources/side-by-side-subfig.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/side-by-side-subfig.pdf


--------------------------------------------------------------------------------
/resources/test Orient.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/test Orient.pdf


--------------------------------------------------------------------------------
/resources/test_watermarking_reportlab_rendering.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/test_watermarking_reportlab_rendering.png


--------------------------------------------------------------------------------
/resources/toy.layout.txt:
--------------------------------------------------------------------------------
1 | AWAY again1
2 |    AWAY again2
3 | 
4 | 
5 |     Something[cited]
6 | 
7 |               Single quote operator
8 |               Double quote operator
9 |               Last Txt


--------------------------------------------------------------------------------
/resources/toy.pdf:
--------------------------------------------------------------------------------
 1 | %PDF-1.4
 2 | 1 0 obj
 3 | << /Type /Catalog
 4 | /Outlines 2 0 R
 5 | /Pages 3 0 R
 6 | >>
 7 | endobj
 8 | 2 0 obj
 9 | << /Type /Outlines
10 | /Count 0
11 | >>
12 | endobj
13 | 3 0 obj
14 | << /Type /Pages
15 | /Kids [4 0 R]
16 | /Count 1
17 | >>
18 | endobj
19 | 4 0 obj
20 | << /Type /Page
21 | /Parent 3 0 R
22 | /MediaBox [0 0 612 792]
23 | /Contents 5 0 R
24 | /Resources << /ProcSet 6 0 R
25 | /Font << /F1 7 0 R >>
26 | >>
27 | >>
28 | endobj
29 | 5 0 obj
30 | << /Length 396 >>
31 | stream
32 | q .75000 0 0 .75000 0 792 cm
33 | q .32000 0 0 .32000 0 0 cm
34 | q
35 | q .20812 0 0 .20832 0 0 cm
36 | BT
37 | /F1 200 Tf
38 | 600 -656 Td
39 | [(AWAY again1)] TJ
40 | ET Q
41 | q .20812 0 0 .20832 0 0 cm
42 | BT
43 | /F1 200 Tf
44 | 600 TL
45 | 900 -906 Td
46 | [300 (A) 120 (W) -120 (A) 95 (Y again) (2)] TJ
47 | T*
48 | (Something) Tj
49 | 50 Ts
50 | ([cited]) Tj
51 | 600 -300 TD
52 | 100 Tw
53 | 80 Tz
54 | (Single quote operator) '
55 | 0 Ts
56 | 200 120 (Double quote operator) "
57 | T*
58 | (Last Txt) Tj
59 | ET Q Q Q Q
60 | endstream
61 | endobj
62 | 6 0 obj
63 | [/PDF /Text]
64 | endobj
65 | 7 0 obj
66 | << /Type /Font
67 | /Subtype /TrueType
68 | /Name /F1
69 | /BaseFont /Arial
70 | /Encoding /WinAnsiEncoding
71 | >>
72 | endobj
73 | xref
74 | 0 8
75 | 0000000000 65535 f
76 | 0000000009 00000 n
77 | 0000000076 00000 n
78 | 0000000120 00000 n
79 | 0000000177 00000 n
80 | 0000000318 00000 n
81 | 0000000765 00000 n
82 | 0000000793 00000 n
83 | trailer
84 | << /Size 8
85 | /Root 1 0 R
86 | >>
87 | startxref
88 | 899
89 | %%EOF


--------------------------------------------------------------------------------
/resources/two-different-pages.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/resources/two-different-pages.pdf


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | """Fixtures that are available automatically for all tests."""
 2 | 
 3 | import uuid
 4 | from pathlib import Path
 5 | 
 6 | import pytest
 7 | 
 8 | TESTS_ROOT = Path(__file__).parent.resolve()
 9 | PROJECT_ROOT = TESTS_ROOT.parent
10 | RESOURCE_ROOT = PROJECT_ROOT / "resources"
11 | 
12 | 
13 | @pytest.fixture(scope="session")
14 | def pdf_file_path(tmp_path_factory):
15 |     return tmp_path_factory.mktemp("pypdf-data") / f"{uuid.uuid4()}.pdf"
16 | 
17 | 
18 | @pytest.fixture(scope="session")
19 | def txt_file_path(tmp_path_factory):
20 |     return tmp_path_factory.mktemp("pypdf-data") / f"{uuid.uuid4()}.txt"
21 | 


--------------------------------------------------------------------------------
/tests/generic/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/tests/generic/__init__.py


--------------------------------------------------------------------------------
/tests/generic/test_image_inline.py:
--------------------------------------------------------------------------------
 1 | """Test the pypdf.generic._image_inline module."""
 2 | from io import BytesIO
 3 | 
 4 | from pypdf.generic._image_inline import is_followed_by_binary_data
 5 | 
 6 | 
 7 | def test_is_followed_by_binary_data():
 8 |     # Empty/too short stream.
 9 |     stream = BytesIO()
10 |     assert not is_followed_by_binary_data(stream)
11 | 
12 |     stream = BytesIO(b" q\n")
13 |     assert not is_followed_by_binary_data(stream)
14 | 
15 |     # byte < 32 and no whitespace.
16 |     stream = BytesIO(b"\x00\x11\x13\x37")
17 |     assert is_followed_by_binary_data(stream)
18 |     assert stream.read(1) == b"\x00"
19 |     assert is_followed_by_binary_data(stream)
20 |     assert stream.read(1) == b"\x11"
21 |     assert is_followed_by_binary_data(stream)
22 |     assert stream.read() == b"\x13\x37"
23 | 
24 |     # byte < 32, but whitespace.
25 |     stream = BytesIO(b" q\n")
26 |     assert not is_followed_by_binary_data(stream)
27 | 
28 |     # Whitespace only.
29 |     stream = BytesIO(b" \n\n\n  \n")
30 |     assert not is_followed_by_binary_data(stream)
31 | 
32 |     # No `operator_end`.
33 |     stream = BytesIO(b"\n\n\n\n\n\n\n\nBT\n")
34 |     assert not is_followed_by_binary_data(stream)
35 | 
36 |     # Operator length is <= 3.
37 |     stream = BytesIO(b"\n\n\n\n\n\n\nBT\n")
38 |     assert not is_followed_by_binary_data(stream)
39 | 
40 |     # Operator length is > 3.
41 |     stream = BytesIO(b"\n\n\n\n\nTEST\n")
42 |     assert is_followed_by_binary_data(stream)
43 | 
44 |     # Just characters.
45 |     stream = BytesIO(b" ABCDEF")
46 |     assert is_followed_by_binary_data(stream)
47 | 
48 |     # No `operator_start`.
49 |     stream = BytesIO(b"ABCDEFG")
50 |     assert is_followed_by_binary_data(stream)
51 | 
52 |     # Name object.
53 |     stream = BytesIO(b"/R10 gs\n/R12 cs\n")
54 |     assert not is_followed_by_binary_data(stream)
55 | 
56 |     # Numbers.
57 |     stream = BytesIO(b"1337 42 m\n")
58 |     assert not is_followed_by_binary_data(stream)
59 | 
60 |     stream = BytesIO(b"1234.56 42 13 37 10 20 c\n")
61 |     assert not is_followed_by_binary_data(stream)
62 | 


--------------------------------------------------------------------------------
/tests/scripts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/py-pdf/pypdf/40752eba72fc2116812b032d4e28025249417870/tests/scripts/__init__.py


--------------------------------------------------------------------------------
/tests/test_constants.py:
--------------------------------------------------------------------------------
  1 | """Test the pypdf.constants module."""
  2 | import re
  3 | from typing import Callable
  4 | 
  5 | import pytest
  6 | 
  7 | from pypdf.constants import PDF_KEYS, GraphicsStateParameters, UserAccessPermissions
  8 | 
  9 | 
 10 | def test_slash_prefix():
 11 |     """
 12 |     Naming conventions of PDF_KEYS (constant names) are followed.
 13 | 
 14 |     This test function validates if PDF key names follow the required pattern:
 15 |     - Starts with a slash "/"
 16 |     - Followed by an uppercase letter
 17 |     - Contains alphanumeric characters (letters and digits)
 18 |     - The attribute name should be a case-insensitive match, with underscores removed
 19 |     """
 20 |     pattern = re.compile(r"^\/[A-Z]+[a-zA-Z0-9]*$")
 21 |     for cls in PDF_KEYS:
 22 |         for attr in dir(cls):
 23 |             # Skip magic methods
 24 |             if attr.startswith("__") and attr.endswith("__"):
 25 |                 continue
 26 | 
 27 |             # Skip methods
 28 |             constant_value = getattr(cls, attr)
 29 |             if isinstance(constant_value, Callable):
 30 |                 continue
 31 | 
 32 |             assert constant_value.startswith("/")
 33 |             assert attr.replace("_", "").casefold() == constant_value[1:].casefold()
 34 | 
 35 |             # There are a few exceptions that may be lowercase
 36 |             if cls == GraphicsStateParameters and attr in ["ca", "op"]:
 37 |                 continue
 38 | 
 39 |             assert pattern.match(constant_value)
 40 | 
 41 | 
 42 | def test_user_access_permissions__dict_handling():
 43 |     # Value is mix of configurable and reserved bits.
 44 |     # Reserved bits should not be part of the dictionary.
 45 |     as_dict = UserAccessPermissions(512 + 64 + 8).to_dict()
 46 |     assert as_dict == {
 47 |         "add_or_modify": False,
 48 |         "assemble_doc": False,
 49 |         "extract": False,
 50 |         "extract_text_and_graphics": True,
 51 |         "fill_form_fields": False,
 52 |         "modify": True,
 53 |         "print": False,
 54 |         "print_to_representation": False,
 55 |     }
 56 | 
 57 |     # Convert the dictionary back to an integer.
 58 |     # This should add the reserved bits automatically.
 59 |     permissions = UserAccessPermissions.from_dict(as_dict)
 60 |     assert permissions == 4294963912
 61 | 
 62 |     # Roundtrip for valid dictionary.
 63 |     data = {
 64 |         "add_or_modify": True,
 65 |         "assemble_doc": False,
 66 |         "extract": False,
 67 |         "extract_text_and_graphics": True,
 68 |         "fill_form_fields": False,
 69 |         "modify": True,
 70 |         "print": False,
 71 |         "print_to_representation": True,
 72 |     }
 73 |     assert UserAccessPermissions.from_dict(data).to_dict() == data
 74 | 
 75 |     # Empty inputs.
 76 |     assert UserAccessPermissions.from_dict({}) == 4294963392  # Reserved bits.
 77 |     assert UserAccessPermissions(0).to_dict() == {
 78 |         "add_or_modify": False,
 79 |         "assemble_doc": False,
 80 |         "extract": False,
 81 |         "extract_text_and_graphics": False,
 82 |         "fill_form_fields": False,
 83 |         "modify": False,
 84 |         "print": False,
 85 |         "print_to_representation": False,
 86 |     }
 87 | 
 88 |     # Unknown dictionary keys.
 89 |     data = {
 90 |         "add_or_modify": True,
 91 |         "key1": False,
 92 |         "key2": True,
 93 |     }
 94 |     unknown = {
 95 |         "key1": False,
 96 |         "key2": True,
 97 |     }
 98 |     with pytest.raises(
 99 |         ValueError,
100 |         match=f"Unknown dictionary keys: {unknown!r}"
101 |     ):
102 |         UserAccessPermissions.from_dict(data)
103 | 
104 | 
105 | def test_user_access_permissions__all():
106 |     all_permissions = UserAccessPermissions.all()
107 |     all_int = int(all_permissions)
108 |     all_string = bin(all_permissions)
109 | 
110 |     assert all_string.startswith("0b")
111 |     assert len(all_string[2:]) == 32  # 32-bit integer
112 | 
113 |     assert all_int & UserAccessPermissions.R1 == 0
114 |     assert all_int & UserAccessPermissions.R2 == 0
115 |     assert all_int & UserAccessPermissions.PRINT == UserAccessPermissions.PRINT
116 |     assert all_int & UserAccessPermissions.R7 == UserAccessPermissions.R7
117 |     assert all_int & UserAccessPermissions.R31 == UserAccessPermissions.R31
118 | 


--------------------------------------------------------------------------------
/tests/test_forms.py:
--------------------------------------------------------------------------------
 1 | """Test form-related functionality. Separate file to keep overview."""
 2 | 
 3 | from io import BytesIO
 4 | 
 5 | import pytest
 6 | 
 7 | from pypdf import PdfReader, PdfWriter
 8 | from tests import get_data_from_url
 9 | 
10 | 
11 | @pytest.mark.enable_socket
12 | def test_form_button__v_value_should_be_name_object():
13 |     url = "https://github.com/user-attachments/files/18736500/blank-form.pdf"
14 |     name = "issue3115.pdf"
15 |     reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
16 |     writer = PdfWriter(clone_from=reader)
17 |     writer.update_page_form_field_values(
18 |         writer.pages[0],
19 |         {"Other": "/On"},
20 |         auto_regenerate=False,
21 |     )
22 |     stream = BytesIO()
23 |     writer.write(stream)
24 | 
25 |     # Wrong: `/V (/On)`.
26 |     assert b"\n/V /On\n" in stream.getvalue()
27 | 


--------------------------------------------------------------------------------
/tests/test_javascript.py:
--------------------------------------------------------------------------------
 1 | """Test topics around the usage of JavaScript in PDF documents."""
 2 | from pathlib import Path
 3 | from typing import Any
 4 | 
 5 | import pytest
 6 | 
 7 | from pypdf import PdfReader, PdfWriter
 8 | 
 9 | # Configure path environment
10 | TESTS_ROOT = Path(__file__).parent.resolve()
11 | PROJECT_ROOT = TESTS_ROOT.parent
12 | RESOURCE_ROOT = PROJECT_ROOT / "resources"
13 | 
14 | 
15 | @pytest.fixture
16 | def pdf_file_writer():
17 |     reader = PdfReader(RESOURCE_ROOT / "issue-604.pdf")
18 |     writer = PdfWriter()
19 |     writer.append_pages_from_reader(reader)
20 |     return writer
21 | 
22 | 
23 | def test_add_js(pdf_file_writer):
24 |     pdf_file_writer.add_js("this.print({bUI:true,bSilent:false,bShrinkToFit:true});")
25 | 
26 |     assert (
27 |         "/Names" in pdf_file_writer._root_object
28 |     ), "add_js should add a name catalog in the root object."
29 |     assert (
30 |         "/JavaScript" in pdf_file_writer._root_object["/Names"]
31 |     ), "add_js should add a JavaScript name tree under the name catalog."
32 | 
33 | 
34 | def test_added_js(pdf_file_writer):
35 |     def get_javascript_name() -> Any:
36 |         assert "/Names" in pdf_file_writer._root_object
37 |         assert "/JavaScript" in pdf_file_writer._root_object["/Names"]
38 |         assert "/Names" in pdf_file_writer._root_object["/Names"]["/JavaScript"]
39 |         return pdf_file_writer._root_object["/Names"]["/JavaScript"]["/Names"][
40 |             -2
41 |         ]  # return -2 in order to get the latest javascript
42 | 
43 |     pdf_file_writer.add_js("this.print({bUI:true,bSilent:false,bShrinkToFit:true});")
44 |     first_js = get_javascript_name()
45 | 
46 |     pdf_file_writer.add_js("this.print({bUI:true,bSilent:false,bShrinkToFit:true});")
47 |     second_js = get_javascript_name()
48 | 
49 |     assert (
50 |         first_js != second_js
51 |     ), "add_js should add to the previous script in the catalog."
52 | 


--------------------------------------------------------------------------------
/tests/test_pagerange.py:
--------------------------------------------------------------------------------
  1 | """Test the pypdf.pagerange module."""
  2 | import pytest
  3 | 
  4 | from pypdf.pagerange import PageRange, ParseError, parse_filename_page_ranges
  5 | 
  6 | 
  7 | def test_equality():
  8 |     pr1 = PageRange(slice(0, 5))
  9 |     pr2 = PageRange(slice(0, 5))
 10 |     assert pr1 == pr2
 11 | 
 12 | 
 13 | @pytest.mark.parametrize(
 14 |     ("page_range", "expected"),
 15 |     [(slice(0, 5), "0:5"), (slice(0, 5, 2), "0:5:2"), ("-1", "-1:"), ("0", "0")],
 16 | )
 17 | def test_str(page_range, expected):
 18 |     assert str(PageRange(page_range)) == expected
 19 | 
 20 | 
 21 | @pytest.mark.parametrize(
 22 |     ("page_range", "expected"),
 23 |     [(slice(0, 5), "PageRange('0:5')"), (slice(0, 5, 2), "PageRange('0:5:2')")],
 24 | )
 25 | def test_repr(page_range, expected):
 26 |     assert repr(PageRange(page_range)) == expected
 27 | 
 28 | 
 29 | def test_equality_other_objectc():
 30 |     pr1 = PageRange(slice(0, 5))
 31 |     pr2 = "PageRange(slice(0, 5))"
 32 |     assert pr1 != pr2
 33 | 
 34 | 
 35 | def test_idempotency():
 36 |     pr = PageRange(slice(0, 5))
 37 |     pr2 = PageRange(pr)
 38 |     assert pr == pr2
 39 | 
 40 | 
 41 | @pytest.mark.parametrize(
 42 |     ("range_str", "expected"),
 43 |     [
 44 |         ("42", slice(42, 43)),
 45 |         ("1:2", slice(1, 2)),
 46 |     ],
 47 | )
 48 | def test_str_init(range_str, expected):
 49 |     pr = PageRange(range_str)
 50 |     assert pr._slice == expected
 51 |     assert PageRange.valid
 52 | 
 53 | 
 54 | def test_str_init_error():
 55 |     init_str = "1-2"
 56 |     assert PageRange.valid(init_str) is False
 57 |     with pytest.raises(ParseError) as exc:
 58 |         PageRange(init_str)
 59 |     assert exc.value.args[0] == "1-2"
 60 | 
 61 | 
 62 | @pytest.mark.parametrize(
 63 |     ("params", "expected"),
 64 |     [
 65 |         (["foo.pdf", "1:5"], [("foo.pdf", PageRange("1:5"))]),
 66 |         (
 67 |             ["foo.pdf", "1:5", "bar.pdf"],
 68 |             [("foo.pdf", PageRange("1:5")), ("bar.pdf", PageRange(":"))],
 69 |         ),
 70 |     ],
 71 | )
 72 | def test_parse_filename_page_ranges(params, expected):
 73 |     assert parse_filename_page_ranges(params) == expected
 74 | 
 75 | 
 76 | def test_parse_filename_page_ranges_err():
 77 |     with pytest.raises(ValueError) as exc:
 78 |         parse_filename_page_ranges(["1:5", "foo.pdf"])
 79 |     assert (
 80 |         exc.value.args[0] == "The first argument must be a filename, not a page range."
 81 |     )
 82 | 
 83 | 
 84 | @pytest.mark.parametrize(
 85 |     ("a", "b", "expected"),
 86 |     [
 87 |         (PageRange(slice(0, 5)), PageRange(slice(2, 10)), slice(0, 10)),
 88 |         (PageRange(slice(0, 5)), PageRange(slice(2, 3)), slice(0, 5)),
 89 |         (PageRange(slice(0, 5)), PageRange(slice(5, 10)), slice(0, 10)),
 90 |     ],
 91 | )
 92 | def test_addition(a, b, expected):
 93 |     pr1 = PageRange(a)
 94 |     pr2 = PageRange(b)
 95 |     assert pr1 + pr2 == PageRange(expected)
 96 |     assert pr2 + pr1 == PageRange(expected)  # addition is commutative
 97 | 
 98 | 
 99 | @pytest.mark.parametrize(
100 |     ("a", "b"),
101 |     [
102 |         (PageRange(slice(0, 5)), PageRange(slice(7, 10))),
103 |         (PageRange(slice(7, 10)), PageRange(slice(0, 5))),
104 |     ],
105 | )
106 | def test_addition_gap(a: PageRange, b: PageRange):
107 |     with pytest.raises(ValueError) as exc:
108 |         a + b
109 |     assert exc.value.args[0] == "Can't add PageRanges with gap"
110 | 
111 | 
112 | def test_addition_non_page_range():
113 |     with pytest.raises(TypeError) as exc:
114 |         PageRange(slice(0, 5)) + "2:7"
115 |     assert exc.value.args[0] == "Can't add PageRange and <class 'str'>"
116 | 
117 | 
118 | def test_addition_stride():
119 |     a = PageRange(slice(0, 5, 2))
120 |     b = PageRange(slice(7, 9))
121 |     with pytest.raises(ValueError) as exc:
122 |         a + b
123 |     assert exc.value.args[0] == "Can't add PageRange with stride"
124 | 


--------------------------------------------------------------------------------
/tests/test_papersizes.py:
--------------------------------------------------------------------------------
 1 | """Test the pypdf.papersizes module."""
 2 | import pytest
 3 | 
 4 | from pypdf import papersizes
 5 | 
 6 | 
 7 | def test_din_a0_paper_size():
 8 |     """The dimensions and area of the DIN A0 paper size are correct."""
 9 |     dim = papersizes.PaperSize.A0
10 |     area_square_pixels = float(dim.width) * dim.height
11 | 
12 |     # 72 pixels is 1 inch
13 |     area_square_inch = area_square_pixels / 72**2
14 | 
15 |     # 25.4 millimeter is equal to 1 inches
16 |     area_square_mm = area_square_inch * (25.4) ** 2
17 |     assert abs(area_square_mm - 999949) < 100
18 |     conversion_factor = 72 / 25.4
19 |     assert (dim.width - 841 * conversion_factor) < 1
20 |     assert (dim.width - 1189 * conversion_factor) < 1
21 | 
22 | 
23 | @pytest.mark.parametrize("dimensions", papersizes._din_a)
24 | def test_din_a_aspect_ratio(dimensions):
25 |     """The aspect ratio of DIN A paper sizes is correct."""
26 |     assert abs(dimensions.height - dimensions.width * 2**0.5) <= 2.5
27 | 
28 | 
29 | @pytest.mark.parametrize(
30 |     ("dimensions_a", "dimensions_b"),
31 |     list(zip(papersizes._din_a, papersizes._din_a[1:])),
32 | )
33 | def test_din_a_size_doubling(dimensions_a, dimensions_b):
34 |     """The height of a DIN A paper size doubles when moving to the next size."""
35 |     assert abs(dimensions_a.height - 2 * dimensions_b.width) <= 4
36 | 


--------------------------------------------------------------------------------
/tests/test_pdfa.py:
--------------------------------------------------------------------------------
 1 | """Ensure that pypdf doesn't break PDF/A compliance."""
 2 | 
 3 | from io import BytesIO
 4 | from pathlib import Path
 5 | from typing import Optional
 6 | 
 7 | import pytest
 8 | 
 9 | from pypdf import PdfReader, PdfWriter
10 | 
11 | TESTS_ROOT = Path(__file__).parent.resolve()
12 | PROJECT_ROOT = TESTS_ROOT.parent
13 | RESOURCE_ROOT = PROJECT_ROOT / "resources"
14 | SAMPLE_ROOT = PROJECT_ROOT / "sample-files"
15 | 
16 | 
17 | def is_pdfa1b_compliant(src: BytesIO):
18 |     """Check if a PDF is PDF/A-1b compliant."""
19 | 
20 |     def document_information_has_analoguos_xml(src: BytesIO) -> bool:
21 |         reader = PdfReader(src)
22 |         meta = reader.metadata
23 |         xmp = reader.xmp_metadata
24 |         if not meta:
25 |             return True
26 |         if not xmp:
27 |             return False
28 |         if meta.title and not xmp.dc_title:
29 |             return meta.title == xmp.dc_title
30 |         return True
31 | 
32 |     return document_information_has_analoguos_xml(src)
33 | 
34 | 
35 | @pytest.mark.samples
36 | @pytest.mark.parametrize(
37 |     ("src", "diagnostic_write_name"),
38 |     [
39 |         (SAMPLE_ROOT / "021-pdfa/crazyones-pdfa.pdf", None),
40 |     ],
41 | )
42 | def test_pdfa(src: Path, diagnostic_write_name: Optional[str]):
43 |     with open(src, "rb") as fp:
44 |         data = BytesIO(fp.read())
45 |     reader = PdfReader(src)
46 |     assert is_pdfa1b_compliant(data)
47 |     writer = PdfWriter()
48 |     writer.clone_document_from_reader(reader)
49 | 
50 |     stream = BytesIO()
51 |     writer.write(stream)
52 |     stream.seek(0)
53 | 
54 |     assert is_pdfa1b_compliant(stream)
55 |     if diagnostic_write_name:
56 |         with open(diagnostic_write_name, "wb") as fp:
57 |             stream.seek(0)
58 |             fp.write(stream.read())
59 | 


--------------------------------------------------------------------------------
/tests/test_protocols.py:
--------------------------------------------------------------------------------
 1 | """Test the pypdf._protocols module."""
 2 | from pypdf._protocols import PdfObjectProtocol
 3 | 
 4 | 
 5 | class IPdfObjectProtocol(PdfObjectProtocol):
 6 |     pass
 7 | 
 8 | 
 9 | def test_pdfobjectprotocol():
10 |     o = IPdfObjectProtocol()
11 |     assert o.clone(None, False, None) is None
12 |     assert o._reference_clone(None, None) is None
13 |     assert o.get_object() is None
14 |     assert o.hash_value() is None
15 |     assert o.write_to_stream(None) is None
16 | 


--------------------------------------------------------------------------------
/tests/test_xobject_image_helpers.py:
--------------------------------------------------------------------------------
  1 | """Test the pypdf._xobj_image_helpers module."""
  2 | from io import BytesIO
  3 | 
  4 | import pytest
  5 | 
  6 | from pypdf import PdfReader
  7 | from pypdf._xobj_image_helpers import _extended_image_frombytes, _handle_flate
  8 | from pypdf.errors import EmptyImageDataError, PdfReadError
  9 | from pypdf.generic import ArrayObject, DecodedStreamObject, NameObject, NumberObject
 10 | 
 11 | from . import get_data_from_url
 12 | 
 13 | 
 14 | @pytest.mark.enable_socket
 15 | def test_get_imagemode_recursion_depth():
 16 |     """Avoid infinite recursion for nested color spaces."""
 17 |     url = "https://github.com/py-pdf/pypdf/files/12814018/out1.pdf"
 18 |     name = "issue2240.pdf"
 19 |     # Simple example: Just let the color space object reference itself.
 20 |     # The alternative would be to generate a chain of referencing objects.
 21 |     content = get_data_from_url(url, name=name)
 22 |     source = b"\n10 0 obj\n[ /DeviceN [ /HKS#2044#20K /Magenta /Yellow /Black ] 7 0 R 11 0 R 12 0 R ]\nendobj\n"
 23 |     target = b"\n10 0 obj\n[ /DeviceN [ /HKS#2044#20K /Magenta /Yellow /Black ] 10 0 R 11 0 R 12 0 R ]\nendobj\n"
 24 |     reader = PdfReader(BytesIO(content.replace(source, target)))
 25 |     with pytest.raises(
 26 |         PdfReadError,
 27 |         match="Color spaces nested too deeply. If required, consider increasing MAX_IMAGE_MODE_NESTING_DEPTH.",
 28 |     ):
 29 |         reader.pages[0].images[0]
 30 | 
 31 | 
 32 | def test_handle_flate__image_mode_1(caplog):
 33 |     data = b"\x00\xe0\x00"
 34 |     lookup = DecodedStreamObject()
 35 |     expected_data = [
 36 |         (66, 66, 66),
 37 |         (66, 66, 66),
 38 |         (66, 66, 66),
 39 |         (0, 19, 55),
 40 |         (0, 19, 55),
 41 |         (0, 19, 55),
 42 |         (66, 66, 66),
 43 |         (66, 66, 66),
 44 |         (66, 66, 66),
 45 |     ]
 46 | 
 47 |     # No trailing data.
 48 |     lookup.set_data(b"\x42\x42\x42\x00\x13\x37")
 49 |     result = _handle_flate(
 50 |         size=(3, 3),
 51 |         data=data,
 52 |         mode="1",
 53 |         color_space=ArrayObject(
 54 |             [NameObject("/Indexed"), NameObject("/DeviceRGB"), NumberObject(1), lookup]
 55 |         ),
 56 |         colors=2,
 57 |         obj_as_text="dummy",
 58 |     )
 59 |     assert expected_data == list(result[0].getdata())
 60 |     assert not caplog.text
 61 | 
 62 |     # Trailing whitespace.
 63 |     lookup.set_data(b"\x42\x42\x42\x00\x13\x37  \x0a")
 64 |     result = _handle_flate(
 65 |         size=(3, 3),
 66 |         data=data,
 67 |         mode="1",
 68 |         color_space=ArrayObject(
 69 |             [NameObject("/Indexed"), NameObject("/DeviceRGB"), NumberObject(1), lookup]
 70 |         ),
 71 |         colors=2,
 72 |         obj_as_text="dummy",
 73 |     )
 74 |     assert expected_data == list(result[0].getdata())
 75 |     assert not caplog.text
 76 | 
 77 |     # Trailing non-whitespace character.
 78 |     lookup.set_data(b"\x42\x42\x42\x00\x13\x37\x12")
 79 |     result = _handle_flate(
 80 |         size=(3, 3),
 81 |         data=data,
 82 |         mode="1",
 83 |         color_space=ArrayObject(
 84 |             [
 85 |                 NameObject("/Indexed"),
 86 |                 NameObject("/DeviceRGB"),
 87 |                 NumberObject(1),
 88 |                 lookup,
 89 |             ]
 90 |         ),
 91 |         colors=2,
 92 |         obj_as_text="dummy",
 93 |     )
 94 |     assert expected_data == list(result[0].getdata())
 95 |     assert "Too many lookup values: Expected 6, got 7." in caplog.text
 96 | 
 97 |     # Not enough lookup data.
 98 |     # `\xe0` of the original input (the middle part) does not use `0x37 = 55` for the lookup
 99 |     # here, but received a custom padding of `0`.
100 |     lookup.set_data(b"\x42\x42\x42\x00\x13")
101 |     caplog.clear()
102 |     expected_short_data = [entry if entry[0] == 66 else (0, 19, 0) for entry in expected_data]
103 |     result = _handle_flate(
104 |         size=(3, 3),
105 |         data=data,
106 |         mode="1",
107 |         color_space=ArrayObject(
108 |             [
109 |                 NameObject("/Indexed"),
110 |                 NameObject("/DeviceRGB"),
111 |                 NumberObject(1),
112 |                 lookup,
113 |             ]
114 |         ),
115 |         colors=2,
116 |         obj_as_text="dummy",
117 |     )
118 |     assert expected_short_data == list(result[0].getdata())
119 |     assert "Not enough lookup values: Expected 6, got 5." in caplog.text
120 | 
121 | 
122 | def test_extended_image_frombytes_zero_data():
123 |     mode = "RGB"
124 |     size = (1, 1)
125 |     data = b""
126 | 
127 |     with pytest.raises(EmptyImageDataError, match="Data is 0 bytes, cannot process an image from empty data."):
128 |         _extended_image_frombytes(mode, size, data)
129 | 


--------------------------------------------------------------------------------